Ejemplo n.º 1
0
def match(word_str):
    city_map = init_city('citys.list')

    if len(city_map) > 0:
        k, v = city_map.items()[0]
        dis = distance(k, word_str)

        for key, val in city_map.items():
            d = distance(key, word_str)
            if d <= dis:
                k = key
                v = val
                dis = d

    return v, k
Ejemplo n.º 2
0
    def matches(self, name):
        """
        Determine if name matches restricted guest's name
        """

        return distance(self.name,
                        name) <= RestrictedGuest.MIN_LEVENSHTEIN_DIST
Ejemplo n.º 3
0
def rewrite_link_target(insee_code, place_label):
    places = Place.query.filter(Place.commune_insee_code == insee_code).all()
    length = len(places)
    place = None

    if length == 1:
        place = places[0]
    elif length > 1:
        # find exact match
        for p in places:
            if p.label == place_label:
                place = p
                break
        if place is None:
            # find best match
            distances = [distance(place_label, p.label) for p in places]
            best_match_idx = distances.index(min(distances))
            place = places[best_match_idx]

    if place:
        return '<a href="{0}/places/{1}">{2}</a>'.format(
            current_app.config['APP_URL_PREFIX'], place.id, place_label)
    else:
        print('@@ NOT FOUND', insee_code, place_label)
        return place_label
Ejemplo n.º 4
0
def main():
    test_data_ids = sorted(map(lambda it: int(it.replace('image_', '').replace('.png', '')),
                               filter(lambda it: it.startswith('image'),
                                      os.listdir(data_directory))))
    X = []
    Y = []
    for i in test_data_ids:
        img = cv2.imread(os.path.join(data_directory, 'image_%s.png' % i), cv2.IMREAD_GRAYSCALE)
        X.append(np.asarray(img, dtype='uint8'))
        Y.append(read_file_content(os.path.join(data_directory, 'text_%s.txt' % i)))
    X = np.array(X)
    X = X / 255

    for testing_index, current_X in enumerate(X):
        image_path = os.path.join(data_directory, 'image_%s.png' % testing_index)
        letter_dimensions = [dim for dim in parse_letters(1 - current_X)]

        pool = Pool(threads)
        predicted_letters = pool.map(recognise, zip([testing_index] * len(letter_dimensions), letter_dimensions))

        actual_text = Y[testing_index].replace('\n', ' ')
        predicted_text = ''.join(predicted_letters)
        print('> ' + image_path)
        print('>> Number of threads:   ', threads)
        print('>> Number of letters:   ', len(letter_dimensions))
        print('>> Levenshtein distance:', distance(actual_text, predicted_text))
        print('>> Length difference:   ', len(predicted_text) - len(actual_text))
        print('>> Actual:')
        print(actual_text)
        print('>> Predicted:')
        print(predicted_text)
        print()
        testing_index += 1
Ejemplo n.º 5
0
def compare_strings(str1, str2):
    def __strip_string__(string):
        return str(string.lower().replace(" ", "").split(",")[0])

    try:
        return distance(__strip_string__(str1), __strip_string__(str2))
    except AttributeError:
        return 9999
Ejemplo n.º 6
0
def get_words_with_levenshtein_1(words):
    for word_1 in words:
        for word_2 in words:
            if word_1 == word_2:
                continue
            dist = distance(word_1, word_2)
            if dist == 1:
                return word_1, word_2
    raise RuntimeError('No words with levenshtein_1 found')
Ejemplo n.º 7
0
 def most_distant_word(self, words_to_choose, words_origin):
     # avoid similar words- car/cars נהג/מנהיג
     DISTNACE_THRESHOLD = 2
     words = [(word, min([ld.distance(word, w) for w in words_origin]))
              for word in words_to_choose]
     selected = [w for w in words if w[1] > DISTNACE_THRESHOLD]
     if selected:
         return selected[0][0]
     else:
         return sorted(words, key=lambda w: w[1], reverse=True)[0][0]
Ejemplo n.º 8
0
 def get_similar_for_groups(self, positive: Iterable[str], negative: Iterable[str] = [],
                            top_n=3) -> NGramSimilarityDict:
     most_similar = self.kv.most_similar(tuple(positive), tuple(negative), top_n * 2)
     # filter too similar words
     most_similar_res = []
     for w in most_similar:
         if len(w) > 1 and w[0] not in self.stopWords:
             if min([distance(w[0], p) for p in positive]) > 3 :
                 most_similar_res.append(w)
     return most_similar_res[:top_n]
Ejemplo n.º 9
0
    def __sub__(self, other):
        if self.strippedContents is None:
            self.strippedContents = self.stripJavaCode(self.contents)

        if other.strippedContents is None:
            other.strippedContents = other.stripJavaCode(other.contents)

        if abs(len(self.strippedContents) - len(other.strippedContents)) > max(0.05 * (len(self.strippedContents) + len(other.strippedContents)), 50):
            return 99999

        return _levenshtein.distance(self.strippedContents, other.strippedContents)
Ejemplo n.º 10
0
    def __sub__(self, other):
        if self.strippedContents is None:
            self.strippedContents = self.stripJavaCode(self.contents)

        if other.strippedContents is None:
            other.strippedContents = other.stripJavaCode(other.contents)

        if abs(len(self.strippedContents) - len(other.strippedContents)) > max(
                0.05 *
            (len(self.strippedContents) + len(other.strippedContents)), 50):
            return 99999

        return _levenshtein.distance(self.strippedContents,
                                     other.strippedContents)
Ejemplo n.º 11
0
 def _compute_edit_distance_matrix(input):
     """
     Computes the edit distance between the
     :param input:
     :return:
     """
     assert isinstance(input, list)
     matrix = numpy.zeros(shape=(len(input), len(input)))
     for i, item_a in enumerate(input):
         for j, item_b in enumerate(input):
             if i == j:
                 matrix[i, j] = sys.maxint - 1000
             else:
                 matrix[i, j] = distance(item_a, item_b)
     return matrix
Ejemplo n.º 12
0
def levenshteinClosest(word):
    operations = 1
    searching = True
    candidateArray = []
    while (operations < len(word) / 2 and searching):
        for w in freqDict:
            if lv.distance(word, w) == operations:
                candidateArray.append(w)

        if len(candidateArray) == 0:
            operations += 1
        else:
            searching = False

    if searching == True:
        candidateArray.append(word)

    return candidateArray
Ejemplo n.º 13
0
def search_model(input_value, **kwargs):
    model = kwargs.get("model", False)
    field = kwargs.get("field", False)
    filter_args = kwargs.get("filter", False)

    closest_match = None

    if isinstance(input_value, (str, unicode)) and model and field:
        text_soundex = soundex.encode_word(input_value)
        simple_text = (re.sub("[^A-Za-z0-9]", "",
                              input_value).strip().replace(" ", "").lower())

        if filter_args:
            model_items = model.objects.filter(**filter_args)
        else:
            model_items = model.objects.all()

        matches = []

        for item in model_items:
            field_value = getattr(item, field, "")

            field_soundex = soundex.encode_word(field_value)
            simple_field_text = (re.sub("[^A-Za-z0-9]",
                                        "", field_value).strip().replace(
                                            " ", "").lower())
            word_distance = distance(input_value, field_value)

            if text_soundex == field_soundex:
                matches.append({"distance": 0, "item": item})

            elif simple_text in simple_field_text:
                matches.append({"distance": 1, "item": item})

            else:
                if word_distance < 10:
                    matches.append({"distance": word_distance, "item": item})

            matches = sorted(matches, key=itemgetter("distance"))

            if len(matches) > 0 and matches[0].get("distance") < 5:
                closest_match = matches[0]

    return closest_match.get("item")
Ejemplo n.º 14
0
def le_levenshtein_percent(percent, string1, string2):
    """
    Returns True if Levenshtein distance between string1 and string2,
    divided by max length, is less than or equal to the given percent.

    :param percent: Percent expressed as a decimal between 0 and 1
    :param string1: First string to compare
    :param string2: Second string to compare
    :return: True or False

    >>> le_levenshtein_percent(0.2, 'Riyaz', 'Riaz')
    True
    >>> le_levenshtein_percent(0.2, 'Riyaz', 'Riazz')
    False

    """
    if not 0 <= percent < 1:
        raise ValueError(
            'percent must be greater that or equal to 0 and less than 1')
    dist = distance(string1, string2)
    max_len = max(len(string1), len(string2))
    return dist / max_len <= percent
Ejemplo n.º 15
0
def parse_text(request):
    """
    Traverses the language model and compares samples using soundex and Levenshtein distance of the interpreted text.
    :param request:
    :return:
    """

    data = request_to_dict(request)
    text = data.get("text")

    try:
        interaction_model = LanguageModel.objects.get(enabled=True)

    except LanguageModel.DoesNotExist:
        resp = {"_text": text}

    else:
        text_soundex = soundex.encode_word(text)
        simple_text = text.strip().replace(" ", "").lower()

        matches = []

        for intent in interaction_model.intents.filter(enabled=True):

            for sample in intent.samples:
                sample_soundex = soundex.encode_word(sample)
                word_distance = distance(text, sample)

                simple_sample = sample.strip().replace(" ", "").lower()

                if text_soundex == sample_soundex:
                    matches.append(
                        {"distance": 0, "intent": intent.name, "sample": sample}
                    )

                elif simple_text in simple_sample:
                    matches.append(
                        {"distance": 1, "intent": intent.name, "sample": sample}
                    )

                else:
                    if word_distance < 10:
                        matches.append(
                            {
                                "distance": word_distance,
                                "intent": intent.name,
                                "sample": sample,
                            }
                        )

        matches = sorted(matches, key=itemgetter("distance"))

        if len(matches) > 0 and matches[0].get("distance") < 5:
            closest_match = matches[0]

        else:
            closest_match = None

        resp = {
            "_text": text,
            # 'closest_match': closest_match,
            "intent": closest_match.get("intent") if closest_match else closest_match,
            # 'matches': matches
        }

    return Response(resp, status=status.HTTP_200_OK, headers=NO_CACHE_HEADERS)
Ejemplo n.º 16
0
def calc_distance(w0, w1):
    return _levenshtein.distance(w0, w1)
Ejemplo n.º 17
0
def calculateLevenshteinDistance(s1, s2):
    return _levenshtein.distance(s1, s2)
Ejemplo n.º 18
0
def calc_levenshtein(tup):
    return _levenshtein.distance(tup[0], tup[1])
Ejemplo n.º 19
0
def calculateLevenshteinDistance(string1, string2):
    return _levenshtein.distance(string1, string2)