def match(word_str): city_map = init_city('citys.list') if len(city_map) > 0: k, v = city_map.items()[0] dis = distance(k, word_str) for key, val in city_map.items(): d = distance(key, word_str) if d <= dis: k = key v = val dis = d return v, k
def matches(self, name): """ Determine if name matches restricted guest's name """ return distance(self.name, name) <= RestrictedGuest.MIN_LEVENSHTEIN_DIST
def rewrite_link_target(insee_code, place_label): places = Place.query.filter(Place.commune_insee_code == insee_code).all() length = len(places) place = None if length == 1: place = places[0] elif length > 1: # find exact match for p in places: if p.label == place_label: place = p break if place is None: # find best match distances = [distance(place_label, p.label) for p in places] best_match_idx = distances.index(min(distances)) place = places[best_match_idx] if place: return '<a href="{0}/places/{1}">{2}</a>'.format( current_app.config['APP_URL_PREFIX'], place.id, place_label) else: print('@@ NOT FOUND', insee_code, place_label) return place_label
def main(): test_data_ids = sorted(map(lambda it: int(it.replace('image_', '').replace('.png', '')), filter(lambda it: it.startswith('image'), os.listdir(data_directory)))) X = [] Y = [] for i in test_data_ids: img = cv2.imread(os.path.join(data_directory, 'image_%s.png' % i), cv2.IMREAD_GRAYSCALE) X.append(np.asarray(img, dtype='uint8')) Y.append(read_file_content(os.path.join(data_directory, 'text_%s.txt' % i))) X = np.array(X) X = X / 255 for testing_index, current_X in enumerate(X): image_path = os.path.join(data_directory, 'image_%s.png' % testing_index) letter_dimensions = [dim for dim in parse_letters(1 - current_X)] pool = Pool(threads) predicted_letters = pool.map(recognise, zip([testing_index] * len(letter_dimensions), letter_dimensions)) actual_text = Y[testing_index].replace('\n', ' ') predicted_text = ''.join(predicted_letters) print('> ' + image_path) print('>> Number of threads: ', threads) print('>> Number of letters: ', len(letter_dimensions)) print('>> Levenshtein distance:', distance(actual_text, predicted_text)) print('>> Length difference: ', len(predicted_text) - len(actual_text)) print('>> Actual:') print(actual_text) print('>> Predicted:') print(predicted_text) print() testing_index += 1
def compare_strings(str1, str2): def __strip_string__(string): return str(string.lower().replace(" ", "").split(",")[0]) try: return distance(__strip_string__(str1), __strip_string__(str2)) except AttributeError: return 9999
def get_words_with_levenshtein_1(words): for word_1 in words: for word_2 in words: if word_1 == word_2: continue dist = distance(word_1, word_2) if dist == 1: return word_1, word_2 raise RuntimeError('No words with levenshtein_1 found')
def most_distant_word(self, words_to_choose, words_origin): # avoid similar words- car/cars נהג/מנהיג DISTNACE_THRESHOLD = 2 words = [(word, min([ld.distance(word, w) for w in words_origin])) for word in words_to_choose] selected = [w for w in words if w[1] > DISTNACE_THRESHOLD] if selected: return selected[0][0] else: return sorted(words, key=lambda w: w[1], reverse=True)[0][0]
def get_similar_for_groups(self, positive: Iterable[str], negative: Iterable[str] = [], top_n=3) -> NGramSimilarityDict: most_similar = self.kv.most_similar(tuple(positive), tuple(negative), top_n * 2) # filter too similar words most_similar_res = [] for w in most_similar: if len(w) > 1 and w[0] not in self.stopWords: if min([distance(w[0], p) for p in positive]) > 3 : most_similar_res.append(w) return most_similar_res[:top_n]
def __sub__(self, other): if self.strippedContents is None: self.strippedContents = self.stripJavaCode(self.contents) if other.strippedContents is None: other.strippedContents = other.stripJavaCode(other.contents) if abs(len(self.strippedContents) - len(other.strippedContents)) > max(0.05 * (len(self.strippedContents) + len(other.strippedContents)), 50): return 99999 return _levenshtein.distance(self.strippedContents, other.strippedContents)
def __sub__(self, other): if self.strippedContents is None: self.strippedContents = self.stripJavaCode(self.contents) if other.strippedContents is None: other.strippedContents = other.stripJavaCode(other.contents) if abs(len(self.strippedContents) - len(other.strippedContents)) > max( 0.05 * (len(self.strippedContents) + len(other.strippedContents)), 50): return 99999 return _levenshtein.distance(self.strippedContents, other.strippedContents)
def _compute_edit_distance_matrix(input): """ Computes the edit distance between the :param input: :return: """ assert isinstance(input, list) matrix = numpy.zeros(shape=(len(input), len(input))) for i, item_a in enumerate(input): for j, item_b in enumerate(input): if i == j: matrix[i, j] = sys.maxint - 1000 else: matrix[i, j] = distance(item_a, item_b) return matrix
def levenshteinClosest(word): operations = 1 searching = True candidateArray = [] while (operations < len(word) / 2 and searching): for w in freqDict: if lv.distance(word, w) == operations: candidateArray.append(w) if len(candidateArray) == 0: operations += 1 else: searching = False if searching == True: candidateArray.append(word) return candidateArray
def search_model(input_value, **kwargs): model = kwargs.get("model", False) field = kwargs.get("field", False) filter_args = kwargs.get("filter", False) closest_match = None if isinstance(input_value, (str, unicode)) and model and field: text_soundex = soundex.encode_word(input_value) simple_text = (re.sub("[^A-Za-z0-9]", "", input_value).strip().replace(" ", "").lower()) if filter_args: model_items = model.objects.filter(**filter_args) else: model_items = model.objects.all() matches = [] for item in model_items: field_value = getattr(item, field, "") field_soundex = soundex.encode_word(field_value) simple_field_text = (re.sub("[^A-Za-z0-9]", "", field_value).strip().replace( " ", "").lower()) word_distance = distance(input_value, field_value) if text_soundex == field_soundex: matches.append({"distance": 0, "item": item}) elif simple_text in simple_field_text: matches.append({"distance": 1, "item": item}) else: if word_distance < 10: matches.append({"distance": word_distance, "item": item}) matches = sorted(matches, key=itemgetter("distance")) if len(matches) > 0 and matches[0].get("distance") < 5: closest_match = matches[0] return closest_match.get("item")
def le_levenshtein_percent(percent, string1, string2): """ Returns True if Levenshtein distance between string1 and string2, divided by max length, is less than or equal to the given percent. :param percent: Percent expressed as a decimal between 0 and 1 :param string1: First string to compare :param string2: Second string to compare :return: True or False >>> le_levenshtein_percent(0.2, 'Riyaz', 'Riaz') True >>> le_levenshtein_percent(0.2, 'Riyaz', 'Riazz') False """ if not 0 <= percent < 1: raise ValueError( 'percent must be greater that or equal to 0 and less than 1') dist = distance(string1, string2) max_len = max(len(string1), len(string2)) return dist / max_len <= percent
def parse_text(request): """ Traverses the language model and compares samples using soundex and Levenshtein distance of the interpreted text. :param request: :return: """ data = request_to_dict(request) text = data.get("text") try: interaction_model = LanguageModel.objects.get(enabled=True) except LanguageModel.DoesNotExist: resp = {"_text": text} else: text_soundex = soundex.encode_word(text) simple_text = text.strip().replace(" ", "").lower() matches = [] for intent in interaction_model.intents.filter(enabled=True): for sample in intent.samples: sample_soundex = soundex.encode_word(sample) word_distance = distance(text, sample) simple_sample = sample.strip().replace(" ", "").lower() if text_soundex == sample_soundex: matches.append( {"distance": 0, "intent": intent.name, "sample": sample} ) elif simple_text in simple_sample: matches.append( {"distance": 1, "intent": intent.name, "sample": sample} ) else: if word_distance < 10: matches.append( { "distance": word_distance, "intent": intent.name, "sample": sample, } ) matches = sorted(matches, key=itemgetter("distance")) if len(matches) > 0 and matches[0].get("distance") < 5: closest_match = matches[0] else: closest_match = None resp = { "_text": text, # 'closest_match': closest_match, "intent": closest_match.get("intent") if closest_match else closest_match, # 'matches': matches } return Response(resp, status=status.HTTP_200_OK, headers=NO_CACHE_HEADERS)
def calc_distance(w0, w1): return _levenshtein.distance(w0, w1)
def calculateLevenshteinDistance(s1, s2): return _levenshtein.distance(s1, s2)
def calc_levenshtein(tup): return _levenshtein.distance(tup[0], tup[1])
def calculateLevenshteinDistance(string1, string2): return _levenshtein.distance(string1, string2)