def find_most_similar_str(str_list, s): distances = [] for s2 in str_list: distances.append(lev_distance(s, s2)) distances = np.array(distances) min_dst_idx = np.argmin(distances) return str_list[min_dst_idx], distances[min_dst_idx]
def search(self, filename): with open(filename, encoding='utf8') as fan_file: fan = sp(fan_file.read()) # Create the fan windows: fan_vectors = mk_vectors(fan) fan_win_vectors = numpy.array([ fan_vectors[i:i + self.window_size, :].ravel() for i in range(fan_vectors.shape[0] - self.window_size + 1) ]) duplicate_records = defaultdict(list) for fan_ix, row in enumerate(fan_win_vectors): self._windows_processed += 1 results = self.engine.neighbours(row) # Extract data about the original script # embedded in the engine's results. results = [(match_ix, match_str, distance) for vec, (match_ix, match_str), distance in results if distance < self.distance_threshold] # Create a new record with original script # information and fan work information. for match_ix, match_str, distance in results: fan_context = str(fan[fan_ix:fan_ix + window_size]) lev_d = lev_distance(match_str, fan_context) for window_ix in range(window_size): fan_word_ix = fan_ix + window_ix fan_word = fan[fan_word_ix].orth_ fan_orth_id = fan[fan_word_ix].orth orig_word_ix = match_ix + window_ix orig_word = self.word_lowercase[orig_word_ix] orig_orth_id = self.orth_id[orig_word_ix] char = self.character[orig_word_ix] scene = self.scene[orig_word_ix] duplicate_records[(filename, fan_word_ix)].append( # NOTE: This **must** match the definition # of `record_structure` above [ filename, fan_word_ix, fan_word, fan_orth_id, orig_word_ix, orig_word, orig_orth_id, char, scene, distance, lev_d, distance * lev_d ]) # To deduplicate duplicate_records, we # pick the single best match, as measured by # the combined distance for the given n-gram # match that first identified the word. for k, dset in duplicate_records.items(): duplicate_records[k] = min(dset, key=itemgetter(11)) return sorted(duplicate_records.values())
def _find_other_entries_with_name(ocr_entries, entry_with_name_bot_left): # check that they are not exactly the same entry and they have relatively the same text other_entries_with_name = [ entry for entry in ocr_entries if lev_distance(entry.text, entry_with_name_bot_left.text) <= 2 and not (entry.max_x == entry_with_name_bot_left.max_x and entry.min_x == entry_with_name_bot_left.min_x and entry.max_y == entry_with_name_bot_left.max_y and entry.min_y == entry_with_name_bot_left.min_y) ] return other_entries_with_name
def get_lev_distance(shows): """ Adds a lev_distance attribute to each show which contains a lookup for the Levenshtein Distance of the show's japanese_title to each other show's japanese_title """ ## Create attribute for all shows for show in shows: show.lev_distance = dict() ## Compile Levenshtein Distances for show in shows: for other in shows: ## Skip this show if other == show: continue ## If other show has already done the calculation, use theirs if show.japanese_title in other.lev_distance: show.lev_distance[other.japanese_title] = other.lev_distance[ show.japanese_title] ## Otherwise calculate it yourself else: show.lev_distance[other.japanese_title] = lev_distance( show.japanese_title, other.japanese_title)
def get_lev_dist_list(query, data): query_data_levs = [lev_distance(q, data) for q in query] return query_data_levs
def levenshtein_distance(string1,string2): distance = lev_distance(string1, string2) return distance