def A_star_search(tree, source_word, goal_word): # The set of nodes already evaluated closedSet = set() # The set of currently discovered nodes that are not evaluated yet. # Initially, only the start node is known. openSet = set() openSet.add(source_word) # For each node, which node it can most efficiently be reached from. # If a node can be reached from many nodes, cameFrom will eventually contain the # most efficient previous step. cameFrom = {} # For each node, the cost of getting from the start node to that node. # The cost of going from start to start is zero. gScore = {source_word: 0} # For each node, the total cost of getting from the start node to the goal # by passing by that node. That value is partly known, partly heuristic. # For the first node, that value is completely heuristic. fScore = {source_word: editdistance.distance(source_word, goal_word)} while openSet: current = find_min(fScore, openSet) # Find the word with fmin AND in openset # If this word is the one we are looking for then return the reconstructed path if current == goal_word: return reconstruct_path(cameFrom, current) # Update the open/closed sets openSet.remove(current) closedSet.add(current) for new_node in bk_search(tree, current, 1): neighbour = new_node[1] if neighbour in closedSet: # Ignore the neighbor which is already evaluated. continue # The distance from start to a neighbor tentative_gScore = gScore.get(current, sys.maxsize) + editdistance.distance(current, neighbour) if neighbour not in openSet: # Discover a new node openSet.add(neighbour) elif tentative_gScore >= gScore.get(neighbour, sys.maxsize): # Not a better path continue # This path is the best until now. Record it! cameFrom[neighbour] = current gScore[neighbour] = tentative_gScore fScore[neighbour] = gScore.get(neighbour) + editdistance.distance(neighbour, goal_word) # If we cant find a path between these 2 words then return that contains only the source word return reconstruct_path(cameFrom, source_word)
def get_bot_accuracies(bot, scored_qa_pairs=None, min_qa_bot_confidence=.2): """ Compare answers from bot to answers in test set >>> from qary.skills import glossary_bots >>> bot = glossary_bots.Bot() >>> scored_qa_pairs = [dict(question='What is RMSE?', answer='Root Mean Square Error', score=.9, topic='ds')] >>> get_bot_accuracies(bot=bot, scored_qa_pairs=scored_qa_pairs)[0]['bot_accuracy'] 1.0 >>> scored_qa_pairs = [dict(question='What is RMSE?', answer='root-mean-sqr-error', score=.9, topic='ds')] >>> get_bot_accuracies(bot=bot, scored_qa_pairs=scored_qa_pairs)[0] {'question': 'What is RMSE?', 'answer': 'root-mean-sqr-error', 'score': 0.9, 'topic': 'ds', 'bot_answer': 'Root Mean Square Error', 'bot_w2v_similarity': 0.64..., 'bot_ed_distance': 0.52..., 'bot_ed_distance_low': 0.31..., 'bot_ed_distance_folded': 0.15..., 'bot_accuracy': 0.65...} """ scored_qa_pairs = load_qa_dataset( ) if scored_qa_pairs is None else scored_qa_pairs scored_qa_pairs = load_qa_dataset(scored_qa_pairs) if isinstance( scored_qa_pairs, str) else scored_qa_pairs validated_qa_pairs = [] for truth in scored_qa_pairs: texts = scrape_wikipedia.find_document_texts(topic=truth['topic'], max_results=10) for context in texts: bot.reset_context(context) replies = sorted(bot.reply(truth['question'])) if len(replies) and sorted(replies)[-1][0] > min_qa_bot_confidence: break replies = replies or [(0, "Sorry, I don't know.")] truth['bot_answer'] = replies[-1][1] truth['bot_w2v_similarity'] = nlp(truth['bot_answer']).similarity( nlp(truth['answer'])) truth['bot_ed_distance'] = distance( truth['answer'], truth['bot_answer']) / len(truth['answer']) truth['bot_ed_distance_low'] = distance( truth['answer'].lower().strip(), truth['bot_answer'].lower().strip()) / len(truth['answer'].strip()) truth['bot_ed_distance_folded'] = distance( fold_characters(truth['answer']), fold_characters(truth['bot_answer'])) / len( truth['answer'].strip()) truth['bot_accuracy'] = .5 * truth['bot_w2v_similarity'] + .5 * ( 1 - (truth['bot_ed_distance'] + truth['bot_ed_distance_low'] + truth['bot_ed_distance_folded']) / 3) validated_qa_pairs.append(dict(truth)) return validated_qa_pairs
def cer(_pred, _true, norm=True): """ Computes the Character Error Rate, defined as the edit distance. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ _pred, _true, = _pred.replace(" ", ""), _true.replace(" ", "") if norm: l = len(_true) if len(_true) > 0 else 1 return float(editdistance.distance(_pred, _true)) / l else: return float(editdistance.distance(_pred, _true))
def main(): with open(json_file) as f: label_dict = json.load(f) label_dict = {int(key): val for key, val in label_dict.items()} actual_df = pd.DataFrame.from_dict(label_dict, orient='index') actual_df.reset_index(level=0, inplace=True) actual_df.columns = ['label', 'actual_word'] predicted_df = pd.read_csv(predicted_csv_file) predicted_df = predicted_df.loc[predicted_df.groupby('predicted_label') ['confidence'].idxmax()] predicted_df = predicted_df.rename(columns={ 'word': 'predicted_word', 'predicted_label': 'label' }) combined_df = actual_df.merge(predicted_df, on='label') combined_df = combined_df.sort_values(by=['label']) combined_df.actual_word combined_df['edit_distance'] = combined_df.apply( lambda row: ed.distance(row.actual_word, row.predicted_word), axis=1) print("Number of total rows :", combined_df.shape[0]) print("Number of rows where editdistance is zero:", (combined_df.edit_distance == 0).count()) combined_df.to_csv("comparison.csv")
def calc_pattern_reliability(self): """ calculate the average Edist-distance between the spatial pattern of each core assembly and each of its significant patterns""" nCores = self.get_ncores() ed_cores_mats = [[]] * nCores ed_cores_means = [[]] * nCores core_pidx = self.get_core_PatchIdx() for c in np.arange(nCores): raster = self.get_patterns_raster()[c] nPatterns = raster.shape[1] ed_mat = np.zeros(nPatterns) ed_mat[:] = np.nan core_binary = np.zeros(raster.shape[0]) core_binary[core_pidx[c][0][0]] = 1 for i in np.arange(nPatterns): ed_mat[i] = ed.distance(core_binary.tolist(), raster[:, i].tolist()) ed_cores_mats[c] = ed_mat ed_cores_means[c] = np.nanmean(ed_mat) ed_grandmean = np.nanmean(ed_cores_means) return ed_cores_mats, ed_cores_means, ed_grandmean
def evaluate_image(self, gt, pred): correct_num = 0 pred_sum_num = 0 gt_sum_num = 0 for gt_text, pred_text in zip(gt, pred): pred_num = len(pred_text) gt_num = len(gt_text) dist = distance(pred_text, gt_text) correct_num += max(pred_num, gt_num) - dist pred_sum_num += pred_num gt_sum_num += gt_num precision = correct_num / pred_sum_num recall = correct_num / gt_sum_num hmean = 0 if (precision + recall) == 0 else 2.0 * \ precision * recall / (precision + recall) per_sample_metric = { 'pred_num': pred_sum_num, 'gt_num': gt_sum_num, 'correct_num': correct_num, 'precision': precision, 'recall': recall, 'hmean': hmean } return per_sample_metric
def update(self, preds, targets): for i in range(len(preds)): targets_list, pred_list = list( filter(lambda a: a != 0, targets[i].tolist())), preds[i] self.edit_distance += editdistance.distance( targets_list, pred_list) self.target_length += len(targets_list)
def compute_cer(predicts: List[Char], targets: List[Char], indistinguish: bool) -> Tuple[torch.Tensor, torch.Tensor]: ''' Calculate CER distance between two strings or two lists of strings Params: ------- - predicts: List of predicted characters - targets: List of target characters - indistinguish: set to True to case-insensitive, or False to case-sensitive Returns: -------- - distances: List of distances - n_references: List of the number of characters of targets ''' assert type(predicts) == type( targets), 'predicts and targets must be the same type' assert len(predicts) == len( targets), 'predicts and targets must have the same length' if indistinguish: predicts = [list(map(str.lower, predict)) for predict in predicts] targets = [list(map(str.lower, target)) for target in targets] distances = torch.tensor([ ed.distance(predict, target) for predict, target in zip(predicts, targets) ]) num_references = torch.tensor(list(map(len, targets))) return distances, num_references
def error_rate(hyps, targets): verbose = 0 assert len(hyps) == len(targets) tot_edits = 0.0 tot_len = 0.0 idx = 0 for h, t in zip(hyps, targets): distance = editdistance.distance(np.array(h), np.array(t)) if verbose > 0: # If necessary, get 'alphabet' as argument after which you can compare strings. # CHECK: Make sure no blanks/ class #0 in here print("error_rate() [" + str(idx) + "] hyps: " + str(tensorList2list(h))) print("error_rate() [" + str(idx) + "] targets: " + str(tensorList2list(t))) print("error_rate() [" + str(idx) + "] distance: " + str(distance)) tot_edits += distance tot_len += len(t) idx += 1 # end for # Compute character error rate (CER) == label error rate (LER) cer = (tot_edits * 100.0) / tot_len return cer
def _get_most_similar_entity(self, text, entities, keywords, unknown_dist, unknown): """ Every entity is paired with its list of keywords. entities = [...] keywords = [[], [], ...] """ text = text.translate(str.maketrans('', '', string.punctuation)) words = text.split(" ") words = [self._morph.parse(word)[0].normal_form for word in words] min_dist = 10000000 min_entity = None for keywords, entity in zip(keywords, entities): # Skip empty entities if len(keywords) == 0: continue keywords = [self._morph.parse(keyword)[0].normal_form for keyword in keywords] entity_distance = 0 for word in words: word = self._morph.parse(word)[0].normal_form entity_distance = min([editdistance.distance(word, keyword) for keyword in keywords]) if entity_distance < min_dist: min_entity = entity min_dist = entity_distance if min_dist < unknown_dist: return min_entity else: return unknown
def replace_suspect_word_to_sentence(word, sent, dis=1): sent_pinyin = pypinyin.pinyin(sent, style=pypinyin.TONE3) sent_pinyin = [ i[0][:-1] if i[0][-1] in tone else i[0] for i in sent_pinyin ] sent_chars = list(sent) word_pinyin = pypinyin.pinyin(word, style=pypinyin.TONE3) word_pinyin = [ i[0][:-1] if i[0][-1] in tone else i[0] for i in word_pinyin ] word_len = len(word_pinyin) sent_len = len(sent_pinyin) replace_pos = [] for i in range(sent_len - word_len + 1): sent_word = sent_pinyin[i:i + word_len] for s in sent_word: if len(s) == 0: break if len(s) == 1: if s not in letters: break sent_word_edit = ''.join(sent_word) word_edit = ''.join(word_pinyin) if editdistance.distance(sent_word_edit, word_edit) <= dis: replace_pos.append(i) for pos in replace_pos: sent_chars[pos:pos + word_len] = word res = ''.join(sent_chars) return res
def compute_distance(self, predict: str, target: str) -> float: """ Compute edit distance between two strings """ distance = ed.distance(predict, target) distance = float(distance) / len(target) return distance
def clean_up(df, threshold, field): ''' Homogenises a field of the data frame by replacing similar entries with a canonical form. That's defined to be the most repeated form withing that similar group Parameters: df The bottin pandas dataframe to be cleaned. threshold A float (0-1] representing the maximum relative distance for 2 strings to be considered similar field A string containing the name of the field to be cleaned up. The field must be string-valued returns: clean_df The cleaned up dataframe ''' clean_df = df.copy() series = df[field] counts = df[field].value_counts(sort=True) unique = series.unique() for i, str1 in enumerate(unique): for str2 in unique[i+1:]: dist = 2*distance(str1, str2)/(len(str1) + len(str2)) if (dist > 0) and (dist <= threshold): if counts[str2] < counts[str1]: canon = str1 abr = str2 else: canon = str2 abr = str1 series.replace(to_replace=abr, value=canon, inplace=True) clean_df[field] = series return clean_df
def _min_distance_between_texts(self, src: str, target: str) -> int: src_norms, _ = self._text_to_normal_forms(src) tar_norms, _ = self._text_to_normal_forms(target) target_text = " ".join(tar_norms) cur_min_distance = sys.maxsize # Linear time (of normal forms) for ind, word in enumerate(src_norms): cur_text = word cur_min_distance = min(editdistance.distance(cur_text, target_text), cur_min_distance) for word1 in src_norms[ind+1:ind+1+self._max_window_size]: cur_text = cur_text + " " + word1 cur_min_distance = min(editdistance.distance(cur_text, target_text), cur_min_distance) return cur_min_distance
def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None: N = preds.shape[0] for ind in range(N): pred = [_ for _ in preds[ind].tolist() if _ not in self.ignore_tokens] target = [_ for _ in targets[ind].tolist() if _ not in self.ignore_tokens] distance = editdistance.distance(pred, target) error = distance / max(len(pred), len(target)) self.error = self.error + error self.total = self.total + N
def compute_distance(self, predict: str, target: str) -> float: """ Compute edit distance between two strings """ predict = "".join(predict).split(" ") target = "".join(target).split(" ") distance = ed.distance(predict, target) distance = float(distance) / len(target) return distance
def check_strict(gt, pred): if abs(len(gt) - len(pred)) >= 2: return 0 dis = editdistance.distance(gt, pred) if dis == 0: return 2 elif dis < max(len(gt), len(pred)) * 0.3: return 1 else: return 0
def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None: for ind in range(preds.shape[0]): pred = [ _ for _ in preds[ind].tolist() if _ not in self.ignore_tokens ] target = [ _ for _ in targets[ind].tolist() if _ not in self.ignore_tokens ] self.dist_leven += editdistance.distance(pred, target) self.len_total += max(len(pred), len(target))
def cer(_pred, _true, norm=True): """ Computes the Character Error Rate using the `editdistance` library. Parameters ---------- _pred : str space-separated sentence (prediction) _true : str space-separated sentence (ground truth) norm : bool divide by length of ground truth """ _pred, _true, = _pred.replace(" ", ""), _true.replace(" ", "") if norm: l = len(_true) if len(_true) > 0 else 1 return float(editdistance.distance(_pred, _true)) / l else: return float(editdistance.distance(_pred, _true))
def _levenshtein_candidates(self, predict_word): candidates = list() dist = dict() for word in self.dict_words: dist.update({word: ed.distance(predict_word, word)}) min_dist = min(dist.items(), key=lambda x: x[1])[1] for key, value in dist.items(): if value == min_dist: candidates.append(key) return candidates
def findWord(parola, insieme): if parola in insieme: return 0, parola minDistance = sys.maxsize nearWord = None for word in insieme: d = distance(parola, word) if minDistance > d: minDistance = d nearWord = word return minDistance, nearWord
def test(model, pe, TestSet, max_len, make_mask, class_num, converter, error_analysis=True): model.eval() loader = DataLoader(TestSet, batch_size=250, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) CCR = 0.0 n_correct = 0 start = time.time() for images, labels in loader: if error_analysis: images_norm = [] for item in images: images_norm.append(norm(item)) images_norm = torch.stack(images_norm, dim=0) else: images_norm = images texts = predict(model, pe, images_norm, max_len, make_mask, class_num, converter) for i, (text, label) in enumerate(zip(texts, labels)): text = text[:text.find('[s]')] if text != label: n_correct += 1 if error_analysis: try: name = label + " " + text + ".jpg" ToPIL( images[i]).save("./error_analysis_TwoDAttention/" + name) except: pass try: NED = 1 - editdistance.distance(text, label) / max( len(text), len(label)) CCR += NED except: pass CCR /= TestSet.len CCR = round(100 * CCR, 2) WCR = (TestSet.len - n_correct) / TestSet.len WCR = round(100 * WCR, 2) end = time.time() print(f"CCR:{CCR}%;WCR:{WCR}%;time consumed:{time_interval(end-start)}") torch.cuda.empty_cache() return CCR
def tree_insertion(tree, word): if len(tree) == 0: tree.append([word, {}]) return node = tree[0] while node is not None: node_word = node[0] distance = editdistance.distance(word, node_word) parent = node node = node[1].get(distance) if not node: parent[1][distance] = [word, {}]
def compute_norm_wer(pred_tokens, tgt_tokens, split_token=' '): # type: (List[str], List[str], str) -> float pred_words = [ ''.join(word_tokens) for word_tokens in split_by_token(pred_tokens, split_token) ] tgt_words = [ ''.join(word_tokens) for word_tokens in split_by_token(tgt_tokens, split_token) ] wer = ed.distance(pred_words, tgt_words) / len(tgt_words) return wer
def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None: N = preds.shape[0] for ind in range(N): pred = [ _ for _ in preds[ind].tolist() if _ not in self.ignore_tokens ] # x = torch([0,2,1]) -> [2] target = [ _ for _ in targets[ind].tolist() if _ not in self.ignore_tokens ] distance = editdistance.distance( pred, target) # 한 문자열을 다른 문자열로 바꾸는데 필요한 add edit delete의 수 error = distance / max(len(pred), len(target)) self.error = self.error + error # 0으로 initialize되어있나? self.total = self.total + N
def test_swap(self): typos_trans.mode = 'swap' x = typos_trans.transform(data_sample, n=1) self.assertTrue(1 == len(x)) for sample in x: self.assertTrue(sample.get_text('x') != data_sample.get_text('x')) self.assertTrue(editdistance.distance(sample.get_text('x'), data_sample.get_text('x')) <= 4) special_sample = SASample({'x': '', 'y': "negative"}) self.assertEqual([], typos_trans.transform(special_sample)) special_sample = SASample({'x': '~!@#$%^7890"\'', 'y': "negative"}) self.assertEqual(1, len(typos_trans.transform(special_sample)))
def dist(self, o): import editdistance # replace NaN with 0 # sd = np.where(self.data == self.nanValue, 0, self.data) # od = np.where(o.data == self.nanValue, 0, o.data) # neutralize NaNs sd = self.data.copy() od = o.data.copy() sd[sd == self.nanValue] = od[sd == self.nanValue] od[od == self.nanValue] = sd[od == self.nanValue] return editdistance.distance(str(sd), str(od))
def find_match(source_word): """Finds the best match for a source word""" min_dist = 100 # min_dist = len(source_word) * 2 optimal_words = [] target_file = open('common_words.txt', 'r') # FIXME: Runtime of this is O(n^2). Can we improve this? for line in target_file: target_word = line.rstrip() if distance(source_word, target_word) == min_dist: # Add this word to the list optimal_words.append(target_word) if distance(source_word, target_word) < min_dist: min_dist = distance(source_word, target_word) # re-initialize the list, with only this word as a possible correction optimal_words = [target_word] return choice(optimal_words)
def compute_global_wer(pred_tokens, tgt_tokens, split_token=' '): # type: (List[str], List[str], str) -> Tuple[int, int] pred_words = [ ''.join(word_tokens) for word_tokens in split_by_token(pred_tokens, split_token) ] tgt_words = [ ''.join(word_tokens) for word_tokens in split_by_token(tgt_tokens, split_token) ] dist = ed.distance(pred_words, tgt_words) num_refs = len(tgt_words) return dist, num_refs
def find_string_distances(embeddings, words, phoc_levels): ''' Given embeddings (2d array of shape mxn where m is the number of words and n is the size of embedding), array of words,list of levels for phocnet and word length, find the words mapped from each row of embeddings, calculates distance between mapped word and actual word and returns this list of distances ''' mapped_words = [] for i in range(embeddings.shape[0]): mapped_word = predict_word_from_embedding(embeddings[i], phoc_levels, len(words[i])) mapped_words.append(mapped_word) distance_array = [ ed.distance(words[i], mapped_words[i]) for i in range(len(words)) ] return distance_array