def process_rel_candidate_for_drop_led(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_rel): simple_sentence = " ".join(simple_sentences) sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) temp_nodeset, temp_filtered_mod_pos = boxer_graph.drop_relation(nodeset, relnode_candidate, filtered_mod_pos) sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, temp_filtered_mod_pos) edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) isDrop = compare_edit_distance(opr_drop_rel, edit_dist_after_drop, edit_dist_before_drop) return isDrop
def process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_ood): simple_sentence = " ".join(simple_sentences) sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) temp_nodeset = nodeset[:] temp_nodeset.remove(oodnode_candidate) sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, filtered_mod_pos) edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) isDrop = compare_edit_distance(opr_drop_ood, edit_dist_after_drop, edit_dist_before_drop) return isDrop
def make_compatible(input_str): for i in range(len(rer_out['taglist'])): if(rer_out['taglist'][i] == "Org"): for j in allprods: if(dist.edit_distance(rer_out['wordlist'][i], j) < 2): rer_out['wordlist'][i] = j break if(rer_out['taglist'][i] == "Family"): for j in allprods: for k in allprods[j]: if(dist.edit_distance(rer_out['wordlist'][i], k) < 4): rer_out['wordlist'][i] = k break
def process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_mod): simple_sentence = " ".join(simple_sentences) sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos) edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split()) modcand_position_to_process = modcand_to_process[0] temp_filtered_mod_pos = filtered_mod_pos[:]+[modcand_position_to_process] sentence_after_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, temp_filtered_mod_pos) edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split()) isDrop = compare_edit_distance(opr_drop_mod, edit_dist_after_drop, edit_dist_before_drop) return isDrop
def one2ArrEditDistance(self,sen,arr): score = [] for l in arr: score.append(edit_distance(sen,l)) if len(score) != 0: return sum(score)*1.0/len(score) return 0
def levenshtein_sort(self, keyword, domains): """ Sort domains by Levenshtein edit-distance :param sentence: str input source :param domains: domains list :rtype: list :return: sorted names list """ # distance counter # transpositions - ab == ba distance = lambda s, d: edit_distance(s, d, transpositions=True) # remove zone get_str = lambda domain: re.sub('([.][a-z]{2,4})+$', '', domain) domains = map(get_str, domains) # Sorter for i in range(len(domains)): for j in range(len(domains) - 1): if (distance(keyword, get_str(domains[j])) > distance(keyword, get_str(domains[j + 1]))): tmp = copy(domains[j + 1]) domains[j + 1] = domains[j] domains[j] = tmp return domains
def mean_char_edit_distance(candidates, references): total_distance = 0 total_target_length = 0 for y, t in zip(candidates, references): total_distance += edit_distance(y, t) total_target_length += len(t) return total_distance/total_target_length
def __init__(self): self.stemmer = LancasterStemmer() self.stem_mapping = {} self.stemmed_trie = TrieNode() self.trie = TrieNode() self.singles_lst = [] self.black_listed_stems = set([]) loaded = cPickle.load(open(DICTIONARY, 'r')) print len(loaded) loaded += CUSTOM loaded = set(loaded) most_common = cPickle.load(open(MOST_COMMON, 'r')) for word in most_common: self.black_listed_stems.add(self.stem(word)) #print self.black_listed_stems for word in loaded: word = word.lower() if word not in most_common[:TOP_K_FILTER]: self.trie.insert(word) stemmed_word = self.stem(word) if stemmed_word in self.stem_mapping: previous = self.stem_mapping[stemmed_word] edist = distance.edit_distance(word, previous) if edist > 2: pass #print 'warning: %s dropped in favor of %s' % (word, previous) else: if stemmed_word not in self.black_listed_stems: self.stem_mapping[stemmed_word] = word self.stemmed_trie.insert(stemmed_word)
def select_anagrams(token, structures): """Select possible anagrams for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Possible anagrams (keys) along with their score (values) """ anagrams = {} focus_alphabet = generate_alphabet_from_word(token[1]) token_hash = anagram_hash(token) hash_list = [] for c in structures["alphabet"]: for f in focus_alphabet: hash_list.append(token_hash + c - f) hash_counter = Counter(hash_list) # Counting retrieval occurence for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())): count = hash_counter[h] anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3] for anag in anag_list: anag_score = rate_anagram(structures["occurence_map"], token, anag, count) if anag_score > 0: anagrams[anag] = anag_score return anagrams
def get_candidates(self, word, D=1): """If word is in lexicon returns [(word, 1.0)]. Otherwise returns a list with all the words in lexicon that has a distance equal or less than to D (D is the Levenshtein edit-distance) If there is no such word, returns [(word, 0.0)] """ word = word.lower() if word in self.fdist: return [(word, 1.0)] candidates = [] counts = [] for w, c in self.fdist.iteritems(): if edit_distance(w, word) <= D: candidates.append(w) counts.append(c) if len(candidates) == 0: candidates.append(word) counts.append(0) probs = [float(c) / self.wcount for c in counts] return sorted(zip(candidates, probs), key=lambda x: x[1], reverse=True)
def get_geonames_code(m): lat = session.scalar(m._geo_ponto.y) lon = session.scalar(m._geo_ponto.x) places = geonames_reverse(lat, lon) for place in places: nome1 = m.nome.strip().lower() nome2 = place[u'name'].strip().lower() if edit_distance(nome1, nome2) < 2: return int(place[u'geonameId'])
def replace(self, word): suggestions = self.spell_dict.suggest(word) if suggestions: for suggestion in suggestions: if edit_distance(word, suggestion) <= self.max_dist: return suggestions[0] return word
def strip_synonyms(output_set, exclude_set): # Remove synonyms that have Levenshtein distance of 1, AFTER removing plurals. for word in output_set: for synset in wn.synsets(word): for synonym in synset.lemma_names(): if edit_distance(word,synonym) == 1: exclude_set.add(synonym) output_set.difference_update(exclude_set) return output_set, exclude_set
def close_enough_buckets(first_bucket, second_bucket, dist): if first_bucket == second_bucket: return False elif edit_distance(first_bucket, second_bucket) <= dist: return True else: return False
def eval(references): string_distances = {'siddharthan':[], 'bayes_no_variation':[], 'bayes_variation':[]} jaccard_distances = {'siddharthan':[], 'bayes_no_variation':[], 'bayes_variation':[]} for reference in references: print reference string_distances['siddharthan'].append(edit_distance(reference['original'], reference['siddharthan'])) string_distances['bayes_no_variation'].append(edit_distance(reference['original'], reference['bayes_no_variation'])) string_distances['bayes_variation'].append(edit_distance(reference['original'], reference['bayes_variation'])) # jaccard_distances['siddharthan'].append(jaccard_distance(reference['original'], reference['siddharthan'])) # jaccard_distances['bayes_no_variation'].append(jaccard_distance(reference['original'], reference['bayes_no_variation'])) # jaccard_distances['bayes_variation'].append(jaccard_distance(reference['original'], reference['bayes_variation'])) print 'String distances: ' print 'siddharthan: ', mean_confidence_interval(string_distances['siddharthan']) print 'bayes_no_variation: ', mean_confidence_interval(string_distances['bayes_no_variation']) print 'bayes_variation: ', mean_confidence_interval(string_distances['bayes_variation']) print 10 * '-'
def model_evaluate(model, d, gt): model_h.add_data(d, trunc=25) inf = model_h.states_list[0].stateseq inf = list(inf) dist = edit_distance(gt, inf) s_gt, s_inf = set(gt), set(inf) iou = len(s_gt.intersection(s_inf)) / len(s_gt.union(s_inf)) return dist, iou
def get_X(lines, features, cache): if cache == None: cache = {} tokenizer = RegexpTokenizer(r'[a-z]+') X = [] for line1 in lines: vector = [] for line2 in lines: vector.append(edit_distance(line1,line2)/max(len(line1),len(line2))) max_v = max(vector) for i in range(len(vector)): vector[i] = vector[i] / max_v syn_dist = {} for word in features: syn_dist[word] = 1 for word in set(tokenizer.tokenize(line1.lower())): if word in stopwords.words('english'): continue for word2 in features: if (len(wn.synsets(word)) == 0 or len(wn.synsets(word2)) == 0): continue else: if (word not in cache): cache[word] = {} if (word2 not in cache[word]): similarity = [w1.wup_similarity(w2) for w1 in wn.synsets(word, pos=wn.NOUN) + wn.synsets(word, pos=wn.VERB) for w2 in wn.synsets(word2, pos=wn.NOUN) + wn.synsets(word2, pos=wn.VERB)] similarity = [s for s in similarity if s] if (len(similarity) != 0): cache[word][word2] = max(similarity) else: cache[word][word2] = None #cache[word][word2] = wn.synsets(word)[0].path_similarity(wn.synsets(word2)[0]) #cache[word][word2] = wn.synsets(word)[0].wup_similarity(wn.synsets(word2)[0]) if (not cache[word][word2]): continue dist = 1 - cache[word][word2] if (dist < syn_dist[word2]): syn_dist[word2] = dist for word in features: vector.append(syn_dist[word]) X.append(vector) return X, cache
def _match_by_edit_distance(full_text, text_to_match): text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")") text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}") text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match) try: end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match)) potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)] except: import sys print(full_text) print() print(text_to_match) sys.exit(1) if len(potential_matches) == 0: potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0]), full_text, re.U)] if len(potential_matches) == 0: text_to_match = text_to_match.replace("(", "[") potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in re.finditer(re.escape(text_to_match[0]), full_text, re.U)] potential_matches = [(p[0:p.rindex(text_to_match[-1])+1] if text_to_match[-1] in p and len(p) > len(text_to_match) else p) for p in potential_matches] if len(potential_matches) == 0: # No idea why this would ever happen, but it does return text_to_match match_with_lowest_edit_distance = "" lowest_edit_distance = -1 for match in potential_matches: e_d = edit_distance(match, text_to_match) if lowest_edit_distance == -1 or e_d <= lowest_edit_distance: lowest_edit_distance = e_d match_with_lowest_edit_distance = match result = match_with_lowest_edit_distance.strip() if text_to_match[-1] in result: while result[-1] != text_to_match[-1]: result = result[0:-1] elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result): while result[-1] not in ['"', '”', "\u201d"]: result = result[0:-1] elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..": while result[-1] != text_to_match[-1]: result += full_text[full_text.index(result) + len(result)][-1] return result
def probs_metric(inverse=False): rand_p = Vec2(random()*table.width+table.min_point.x, random()*table.height+table.min_point.y) try: bestmeaning, bestsentence = generate_sentence(rand_p, False, scene, speaker, usebest=True, golden=inverse, printing=printing) sampled_landmark, sampled_relation = bestmeaning.args[0], bestmeaning.args[3] golden_posteriors = get_all_sentence_posteriors(bestsentence, meanings, golden=(not inverse), printing=printing) # lmk_prior = speaker.get_landmark_probability(sampled_landmark, landmarks, PointRepresentation(rand_p))[0] all_lmk_probs = speaker.all_landmark_probs(landmarks, Landmark(None, PointRepresentation(rand_p), None)) all_lmk_probs = dict(zip(landmarks, all_lmk_probs)) lmk_prior = all_lmk_probs[sampled_landmark] head_on = speaker.get_head_on_viewpoint(sampled_landmark) rel_prior = speaker.get_probabilities_points( np.array([rand_p]), sampled_relation, head_on, sampled_landmark) lmk_post = golden_posteriors[sampled_landmark] rel_post = golden_posteriors[sampled_relation] ps = np.array([golden_posteriors[lmk]*golden_posteriors[rel] for lmk, rel in meanings]) rank = None for i,p in enumerate(ps): lmk,rel = meanings[i] # logger( '%f, %s' % (p, m2s(lmk,rel))) head_on = speaker.get_head_on_viewpoint(lmk) # ps[i] *= speaker.get_landmark_probability(lmk, landmarks, PointRepresentation(rand_p))[0] ps[i] *= all_lmk_probs[lmk] ps[i] *= speaker.get_probabilities_points( np.array([rand_p]), rel, head_on, lmk) if lmk == sampled_landmark and rel == sampled_relation: idx = i ps += epsilon ps = ps/ps.sum() prob = ps[idx] rank = sorted(ps, reverse=True).index(prob) entropy = entropy_of_probs(ps) except (ParseError,RuntimeError) as e: logger( e ) lmk_prior = 0 rel_prior = 0 lmk_post = 0 rel_post = 0 prob = 0 rank = len(meanings)-1 entropy = 0 distances = [[None]] head_on = speaker.get_head_on_viewpoint(sampled_landmark) all_descs = speaker.get_all_meaning_descriptions(trajector, scene, sampled_landmark, sampled_relation, head_on, 1) distances = [] for desc in all_descs: distances.append([edit_distance( bestsentence, desc ), desc]) distances.sort() return lmk_prior,rel_prior,lmk_post,rel_post,\ prob,entropy,rank,distances[0][0],type(sampled_relation)
def validate_password_dictionary(value): """ Insures that the password is not too similar to a defined set of dictionary words """ password_max_edit_distance = getattr(settings, "PASSWORD_DICTIONARY_EDIT_DISTANCE_THRESHOLD", None) password_dictionary = getattr(settings, "PASSWORD_DICTIONARY", None) if password_max_edit_distance and password_dictionary: for word in password_dictionary: distance = edit_distance(value, word) if distance <= password_max_edit_distance: raise ValidationError(_("Too similar to a restricted dictionary word."), code="dictionary_word")
def areExpansionsSimilar(expansion_1, expansion_2): expansion_1 = expansion_1.lower().replace(u"-", u" ") expansion_2 = expansion_2.lower().replace(u"-", u" ") #numActualWords = len(expansion_1) #numPredictedWords = len(expansion_2) if(expansion_1 == expansion_2 or AcronymExpansion.startsSameWay(expansion_1, expansion_2) or edit_distance(expansion_1, expansion_2) <= 2): # max(numActualWords, numPredictedWords)): return True return False
def run(self, token): try: spellchk = self.correct(token.lower(), self.lWords) if spellchk == token and spellchk not in self.lWords[token[0]]: return "#" + token else: if edit_distance(token, spellchk) <= 2: return spellchk else: return "#" + token except: return "#" + token
def attributeMatches(val1, val2, mode="exact", threshold=0): if mode == "ignore": return True if mode == "exact": return val1 is not None and val2 is not None and\ val1 == val2 if mode == "fuzzy": return val1 is not None and val2 is not None and\ abs(len(val1) - len(val2)) <= threshold and\ edit_distance(val1,val2) <= threshold if mode == "do not differ": return val1 is None or val2 is None or val1 == val2 return False
def matchWord(tokens, words): s = Set(tokens).intersection(Set(words)) if len(s)>0: return [(w,1.0,w) for w in s] else: from nltk.metrics import distance import operator result = [] for token in Set(tokens): vals = {w : (1.0-float(distance.edit_distance(token, w))/float(max(len(token),len(w)))) for w in words} sortedvals = sorted(vals.iteritems(), key=operator.itemgetter(1),reverse=True) result.append( (token, sortedvals[0][1], sortedvals[0][0]) ) return sorted(result, key=lambda tup: tup[1], reverse=True)
def update_summary(qa_pairs, baidu_data): assert(qa_pairs != None) assert(baidu_data != None) # load qa pairs reader = open(qa_pairs, 'r') pairs = [] for line in reader: # print line words = line.split(' ') first = words[0] second = words[-1] pairs.append([first, second]) print 'number of pairs loaded:', len(pairs) s1 = u'辛弃疾的名作《永遇乐.京口北固亭怀古》中”凭谁问,廉颇老矣“的下一句是什么?' s2 = u'辛弃疾的名作《永遇乐.京口北固亭怀古》中”凭谁问,廉颇老矣“的下一句是什么?' print edit_distance(s1, s2) # print pairs[0][0] # parse tree and update its summary tree = ET.parse(baidu_data) root = tree.getroot() for question in root.getchildren(): q_text = question.findall('q')[0].text.strip() cnt = 0 for p in pairs: print edit_distance(q_text, p[0]) if edit_distance(q_text, p[0]) < 3: question.remove('summary') ans = ET.SubElement(question, 'summary') ans.text = p[1] break break xml_string = ET.tostring(root, encoding = 'utf-8') result = xml.dom.minidom.parseString(xml_string) # print result.toprettyxml() return
def average_edit_distance(n): import nltk.metrics.distance as d edits = [ e["noteText"] for e in activity_logs_for_note(n) if e["action"] == "note-save"] ##//ActivityLog.objects.filter(noteid=n["id"],action="note-edit").order_by("when").values_list("noteText") distances = [] if len(edits) > 1: #print "edits: %s " % repr(edits) for i in range(0,len(edits)-1): if edits[i] is None or edits[i+1] is None: continue distances.append( d.edit_distance( edits[i], edits[i+1] ) ) if len(distances) > 0: return make_feature("edit_distance",median(distances)) return make_feature('edit_distance',MISSING)
def group_worlds(tags: List[str], tokens: List[str]) -> Dict[str, List[str]]: spans = from_bio(tags, 'world') with_strings = [(" ".join(tokens[i:j]), i, j) for i, j in spans] with_strings.sort(key=lambda x: len(x[0]), reverse=True) substring_groups: List[List[Tuple[str, int, int]]] = [] ambiguous = [] for string, i, j in with_strings: found = None for group_index, group in enumerate(substring_groups): for string_g, _, _ in group: if string in string_g: if found is None: found = group_index elif found != group_index: found = -1 # Found multiple times if found is None: substring_groups.append([(string, i, j)]) elif found >= 0: substring_groups[found].append((string, i, j)) else: ambiguous.append((string, i, j)) nofit = [] if len(substring_groups) > 2: substring_groups.sort(key=len, reverse=True) for extra in substring_groups[2:]: best_distance = 999 best_index = None string = extra[0][0] # Use the longest string for index, group in enumerate(substring_groups[:2]): for string_g, _, _ in group: distance = edit_distance(string_g, string) if distance < best_distance: best_distance = distance best_index = index # Heuristics for "close enough" if best_index is not None and best_distance < len(string) - 1: substring_groups[best_index] += extra else: nofit.append(extra) else: substring_groups += [[("N/A", 999, 999)]] * 2 # padding substring_groups = substring_groups[:2] # Sort by first occurrence substring_groups.sort(key=lambda x: min([y[1] for y in x])) world_dict = {} for index, group in enumerate(substring_groups): world_strings = delete_duplicates([x[0] for x in group]) world_dict['world'+str(index+1)] = world_strings return world_dict
def select_lower_edit_distance(ref_word, word_list): """Get the word with the lower edit distance Parameters: ref_word (:func:`str`): Word to correct word_list (list): List of proposals Returns: :func:`str` - Selected word """ word_dict = {word: edit_distance(ref_word, word) for word in word_list} min_dist = min(word_dict.values()) return [word for word, dist in word_dict.items() if dist == min_dist]
def select_ocrsims(token, structures): """Select similar words for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Similar words (keys) along with their score (values) """ delta = 2 ocr_sims = {} word_hash = ocr_key_hash(token) sim_hash_list = {} # Using a dictionary avoid multiple entries if a key is retrieved twice key_index = -1 # for (key, value) in word_hash: for key, value in word_hash: key_index += 1 sim_hash = deepcopy(word_hash) for d in range(-delta, delta+1): if d != 0: card = max(int(value)+d, 1) sim_hash[key_index] = (key, card) # Rebuild OCR key string sim_hash_str = "" for k, v in sim_hash: sim_hash_str += k + str(v) if sim_hash_str in structures["ocrkeys"]: card_diff = abs(int(value)-card) sim_hash_list[sim_hash_str] = [(sim_word, card_diff) for sim_word in structures["ocrkeys"][sim_hash_str] if edit_distance(sim_word, token) <= 2] for sim_hash_str, sim_list in sim_hash_list.items(): for sim_word, card_diff in sim_list: sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff) if sim_score > 0: ocr_sims[sim_word] = sim_score return ocr_sims
def strip_words(text, strip_this): """ if text starts with something that looks like stip_this then strip it """ letters = functools.partial(re.sub, '[^a-z]+', '') ltext = letters(text.lower()) lstrip = letters(strip_this.lower()) best = (4, 0, '') for i in range(1, 1 + min(50, len(ltext), len(lstrip))): a, b = ltext[:i], lstrip[:i] score = (float(edit_distance(a, b)) / len(b), -len(b), b) if score < best: best = score density, length, letters = best if best[0] > 0.1 or abs(length) < min(len(lstrip), 10): return text return strip_letters(text, letters).lstrip(' .?;:()-\t\n')
from nltk.metrics.distance import edit_distance def my_edit_distance(str1, str2): m = len(str1) + 1 n = len(str2) + 1 table = {} for i in range(m): table[i, 0] = i for j in range(n): table[0, j] = j for i in range(1, m): for j in range(1, n): cost = 0 if str1[i - 1] == str2[j - 1] else 1 table[i, j] = min(table[i, j - 1] + 1, table[i - 1, j] + 1, table[i - 1, j - 1] + cost) return table[i, j] print("Our Algorithm :", my_edit_distance("hand", "and")) print("NLTK Algorithm :", edit_distance("hand", "and"))
def _get_features(self, s_ent, t_ent): """ compute all LR model features :param s_ent: :param t_ent: :return: """ s_name_tokens, s_stem_tokens, s_lemm_tokens, s_char_tokens, s_alias_tokens, s_def_tokens = self._compute_tokens( s_ent) t_name_tokens, t_stem_tokens, t_lemm_tokens, t_char_tokens, t_alias_tokens, t_def_tokens = self._compute_tokens( t_ent) has_same_canonical_name = (s_name_tokens == t_name_tokens) has_same_stemmed_name = (s_stem_tokens == t_stem_tokens) has_same_lemmatized_name = (s_lemm_tokens == t_lemm_tokens) has_same_char_tokens = (s_char_tokens == t_char_tokens) has_alias_in_common = (len( set(s_alias_tokens).intersection(set(t_alias_tokens))) > 0) # initialize similarity features name_token_jaccard_similarity = 1.0 inverse_name_token_edit_distance = 1.0 name_stem_jaccard_similarity = 1.0 inverse_name_stem_edit_distance = 1.0 name_lemm_jaccard_similarity = 1.0 inverse_name_lemm_edit_distance = 1.0 name_char_jaccard_similarity = 1.0 inverse_name_char_edit_distance = 1.0 # jaccard similarity and token edit distance max_changes = len(s_name_tokens) + len(t_name_tokens) max_char_changes = len(s_char_tokens) + len(t_char_tokens) if not has_same_canonical_name: name_token_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_name_tokens), set(t_name_tokens)) inverse_name_token_edit_distance = 1.0 - edit_distance( s_name_tokens, t_name_tokens) / max_changes if not has_same_stemmed_name: name_stem_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_stem_tokens), set(t_stem_tokens)) inverse_name_stem_edit_distance = 1.0 - edit_distance( s_stem_tokens, t_stem_tokens) / max_changes if not has_same_lemmatized_name: name_lemm_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_lemm_tokens), set(t_lemm_tokens)) inverse_name_lemm_edit_distance = 1.0 - edit_distance( s_lemm_tokens, t_lemm_tokens) / max_changes if not has_same_char_tokens: name_char_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_char_tokens), set(t_char_tokens)) inverse_name_char_edit_distance = 1 - edit_distance( s_char_tokens, t_char_tokens) / max_char_changes max_alias_token_jaccard = 0.0 min_alias_edit_distance = 1.0 best_s_alias = s_ent['aliases'][0] best_t_alias = t_ent['aliases'][0] if not has_alias_in_common: for s_ind, s_a_tokens in enumerate(s_alias_tokens): for t_ind, t_a_tokens in enumerate(t_alias_tokens): if s_a_tokens and t_a_tokens: j_ind = string_utils.get_jaccard_similarity( set(s_a_tokens), set(t_a_tokens)) if j_ind > max_alias_token_jaccard: max_alias_token_jaccard = j_ind best_s_alias = s_ent['aliases'][s_ind] best_t_alias = t_ent['aliases'][t_ind] e_dist = edit_distance(s_a_tokens, t_a_tokens) / ( len(s_a_tokens) + len(t_a_tokens)) if e_dist < min_alias_edit_distance: min_alias_edit_distance = e_dist # has any relationships has_parents = (len(s_ent['par_relations']) > 0 and len(t_ent['par_relations']) > 0) has_children = (len(s_ent['chd_relations']) > 0 and len(t_ent['chd_relations']) > 0) percent_parents_in_common = 0.0 percent_children_in_common = 0.0 # any relationships in common if has_parents: max_parents_in_common = (len(s_ent['par_relations']) + len(t_ent['par_relations'])) / 2 percent_parents_in_common = len( s_ent['par_relations'].intersection( t_ent['par_relations'])) / max_parents_in_common if has_children: max_children_in_common = (len(s_ent['chd_relations']) + len(t_ent['chd_relations'])) / 2 percent_children_in_common = len( s_ent['chd_relations'].intersection( t_ent['chd_relations'])) / max_children_in_common s_acronyms = [(i[0] for i in a) for a in s_alias_tokens] t_acronyms = [(i[0] for i in a) for a in t_alias_tokens] has_same_acronym = (len(set(s_acronyms).intersection(set(t_acronyms))) > 0) s_name_root, s_name_heads = self._dependency_parse( s_ent['canonical_name']) t_name_root, t_name_heads = self._dependency_parse( t_ent['canonical_name']) has_same_name_root_word = (s_name_root == t_name_root) has_same_name_chunk_heads = (s_name_heads == t_name_heads) name_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity( s_name_heads, t_name_heads) s_alias_root, s_alias_heads = self._dependency_parse(best_s_alias) t_alias_root, t_alias_heads = self._dependency_parse(best_t_alias) has_same_alias_root_word = (s_alias_root == t_alias_root) has_same_alias_chunk_heads = (s_alias_heads == t_alias_heads) alias_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity( s_alias_heads, t_alias_heads) def_jaccard_similarity = string_utils.get_jaccard_similarity( set(s_def_tokens), set(t_def_tokens)) # form feature vector feature_vec = [ FloatField(float(has_same_canonical_name)), FloatField(float(has_same_stemmed_name)), FloatField(float(has_same_lemmatized_name)), FloatField(float(has_same_char_tokens)), FloatField(float(has_alias_in_common)), FloatField(name_token_jaccard_similarity), FloatField(inverse_name_token_edit_distance), FloatField(name_stem_jaccard_similarity), FloatField(inverse_name_stem_edit_distance), FloatField(name_lemm_jaccard_similarity), FloatField(inverse_name_lemm_edit_distance), FloatField(name_char_jaccard_similarity), FloatField(inverse_name_char_edit_distance), FloatField(max_alias_token_jaccard), FloatField(1.0 - min_alias_edit_distance), FloatField(percent_parents_in_common), FloatField(percent_children_in_common), FloatField(float(has_same_acronym)), FloatField(float(has_same_name_root_word)), FloatField(float(has_same_name_chunk_heads)), FloatField(name_chunk_heads_jaccard_similarity), FloatField(float(has_same_alias_root_word)), FloatField(float(has_same_alias_chunk_heads)), FloatField(alias_chunk_heads_jaccard_similarity), FloatField(def_jaccard_similarity) ] return feature_vec
def Greedy_Decode_Eval(Net, datasets, args): # TestNet = Net.eval() epoch_size = len(datasets) // args.test_batch_size batch_iterator = iter( DataLoader(datasets, args.test_batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn)) c_matrix = np.zeros((len(CHARS) - 1, len(CHARS) - 1)) Tp = 0 Tn_1 = 0 Tn_2 = 0 t_chars = 0 T_c = 0 T_f = 0 T_fc = 0 t_fchars = 0 norm_ed = 0 res_chars = np.zeros(len(CHARS)) t1 = time.time() count = 0 for i in tqdm(range(epoch_size)): # load train data images, labels, lengths, filenames = next(batch_iterator) start = 0 targets = [] for length in lengths: label = labels[start:start + length] targets.append(label) start += length targets = np.array([el.numpy() for el in targets]) imgs = images.numpy().copy() if args.cuda: images = Variable(images.cuda()) else: images = Variable(images) # forward prebs = Net(images) # greedy decode prebs = prebs.cpu().detach().numpy() preb_labels = list() for i in range(prebs.shape[0]): preb = prebs[i, :, :] preb_label = list() for j in range(preb.shape[1]): preb_label.append(np.argmax(preb[:, j], axis=0)) no_repeat_blank_label = list() pre_c = preb_label[0] if pre_c != len(CHARS) - 1: no_repeat_blank_label.append(pre_c) for c in preb_label: # dropout repeate label and blank label if (pre_c == c) or (c == len(CHARS) - 1): if c == len(CHARS) - 1: pre_c = c continue no_repeat_blank_label.append(c) pre_c = c preb_labels.append(no_repeat_blank_label) #print(len(preb_labels)) for i, label in enumerate(preb_labels): if args.postprocess: label = postprocess(label) correct = False X = i lb = "" tg = "" for j in targets[i]: x = int(j) tg += CHARS[x] for j in label: lb += CHARS[j] norm_ed_img = 0 # if len(lb)==0 or len(tg)==0 or tg == '0': # norm_ed_img = -1 if len(tg) > len(lb): norm_ed_img = 1 - edit_distance(lb, tg) / len(tg) norm_ed += norm_ed_img else: norm_ed_img = 1 - edit_distance(lb, tg) / len(lb) norm_ed += norm_ed_img # show image and its predict label t_chars += len(targets[i]) for j in range(len(label)): if j >= len(targets[i]): continue if label[j] == targets[i][j]: res_chars[label[j]] += 1 T_c += 1 if args.show: show(imgs[i], label, targets[i]) if len(label) != len(targets[i]): #print(abs(len(label)-len(targets[i]))) Tn_1 += 1 else: c_matrix = cmatrix(c_matrix, label, targets[i]) t_fchars += len(targets[i]) for j in range(len(label)): if j >= len(targets[i]): continue if label[j] == targets[i][j]: res_chars[label[j]] += 1 T_fc += 1 fuzzy = 0 for x in range(len(label)): if targets[i][x] == label[x]: fuzzy += 1 if fuzzy / len(label) >= 0.75: T_f += 1 if (np.asarray(targets[i]) == np.asarray(label)).all(): Tp += 1 correct = True else: Tn_2 += 1 # print(lb,tg) if args.save_pred_results: if not os.path.isdir('./testpreds'): os.makedirs('./testpreds') with open("./testpreds/preds.csv", 'a+', newline='') as f: l1 = [filenames[X].split('\\')[1], tg, lb, norm_ed_img] csv_writer = writer(f) csv_writer.writerow(l1) f.close if args.save_pred_images: if not os.path.isdir('./testpreds'): os.makedirs('./testpreds') if not os.path.isdir('./testpreds/images'): os.makedirs('./testpreds/images') basename = os.path.basename(filenames[X]) #newname = 'testpreds/images/new_'+basename.split('.')[0]+"__"+lb+'.png' newname = 'testpreds/images/' + f'{count}__' + lb + '.png' count += 1 #if not correct: shutil.copy(filenames[X], newname) if args.evaluate: evaluate_and_save(c_matrix) Acc = Tp * 1.0 / (Tp + Tn_1 + Tn_2) print("[Info] Test Accuracy: {} [{}:{}:{}:{}]".format( Acc, Tp, Tn_1, Tn_2, (Tp + Tn_1 + Tn_2))) print( f"[Info] 75%+ Accuracy: {T_f/(Tp+Tn_1+Tn_2)} [{T_f}/{(Tp+Tn_1+Tn_2)}]") t2 = time.time() print(f'[Info] Global Char Accuracy:{T_c/t_chars} [{T_c}/{t_chars}] ') print( f'[Info] Char Accuracy on full length match:{T_fc/t_fchars} [{T_fc}/{t_fchars}] ' ) print(f"[Info] Length accuracy: {(Tp+Tn_2)/(Tp+Tn_1+Tn_2)}") print(f"[Info] Norm_ed: {norm_ed/(Tp+Tn_1+Tn_2)}") # print('Per char: ') # for i in range(10): # print(i,": ",res_chars[i]/T_c) # for i in range(10,len(CHARS)-1): # print(chr(55+i),': ',res_chars[i]/T_c) print("[Info] Test Speed: {}s 1/{}]".format((t2 - t1) / len(datasets), len(datasets)))
__author__ = 'user' from nltk.metrics import distance as dist # transposition flag allows transpositions edits (e.g., “ab� -> “ba�), s1 = 'dr mark keane' s2 = 'mr mark bean' s3 = 'rain' s4 = 'shine' s5 = 'mr rowan atkinson' s6 = 'mr bean' ans = dist.edit_distance(s1, s2, transpositions=False) print(ans) ans = dist.edit_distance(s3, s4, transpositions=False) print(ans) ans = dist.edit_distance(s5, s6, transpositions=False) print(ans) ans = dist.levenschtein(s1, s2) print(ans) ans = dist.levenschtein(s3, s4) print(ans) ans = dist.levenschtein(s5, s6)
def validation(model, criterion, evaluation_loader, converter, opt): """ Validation or Evaluation """ n_correct = 0 norm_ED = 0 length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() for i, (image_tensors, labels) in enumerate(evaluation_loader): batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) text_for_loss, length_for_loss = converter.encode( labels, batch_max_length=opt.batch_max_length) start_time = time.time() if 'CTC' in opt.Prediction: preds = model(image, text_for_pred) forward_time = time.time() - start_time # Calculate evaluation loss for CTC decoder. preds_size = torch.IntTensor([preds.size(1)] * batch_size) # permute 'preds' to use CTCloss format if opt.baiduCTC: cost = criterion(preds.permute(1, 0, 2), text_for_loss, preds_size, length_for_loss) / batch_size else: cost = criterion( preds.log_softmax(2).permute(1, 0, 2), text_for_loss, preds_size, length_for_loss) # Select max probabilty (greedy decoding) then decode index to character if opt.baiduCTC: _, preds_index = preds.max(2) preds_index = preds_index.view(-1) else: _, preds_index = preds.max(2) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) forward_time = time.time() - start_time preds = preds[:, :text_for_loss.shape[1] - 1, :] target = text_for_loss[:, 1:] # without [GO] Symbol cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) labels = converter.decode(text_for_loss[:, 1:], length_for_loss) infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy & confidence score preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) confidence_score_list = [] for gt, pred, pred_max_prob in zip(labels, preds_str, preds_max_prob): if 'Attn' in opt.Prediction: gt = gt[:gt.find('[s]')] pred_EOS = pred.find('[s]') pred = pred[: pred_EOS] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS] # To evaluate 'case sensitive model' with alphanumeric and case insensitve setting. if opt.sensitive and opt.data_filtering_off: pred = pred.lower() gt = gt.lower() alphanumeric_case_insensitve = '0123456789abcdefghijklmnopqrstuvwxyz' out_of_alphanumeric_case_insensitve = f'[^{alphanumeric_case_insensitve}]' pred = re.sub(out_of_alphanumeric_case_insensitve, '', pred) gt = re.sub(out_of_alphanumeric_case_insensitve, '', gt) if pred == gt: n_correct += 1 ''' (old version) ICDAR2017 DOST Normalized Edit Distance https://rrc.cvc.uab.es/?ch=7&com=tasks "For each word we calculate the normalized edit distance to the length of the ground truth transcription." if len(gt) == 0: norm_ED += 1 else: norm_ED += edit_distance(pred, gt) / len(gt) ''' # ICDAR2019 Normalized Edit Distance if len(gt) == 0 or len(pred) == 0: norm_ED += 0 elif len(gt) > len(pred): norm_ED += 1 - edit_distance(pred, gt) / len(gt) else: norm_ED += 1 - edit_distance(pred, gt) / len(pred) # calculate confidence score (= multiply of pred_max_prob) try: confidence_score = pred_max_prob.cumprod(dim=0)[-1] except: confidence_score = 0 # for empty pred case, when prune after "end of sentence" token ([s]) confidence_score_list.append(confidence_score) # print(pred, gt, pred==gt, confidence_score) accuracy = n_correct / float(length_of_data) * 100 norm_ED = norm_ED / float( length_of_data) # ICDAR2019 Normalized Edit Distance return valid_loss_avg.val( ), accuracy, norm_ED, preds_str, confidence_score_list, labels, infer_time, length_of_data
def validation(model, criterion, evaluation_loader, converter, opt): """ validation or evaluation """ n_correct = 0 norm_ED = 0 length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() for i, (image_tensors, labels) in enumerate(evaluation_loader): batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size image = image_tensors.to(device) # For max length prediction length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(device) text_for_pred = torch.LongTensor(batch_size, opt.batch_max_length + 1).fill_(0).to(device) text_for_loss, length_for_loss = converter.encode( labels, batch_max_length=opt.batch_max_length) start_time = time.time() if 'CTC' in opt.Prediction: preds = model(image, text_for_pred).log_softmax(2) forward_time = time.time() - start_time # Calculate evaluation loss for CTC deocder. preds_size = torch.IntTensor([preds.size(1)] * batch_size) # permute 'preds' to use CTCloss format torch.backends.cudnn.enabled = False cost = criterion( preds.permute(1, 0, 2).to(device), text_for_loss.to(device), preds_size.to(device), length_for_loss.to(device)) torch.backends.cudnn.enabled = True # Select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_index = preds_index.view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) forward_time = time.time() - start_time preds = preds[:, :text_for_loss.shape[1] - 1, :] target = text_for_loss[:, 1:] # without [GO] Symbol cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) labels = converter.decode(text_for_loss[:, 1:], length_for_loss) infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy & confidence score preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) confidence_score_list = [] for gt, pred, pred_max_prob in zip(labels, preds_str, preds_max_prob): if 'Attn' in opt.Prediction: gt = gt[:gt.find('[s]')] pred_EOS = pred.find('[s]') pred = pred[: pred_EOS] # prune after "end of sentence" token ([s]) pred_max_prob = pred_max_prob[:pred_EOS] if pred == gt: n_correct += 1 if len(gt) == 0: norm_ED += 1 else: norm_ED += edit_distance(pred, gt) / len(gt) # calculate confidence score (= multiply of pred_max_prob) try: confidence_score = pred_max_prob.cumprod(dim=0)[-1] except: confidence_score = 0 # for empty pred case, when prune after "end of sentence" token ([s]) confidence_score_list.append(confidence_score) # print(pred, gt, pred==gt, confidence_score) accuracy = n_correct / float(length_of_data) * 100 return valid_loss_avg.val( ), accuracy, norm_ED, preds_str, confidence_score_list, labels, infer_time, length_of_data
def calEditDistance(barK, truP): dist = edit_distance(barK, truP) return dist
def validation(model, criterion, eval_loader, converter, opt, tqdm_position=1): """validation or evaluation""" n_correct = 0 norm_ED = 0 length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() for i, (image_tensors, labels) in tqdm( enumerate(eval_loader), total=len(eval_loader), position=tqdm_position, leave=False, ): batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size image = image_tensors.to(device) # For max length prediction labels_index, labels_length = converter.encode( labels, batch_max_length=opt.batch_max_length) if "CTC" in opt.Prediction: start_time = time.time() preds = model(image) forward_time = time.time() - start_time # Calculate evaluation loss for CTC deocder. preds_size = torch.IntTensor([preds.size(1)] * batch_size) # permute 'preds' to use CTCloss format cost = criterion( preds.log_softmax(2).permute(1, 0, 2), labels_index, preds_size, labels_length, ) else: text_for_pred = (torch.LongTensor(batch_size).fill_( converter.dict["[SOS]"]).to(device)) start_time = time.time() preds = model(image, text_for_pred, is_train=False) forward_time = time.time() - start_time target = labels_index[:, 1:] # without [SOS] Symbol cost = criterion( preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1), ) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_size = torch.IntTensor([preds.size(1)] * preds_index.size(0)).to(device) preds_str = converter.decode(preds_index, preds_size) infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy & confidence score preds_prob = F.softmax(preds, dim=2) preds_max_prob, _ = preds_prob.max(dim=2) confidence_score_list = [] for gt, prd, prd_max_prob in zip(labels, preds_str, preds_max_prob): if "Attn" in opt.Prediction: prd_EOS = prd.find("[EOS]") prd = prd[: prd_EOS] # prune after "end of sentence" token ([EOS]) prd_max_prob = prd_max_prob[:prd_EOS] """ In our experiment, if the model predicts at least one [UNK] token, we count the word prediction as incorrect. To not take account of [UNK] token, use the below line. prd = prd.replace('[UNK]', '') """ # To evaluate 'case sensitive model' with alphanumeric and case insensitve setting. = same with ASTER gt = gt.lower() prd = prd.lower() alphanumeric_case_insensitve = "0123456789abcdefghijklmnopqrstuvwxyz" out_of_alphanumeric_case_insensitve = f"[^{alphanumeric_case_insensitve}]" gt = re.sub(out_of_alphanumeric_case_insensitve, "", gt) prd = re.sub(out_of_alphanumeric_case_insensitve, "", prd) if opt.NED: # ICDAR2019 Normalized Edit Distance if len(gt) == 0 or len(prd) == 0: norm_ED += 0 elif len(gt) > len(prd): norm_ED += 1 - edit_distance(prd, gt) / len(gt) else: norm_ED += 1 - edit_distance(prd, gt) / len(prd) else: if prd == gt: n_correct += 1 # calculate confidence score (= multiply of prd_max_prob) try: confidence_score = prd_max_prob.cumprod(dim=0)[-1] except: confidence_score = 0 # for empty pred case, when prune after "end of sentence" token ([EOS]) confidence_score_list.append(confidence_score) if opt.NED: # ICDAR2019 Normalized Edit Distance. In web page, they report % of norm_ED (= norm_ED * 100). score = norm_ED / float(length_of_data) * 100 else: score = n_correct / float(length_of_data) * 100 # accuracy return ( valid_loss_avg.val(), score, preds_str, confidence_score_list, labels, infer_time, length_of_data, )
# rosalind_ba5g # edit distance import numpy as np f = open('rosalind_ba5g.txt') a = f.readline().rstrip() b = f.readline().rstrip() from nltk.metrics import distance print(distance.edit_distance(a, b)) # Wagner-Fischer algorithm def edit_distance(s, t): import numpy as np m, n = len(s), len(t) d = np.zeros((m + 1, n + 1), dtype=int) d[:, 0] = np.arange(m + 1) d[0, :] = np.arange(n + 1) for j in np.arange(n): for i in np.arange(m): if s[i] == t[j]: d[i + 1, j + 1] = d[i, j] else: d[i + 1, j + 1] = min(d[i, j + 1] + 1, d[i + 1, j] + 1, d[i, j] + 1) return d[m, n] print(edit_distance(a, b))
from nltk.metrics.distance import edit_distance import codecs # Array de questões questions = [] # Lendo arquivo cybora cybora_aiml = codecs.open("./core/base/cybora.aiml", "r", encoding="utf-8") # Frase para teste phrase = 'meu nome é rodrigo e o seu?' # Menor distância distance_less = len(phrase) # Inicializa varivéis usadas no algoritmo delta = 0 trigger = '' # Inicia teste for question in questions: # Calcula distância delta = edit_distance(phrase, question) # Verifica distância if delta < distance_less: # Atualiza resultados distance_less = delta trigger = question print trigger, distance_less
def get_features(self, lhs, rhs, alignment=['0-0']): """calculate and return features for a rule""" features = {} # no changes if lhs == rhs: return features # indicate if tokens are aligned rfound = [(rtok[0] == '[') for rtok in rhs] lfound = [(ltok[0] == '[') for ltok in lhs] # iterate through aligned tokens--assume no NTs for tuple in alignment: lind, rind = [int(i) for i in tuple.split('-')] lfound[lind] = True rfound[rind] = True if lhs[lind] == rhs[rind]: continue # calculate features for substituion self.increment(features, 'substituted') self.increment(features, 'char-ld', edit_distance(lhs[lind], rhs[rind])) if rhs[rind] in self.dictionary.suggest(lhs[lind]): if self.dictionary.check(lhs[lind]): self.increment(features, 'alternate-spelling', v=self.get_weight(rhs[rind])) else: self.increment(features, 'mispelled', v=self.get_weight(rhs[rind])) # if the tokens aren't the same, compare them ltok, lpos = self.get_pos(lhs[lind]) rtok, rpos = self.get_pos(rhs[rind]) self.increment(features, '%s-%s' % (lpos, rpos)) if rpos[0] == 'W' or rpos[0] == 'C': self.increment(features, '%s-error' % (rpos[:2])) else: self.increment(features, '%s-error' % (rpos[0])) # compare lemmas/morphology if self.do_morph: try: lmorph = self.get_morphology(ltok) rmorph = self.get_morphology(rtok) if lmorph[0] != rmorph[0]: if len(lmorph) + len(rmorph) > 2: self.increment(features, 'diff-lemma-diff-morph') else: self.increment(features, 'diff-lemma-same-morph') else: self.increment(features, 'same-lemma-diff-morph') if len(lmorph) + len(rmorph) > 2: self.increment( features, 'morph-%s-%s' % ('+'.join(lmorph[1:]), '+'.join(rmorph[1:]))) except: sys.stderr.write('Error handling morphology in rule %d\n' % i) # calculate feaures for deletion for deltok in self.analyze_unaligned(lfound, lhs): self.increment(features, 'deleted') self.increment(features, deltok + '-') # calculate feaures for insertion for instok in self.analyze_unaligned(rfound, rhs): self.increment(features, 'inserted') self.increment(features, '-' + instok) self.increment(features, 'tok-ld', edit_distance(lhs, rhs)) return features
def lev_no_case_sens(a, b): a = a.lower() b = b.lower() dist = edit_distance(a, b) return dist
def did_you_mean(keyword, keyword_pool): candidates = list(keyword_pool) closest_match_idx = np.argmin( [edit_distance(keyword, candidate) for candidate in candidates]) return candidates[closest_match_idx]
def lev_dist(first, second): return edit_distance(first, second)
def getFeatureVector(self, mention, entity): features = [] page_title = self._db.getPageTitle(entity) page_title = utils.text.normalize_unicode( page_title) if page_title is not None else None mention_text = utils.text.normalize_unicode(mention.mention_text()) for feature in self.feature_names: # Count features if feature == 'prior': features.append(self._stats.getCandidatePrior(entity)) elif feature == 'prior_yamada': features.append( self._stats.getCandidatePriorYamadaStyle(entity)) elif feature == 'normalized_prior': features.append( self._stats.getCandidatePrior(entity, normalized=True)) elif feature == 'normalized_log_prior': features.append( self._stats.getCandidatePrior(entity, normalized=True, log=True)) elif feature == 'relative_prior': if entity in mention.candidates: count = 0 for cand in mention.candidates: count += self._stats.getCandidatePrior(cand) if count == 0: features.append(float(0)) else: features.append( float(self._stats.getCandidatePrior(entity)) / count) else: features.append(float(0)) elif feature == 'cond_prior': features.append( self._stats.getCandidateConditionalPrior(entity, mention)) elif feature == 'n_of_candidates': features.append(len(mention.candidates)) elif feature == 'max_prior': max_prior = self._stats.getCandidateConditionalPrior( entity, mention) for m in mention.document().mentions: if entity in m.candidates and self._stats.getCandidateConditionalPrior( entity, m) > max_prior: max_prior = self._stats.getCandidateConditionalPrior( entity, m) features.append(max_prior) # string similarity features elif feature == 'entity_title_starts_or_ends_with_mention': x = 1 if page_title is not None and ( page_title.lower().startswith(mention_text.lower()) or page_title.lower().endswith(mention_text.lower())) else 0 features.append(x) elif feature == 'mention_text_starts_or_ends_with_entity': x = 1 if page_title is not None and ( mention_text.lower().startswith(page_title.lower()) or mention_text.lower().endswith(page_title.lower())) else 0 features.append(x) elif feature == 'edit_distance': features.append( edit_distance(page_title.lower(), mention_text.lower() ) if page_title is not None else 0) # context similarity features elif feature == 'yamada_context_similarity': if not hasattr(mention.document(), 'yamada_context_nouns'): mention.document().yamada_context_nouns = \ self._opennlp.list_nouns(mention.document().sentences) if not hasattr(mention.document(), 'yamada_context_embd'): mention.document().yamada_context_embd = dict() if mention_text not in mention.document().yamada_context_embd: context_embd = self.yamada_txt_to_embd.text_to_embedding( mention.document().yamada_context_nouns, mention_text) mention.document( ).yamada_context_embd[mention_text] = context_embd context_embd = mention.document( ).yamada_context_embd[mention_text] entity_embd = self.yamada_txt_to_embd.from_the_cache(entity) self.n += 1 if entity_embd is not None: s = self.yamada_txt_to_embd.similarity( context_embd, entity_embd) # print self.yamada_txt_to_embd.similarity(context_embd, entity_embd) features.append(s) if s > 0: self.nn += 1 else: #print 0 features.append(0.0) if self.n % 100 == 0: print "yamada got sim", self.nn / float(self.n) elif feature == 'our_context_similarity': if not hasattr(mention.document(), 'our_context_nouns'): mention.document().our_context_nouns = \ self._w2v.get_nouns(mention.document().sentences) if not hasattr(mention.document(), 'our_context_embd'): mention.document().our_context_embd = dict() if mention_text not in mention.document().our_context_embd: context_embd = self._w2v.text_to_embedding( mention.document().our_context_nouns, mention_text) mention.document( ).our_context_embd[mention_text] = context_embd context_embd = mention.document( ).our_context_embd[mention_text] entity_embd = self._w2v.get_entity_vec(entity) if entity_embd is not None: print self._w2v.similarity(context_embd, entity_embd) features.append( self._w2v.similarity(context_embd, entity_embd)) else: print 0 features.append(0.0) elif feature.startswith('model_'): x = self.models_as_features_predictors[ feature[6:]].predict_prob(mention, entity) features.append(x) else: raise "feature undefined" return features
def validation(model, criterion, evaluation_loader, converter, opt): """ validation or evaluation """ for p in model.parameters(): p.requires_grad = False n_correct = 0 norm_ED = 0 length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() for i, (image_tensors, labels) in enumerate(evaluation_loader): batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size with torch.no_grad(): image = image_tensors.cuda() # For max length prediction length_for_pred = torch.cuda.IntTensor([opt.batch_max_length] * batch_size) text_for_pred = torch.cuda.LongTensor( batch_size, opt.batch_max_length + 1).fill_(0) text_for_loss, length_for_loss = converter.encode(labels) start_time = time.time() if 'CTC' in opt.Prediction: preds = model(image, text_for_pred).log_softmax(2) forward_time = time.time() - start_time # Calculate evaluation loss for CTC deocder. preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCloss format cost = criterion(preds, text_for_loss, preds_size, length_for_loss) # Select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_index = preds_index.transpose(1, 0).contiguous().view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) forward_time = time.time() - start_time preds = preds[:, :text_for_loss.shape[1] - 1, :] target = text_for_loss[:, 1:] # without [GO] Symbol cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) labels = converter.decode(text_for_loss[:, 1:], length_for_loss) infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy. for pred, gt in zip(preds_str, labels): if 'Attn' in opt.Prediction: pred = pred[:pred.find( '[s]')] # prune after "end of sentence" token ([s]) gt = gt[:gt.find('[s]')] if pred == gt: n_correct += 1 if len(gt) == 0: norm_ED += 1 else: norm_ED += edit_distance(pred, gt) / len(gt) accuracy = n_correct / float(length_of_data) * 100 return valid_loss_avg.val( ), accuracy, norm_ED, preds_str, labels, infer_time, length_of_data
def levenshtein(self, a, b): return distance.edit_distance(a, b)
def evaluate(sentx, senty): sent_max_len = max(len(list(sentx)), len(list(senty))) if sent_max_len == 0: return 0 return edit_distance(sentx, senty) / sent_max_len
def distance(name, query): return edit_distance(name, query)
def cmp_text_edit_distance(self, annotation, candidate, entire_annotation): """ """ result = edit_distance(annotation, candidate) return result
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input_text', '-i', required=True, type=str) parser.add_argument('--output_json', '-o', required=True, type=str) parser.add_argument('--tmp_dir', '-t', default='results/tmp', type=str) parser.add_argument('--classifier_type', '-ct', default='RelationClassification', type=str, choices=['RelationClassification', 'fasttext']) parser.add_argument('--classifier_model', '-c', nargs='*', type=str) parser.add_argument('--classifier_preprocessor', '-cp', nargs='*', type=str) parser.add_argument('--use_amr', '-uamr', action='store_true') parser.add_argument('--amrs_from', type=str) parser.add_argument('--tokenize', action='store_true') parser.add_argument('--use_sdg', '-usdg', action='store_true') parser.add_argument('--sdg_model', '-sdg', default='stanford', type=str, choices=['stanford', 'spacy']) parser.add_argument('--entity_recognizer', '-er', default='None', type=str, choices=['None', 'tagNERv2', 'tagNERv3', 'byteNER']) parser.add_argument('--entities_from', type=str) parser.add_argument('--anonymize', '-a', action='store_true') parser.add_argument('--add_symmetric_pairs', '-sym', action='store_true') parser.add_argument('--ensembling_mode', '-ens', type=str, default='average', choices=['average', 'majority_vote']) args = parser.parse_args() basename = os.path.basename(args.input_text) args.tmp_dir = os.path.abspath(args.tmp_dir) print("Using tmp dir: {}".format(args.tmp_dir)) ensure_dir(args.tmp_dir) # not very necessary if args.entity_recognizer == 'None': if args.entities_from: print("Getting ground truth entities and pairs...") with io.open(args.entities_from, encoding='utf-8') as fr: with io.open(args.output_json, 'w', encoding='utf-8') as fw: ground_truth = fr.read() fw.write(ground_truth) print('Done\n') else: raise Exception("--entities_from is not specified") elif args.entity_recognizer == 'tagNERv2': tokenized_input = os.path.join(args.tmp_dir, '{}.tokenized.txt'.format(basename)) entities_output = os.path.join(args.tmp_dir, '{}.tokenized.txt.IOB'.format(basename)) # candidate_tuples_json = os.path.join(args.tmp_dir, '{}.candidates.json'.format(basename)) print("Tokenizing...") # print("Adding spaces around -") with io.open(args.input_text, encoding='utf-8') as fr: with io.open(tokenized_input, 'w', encoding='utf-8') as fw: for line in fr.readlines(): id, sentence = line[:-1].split('\t') # \n symbol #sentence = sentence.replace('-',' - ') sentence = ' '.join(sentence.split()) fw.write("{}\t{}\n".format(id, sentence)) print('Running tagNERv2...') check_call(['bash', 'tag_NER.sh', '-i', tokenized_input, '-f', 'IOB'], cwd='submodules/tag_NER_v2') print('Done\n') # the output is entities_output print('Building interaction tuples with unknown labels...') check_call([ 'python', 'iob_to_bind_json.py', '--input_text', args.input_text, '--input_iob2', entities_output, '--output_json', args.output_json ]) #candidate_tuples_json]) print('Done\n') elif args.entity_recognizer == 'tagNERv3': tokenized_input = os.path.join(args.tmp_dir, '{}.tokenized.txt'.format(basename)) entities_output = os.path.join(args.tmp_dir, '{}.tokenized.txt.IOB'.format(basename)) # candidate_tuples_json = os.path.join(args.tmp_dir, '{}.candidates.json'.format(basename)) print("Tokenizing...") # print("Adding spaces around -") with io.open(args.input_text, encoding='utf-8') as fr: with io.open(tokenized_input, 'w', encoding='utf-8') as fw: for line in fr.readlines(): id, sentence = line[:-1].split('\t') # \n symbol #sentence = sentence.replace('-',' - ') sentence = ' '.join(sentence.split()) fw.write("{}\t{}\n".format(id, sentence)) print('Running tagNERv3 inside Docker...') with open(tokenized_input, 'r') as f_in, open(entities_output, 'w') as f_out: p = run([ 'nvidia-docker', 'run', '-i', '--rm', 'yerevann/tag-ner-v3', '-i', '/dev/stdin', '-f', 'IOB' ], stdin=f_in, stdout=f_out) print('Done\n') # the output is entities_output print('Building interaction tuples with unknown labels...') check_call([ 'python', 'iob_to_bind_json.py', '--input_text', args.input_text, '--input_iob2', entities_output, '--output_json', args.output_json ]) #candidate_tuples_json]) print('Done\n') elif args.entity_recognizer == 'byteNER': input_without_ids = os.path.join(args.tmp_dir, '{}.noids.txt'.format(basename)) entities_output = os.path.join(args.tmp_dir, '{}.IOB'.format(basename)) entities_output_chr = os.path.join(args.tmp_dir, '{}.IOB.chr'.format(basename)) print('Removing IDs from input for byteNER') with io.open(args.input_text, encoding='utf-8') as fr: with io.open(input_without_ids, 'w', encoding='utf-8') as fw: for line in fr.readlines(): id, sentence = line[:-1].split('\t') # \n symbol fw.write("{}\n".format(sentence)) print("Done") print('Running byteNER...') # requires Keras 2.0.6 on python2! env = os.environ.copy() env['KERAS_BACKEND'] = 'theano' env['THEANO_FLAGS'] = 'dnn.enabled=False' check_call([ 'python2', 'tagger.py', '-m', 'models/20CNN,dropout0.5,bytedrop0.3,lr0.0001,bytes,bpe,blstm,crf,biocreative.model', '-i', input_without_ids, '-o', entities_output, '--output_format', 'iob' ], cwd='submodules/byteNER', env=env) print('Done\n') print('Building interaction tuples with unknown labels...') check_call([ 'python', 'iob_to_bind_json.py', '--character_level', '--input_text', args.input_text, '--input_iob2', entities_output_chr, '--output_json', args.output_json ]) #candidate_tuples_json]) print('Done\n') pretokenized_input = os.path.join(args.tmp_dir, '{}.pretokenized.txt'.format(basename)) if args.tokenize: with open(args.input_text, 'r', encoding='utf-8') as fr: with open(pretokenized_input, 'w', encoding='utf-8') as fw: for line in fr: id, sentence = line[:-1].split('\t') sentence = double_normalize_text(sentence, lower=False) # although fasttext vectors require lower(), RelClass handles it internally fw.write("{}\t{}\n".format(id, sentence)) else: with open(args.input_text, 'r', encoding='utf-8') as fr: with open(pretokenized_input, 'w', encoding='utf-8') as fw: fw.write(fr.read()) if args.add_symmetric_pairs: # useful for symmetric interactions like `bind` with io.open(args.output_json, 'r', encoding='utf-8') as f: dense = json.load(f) print("Adding symmetric pairs...") for sentence in dense: sym = [] for i, pair in enumerate(sentence['extracted_information']): reverse_pair = pair.copy() reverse_pair['participant_a'] = pair['participant_b'] reverse_pair['participant_b'] = pair['participant_a'] reverse_pair['_sym_of'] = i sym.append(reverse_pair) sentence['extracted_information'] += sym with io.open(args.output_json, 'w', encoding='utf-8') as f: json.dump(dense, f) if args.use_amr: print('Adding AMRs...') if args.amrs_from: with open(args.amrs_from, 'r', encoding='utf-8') as f: amrs = json.load(f) amr_dict = {} for sample in amrs: amr_dict[sample['id']] = sample['amr'] with open(args.output_json, 'r', encoding='utf-8') as f: data = json.load(f) for sentence in data: sentence['amr'] = amr_dict[sentence['id']] with open(args.output_json, 'w', encoding='utf-8') as f: json.dump(data, f) else: check_call([ 'python3', 'add_amr.py', '--input_text', pretokenized_input, '--input_json', args.output_json, '--model', 'amr2_bio7_best_after_2_fscore_0.6118.m', #'--model', 'bio_model_best.m', '--output_json', args.output_json, '--tmp_dir', args.tmp_dir ]) print('Done\n') print('Extracting AMR paths...') check_call([ 'python3', 'append_amr_paths.py', '--input_json', args.output_json, '--output_json', args.output_json, '--tmp_dir', args.tmp_dir ]) print('Done\n') print('Appending Amr Soft-Matching Statistics...') with open(args.output_json, 'r', encoding='utf-8') as f: data = json.load(f) for sentence in data: for info in sentence['extracted_information']: participant_a = info['participant_a'] participant_b = info['participant_b'] if not info['amr_path']: info['amr_path'] = '{} _nopath_ {}'.format( participant_a, participant_b) info['amr_soft_match_distance_a'] = -1 info['amr_soft_match_distance_b'] = -1 else: amr_match_a = info['amr_path'].split()[0] amr_match_b = info['amr_path'].split()[-1] info['amr_soft_match_distance_a'] = edit_distance( participant_a, amr_match_a) info['amr_soft_match_distance_b'] = edit_distance( participant_b, amr_match_b) with open(args.output_json, 'w', encoding='utf-8') as f: json.dump(data, f) print('Done') if args.use_sdg: print('Adding Stanford Dependency Graphs...') check_call([ 'python', 'add_sdg.py', '--input_text', pretokenized_input, '--input_json', args.output_json, '--output_json', args.output_json, '--model', args.sdg_model, '--tmp_dir', args.tmp_dir ]) print('Done\n') print('Extracting SDG paths...') check_call([ 'python', 'append_sdg_paths.py', '--input_json', args.output_json, '--output_json', args.output_json ]) print('Done\n') print('Appending SDG Soft-Matching Statistics...') with open(args.output_json, 'r', encoding='utf-8') as f: data = json.load(f) for sentence in data: for info in sentence['extracted_information']: participant_a = info['participant_a'] participant_b = info['participant_b'] if not info['sdg_path']: info['sdg_path'] = '{} _nopath_ {}'.format( participant_a, participant_b) info['sdg_soft_match_distance_a'] = -1 info['sdg_soft_match_distance_b'] = -1 else: sdg_match_a = info['sdg_path'].split()[0] sdg_match_b = info['sdg_path'].split()[-1] info['sdg_soft_match_distance_a'] = edit_distance( participant_a, sdg_match_a) info['sdg_soft_match_distance_b'] = edit_distance( participant_b, sdg_match_b) with open(args.output_json, 'w', encoding='utf-8') as f: json.dump(data, f) print('Done') # raise Exception("Classifier is not ready!") before_classifier = os.path.join( args.tmp_dir, '{}.before-classifier.json'.format(basename)) after_classifier = os.path.join( args.tmp_dir, '{}.after-classifier.0.json'.format(basename)) after_classifier_format_string = os.path.join( args.tmp_dir, '{}.after-classifier.{}.json'.format(basename, "{}")) print( "Converting dense JSON to flat JSON: {} ...".format(before_classifier)) with io.open(args.output_json, encoding='utf-8') as fr: dense = json.load(fr) flat = {} if args.anonymize: print("Anonymizing...") for sentence in dense: for i, pair in enumerate(sentence['extracted_information']): id = "{}|{}".format(sentence['id'], i) sentence['extracted_information'][i]['id'] = id flat[id] = { 'text': sentence['text'], 'interaction_tuple': [ pair['interaction_type'], '', pair['participant_a'], pair['participant_b'] ], 'label': 1 if pair['label'] != 0 else 0 # TODO: -1s } if '_sym_of' in pair: flat[id]['_sym_of'] = "{}|{}".format( sentence['id'], pair['_sym_of']) tokenized_text = None if args.anonymize: placeholder_a = '__participant_a__' placeholder_b = '__participant_b__' flat[id]['interaction_tuple'][2] = placeholder_a flat[id]['interaction_tuple'][3] = placeholder_b if not args.use_sdg and not args.use_amr: raise NotImplementedError('Anonymization for this \ setting is not implemented') if args.use_sdg: sdg_match_a = pair['sdg_path'].split()[0] sdg_match_b = pair['sdg_path'].split()[-1] pair['sdg_path'] = pair['sdg_path'].replace( sdg_match_a, placeholder_a) pair['sdg_path'] = pair['sdg_path'].replace( sdg_match_b, placeholder_b) tokenized_text = sentence['tokenized_text'] tokenized_text = [ placeholder_a if word == sdg_match_a else word for word in tokenized_text ] tokenized_text = [ placeholder_b if word == sdg_match_b else word for word in tokenized_text ] # sdg = sentence['sdg'].replace(sdg_match_a, # placeholder_a) # sdg = sdg.replace(sdg_match_b, # placeholder_b) if args.use_amr: amr_match_a = pair['amr_path'].split()[0] amr_match_b = pair['amr_path'].split()[-1] pair['amr_path'] = pair['amr_path'].replace( amr_match_a, placeholder_a) pair['amr_path'] = pair['amr_path'].replace( amr_match_b, placeholder_b) if args.use_sdg: participant_a = sdg_match_a participant_b = sdg_match_b else: participant_a = amr_match_a participant_b = amr_match_b text = flat[id]['text'] text = text.replace(participant_a, placeholder_a) text = text.replace(participant_b, placeholder_b) flat[id]['text'] = text if 'amr_path' in pair: flat[id]['amr_path'] = pair['amr_path'] if 'sdg_path' in pair: flat[id]['sdg_path'] = pair['sdg_path'] if 'tokenized_text' in sentence: if tokenized_text is not None: # custom, anonymized version flat[id]['tokenized_text'] = tokenized_text else: # general version flat[id]['tokenized_text'] = sentence['tokenized_text'] if 'pos_tags' in sentence: flat[id]['pos_tags'] = sentence['pos_tags'] flat_json_string = json.dumps(flat, indent=True) with io.open(before_classifier, 'w', encoding='utf-8') as fw: fw.write(flat_json_string) print("Done!") print('Detecting true interactions using {} ...'.format( args.classifier_type)) if args.classifier_type == "RelationClassification": for i, (model, processor) in enumerate( zip(args.classifier_model, args.classifier_preprocessor)): print('Running model number {}'.format(i)) print('Model filepath: {}'.format(model)) check_call([ 'python2', 'predict.py', '--input_path', before_classifier, '--output_path', after_classifier_format_string.format(i), '--processor_path', processor, '--model_path', model, ], cwd='submodules/RelationClassification/') elif args.classifier_type == "fasttext": # TODO: this does not support multiple models! # TODO: this is pretty ugly. # Preprocessing and postprocessing for fasttext and RelClass should be at the same level before_fasttext = os.path.join( args.tmp_dir, '{}.before-fasttext.txt'.format(basename)) fasttext_keys = [] with io.open(before_fasttext, 'w', encoding='utf-8') as fw: for k, v in flat.items(): fw.write("{}\n".format(v['text'])) fasttext_keys.append(k) fasttext_output = check_output([ 'fasttext', 'predict', args.classifier_model, before_fasttext, #after_classifier ]) fasttext_labels = fasttext_output.decode('utf-8').split('\n') for i, k in enumerate(fasttext_keys): label_string = fasttext_labels[i] if not label_string.startswith("__label__"): print("Error: invalid label: {}".format(label_string)) else: flat[k]['prediction'] = int(label_string[9:]) flat_json_string = json.dumps(flat, indent=True) with io.open(after_classifier, 'w', encoding='utf-8') as fw: fw.write(flat_json_string) for after_classifier in sorted( glob(after_classifier_format_string.format('*'))): print("Reading classifier output from flat JSON: {} ...".format( after_classifier)) with io.open(after_classifier, encoding='utf-8') as fr: flat = json.load(fr) found = 0 missing = 0 for sentence in dense: for pair in sentence['extracted_information']: if pair['id'] in flat: if 'predictions' not in pair: pair['predictions'] = [] if 'probabilities' not in pair: pair['probabilities'] = [] pair['predictions'].append( flat[pair['id']]['prediction']) if 'probabilities' in flat[pair['id']]: pair['probabilities'].append( flat[pair['id']]['probabilities']) found += 1 else: missing += 1 print("{}/{} items did not have predictions in {}".format( missing, missing + found, after_classifier)) # Performing Ensembling for sentence in dense: for pair in sentence['extracted_information']: if args.ensembling_mode == 'majority_vote': if sum(pair['predictions']) / len(pair['predictions']) < 0.5: pair['label'] = 0 else: pair['label'] = 1 else: # args.ensembling_mode = 'average' prob = np.array(pair['probabilities']).mean(axis=0) pair['label'] = int(prob.argmax()) with io.open(args.output_json, 'w', encoding='utf-8') as fw: dense_json_string = json.dumps(dense, indent=True) fw.write(dense_json_string) print("Done!")
def MatchURLTraces(d, m): dtraces = sorted(d.traces.iteritems()) mtraces = sorted(m.traces.iteritems()) dTrStr = utils.getURLFeatures(dtraces, d) mTrStr = utils.getURLFeatures(mtraces, m) print "*"*100 print "Desktop features:", len(dTrStr) for tr in dTrStr: print '\t',tr[0] print "Mobile features:", len(mTrStr) for tr in mTrStr: print '\t',tr[0] print "*"*100 x,y = len(dTrStr), len(mTrStr) data = numpy.zeros(shape=(x,y)) for i in range(x): for j in range(y): df, dStr = dTrStr[i] mf, mStr = mTrStr[j] dist = distance.edit_distance(dStr,mStr) data[i,j] = dist ## try t = max(len(dStr), len(mStr)) if dist < URL_THRESH * t: print "++", data[i,j] = dist else: print "--", data[i,j] = 100000000 print dist, t, dStr, mStr, df, mf # if dist < 1: # print '\t- perfect match', df, dStr, mf, mStr # exit() print data N = max(x,y) print "Resizing", x, y, " to ", N #data.resize((N,N)) matrix = numpy.ones(shape=(N,N)) * 100000000 matrix[:x,:y] = data #matrix = numpy.copy(data) mwbgm = bipartitematching.Munkres() indexes = mwbgm.compute(matrix) bipartitematching.print_matrix(matrix, msg='Lowest cost through this matrix:') total = 0 for row, column in indexes: if row < x and column < y: value = data[row][column] if value > 10000000: continue total += value print '(%d, %d) -> %d ' % (row, column, value), print '==> %s = %s' %(dTrStr[row][0], mTrStr[column][0]) print 'total cost: %d' % total print "*"*100
for i in toktok.tokenize(s): if re.match(r_exp_an, i): print("Prueba REGEXP start:\t", end="") if i.lower() in stopwords: r_oraciones.append(i) print('Prueba Stopwords OK:', i) elif re.match(r_exp_n, i): r_oraciones.append(i) print('Prueba REGEXP-Numbers OK:', i) else: print("Buscar " + str(i) + ": ") x = None rnk = 3 for j in datos: if (len(i) + 3) >= len(j): distancia = edit_distance(i.lower(), j) if (distancia < rnk): rnk = distancia if (rnk == 0): x = j print(i, j, distancia, rnk, x, str((len(i) + 3)), str(len(j)), (len(i) + 3) >= len(j)) break else: x = j if (rnk < 2) else None if x == None: print('\tCambiado->No') r_oraciones.append(i) else: print('\tCambiado->Si: ' + x)
real_words = set(model) # generate Test Dataset data = pd.DataFrame(columns=["Correct", "Misspelling"]) f = open("data/misspelling.txt", "r") j = 0 alphabet = set('abcdefghijklmnopqrstuvwxyz') for i in tqdm(f): # print(j) # if j > 50: # break j += 1 if i[0] == "$": correct = i[1:].lower().strip() else: i = i.lower().strip() if not (i in real_words) and not (set(i) - alphabet) and not ( set(correct) - alphabet) and 0 < edit_distance(correct, i) <= 2: data = data.append({ 'Correct': correct, 'Misspelling': i }, ignore_index=True) data.to_csv(path_or_buf='data/testdata.txt', sep=' ', index=False, header=False)
def get_values(entities, domain): _random, bayes_random = {}, {} bayes_no_variation, bayes_variation = {}, {} siddharthan, deemter = {}, {} for _id in entities: evaluation = p.load(open(os.path.join(properties.evaluation_dir, _id))) for fold in evaluation: if fold not in bayes_random: _random[fold] = { 'y_real': [], 'y_pred': [], 'string': [], 'jaccard': [] } bayes_random[fold] = { 'y_real': [], 'y_pred': [], 'string': [], 'jaccard': [] } bayes_no_variation[fold] = { 'y_real': [], 'y_pred': [], 'string': [], 'jaccard': [] } bayes_variation[fold] = { 'y_real': [], 'y_pred': [], 'string': [], 'jaccard': [] } siddharthan[fold] = { 'y_real': [], 'y_pred': [], 'string': [], 'jaccard': [] } deemter[fold] = { 'y_real': [], 'y_pred': [], 'string': [], 'jaccard': [] } for item in evaluation[fold]: item_domain = get_domain(item['features']['fname']) if domain == item_domain or domain == '': string_real = item['real']['reference'] string_random = item['random']['reference'] string_bayes_random = item['bayes_random']['reference'][0][ 0] string_bayes_no_variation = item['bayes_no_variation'][ 'reference'][0][0] string_bayes_variation = item['bayes_variation'][ 'reference'][0][0] string_siddharthan = item['siddharthan']['reference'] string_deemter = item['deemter']['reference'] dist_random = edit_distance(string_random, string_real) dist_bayes_random = edit_distance(string_bayes_random, string_real) dist_bayes_no_variation = edit_distance( string_bayes_no_variation, string_real) dist_bayes_variation = edit_distance( string_bayes_variation, string_real) dist_siddharthan = edit_distance(string_siddharthan, string_real) dist_deemter = edit_distance(string_deemter, string_real) tokens_real = set(nltk.word_tokenize(string_real)) tokens_random = set(nltk.word_tokenize(string_random)) tokens_bayes_random = set( nltk.word_tokenize(string_bayes_random)) tokens_bayes_no_variation = set( nltk.word_tokenize(string_bayes_no_variation)) tokens_bayes_variation = set( nltk.word_tokenize(string_bayes_variation)) tokens_siddharthan = set( nltk.word_tokenize(string_siddharthan)) tokens_deemter = set(nltk.word_tokenize(string_deemter)) jaccard_random = jaccard_distance(tokens_random, tokens_real) jaccard_bayes_random = jaccard_distance( tokens_bayes_random, tokens_real) jaccard_bayes_no_variation = jaccard_distance( tokens_bayes_no_variation, tokens_real) jaccard_bayes_variation = jaccard_distance( tokens_bayes_variation, tokens_real) jaccard_siddharthan = jaccard_distance( tokens_siddharthan, tokens_real) jaccard_deemter = jaccard_distance(tokens_deemter, tokens_real) bayes_random[fold]['y_real'].append(item['real']['label']) bayes_random[fold]['y_pred'].append( item['bayes_random']['label'][0]) bayes_random[fold]['string'].append(dist_bayes_random) bayes_random[fold]['jaccard'].append(jaccard_bayes_random) bayes_no_variation[fold]['y_real'].append( item['real']['label']) bayes_no_variation[fold]['y_pred'].append( item['bayes_no_variation']['label'][0]) bayes_no_variation[fold]['string'].append( dist_bayes_no_variation) bayes_no_variation[fold]['jaccard'].append( jaccard_bayes_no_variation) bayes_variation[fold]['y_real'].append( item['real']['label']) bayes_variation[fold]['y_pred'].append( item['bayes_variation']['label'][0]) bayes_variation[fold]['string'].append( dist_bayes_variation) bayes_variation[fold]['jaccard'].append( jaccard_bayes_variation) _random[fold]['y_real'].append(item['real']['label']) _random[fold]['y_pred'].append(item['random']['label']) _random[fold]['string'].append(dist_random) _random[fold]['jaccard'].append(jaccard_random) siddharthan[fold]['y_real'].append(item['real']['label']) siddharthan[fold]['y_pred'].append( item['siddharthan']['label']) siddharthan[fold]['string'].append(dist_siddharthan) siddharthan[fold]['jaccard'].append(jaccard_siddharthan) deemter[fold]['y_real'].append(item['real']['label']) deemter[fold]['y_pred'].append(item['deemter']['label']) deemter[fold]['string'].append(dist_deemter) deemter[fold]['jaccard'].append(jaccard_deemter) return _random, bayes_random, bayes_no_variation, bayes_variation, siddharthan, deemter
def validation(model, criterion, evaluation_loader, converter, opt, eval_data=None): """ validation or evaluation """ for p in model.parameters(): p.requires_grad = False n_correct = 0 norm_ED = 0 max_length = opt.batch_max_length length_of_data = 0 infer_time = 0 valid_loss_avg = Averager() if 'Transformer' in opt.SequenceModeling: text_pos = torch.arange(1, max_length + 2, dtype=torch.long, device='cuda').expand( evaluation_loader.batch_size, -1) for i, (image_tensors, labels) in enumerate(evaluation_loader): print(image_tensors.size()) img = image_tensors[100].squeeze().mul_(0.5).add_(0.5).mul_( 255).numpy() print(img.shape) cv2.imshow('1', img) cv2.waitKey(0) cv2.destroyAllWindows() exit() batch_size = image_tensors.size(0) length_of_data = length_of_data + batch_size with torch.no_grad(): image = image_tensors.cuda() # For max length prediction length_for_pred = torch.cuda.IntTensor([opt.batch_max_length] * batch_size) text_for_pred = torch.cuda.LongTensor( batch_size, opt.batch_max_length + 1).fill_(0) if 'Transformer' in opt.SequenceModeling: text_for_loss, length_for_loss, text_pos_for_loss = converter.encode( labels, opt.batch_max_length) elif 'CTC' in opt.Prediction: text_for_loss, length_for_loss = converter.encode(labels) else: text_for_loss, length_for_loss = converter.encode( labels, opt.batch_max_length) start_time = time.time() if 'Transformer' in opt.SequenceModeling: batch_text_pos = text_pos[:batch_size] preds = model(image, text_for_pred, is_train=False, tgt_pos=batch_text_pos) forward_time = time.time() - start_time # print('test pred',preds[0].size(),text_for_loss.shape[1] - 1) preds = preds[:, :text_for_loss.shape[1] - 1, :] target = text_for_loss[:, 1:] # without [GO] Symbol # print('pred',preds.size(),target.size()) # print('pred[0]',preds[0],target[0]) cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character # print('cost',cost) # exit() _, preds_index = preds.max(2) # print('preds_index',preds_index,length_for_pred) # exit() preds_str = converter.decode(preds_index, length_for_pred) labels = converter.decode(text_for_loss[:, 1:], length_for_loss) elif 'CTC' in opt.Prediction: preds = model(image, text_for_pred).log_softmax(2) forward_time = time.time() - start_time # Calculate evaluation loss for CTC deocder. preds_size = torch.IntTensor([preds.size(1)] * batch_size) preds = preds.permute(1, 0, 2) # to use CTCloss format cost = criterion(preds, text_for_loss, preds_size, length_for_loss) # Select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_index = preds_index.transpose(1, 0).contiguous().view(-1) preds_str = converter.decode(preds_index.data, preds_size.data) else: preds = model(image, text_for_pred, is_train=False) forward_time = time.time() - start_time preds = preds[:, :text_for_loss.shape[1] - 1, :] target = text_for_loss[:, 1:] # without [GO] Symbol cost = criterion(preds.contiguous().view(-1, preds.shape[-1]), target.contiguous().view(-1)) # select max probabilty (greedy decoding) then decode index to character _, preds_index = preds.max(2) preds_str = converter.decode(preds_index, length_for_pred) labels = converter.decode(text_for_loss[:, 1:], length_for_loss) print('forward_time', forward_time * 1000, 'ms') infer_time += forward_time valid_loss_avg.add(cost) # calculate accuracy. for pred, gt in zip(preds_str, labels): if 'Transformer' in opt.SequenceModeling: pred = pred[:pred.find('</s>')] gt = gt[:gt.find('</s>')] elif 'Attn' in opt.Prediction: # prune after "end of sentence" token ([s]) pred = pred[:pred.find('[s]')] gt = gt[:gt.find('[s]')] if pred == gt: n_correct += 1 norm_ED += edit_distance(pred, gt) / len(gt) accuracy = n_correct / float(length_of_data) * 100 return valid_loss_avg.val( ), accuracy, norm_ED, preds_str, labels, infer_time, length_of_data
def find_correct_case(word, case_mode, structures): """Select the best case between a set of already encountered cases Parameters: word (:func:`str`): Word to correct case_mode (int): Choice between lower or upper case (extra choice for undecisive) structures (dict): List of structures needed to perform the choice Returns: :func:`str` - Corrected word """ variations = { key: structures["occurence_map"][key] for key in structures["altcase"][word.lower()] } variations = sorted(variations.iteritems(), key=operator.itemgetter(1), reverse=True) tmp_vars = [] if case_mode == 0: # Upper case spelling for var in variations: _word = var[0] if _word[0].isupper() and sum(char.isupper() for char in _word) > 2: tmp_vars.append(var) if len(tmp_vars) == 0: tmp_vars = variations elif case_mode == 1: # Lower case with capital initial for var in variations: _word = var[0] if _word[0].isupper() and sum(char.isupper() for char in _word) <= 2: tmp_vars.append(var) if len(tmp_vars) == 0: tmp_vars = variations else: # case_mode == -1 (no capital letters found) tmp_vars = variations max_occ = tmp_vars[0][1] dist_vars = { term: edit_distance(word, term) for term, occ in tmp_vars if occ == max_occ } if len(dist_vars) == 1: return dist_vars.keys()[0] # Several terms with max occurence still exist dist_vars = sorted(dist_vars.iteritems(), key=operator.itemgetter(1)) min_dist = dist_vars[0][1] min_dist_vars = [term for term, dist in dist_vars if dist == min_dist] if len(min_dist_vars) == 1: return min_dist_vars[0] # Several terms with same Levenhstein distance exist term_ascii_code = { term: [ord(ch) for ch in term] for term in min_dist_vars } for ascii_code in term_ascii_code.values(): for i in xrange(len(ascii_code)): code = ascii_code[i] # Non a-zA-Z chars will have a 0 value if code < 65 or 90 < code < 97 or code > 122: ascii_code[i] = 0 if case_mode >= 0: ascii_val = min(term_ascii_code.values()) t = [t for t, v in term_ascii_code.items() if v == ascii_val] if len(t) > 1: raise ValueError("Too many value in final array") return t[0] else: ascii_val = max(term_ascii_code.values()) t = [t for t, v in term_ascii_code.items() if v == ascii_val] if len(t) > 1: raise ValueError("Too many value in final array") return t[0]
if __name__ == '__main__': a = ' Herbert Karagan' b = 'Karajan hervert wisloW ' # su distancia es 2 q = 'hervert' w = 'erberth' e = 'Hervert' # su distancia es 3 y = ' qW rJRs Pin' w = 'PiN qw qweqwe lñlqw svkf RHqT' j = 'Qw RjrS pInqw' serie1 = pd.Series(['qwe we', 'Ana Palacios', 'pedro biescas']) serie2 = pd.Series([' we qweR', ' Palacios ewAna ', ' biescas pedro']) print(edit_distance(q, w)) print(edit_distance(q, e)) print(lev_no_case_sens(q, e)) print(bow_dist(a, b)) # print(list(permutations('123', 2))) print(bow_dist(y, w)) print(bow_dist(y, w, case_sens=True)) print(bow_dist(y, j)) print(bow_dist(y, j, case_sens=True)) print(dist_string_to_series('pedro viehscas', serie2)) print(dist_series_to_series(serie1, serie2)) print(dist_series_to_series_paralell(serie1, serie2))
def distance(self, first_word, second_word): return edit_distance(first_word, second_word)
def score_prediction(y_true, y_pred): """Function to score prediction on IAM, using Levenshtein distance to calculate character error rate (CER) Parameters ------ y_true: list list of ground truth labels y_pred: list list of predicted labels Returns ------- CER: float character error rate WER: float word error rate """ words_identified = 0 characters_identified = 0 char_tot = 0 # CER = 0 # list_accuracy_characters = [] for i in range(len(y_pred)): #pred_row = [y_true[i], y_pred[i]] #check if date are the same #if pred_row[0] == pred_row[1]: if y_true[i] == y_pred[i]: words_identified += 1 # if len(pred_row[1]) < len(pred_row[0]): # pred_row[1] += '-' * (len(pred_row[0]) - len(pred_row[1])) # elif len(pred_row[1]) > len(pred_row[1]): # pred_row[1] = pred_row[1][0:len(pred_row[0])] #check the number of characters that are the same # print(y_true[i]) # print(y_pred[i]) levenshtein_distance = edit_distance(y_true[i], y_pred[i]) n_char = np.maximum(len(y_true[i]), len(y_pred[i])) normalized_distance = levenshtein_distance/n_char characters_identified += normalized_distance # char_tot += n_char # CER += normalized_distance # print(len(y_true[i])) # for k in range(len(y_true[i])): # print() # if y_true[i][k] == y_pred[1][k]: # characters_identified += 1 # char_tot += 1 # array_accuracy_characters = np.asarray(list_accuracy_characters) CER = float((characters_identified) / len(y_true)) WER = (len(y_pred) - words_identified)/len(y_pred) return CER, WER # return WER