def is_response_close_enough_using_leveinstein_with_text_separation( self, response, expected_response): acceptable_levenshtein_threshold = 0.5 if len(response) < len(expected_response): expected_response = self._return_response_in_same_length( response, expected_response) for res in expected_response: if stringdist.levenshtein( response, res) / 33 < acceptable_levenshtein_threshold: return True elif len(response) > len(expected_response): response = self._return_response_in_same_length( expected_response, response) for res in response: if stringdist.levenshtein( expected_response, res) / 33 < acceptable_levenshtein_threshold: return True else: if stringdist.levenshtein( response, expected_response) / 33 < acceptable_levenshtein_threshold: return True return False
def get_city(cities, city_field): if '/' in city_field: city_field = city_field.split('/')[1] city_name = util.remove_accents_in_string(city_field.lower().strip()) closest_match = None closest_distance = 100 for city in cities: db_city_name = util.remove_accents_in_string( city['fields']['name'].lower().strip()) if city['fields']['alias']: db_city_alias = util.remove_accents_in_string( city['fields']['alias'].lower().strip()) else: db_city_alias = 'this is never ever a city...' distance = stringdist.levenshtein(city_name, db_city_name) distance_alias = stringdist.levenshtein(city_name, db_city_alias) if (distance < closest_distance and distance < 6) or \ (distance_alias < closest_distance and distance_alias < 6): # if Levishtein distance is close enough will do! closest_match = city closest_distance = min(distance, distance_alias) print('city: ' + db_city_name + ' distance: ' + str(closest_distance)) return closest_match
def ocr_metrics(pred_texts, gt_texts, lower=True): ''' lower: If set, converted to lowercase Takes 'predicted-texts' and 'ground truth-texts' as arguments. Returns Character Error Rate (CER) Word Error Rate (WER) Sequence Error Rate (SER) ''' cer, wer, ser = [], [], [] for pred, gt in zip(pred_texts, gt_texts): if lower: pred, gt = pred.lower(), gt.lower() # CER pred_cer, gt_cer = list(pred), list(gt) dist = stringdist.levenshtein(pred_cer, gt_cer) cer.append(dist / max(len(pred_cer), len(gt_cer))) # WER pred_wer, gt_wer = pred.split(), gt.split() dist = stringdist.levenshtein(pred_wer, gt_wer) wer.append(dist / max(len(pred_wer), len(gt_wer))) # SER pred_ser, gt_ser = [pred], [gt] dist = stringdist.levenshtein(pred_ser, gt_ser) ser.append(dist / max(len(pred_ser), len(gt_ser))) return np.mean([cer, wer, ser], axis=1)
def sort_displayfile(filename): dist_name = stringdist.levenshtein(os.path.splitext(filename)[0], DISPLAYFILE_PROTOTYPE_NAME) dist_ext = stringdist.levenshtein(os.path.splitext(filename)[1][1:], DISPLAYFILE_PROTOTYPE_EXT) status_note(['[displayfile] Distance between names ', DISPLAYFILE_PROTOTYPE_NAME, ' and ', os.path.splitext(filename)[0], ' is ', dist_name], d=is_debug) status_note(['[displayfile] Distance between extensions ', DISPLAYFILE_PROTOTYPE_EXT, ' and ', os.path.splitext(filename)[1][1:], ' is ', dist_ext], d=is_debug) status_note(['[displayfile] Combined distance: ', dist_name + dist_ext]) return dist_name + dist_ext
def bbc(headline): headline_list = [] print("headline: " + headline) # Collect and parse first page url = ('https://www.bbc.co.uk/search?q=' + headline) response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # Pull all text from the BodyText div news_list = soup.find("ol", class_="search-results") #check if there is an Error if news_list is None: print("Error: No article with that date and headline") return None news_list_items = news_list.find_all('a') # Create for loop to print out all artists' names for news in news_list_items: names = news.text link = news.attrs['href'] if(len(names) > 0 and names[0] != '\n'): headline_list.append((names,link)) # calculating Levenshtein distance between entered headline and bbc headlines l_min = stringdist.levenshtein(headline_list[0][0], headline) closest_headline = headline_list[0] for i in range(1,len(headline_list)): l_value = stringdist.levenshtein(headline_list[i][0], headline) if l_value < l_min: l_min = l_value closest_headline = headline_list[i] # setting up article's content article_text = '' # Collect and parse the headline with the smallest Levenshtein distance url = (closest_headline[1]) response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # Pull all text from the BodyText div date = soup.find("div", {"class":"date"}).text print("Date: ",date) article = soup.find("div", {"class":"story-body__inner"}).findAll('p') if article is None: print("Error: Content of headline ",closest_headline[0],"couldn't be found at",closest_headline[1]) return None for element in article: article_text += '\n' + ''.join(element.findAll(text = True)) result = {'title': closest_headline[0], 'date': date, 'content': article_text} return result
def sort_mainfile(filename): dist_name = stringdist.levenshtein(os.path.splitext(filename)[0], MAINFILE_PROTOTYPE_NAME) padded_ext = os.path.splitext(filename)[1][1:].zfill(len(MAINFILE_PROTOTYPE_EXT)) dist_ext = stringdist.levenshtein(padded_ext, MAINFILE_PROTOTYPE_EXT) status_note(['[mainfile] Distance between names ', MAINFILE_PROTOTYPE_NAME, ' and ', os.path.splitext(filename)[0], ' is ', dist_name], d=is_debug) status_note(['[mainfile] Distance between extensions ', MAINFILE_PROTOTYPE_EXT, ' and ', padded_ext, ' is ', dist_ext], d=is_debug) status_note(['[mainfile] Combined distance: ', dist_name + dist_ext]) return dist_name + dist_ext
def MaliciusnessAnalysis(self, word_list, data, brand_list, keyword_list): check_similary = [] data['ConsecutiveCharacterRepeat'] = 0 for word in word_list: if word.lower() in dic_BrandNames.keys(): brand_list.append(word.lower()) if word.lower() in sensitive_list: keyword_list.append(word.lower()) data['KeywordCount'] = len(keyword_list) data['BrandNameCount'] = len(brand_list) # get targets data list_key = [] count_key = 0 for word in keyword_list: if word not in list_key: count_key += 1 list_key.append(word) list_brand = [] count_brand = 0 for word in brand_list: if word not in list_brand: count_brand += 1 list_brand.append(word) data['TargetBrandNameCount'] = count_brand data['TargetKeywordCount'] = count_key for word in word_list: if word.lower() not in brand_list and word.lower( ) not in keyword_list: check_similary.append(word.lower()) # for brandnames similar_word_list = [] for word in check_similary: for brand in dic_BrandNames.keys(): if ((stringdist.levenshtein(word, brand) < 2) and word not in similar_word_list): similar_word_list.append(word) data['ConsecutiveCharacterRepeat'] = 1 data['SimilarBrandNameCount'] = len(similar_word_list) for word in check_similary: for sens in sensitive_list: if (stringdist.levenshtein(word, sens) < 2 and word not in similar_word_list): #print(word, sens) similar_word_list.append(word) data['ConsecutiveCharacterRepeat'] = 1 data['SimilarKeywordCount'] = len( similar_word_list) - data['SimilarBrandNameCount'] return similar_word_list
def getLocations(src, dest): '''matches the user input string with the best possible locations in the database using edit distance''' mindist = 1000 mindist1 = 1000 srcname = "" destname = "" for node in nodes: if stringdist.levenshtein(src, node.name) < mindist: mindist = stringdist.levenshtein(src, node.name) srcname = node.name if stringdist.levenshtein(dest, node.name) < mindist1: mindist1 = stringdist.levenshtein(dest, node.name) destname = node.name return srcname, destname
def getKeyWord(rankedPhrases): #Set the initial ratio of the distance to 0 min_dist_ratio = 1 driv = "" for driver in drivers: indic = drivers.get(driver) div = 0 total_ratio = 0 for key_val in indic: for key_words in rankedPhrases: #This gets the levenshtein distance between each word if the row data is not 'None' if key_words is not None: dist = stringdist.levenshtein(key_val.lower(), key_words.lower()) curr_dist_ratio = (dist / longeststring([key_val, key_words])) total_ratio += curr_dist_ratio div = div + 1 total_ratio = total_ratio / div if total_ratio < min_dist_ratio: min_dist_ratio = total_ratio driv = driver #The levenshtein distance is computed and the category with the lowest levenshtein distance among all the key words is used for #that specific row. if min_dist_ratio < 0.87: driv = "Other" return driv
def cluster_words(words, thresh=8): """Return clusters of words, where word are added to clusters where the word has an average levenshtein of less than a threshold Each word is actally a tuple, with the word being the first item, and any other data in subsequent items """ import stringdist clusters = [] for w1 in words: placed = False for cluster in clusters: # Average dist to all words in the cluster ad = sum( stringdist.levenshtein(slugify(w1[0]), slugify(w2[0])) for w2 in cluster) / float(len(cluster)) if ad < thresh: cluster.add(w1) placed = True break elif any(dmeta_sub(w1[0], w2[0]) < thresh for w2 in cluster): cluster.add(w1) placed = True break if not placed: clusters.append(set([w1])) return clusters
def is_response_close_enough_using_leveinstein(self, response, expected_response): acceptable_levenshtein_threshold = 0.5 return stringdist.levenshtein( response, expected_response) / 33 < acceptable_levenshtein_threshold
def target_detected_call(self, data): global on_box if data.data != "invalid": dist_low = 1000 corrected_target = "should not appear" for subject in subjects: dist = stringdist.levenshtein(subject, data.data) if dist < dist_low: dist_low = dist corrected_target = subject print "target: ", corrected_target self.pub_target.publish(corrected_target) self.state_after_speak = "speaking_again" self.state = "speaking" print "state: ", self.state self.pub_speak.publish("We will deliver to " + corrected_target) while self.state != "speaking_again": 1 + 1 self.last_target_read = corrected_target else: print "target: ", data.data self.pub_target.publish(data.data) self.state_after_speak = "roaming" self.state = "speaking" print "state: ", self.state self.pub_speak.publish("Could not read message") while self.state != "roaming": 1 + 1 #print "hi", self.state rospy.sleep(3) on_box = "no" print "on_box :", on_box
def reply(prompt): # cleaning prompt = prompt.lower() for char in prompt: if not char.isalnum() and char != ' ': prompt = prompt.replace(char, "") # check for exact query matches for keys, value in gettuples(): if (prompt in keys): return value # return with max similarity if max sim > 0.3 max_value = ["Sorry! I didn't understand that."] max_simil = 0.3 for keys, value in gettuples(): for key in keys: simil = (get_cosine(key, prompt) + SequenceMatcher(None, key, prompt).ratio() + (1 - stringdist.levenshtein(key, prompt) / 15)) / 3 if (simil > max_simil): max_value = value max_simil = simil return max_value
def UMI_correct(UMI): """Corrects UMI by up to one letter""" for item in UMI_dict: if (stringdist.levenshtein(item, UMI)) <= 1: return(item) else: return("BAD")
def rectify(self, word): """ Speller predictions """ # the query that is mapped to ngrams char_ngrams_list = self.vectorizer.transform([word]).tocoo().col # print('--------') # print(word) # calculate the number of matches for each term counter = Counter() for token_id in char_ngrams_list: for word_id in self.index[token_id]: counter[word_id] += 1 # search for the nearest term from the selected terms closest_word = word minimal_distance = 1000 # search for "good" fix from the top of matches by n-gramms for suggest in counter.most_common(n=self.n_candidates): suggest_word = self.words_list[suggest[0]] # TODO: your code here # you can use any libraries and sources except the original texts distance = Distancer.levenshtein(word, suggest_word) if distance < minimal_distance: minimal_distance = distance closest_word = suggest_word # print(closest_word) return closest_word
def forward(self, prediction, target): logits = prediction[0] # (logits, len) feature_lengths = prediction[1].int() labels = target logits = torch.transpose(logits, 0, 1) logits = logits.cpu() # beam decoder output, scores, timesteps, out_seq_len = self.decoder.decode( probs=logits, seq_lens=feature_lengths) ############# GREEDY DECODE ########################## _, max_probs = torch.max(logits, 2) strings, offsets = self.greedy_decoder.decode(probs=logits) predictions = [] time_stamps = [] ls = 0 for i in range(len(strings)): pred = strings[i][0] phone_pred = [] for j in pred: phone_pred.append(self.phoneme_list[self.label_map.index(j)]) predictions.append(phone_pred) time_stamps.append(offsets[i][0].float() / 100) if target != None: true = "".join(self.label_map[l] for l in labels[i]) ls += stringdist.levenshtein(strings[i][0], true) return predictions, time_stamps, ls / len(strings)
def get_ranked_ontology_matches(cleaned_term): ''' Get ranked matches from ontology ''' ontology_matches = simstring_searcher.ranked_search( cleaned_term, SIMILARITY_THRESHOLD) # Weight relevant UMLS matches based on word ordering weighted_matches = {} for ontology_match in ontology_matches: # Get term and cui from ontology ontology_term = ontology_match[1] ontology_cui = term_to_cui[ontology_term] # Calculate Levenshtein distance for ranking levenshtein_distance = stringdist.levenshtein(ontology_term, cleaned_term) # Construct match key with divisor key = ontology_term + ' :: UMLS ' + ontology_cui weighted_matches[key] = levenshtein_distance # Construct list of ranked terms based on levenshtein distasnce value ranked_matches = [ ranked_pair[0] for ranked_pair in sorted(weighted_matches.items(), key=lambda kv: kv[1]) ] return ranked_matches
def get_doctor_job(input_text, LIST_SPECIALITY_NAME): input_text = input_text.strip().lower() list_values_lev = [stringdist.levenshtein(input_text, spec) for spec in LIST_SPECIALITY_NAME] min_dist = min(list_values_lev) if min_dist < 5: return True, LIST_SPECIALITY_NAME[list_values_lev.index(min_dist)] else: return False, LIST_SPECIALITY_NAME[list_values_lev.index(min_dist)]
def cer(words): """Character error rate (CER), defined as Levenshtein distance normalized by reference word length.""" val = [ (0 if gold == norm else stringdist.levenshtein(gold, norm) / len(gold)) for (gold, norm) in words ] return val
def closest_string(base_word, word_list): lowest_distance = 100 closest_word = '' for word in word_list: distance = stringdist.levenshtein(base_word, word) if distance <= lowest_distance: closest_word = word lowest_distance = distance return closest_word
def compareSimilarity(name, list_name, dicordRate): print("내 데이터 ", name) print("DB의 데이터 ", list_name) string_length = len(name) value = stringdist.levenshtein(name, list_name) print('오차율 : ', value) if dicordRate >= value / string_length: return 1 else: return 2
def sort_mainfile(filename): dist_name = stringdist.levenshtein( os.path.splitext(filename)[0], MAINFILE_PROTOTYPE_NAME) padded_ext = os.path.splitext(filename)[1][1:].zfill( len(MAINFILE_PROTOTYPE_EXT)) dist_ext = stringdist.levenshtein(padded_ext, MAINFILE_PROTOTYPE_EXT) help.status_note([ '[mainfile] Distance between names ', MAINFILE_PROTOTYPE_NAME, ' and ', os.path.splitext(filename)[0], ' is ', dist_name ], d=is_debug) help.status_note([ '[mainfile] Distance between extensions ', MAINFILE_PROTOTYPE_EXT, ' and ', padded_ext, ' is ', dist_ext ], d=is_debug) help.status_note(['[mainfile] Combined distance: ', dist_name + dist_ext]) return dist_name + dist_ext
def correct_headers(s): distance = 100 mark = "" for k in groundtruth.headers: this_distance = stringdist.levenshtein( s.replace(" ", "").lower(), groundtruth.headers[k].replace(" ", "").lower()) if this_distance < distance: distance = this_distance mark = k return distance, groundtruth.headers[mark]
def closest_name_match(local_name, group_language, translations): closest = float('inf') closest_index = -1 for index, name in translations[group_language].iteritems(): if "+" not in local_name and "+" in name: # Quick fix for people searching for forms w/o giving the form itself. name = name.split("+")[0] dst = stringdist.levenshtein(local_name, name) if dst < closest: closest = dst closest_index = index return translations[group_language][closest_index]
def is_typo(name0, name1): """ Get: name0 - string of name, name1 - string of name. Returns: True if the Levenshtein-distance between the names is <= 1, otherwise the function will return False. """ return levenshtein(name0, name1) <= 1
def sort_displayfile(filename): dist_name = stringdist.levenshtein( os.path.splitext(filename)[0], DISPLAYFILE_PROTOTYPE_NAME) dist_ext = stringdist.levenshtein( os.path.splitext(filename)[1][1:], DISPLAYFILE_PROTOTYPE_EXT) help.status_note([ '[displayfile] Distance between names ', DISPLAYFILE_PROTOTYPE_NAME, ' and ', os.path.splitext(filename)[0], ' is ', dist_name ], d=is_debug) help.status_note([ '[displayfile] Distance between extensions ', DISPLAYFILE_PROTOTYPE_EXT, ' and ', os.path.splitext(filename)[1][1:], ' is ', dist_ext ], d=is_debug) help.status_note( ['[displayfile] Combined distance: ', dist_name + dist_ext]) return dist_name + dist_ext
def line_data(gt_line, htr_line, htr_index): # Remove leading spaces and compress runs of spaces in the line. expected = ' '.join(gt_line.split()) obtained = ' '.join(htr_line.split()) # The stringdist package definition of levenshtein_norm() divides # by the longest of the two strings, but it is more conventional in # OCR papers and software to divide by the length of the reference. distance = levenshtein(expected, obtained) if len(expected) > 0: cer = '{:.2f}'.format(100 * float(distance) / len(expected)) else: cer = '100.00' return Line(htr_index, distance, cer, expected, obtained)
def handle_cocktail_recipe(req, res, tokens): make_words = { "рецепт", "коктейль", "приготовить", "сделать", "изготовить", "создать" } if len(tokens.intersection(make_words)): for word in make_words: if word in tokens: tokens.remove(word) if "как" in tokens: tokens.remove("как") if "с" in tokens or "из" in tokens: if "сок" in tokens: tokens.remove("сок") answer = [] wth = 'с' if wth in tokens or "из" in tokens: if wth in tokens: tokens.remove(wth) if "из" in tokens: tokens.remove("из") for token in tokens: if token in GLOBAL_DATA['INGREDIENTS']: answer += GLOBAL_DATA['INGREDIENTS'][token] if "без" in tokens: pass if len(answer) > 0: res['response']['text'] = gen_text_cocktail(answer[0]) res['response']['buttons'] = get_suggests_cocktails(answer[1:5]) return True else: result_list = defaultdict(float) for word in GLOBAL_DATA['COCKTAILS_WORDS']: for unit in GLOBAL_DATA['COCKTAILS_WORDS'][word]: for token in tokens: score = stringdist.levenshtein(word, token) if score < 1: score = 1 if score <= 8: result_list[unit] += (1 / score)**2 sorted_list = sorted(result_list.items(), key=operator.itemgetter(1), reverse=True) if sorted_list and sorted_list[0][1] > 0.25: res['response']['text'] = gen_text_cocktail(sorted_list[0][0]) res['response']['buttons'] = get_suggests_cocktails( x[0] for x in sorted_list[1:5]) return True return False
def getContactDetails(name): random.seed(datetime.now()) File = open('./firstnames.txt') firstnames = File.read() namesDistance = [] for firstname in firstnames.splitlines(): namesDistance.append((stringdist.levenshtein(firstname, name), firstname)) contact = contacts[randint(0, 42000) % len(contacts)] contact['firstname'] = sorted(namesDistance)[0][1] return contact
def parse_phrase(input_text, list_of_words, desirable_dist): input_text = input_text.strip().lower() print(input_text) value_of_dist = [ stringdist.levenshtein(input_text, word) for word in list_of_words ] print(value_of_dist) min_dist = min(value_of_dist) print(min_dist) print(list_of_words[value_of_dist.index(min_dist)]) if min_dist <= desirable_dist: return True, list_of_words[value_of_dist.index(min_dist)] else: return False, list_of_words[value_of_dist.index(min_dist)]
def clean(output, map_string): map = get_map(map_string) result = "".join( [symbol for symbol in output if symbol in accepted_symbols]) #Define an algorithm to check likely matching string for result strings = get_map_location_strings(map) #Error check matching_list = [(string, stringdist.levenshtein(result, string)) for string in strings] if matching_list: min_match = min(matching_list, key=lambda pairs: pairs[1]) best_match = min_match[ 0] if min_match[1] <= 2 and min_match[0] != '' else None return best_match return None
def parse_name_body(self, address_parts): # stick together the remaining parts # TODO return best matches under a value s = ' ' name = s.join(address_parts) match = [None, 10] if name in names: return name for n in names: score = levenshtein(name, n) if score < match[1]: match = [n, score] return match[0]
def check_matches( db_links, id_mp, lvs_max ): print( "check_matches() id_mp: %s, lvs_max: %s" % ( id_mp, lvs_max ) ) query = "SELECT M.id_matches, M.id_linksbase_1, M.id_linksbase_2, " query += "X.id_base, Y.id_base, X.ego_familyname_str, Y.ego_familyname_str " query += "FROM links_match.matches as M, " query += "links_prematch.links_base as X, " query += "links_prematch.links_base as Y " query += "WHERE M.id_match_process = %s " % id_mp query += "AND X.id_base = id_linksbase_1 " query += "AND Y.id_base = id_linksbase_2 " query += "ORDER BY id_matches LIMIT 5;" if debug: print( query ) resp = db_links.query( query ) if len( resp ) == 0: print( "No corresponding links_base records found for id_match_process %d" % id_mp ) for rec in resp: #print( str( rec ) ) id_matches = rec[ "id_matches" ] id_linksbase_1 = rec[ "id_linksbase_1" ] id_linksbase_2 = rec[ "id_linksbase_2" ] X_id_base = rec[ "id_base" ] Y_id_base = rec[ "Y.id_base" ] X_ego_familyname_str = rec[ "ego_familyname_str" ] Y_ego_familyname_str = rec[ "Y.ego_familyname_str" ] lvs = stringdist.levenshtein( X_ego_familyname_str, Y_ego_familyname_str ) msg = "OK " if lvs > lvs_max: msg = "ERR" print( "id_matches: %s, id_linksbase_1&2: %s, %s, lvsd: %2d: %s ego_familyname_str 1&2: %s, %s" % ( id_matches, id_linksbase_1, id_linksbase_2, lvs, msg, X_ego_familyname_str, Y_ego_familyname_str ) ) print( "" )