def fuzzy_match(words, assoc_dict, assoc_nouns): # method that matches similar words to already collected words. for word in words: max_ratio = 0 max_cat = '' for cat, tokens in assoc_dict.items(): for token in tokens: ratio = fuzz.ratio(word, token) if ratio > max_ratio: max_ratio = ratio max_cat = cat if max_cat != '' and max_ratio > 84: assoc_dict[max_cat].append(word) assoc_nouns.append(word) return assoc_dict, assoc_nouns
def normalize_company_list(company_list): """ transform the list of companies to be normalized, meaning companies with important words in common be mapped to a common name when they match to a high enough degree """ strip_names = [n.strip().lower() for n in company_list] cur_name = company_list[0].strip().lower() for i, matching_name in enumerate(strip_names): match_idxs = [] for j, next_name in enumerate(strip_names): if fuzz.ratio(cur_name, next_name.strip().lower()) > 75: match_idxs.append(j) # can improve the make_normalized_name as necessary company_list[match_idxs] =\ make_normalized_name(company_list[matched_idxs]) return company_list
where freq > 10 and pmi > 20 and entropy >0.1 order by freq desc """); rows = cursor.fetchall() # for d in rows: FINAL_WORDS.add(d['word']) cursor.close() conn.close() ##2 识别相似字符串 ,频率相同,或相差5%以内, 取pmi大5倍以上的 choices = word_set.copy() for w in tqdm(word_set): score_list = [] for b in choices: ratio = fuzzywuzzy.ratio(w, b) if ratio > 0.6: score_list.append((b, ratio)) score_list.sort(key=lambda i:i[1], reverse=True) if len(score_list)<1: continue word = wordbook[w] likely_word = {} for sw, _ in score_list: sword = wordbook[sw] dratio = sword['pmi'] / float(word['pmi']) likely_word[dratio] = sw if len(likely_word)==0: FINAL_WORDS.add(w) else:
def need_to_change_author(self, first, second, setting_ratio): ratio = fuzz.ratio(first, second) if ratio >= setting_ratio: return True else: return False
def testRatioUnicodeString(self): s1 = "\u00C1" s2 = "ABCD" score = fuzzywuzzy.ratio(s1, s2) self.assertEqual(0, score)
def testEmptyStringsScore0(self): self.assertEqual(fuzzywuzzy.ratio("", ""), 0) self.assertEqual(fuzzywuzzy.partial_ratio("", ""), 0)
def testCaseInsensitive(self): self.assertNotEqual(fuzzywuzzy.ratio(self.s1, self.s2), 100) self.assertEqual( fuzzywuzzy.ratio(fuzzywuzzy.full_process(self.s1), fuzzywuzzy.full_process(self.s2)), 100)
def testEqual(self): self.assertEqual(fuzzywuzzy.ratio(self.s1, self.s1a), 100)
dtset2 = dtset2.reset_index(drop=True) dtset1_test = dtset1_test.reset_index(drop=True) dtset2_test = dtset2_test.reset_index(drop=True) while flag: F_threshold += 0.5 F_count = 0 F_dobW = F_threshold F_uniqueNms = [] F_labels = {} for i in range(len(dtset1)): F_labels[i] = -1 for i in range(len(dtset1)): for j in range(len(dtset1)): F_fn_C = (max( (dtset1[i]['fn'].str.len()), (dtset1[j]['fn'].str.len()))) * ( 1 - fuzz.ratio(dtset1[i]['fn'], dtset1[j]['fn']) / 100) F_ln_C = (max( (dtset1[i]['ln'].str.len()), (dtset1[j]['ln'].str.len()))) * ( 1 - fuzz.ratio(dtset1[i]['ln'], dtset1[j]['ln']) / 100) F_dobC = (dtset1[i]['dob'] != dtset1[j]['dob']) * dobW if (F_fn_C + F_ln_C + F_dobC < threshold): if F_labels[j] == -1 and F_labels[i] == -1: F_labels[j] = F_count F_count += 1 F_labels[i] = F_labels[j] else: if F_labels[j] == -1: F_labels[j] = F_labels[i] elif F_labels[i] == -1: F_labels[i] = F_labels[j] else: