def fuzzy_match(words, assoc_dict, assoc_nouns):
    # method that matches similar words to already collected words.

    for word in words:

        max_ratio = 0
        max_cat = ''
        for cat, tokens in assoc_dict.items():
            for token in tokens:
                ratio = fuzz.ratio(word, token)
                if ratio > max_ratio:
                    max_ratio = ratio
                    max_cat = cat

        if max_cat != '' and max_ratio > 84:
            assoc_dict[max_cat].append(word)
            assoc_nouns.append(word)

    return assoc_dict, assoc_nouns
Example #2
0
def normalize_company_list(company_list):
    """
    transform the list of companies to be normalized, meaning companies with
    important words in common be mapped to a common name when they match to
    a high enough degree
    """
    strip_names = [n.strip().lower() for n in company_list]

    cur_name = company_list[0].strip().lower()

    for i, matching_name in enumerate(strip_names):

        match_idxs = []

        for j, next_name in enumerate(strip_names):

            if fuzz.ratio(cur_name, next_name.strip().lower()) > 75:
                match_idxs.append(j)

        # can improve the make_normalized_name as necessary
        company_list[match_idxs] =\
            make_normalized_name(company_list[matched_idxs])

    return company_list
Example #3
0
def normalize_company_list(company_list):
    """
    transform the list of companies to be normalized, meaning companies with
    important words in common be mapped to a common name when they match to
    a high enough degree
    """
    strip_names = [n.strip().lower() for n in company_list]

    cur_name = company_list[0].strip().lower()

    for i, matching_name in enumerate(strip_names):

        match_idxs = []

        for j, next_name in enumerate(strip_names):

            if fuzz.ratio(cur_name, next_name.strip().lower()) > 75:
                match_idxs.append(j)

        # can improve the make_normalized_name as necessary
        company_list[match_idxs] =\
            make_normalized_name(company_list[matched_idxs])

    return company_list
                where freq > 10 and pmi > 20 and entropy >0.1
                order by freq desc
                """);
rows = cursor.fetchall()
# for d in rows:  FINAL_WORDS.add(d['word'])
cursor.close()
conn.close()


##2 识别相似字符串 ,频率相同,或相差5%以内, 取pmi大5倍以上的
choices = word_set.copy()

for w in tqdm(word_set):
    score_list = []
    for b in choices:
        ratio = fuzzywuzzy.ratio(w, b)
        if ratio > 0.6:
            score_list.append((b, ratio))
    score_list.sort(key=lambda i:i[1], reverse=True)
    
    if len(score_list)<1: continue
    word = wordbook[w]
    
    likely_word = {}
    for sw, _ in score_list:
        sword = wordbook[sw]
        dratio = sword['pmi'] / float(word['pmi']) 
        likely_word[dratio] = sw
    if len(likely_word)==0:
        FINAL_WORDS.add(w)
    else:
 def need_to_change_author(self, first, second, setting_ratio):
     ratio = fuzz.ratio(first, second)
     if ratio >= setting_ratio:
         return True
     else:
         return False
 def testRatioUnicodeString(self):
     s1 = "\u00C1"
     s2 = "ABCD"
     score = fuzzywuzzy.ratio(s1, s2)
     self.assertEqual(0, score)
 def testEmptyStringsScore0(self):
     self.assertEqual(fuzzywuzzy.ratio("", ""), 0)
     self.assertEqual(fuzzywuzzy.partial_ratio("", ""), 0)
 def testCaseInsensitive(self):
     self.assertNotEqual(fuzzywuzzy.ratio(self.s1, self.s2), 100)
     self.assertEqual(
         fuzzywuzzy.ratio(fuzzywuzzy.full_process(self.s1),
                          fuzzywuzzy.full_process(self.s2)), 100)
 def testEqual(self):
     self.assertEqual(fuzzywuzzy.ratio(self.s1, self.s1a), 100)
Example #10
0
dtset2 = dtset2.reset_index(drop=True)
dtset1_test = dtset1_test.reset_index(drop=True)
dtset2_test = dtset2_test.reset_index(drop=True)
while flag:
    F_threshold += 0.5
    F_count = 0
    F_dobW = F_threshold
    F_uniqueNms = []
    F_labels = {}
    for i in range(len(dtset1)):
        F_labels[i] = -1
    for i in range(len(dtset1)):
        for j in range(len(dtset1)):
            F_fn_C = (max(
                (dtset1[i]['fn'].str.len()), (dtset1[j]['fn'].str.len()))) * (
                    1 - fuzz.ratio(dtset1[i]['fn'], dtset1[j]['fn']) / 100)
            F_ln_C = (max(
                (dtset1[i]['ln'].str.len()), (dtset1[j]['ln'].str.len()))) * (
                    1 - fuzz.ratio(dtset1[i]['ln'], dtset1[j]['ln']) / 100)
            F_dobC = (dtset1[i]['dob'] != dtset1[j]['dob']) * dobW
            if (F_fn_C + F_ln_C + F_dobC < threshold):
                if F_labels[j] == -1 and F_labels[i] == -1:
                    F_labels[j] = F_count
                    F_count += 1
                    F_labels[i] = F_labels[j]
                else:
                    if F_labels[j] == -1:
                        F_labels[j] = F_labels[i]
                    elif F_labels[i] == -1:
                        F_labels[i] = F_labels[j]
                    else: