def find_match_levenshtein_metaphone(self, token, canonical): candidates = [] best_score = 2 for word in self.dicts: score = jellyfish.levenshtein_distance( token, word.decode("utf-8").lower()) if score <= best_score: best_score = score candidates.append(word.lower()) token_metaphone = jellyfish.metaphone(token.decode("utf-8")) match_metaphone = [ match for match in candidates if jellyfish.metaphone(match.decode("utf-8")) == token_metaphone ] #G = ngram.NGram(match_metaphone) #best_candidates = G.search(token, threshold=0.5) #results = [item[0] for item in best_candidates] is_match = False for word in match_metaphone: if word == canonical: is_match = True break #if len(best_candidates) > 0: # best_match = best_candidates[0][0] #else: # best_match = "" return match_metaphone, is_match
def get_event_code(self, key, language, dimension="2D", event_type="MT"): """Returns EventCode :param str key: Movie name :param str language: Movie language :param dimension: Movie dimension, can be 2D, 2D 4DX, 3D, 3D 4DX, or IMAX 3D :type dimension: str :param event_type: Event types( MT(Movies), CT(Events), PL(Plays), SP(Sports)) :type event_type: str :return: Event Code :rtype: str :raises BMSError: If the event code is not found """ quickbook = self.quickbook(event_type) movies = quickbook['moviesData']['BookMyShow']['arrEvents'] key = metaphone(key).replace(' ', '') for movie in movies: if key == metaphone(movie['EventTitle']).replace(' ', ''): for child in movie['ChildEvents']: if language == child[ 'EventLanguage'] and dimension == child[ 'EventDimension']: return child['EventCode'] else: raise BMSError( "Event code not found! Please check the Movie name and other options" )
def get_similarity_score(self, str_a, str_b): ''' Compare phonetic similarity of two strings. Input strings are converted into pinyin before comparing. :param str_a: (String) name entity :param str_b: (String) name entity :return: (float) Similarity score ranging between 0 to 1 ''' cn_str_a = self.get_pinyin("".join(str_a), space_seperated=False) cn_str_b = self.get_pinyin("".join(str_b), space_seperated=False) phone_a, phone_b = jellyfish.metaphone(cn_str_a), jellyfish.metaphone( cn_str_b) # Calculate phone edit distance and phone similarity edit_distance = self._phone_edit_distance(phone_a, phone_b) max_score = max(len(phone_a), len(phone_b)) * 4 - abs(len(phone_a) - len(phone_b)) if max_score == 0: if str_a == str_b: return 1 else: return 0 similarity = 1 - edit_distance / max_score return similarity
def token_similarity(a, b): # Strings are a case insensitive match. # Match any whitespace to any whitespace. if a.word.lower().strip() == b.word.lower().strip(): return 1. # Make it impossible for words to map to whitespace. if ((isspace(a.word) and not isspace(b.word)) or (not isspace(a.word) and isspace(b.word))): return -1. # Make it impossible for words to map to punctuation. if ispunc(a.word) and ispunc(b.word): return 0.9 if ((ispunc(a.word) and not ispunc(b.word)) or (not ispunc(a.word) and ispunc(b.word))): return -1. # Strings sound alike (approximate phonetic match). if a.word.isalpha() and b.word.isalpha(): if jf.metaphone(a.word) == jf.metaphone(b.word): return 0.9 if jf.soundex(a.word) == jf.soundex(b.word): return 0.9 if jf.nysiis(a.word) == jf.nysiis(b.word): return 0.9 if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a.word, b.word)
def phonetic_similarity(ref, result): targetTmp = result refTmp = ref targetTmp = jellyfish.metaphone(result) refTmp = jellyfish.metaphone(ref) return jellyfish.jaro_winkler(refTmp, targetTmp)
def similarity_factor(s1, s2): """ Returns float number which corresponds to similarity order of two strings s1 and s2 """ diffl = difflib.SequenceMatcher(None, s1, s2).ratio()*100 ng = ngram.NGram.compare(s1, s2, N=1)*100 fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100 jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100 return mean([diffl, ng, fpr, jac_soundex, jac_metaphone]) if mean([diffl, ng, fpr]) < jac_soundex else mean([diffl, ng, fpr, jac_metaphone])
def compare_for_seniority_finding(s1, s2): """ Returns the input word if it is similar (according to corresponding algorithms) to some another word. s1 - main string, s2 - string from list for comparison """ fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100 jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100 jac_mrc = (1-distance.jaccard(jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower()))*100 return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def phonetic_encoded_jaro_winkler_sim(ref, result): targetTmp = jellyfish.metaphone(result) refTmp = jellyfish.metaphone(ref) if Debug: print("Result: \t\t" + result) print("Converted result: \t" + targetTmp) print("Ref: \t\t\t" + ref) print("Converted ref: \t\t" + refTmp) print( "------------------------------------------------------------------------------" ) return jellyfish.jaro_winkler(refTmp, targetTmp)
def correct(self, wrongWord): candidates = [] candidateDistList = [] wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram) for trigram in wWTGrams: if trigram in SpellChecker.invertTriMap: addList = [] tmpList = SpellChecker.invertTriMap[trigram] for tmp in tmpList: ed = self.compED(tmp, wrongWord) if ed <= 2: addList.append(tmp) candidates = candidates + addList #soundexHash = jellyfish.soundex(wrongWord) #if soundexHash in SpellChecker.invertSoundexMap: # candidates = candidates + SpellChecker.invertSoundexMap[soundexHash] #candidates = list(set(candidates)) metaHash = jellyfish.metaphone(wrongWord) if metaHash in SpellChecker.invertMetaMap: candidates = candidates + SpellChecker.invertMetaMap[metaHash] candidates = list(set(candidates)) #print (len(candidates)) for candidate in candidates: if abs(len(candidate) - len(wrongWord)) > 2: continue if wrongWord == candidate: continue ed = self.compED(candidate, wrongWord) jd = jellyfish.jaro_distance(wrongWord, candidate) gd = self.getJackSim( self.getGrams(candidate, SpellChecker.jackardGram), self.getGrams(wrongWord, SpellChecker.jackardGram)) score = float(SpellChecker.dictCountMap[candidate]) / float( SpellChecker.totalCount) + ( max(len(candidate), len(wrongWord)) - ed) if jellyfish.metaphone(wrongWord) == jellyfish.metaphone( candidate): score = score + 0.1 #if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate): # score = score+0.1 #if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate): # score = score+0.1 #if jellyfish.match_rating_codex(wrongWord) == jellyfish.match_rating_codex(candidate): # score = score+0.1 tmpCandidate = ScoreRcd(candidate, ed, score) candidateDistList.append(tmpCandidate) candidateDistList.sort() return candidateDistList
def phonetic_encoded_jaccard_sim(str1, str2): targetTmp = jellyfish.metaphone(str2) refTmp = jellyfish.metaphone(str1) if Debug: print("Result: \t\t" + str2) print("Converted result: \t" + targetTmp) print("Ref: \t\t\t" + str1) print("Converted ref: \t\t" + refTmp) print( "------------------------------------------------------------------------------" ) return jaccard_sim(refTmp, targetTmp)
def phonetic_similarity(ref, result): targetTmp = result refTmp = ref targetTmp = jellyfish.metaphone(result) refTmp = jellyfish.metaphone(ref) if Debug: print("Result: \t\t" + result) print("Converted result: \t" + targetTmp) print("Ref: \t\t\t" + ref) print("Converted ref: \t\t" + refTmp) print( "------------------------------------------------------------------------------" ) return round(jellyfish.jaro_winkler(refTmp, targetTmp), 5)
def similarity_factor(s1, s2): """ Returns float number which corresponds to similarity order of two strings s1 and s2 """ diffl = difflib.SequenceMatcher(None, s1, s2).ratio() * 100 ng = ngram.NGram.compare(s1, s2, N=1) * 100 fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1 - distance.jaccard( jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower())) * 100 jac_soundex = (1 - distance.jaccard( jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower())) * 100 return mean([diffl, ng, fpr, jac_soundex, jac_metaphone ]) if mean([diffl, ng, fpr]) < jac_soundex else mean( [diffl, ng, fpr, jac_metaphone])
def compare_for_seniority_finding(s1, s2): """ Returns the input word if it is similar (according to corresponding algorithms) to some another word. s1 - main string, s2 - string from list for comparison """ fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1 - distance.jaccard( jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower())) * 100 jac_soundex = (1 - distance.jaccard( jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower())) * 100 jac_mrc = (1 - distance.jaccard( jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower())) * 100 return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def string_match(x, y): # converts the string into unicode x = x.decode('utf-8') y = y.decode('utf-8') # makes the metaphones of the words xp = jf.metaphone(x) yp = jf.metaphone(y) # compares the metaphones of the words if xp == yp: return True return False
def __init__(self): SpellChecker.dictCountMap = self.readDitionary( '../data/count_1w100k.txt') for key in SpellChecker.dictCountMap: SpellChecker.totalCount += SpellChecker.dictCountMap[key] for word in SpellChecker.dictCountMap: tGList = self.getGrams(word, SpellChecker.invertMapGram) for tgram in tGList: tmpWordList = [] if tgram in SpellChecker.invertTriMap: tmpWordList = SpellChecker.invertTriMap[tgram] tmpWordList.append(word) SpellChecker.invertTriMap[tgram] = tmpWordList tmpWordList = [] soundexHash = jellyfish.soundex(word) if soundexHash in SpellChecker.invertSoundexMap: tmpWordList = SpellChecker.invertSoundexMap[soundexHash] tmpWordList.append(word) SpellChecker.invertSoundexMap[soundexHash] = tmpWordList metaHash = jellyfish.metaphone(word) if metaHash in SpellChecker.invertMetaMap: tmpWordList = SpellChecker.invertMetaMap[metaHash] tmpWordList.append(word) SpellChecker.invertMetaMap[metaHash] = tmpWordList
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def make_shortcode(self, name): upper_name = name.upper().replace(" ", "") shortcode = jellyfish.metaphone(upper_name)[:4] if len(shortcode) < 4: shortcode = upper_name[:4] shortcode += str(random.randint(0, 99999999)).zfill(8) return shortcode[:8]
def pickle_data(): dbpath = os.path.abspath(os.path.join(PWD, 'data.db')) conn = sqlite3.connect(dbpath) conn.row_factory = dict_factory c = conn.cursor() c.execute("""SELECT * FROM states ORDER BY name""") states = [] for row in c: row['name_metaphone'] = jellyfish.metaphone(row['name']) row['is_territory'] = row['is_territory'] == 1 row['is_obsolete'] = row['is_obsolete'] == 1 row['is_contiguous'] = row['is_contiguous'] == 1 row['is_continental'] = row['is_continental'] == 1 row['time_zones'] = row['time_zones'].split(',') states.append(row) pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl')) with open(pkl_path, 'wb') as pkl_file: pickle.dump(states, pkl_file)
def fuzzy(string): return jsonify({ "metaphone": jellyfish.metaphone(string), "soundex": jellyfish.soundex(string), "nysiis": jellyfish.nysiis(string), "match_rating_codex": jellyfish.match_rating_codex(string), })
def pickle_state_data(): dbpath = os.path.abspath(os.path.join(PWD, 'data.db')) conn = sqlite3.connect(dbpath) conn.row_factory = dict_factory c = conn.cursor() c.execute("""SELECT * FROM states ORDER BY name""") states = [] for row in c: row['name_metaphone'] = jellyfish.metaphone(row['name']) row['is_territory'] = row['is_territory'] == 1 row['is_obsolete'] = row['is_obsolete'] == 1 row['is_contiguous'] = row['is_contiguous'] == 1 row['is_continental'] = row['is_continental'] == 1 row['time_zones'] = row['time_zones'].split(',') states.append(row) pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl')) with open(pkl_path, 'wb') as pkl_file: pickle.dump(states, pkl_file)
def metaphone(x): x = x.decode('utf-8') xp = jf.metaphone(x) return xp
def pickle_data(): dbpath = os.path.abspath(os.path.join(PWD, 'data.db')) conn = sqlite3.connect(dbpath) conn.row_factory = dict_factory c = conn.cursor() c.execute("""SELECT * FROM states ORDER BY name""") states = [] for row in c: row['name_metaphone'] = jellyfish.metaphone(row['name']) row['is_territory'] = row['is_territory'] == 1 row['is_obsolete'] = row['is_obsolete'] == 1 row['is_contiguous'] = row['is_contiguous'] == 1 row['is_continental'] = row['is_continental'] == 1 row['time_zones'] = row['time_zones'].split(',') states.append(row) pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl')) with open(pkl_path, 'wb') as pkl_file: # Use `protocol=2` to ensure package compatibility with Python 2, # even if the `.pkl` file is built under Python 3 pickle.dump(states, pkl_file, protocol=2)
def find_min_dist(lyrics): nonlocal min_dist nonlocal min_dist_idx nonlocal phrase nonlocal idx # Find best match phrase in lyrics min_dist_this_lyrics = 10000 min_dist_start_idx = 0 min_dist_end_idx = 0 lyrics_met = jellyfish.metaphone(lyrics).split(' ') for i in range(0, len(lyrics_met) - len(test_met)): this_lyrics_met = lyrics_met[i:i + len(test_met)] if this_lyrics_met[0] == test_met[0]: dist = jellyfish.levenshtein_distance(''.join(test_met), ''.join(this_lyrics_met)) if dist < min_dist_this_lyrics: min_dist_this_lyrics = dist min_dist_start_idx = i min_dist_end_idx = i + len(test_met) # Check against global min if min_dist_this_lyrics < min_dist: min_dist = min_dist_this_lyrics min_dist_idx = idx phrase = ' '.join(lyrics.split(' ')[min_dist_start_idx:min_dist_end_idx]) # Increment global idx idx += 1
def metaphone(): fw6 = open('metaphone_result.txt', 'w') for line in wiki_misspell: string = line.strip() dis = 100000 bests = "" string_s = jellyfish.metaphone(string) for entry in my_dict: entry.strip() entry_s = jellyfish.metaphone(entry) len_entry = len(entry_s) + 1 len_string = len(string_s) + 1 distance_m = [[0 for i in range(len_string)] for i in range(len_entry)] for i in range(0, len_entry): distance_m[i][0] = 0 for i in range(0, len_string): distance_m[0][i] = 0 for i in range(1, len_entry): for j in range(1, len_string): if entry_s[i - 1] == string_s[j - 1]: distance_m[i][j] = min( distance_m[i - 1][j - 1] - 1, distance_m[i - 1][j] + 1, distance_m[i][j - 1] + 1, ) else: distance_m[i][j] = min( distance_m[i - 1][j - 1] + 1, distance_m[i - 1][j] + 1, distance_m[i][j - 1] + 1, ) tem_dis = distance_m[len_entry - 1][len_string - 1] if tem_dis < dis: dis = tem_dis bests = " " bests = entry.strip() elif tem_dis == dis: bests += ' ' + entry.strip() print(dis, string, bests) fw6.write(bests + '\n') fw6.close()
def _word_similarity_score(a, b): if a == b: return 1. # Case and whitespace insenstive comparison. if a.lower().strip() == b.lower().strip(): return 0.95 # Penalize whitespace matching to non-whitespace. if ((_isspace(a) and not _isspace(b)) or (not _isspace(a) and _isspace(b))): return 0 # Exceptions to punctuation. if _match_ampersand(a, b): return 0.85 # Penalize punctuation matching to non-punctuation. if _ispunc(a) and _ispunc(b): return 0.95 if ((_ispunc(a) and not _ispunc(b)) or (not _ispunc(a) and _ispunc(b))): return 0 # Problems with phonetic match functions segfaulting on # empty strings. Also beneficial to match strings with # no alpha characters to each other (e.g., line numbers). a_alpha = u''.join([ c for c in a if c.isalpha() ]) b_alpha = u''.join([ c for c in b if c.isalpha() ]) if a_alpha == '' and b_alpha == '': return 0.85 # Strings sound alike (approximate phonetic match). if jf.match_rating_comparison(a_alpha, b_alpha): return 0.9 if jf.metaphone(a_alpha) == jf.metaphone(b_alpha): return 0.9 if jf.soundex(a_alpha) == jf.soundex(b_alpha): return 0.9 if jf.nysiis(a_alpha) == jf.nysiis(b_alpha): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a, b)
def get_ethnicity_list(input_list): output_list = [] for i in input_list: temp = jellyfish.metaphone(unicode(i)) if D4name_ethnicity_meta.has_key(temp): output_list.append(D4name_ethnicity_meta[temp]) else: output_list.append('other') return output_list
def phonetic(addressline): # create a metaphone representation of an address or partial address words = re.split('\s+', addressline) phonetics = [] for word in words: if re.match('\d', word): phonetics.append(word) else: phonetics.append(jellyfish.metaphone(word)) return ''.join(phonetics)
def process_stop_words(text): result = [] words = text.split() for word in words: lower_word = word.lower() if lower_word.isdigit(): result.append(lower_word) else: if word and not word.lower() in stop_words_set: result.append(jf.metaphone(lower_word)) return ' '.join(result)
def scoring(self, suggestion, phrase): score = 0 if (suggestion.distance == 0): score += 2000 if (suggestion.suggest_rule == SuggestRule.PREFIX): score += 500 if (suggestion.distance == 1): score += 300 if (suggestion.distance == 2): score += 100 if (suggestion.distance > 2): score += (100 - suggestion.distance * 10) score += suggestion.count / 100000000 if (suggestion.count < 100000): score -= 10000000 / suggestion.count if (jellyfish.metaphone( suggestion.term) == jellyfish.metaphone(phrase)): score += 50 print(str(suggestion) + "score is : " + str(score)) return score * -1
def __init__(self, plainEntity): """ Instantiates a new encoded entity object. Requires 'plainEntity': plain text entity to encode """ if isinstance(plainEntity, str): plainEntity = unicode(plainEntity, 'utf-8') self.plain = plainEntity self.encoded = jellyfish.metaphone(plainEntity)
def __init__(self, **kwargs): for k, v in kwargs.items(): self.__dict__[k] = v try: import jellyfish self.__dict__["name_metaphone"] = jellyfish.metaphone( self.__dict__["name"]) except: pass
def process_stop_words(text): result = [] words = text.split() for word in words: lower_word = word.lower() if lower_word.isdigit(): result.append(lower_word) else: if word and not word.lower() in stop_words_set: result.append(jf.metaphone(lower_word)) return " ".join(result)
def measure_string_distance(s1, s2, method): ''' Four methods will be used with method code from 1 to 4 Two methods focused on string similarity and the other two will be focused on phonetic encoding Method code to method name: 1. jaro-winkler distance 2. damerau-levenshtein distance 3. Metaphone 4. NYSIIS 5. match_rating_codex note: for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match) for methods 1 and 2, the methods will return a value in range [0, 1] ''' result = 0 if s1 == '' or s2 == '': return result if method == 1: result = jellyfish.jaro_winkler(s1, s2) elif method == 2: try: diff = jellyfish.damerau_levenshtein_distance(s1, s2) result = 1 - (diff / max(len(s1), len(s2))) except: result = 0 elif method == 3: result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0 elif method == 4: result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0 elif method == 5: result = 1 if jellyfish.match_rating_codex( s1) == jellyfish.match_rating_codex(s2) else 0 # elif method == 0: # raise ValueError("provide a method code (1-6).") # else: # raise ValueError("the method parameter must be in the range from 1 to 6.") return result
def process(self, term): if term == '': return term candidates = self.generate_candidates(term) if candidates: scores = [ (0.6 * levenshtein_distance(metaphone(i), metaphone(term)) + 0.4 * levenshtein_distance(i, term), idx) for idx, i in enumerate(candidates) ] min_value = 1000 min_idx = -1 if len(scores) > 0: for score, idx in scores: if candidates[idx].startswith( term[0]) and "'" not in candidates[idx]: if score < min_value: min_value = score min_idx = idx term = candidates[min_idx] return term
def bestcandidate(wrd): w = wrd candidate_list = [] try: #Check the Brown word clusters c = bcluster._word[w] for rec in c: d = rec['cluster'] recs = bcluster._cluster[d] for rec in recs: candidate = rec['word'] levenshtein = jellyfish.levenshtein_distance(w,candidate) n2 = jellyfish.metaphone(w) n3 = jellyfish.metaphone(candidate) if chant.check(candidate): #Filter the candidates within a specific character and phonetic distance if levenshtein <= 2 or jellyfish.levenshtein_distance(n2, n3) <= 1: candidate_list.append((candidate, rec['count'])) return candidate_list[-1][0] except Exception: return 'No'
def featurize(df): if len(df.columns)==3: df.columns=['a', 'b', 'target'] elif len(df.columns)==2: df.columns=['a', 'b'] else: df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' }) df['TM_A'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1) df['TM_B'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1) df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1) df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1) # Jellyfish levenshtein df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1) # Scale Levenshtein column scaler = MinMaxScaler() df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1)) # Jellyfish phoneme df['metaphone'] = df.apply( lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1) df['nysiis'] = df.apply( lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1) df['mtch_rtng_cdx'] = df.apply( lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1) df['pshp_soundex_first'] = df.apply( lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1) for i, algo in enumerate(algos): df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1) return df
def test_metaphone(self): cases = [("metaphone", 'MTFN'), ("wHErE", "WR"), ("shell", "XL"), ("this is a difficult string", "0S IS A TFKLT STRNK"), ("aeromancy", "ERMNS"), ("Antidisestablishmentarianism", "ANTTSSTBLXMNTRNSM"), ("sunlight labs", "SNLT LBS"), ("sonlite laabz", "SNLT LBS"), (u"Çáŕẗéř", "KRTR"), ] for (s1, code) in cases: self.assertEqual(jellyfish.metaphone(s1), code)
def lookup(val, field=None, use_cache=True): """ Semi-fuzzy state lookup. This method will make a best effort attempt at finding the state based on the lookup value provided. * two digits will search for FIPS code * two letters will search for state abbreviation * anything else will try to match the metaphone of state names Metaphone is used to allow for incorrect, but phonetically accurate, spelling of state names. Exact matches can be done on any attribute on State objects by passing the `field` argument. This skips the fuzzy-ish matching and does an exact, case-sensitive comparison against the specified field. This method caches non-None results, but can the cache can be bypassed with the `use_cache=False` argument. """ import jellyfish if field is None: if FIPS_RE.match(val): field = 'fips' elif ABBR_RE.match(val): val = val.upper() field = 'abbr' else: val = jellyfish.metaphone(val) field = 'name_metaphone' # see if result is in cache cache_key = "%s:%s" % (field, val) if use_cache and cache_key in _lookup_cache: return _lookup_cache[cache_key] for state in STATES_AND_TERRITORIES: if val == getattr(state, field): _lookup_cache[cache_key] = state return state
# Damerau-Levenshtein Distance # Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查 print(Levenshtein.jaro_winkler('hello', 'helol')) # 0.9533333333333333; 计算 Jaro – Winkler 距离
import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish') #'JLLFSH' import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish')
def extract_feature(name, standard): """ (string, string) --> [boolean, boolean, boolean, int, int, int, boolean, boolean, boolean, int] extracts various features for each record (name, standard) and exports results in form of a list of booleans and integers. >>> extract_feature('ARINCK', 'AAFTINK') [0,0,0,1,1,1,?, ?, ?, 1] """ if not name or not standard: return [] f_list = [] # features list # f1: Boolean feature -- If first 2 letters of name and standard name are equal f_list.append(name[:2] == standard[:2]) # f2: Boolean feature -- If last 2 letters of name and standard name are equal f_list.append(name[-2:] == standard[-2:]) # f3: Boolean feature -- If size of name and standard name are equal f_list.append(len(name) == len(standard)) # f4: Number feature -- absolute difference of name size and standard size f_list.append(abs(len(name) - len(standard))) # f5: Number feature--Number of longest first equal chars for i in xrange(1,len(name)+1): if not name[:i] == standard[:i]: break # print i, name, standard f_list.append(i-1) # f6: Number feature -- Number of longest last equal chars for i in range(len(name)): if not name[-i-1:] == standard[-i-1:]: break f_list.append(i) # f7: Boolean feature -- if soundex code of name and standard name is equal import jellyfish f_list.append(jellyfish.soundex(name) == jellyfish.soundex(standard)) # f8: Boolean feature -- if metaphone code of name and standard name is equal f_list.append(jellyfish.metaphone(name) == jellyfish.metaphone(standard)) # f9: Boolean feature -- if double-metaphone code of name and standard name is equal from preModules import metaphone dm_flag = False # a flag that shows whether two words have any common double-metaphone or not for dm1 in metaphone.doublemetaphone(name): for dm2 in metaphone.doublemetaphone(standard): if dm1 and dm2 and dm1 == dm2: dm_flag = True break f_list.append(dm_flag) # f10: Number feature -- longest common chars between name and its standard name from modules.basic_modules.basic import longest_common_substring f_list.append(len(longest_common_substring(name, standard))) return f_list
def metaph(list_in_df): new_list = [] for tx in list_in_df: metaph_equiv = jellyfish.metaphone(unicode(tx)) new_list.append(metaph_equiv) return new_list
from collections import Counter import jellyfish # Input data file input_datafile = '/Users/MariaAthena/Dropbox/00 Imperial College/1601 Workforce Analytics/Assignments/BS1810_IndividualPart1_EngesaethMaria/Data/D3_patent_data.csv' # Load data file containing required data to create ethnicity dictionary ethnicfile = open('/Users/MariaAthena/Dropbox/00 Imperial College/1601 Workforce Analytics/Assignments/BS1810_IndividualPart1_EngesaethMaria/Data/D4name_ethnicity.pkl', 'rb') # Create ethnicity_dict: {'names': 'ethnicity of name'} ethnicity_dict = pickle.load(ethnicfile) ethnicfile.close() # loops through ethnicity_dict changes each key to its metaphone equivalent for key in ethnicity_dict.keys(): phonetic_key = jellyfish.metaphone(unicode(key)) # replaces phonetic key with old key ethnicity_dict[phonetic_key] = ethnicity_dict.pop(key) # Helper functions # Calculating the Herfindahl index def herfindahl(input_list): cntry_cnt = Counter(input_list) vals = cntry_cnt.values() prob = 0 for val in vals: prob = prob + (val / float(sum(vals))) ** 2 return prob
import jellyfish #checking if two words are homophones (not much accurate) x,y = map(str,input("Enter two words : ").split()) if(jellyfish.metaphone(x) == jellyfish.metaphone(y) or jellyfish.soundex(x) == jellyfish.soundex(y)): print("Homophones !") else: print("Not Homophones !") ''' #check difference between two words #returns number of changes print(jellyfish.levenshtein_distance(x,y)) '''
# %% Define helper functions # Herfindal Index def herf(input_list): from collections import Counter counts = Counter(input_list) denom = sum(counts.values()) ans = sum([(x/float(denom))**2 for x in counts.values()]) return ans # %% Step 1 - Read in the data d4 = pd.read_csv("../../data/D4_ethnic_surnames.csv") # %% Step 2 - Convert names to metaphone representations d4_long = pd.melt(d4) d4_long.columns = ['ethnicity', 'name'] d4_long['meta'] = [jellyfish.metaphone(unicode(name)) for name in d4_long.name] # d4_long = d4_long.drop_duplicates() # %% Step 3 - Get patents data d3 = pd.read_csv("../../data/D3_patent_data.csv") # %% Step 4 - Patent lastname ethnicities ## 4.1 Reshape patents data # inventor names d3_inv_names = pd.concat([d3.pnum, d3.lastname.apply(lambda y: pd.Series(y.split(';')))], #d3.cntries.apply(lambda y: pd.Series(y.split(';')))], axis = 1) d3_inv_names_melt = pd.melt(d3_inv_names, id_vars = 'pnum',
def phonetic_match(s1, s2): """ returns bool of whether two strings are phonetically identical after processing """ return jellyfish.metaphone(s1) == jellyfish.metaphone(s2)
def add_blocking_code(blocking_type=2): '''(int) -> () adds new potential matches according to the blocking technique to the carr_match table. blocking_type 1: metaphone of last name ''' import time if blocking_type == 1: import jellyfish count = 0 query = 'Select id, last_name from all_persons' cur1 = run_query(query) # fetch the person with the the random id ref_list = [] for row in cur1.fetchall(): ref_list.append([row[0], row[1]]) cur1.close() query = '' for ref in ref_list: count += 1 if count % 10000 == 0: if query: # print query cur = run_query(query) cur.fetchall() cur.close() query = '' metaphone = jellyfish.metaphone(ref[1]) query += 'update all_persons set metaphone = "' + metaphone + '" where id =' + str(ref[0]) + ';' if query: cur = run_query(query) cur.fetchall() cur.close() query = '' if blocking_type == 2: start = time.time() import jellyfish count = 0 # importing all names and their ids query = 'Select id, concat(first_name, " ", last_name) from all_persons' cur1 = run_query(query) # fetch the person with the the random id ref_list = [] for row in cur1.fetchall(): ref_list.append([row[0], row[1]]) cur1.close() for ref in ref_list[4300000:]: count += 1 if count % 10000 == 0: end = time.time() elapsed = end - start print count, elapsed # start = time.time() metaphone = jellyfish.metaphone(ref[1]) query = 'update all_persons set metaphone = "' + metaphone + '" where id =' + str(ref[0]) + ';' cur = run_query(query) cur.fetchall() cur.close()
def transform(self, data): if isinstance(data, basestring): return metaphone(unicode(data))
def loadAuthors(authorfile, printaffilwordfreq=False): reader = csv.reader(open(authorfile, 'rb')) reader.next() authors = [] lastname_cnt = defaultdict(int) iFfL_cnt = defaultdict(int) affiliations = [] fullnames = [] print_err("Parsing names and counts") #[^~:_`@\?\\|\'/\"\.\-0-9a-z;,\n\r \+\-\)\}&%\$\*\{\>=\^] titles_c = nameparser.constants.TITLES - set(['wing', 'lord', 'mg', 'mate', 'king', 'sharif', 'sheikh', 'rt', 'lama', 'gen', 'bg', 'baba', 'ab']) suffixes_c = nameparser.constants.SUFFIXES | set(['junior', 'senior', 'vii']) prefixes_c = nameparser.constants.PREFIXES - set(['bin']) # more common as first name id2affiliation = {} id2fullname = {} for i, line in verbose_iter(reader): line[1:] = [unidecode(unicode(cell, 'utf-8')) for cell in line[1:]] if line[2]: id2affiliation[int(line[0])] = len(affiliations) line[2] = strip_punc(line[2].lower()) affiliations.append(line[2]) fullnm = strip_punc(line[1].lower().encode('ascii')) if fullnm: id2fullname[int(line[0])] = len(fullnames) fullnames.append(fullnm) if printaffilwordfreq: continue hn = HumanName(line[1].replace('-', ' '), titles_c=titles_c, prefixes_c=prefixes_c, suffixes_c=suffixes_c) ai = { 'fullname_joined': hn.full_name, 'name_title': hn.title, 'name_first': hn.first, 'name_middle': hn.middle, 'name_last': hn.last, 'name_suffix': hn.suffix } ai = {k: strip_punc(v.lower().encode('ascii'), space_dashes=False) for k, v in ai.iteritems()} ai['name'] = hn.full_name.lower().strip().encode('ascii').translate(None, ';') ai['fullname'] = strip_punc(hn.full_name.lower().encode('ascii')) ai['fullname_parsed'] = ai['name_first'] + ai['name_middle'] + ai['name_last'] + ai['name_suffix'] ai['affiliation'] = line[2] ai['metaphone_fullname'] = jellyfish.metaphone(ai['fullname']).encode('ascii').translate(None, ' ') if ai['name_last']: if ai['name_first']: ai['iFfL'] = ai['name_first'][0] + ai['name_last'] else: ai['iFfL'] = 'L:' + ai['name_last'] elif ai['name_first']: ai['iFfL'] = 'F:' + ai['name_first'] # use full first name if no last name else: ai['iFfL'] = 'ID:' + line[0] if ai['name_last'] and ai['name_first']: ai['fFfL'] = ai['name_first'] + ai['name_last'] ai['fFiL'] = ai['name_first'] + ai['name_last'][0] else: ai['fFfL'] = ai['iFfL'] ai['fFiL'] = ai['iFfL'] if not ai['fullname_joined']: ai['fullname_joined'] = 'ID:' + line[0] if not ai['fullname']: ai['fullname'] = 'ID:' + line[0] if not ai['fullname_parsed']: ai['fullname_parsed'] = ai['fullname'] authors.append((int(line[0]), ai)) lastname_cnt[ai['name_last']] += 1 iFfL_cnt[ai['iFfL']] += 1 print_err("Computing TF-IDF of affiliations") # min_df = 2 because though we deduct non common words, they should be significant first affil_tfidf = computeTFIDFs(affiliations, 'all', min_df=2, words_freq=printaffilwordfreq) if printaffilwordfreq: print "-----" name_tfidf = computeTFIDFs(fullnames, None, min_df=2, ngram_range=(1,3), words_freq=printaffilwordfreq, token_pattern=u'(?u)\\b[a-zA-Z][a-zA-Z]+\\b') if printaffilwordfreq: return print_err("Calculating IDFs") iFfL_IDF = dict(zip(iFfL_cnt.keys(), np.log(float(len(authors)) / np.array(iFfL_cnt.values())))) lastname_IDF = dict(zip(lastname_cnt.keys(), np.log(float(len(authors)) / np.array(lastname_cnt.values())))) print_err("Packing it into a list") for i, a in enumerate(authors): authors[i][1]['iFfL_idf'] = iFfL_IDF[a[1]['iFfL']] authors[i][1]['lastname_idf'] = lastname_IDF[a[1]['name_last']] if len(a[1]['affiliation']) == 0: authors[i][1]['affil_tfidf'] = None else: authors[i][1]['affil_tfidf'] = affil_tfidf[id2affiliation[a[0]]] if a[0] in id2fullname: authors[i][1]['fullname_tfidf'] = name_tfidf[id2fullname[a[0]]] else: authors[i][1]['fullname_tfidf'] = None if (i+1) % 10000 == 0: print_err(i+1) authors_dict = dict(authors) return authors_dict