def morphologique(a: str, b: str) -> bool: """ :param a: a word :param b: a word :return: True if both a and b sound the same. """ return soundex(a) == soundex(b)
def find_match_levenshtein_soundex(self, token, canonical): candidates = [] best_score = 2 for word in self.dicts: score = jellyfish.levenshtein_distance( token, word.decode("utf-8").lower()) if score <= best_score: best_score = score candidates.append(word.lower()) token_soundex = jellyfish.soundex(token.decode("utf-8")) match_soundex = [ match for match in candidates if jellyfish.soundex(match.decode("utf-8")) == token_soundex ] #G = ngram.NGram(match_soundex) #best_candidates = G.search(token, threshold=0.5) #results = [item[0] for item in best_candidates] is_match = False for word in match_soundex: if word == canonical: is_match = True break #if len(best_candidates) > 0: # best_match = best_candidates[0][0] #else: # best_match = "" return match_soundex, is_match
def match_to_lib(sound, sound_lib): # Find the closest match of numbers 1-10 n_closest = get_close_matches(sound, sound_lib, 1) # No direct match from the sound_lib # Check if the sound "sounds like" any lib sounds if not n_closest: # Convert the sounds into what they sound like sound_lib_ex = [soundex(x) for x in sound_lib] sound_ex = soundex(sound) # Find the closest match for "sounds like" closest_sound = get_close_matches(sound_ex, sound_lib_ex, 1) # Sounds like something in the sound_lib if closest_sound: closest_sound = closest_sound[0] n_closest = sound_lib[sound_lib_ex.index(closest_sound)] # Doesn't sound like anything in sound_lib else: n_closest = [] else: n_closest = n_closest[0] return n_closest
def update_entry(entry: dict) -> dict: name = entry["PaxName"].split() # Get name. entry["PassengerLastName"] = name[0] if len(name) >= 1 else "" entry["PassengerFirstName"] = name[1] if len(name) >= 2 else "" entry["PassengerSecondName"] = name[2] if len(name) >= 3 else "" entry["PassengerFirstName_en"] = (transliterate( entry["PassengerFirstName"]).replace("'", "").upper()) entry["PassengerSecondName_en"] = (transliterate( entry["PassengerSecondName"]).replace("'", "").upper()) entry["PassengerLastName_en"] = (transliterate( entry["PassengerLastName"]).replace("'", "").upper()) entry["PassengerFirstName_sx"] = soundex(entry["PassengerFirstName_en"]) entry["PassengerSecondName_sx"] = soundex(entry["PassengerSecondName_en"]) entry["PassengerLastName_sx"] = soundex(entry["PassengerLastName_en"]) # Transliterate name. entry["PassengerFirstName_en"] = (translit(entry["PassengerFirstName"], "ru", reversed=True).replace( "'", "").upper()) entry["PassengerSecondName_en"] = (translit(entry["PassengerSecondName"], "ru", reversed=True).replace( "'", "").upper()) entry["PassengerLastName_en"] = (translit(entry["PassengerLastName"], "ru", reversed=True).replace( "'", "").upper()) return entry
def apply_soundex(misspell, dictionary): count = 0 result = [] for mis_word in misspell: predict_words = [] if mis_word not in dictionary: if '/' not in mis_word: for dict_word in dictionary: soundex_mis = jf.soundex(mis_word) soundex_dict = jf.soundex(dict_word) l_dist = jf.levenshtein_distance(soundex_mis, soundex_dict) predict_words.append((dict_word, l_dist)) first_five_pred = sorted(predict_words, key=operator.itemgetter(1), reverse=False)[:5] pred_words = [x[0] for x in first_five_pred] result.append(pred_words) else: # do not predict when word contains '/', a lazy method result.append(mis_word) # if mis_word in dictionary else: result.append(mis_word) count += 1 print("Processing: {} / {}".format(count, len(misspell)), end='\r') return result
def token_similarity(a, b): # Strings are a case insensitive match. # Match any whitespace to any whitespace. if a.word.lower().strip() == b.word.lower().strip(): return 1. # Make it impossible for words to map to whitespace. if ((isspace(a.word) and not isspace(b.word)) or (not isspace(a.word) and isspace(b.word))): return -1. # Make it impossible for words to map to punctuation. if ispunc(a.word) and ispunc(b.word): return 0.9 if ((ispunc(a.word) and not ispunc(b.word)) or (not ispunc(a.word) and ispunc(b.word))): return -1. # Strings sound alike (approximate phonetic match). if a.word.isalpha() and b.word.isalpha(): if jf.metaphone(a.word) == jf.metaphone(b.word): return 0.9 if jf.soundex(a.word) == jf.soundex(b.word): return 0.9 if jf.nysiis(a.word) == jf.nysiis(b.word): return 0.9 if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a.word, b.word)
def remove_similar_sounds(l,r): l_sig = [] r_sig = [] l_map = defaultdict(lambda:[]) r_map = defaultdict(lambda:[]) if len(l)!=len(r): pdb.set_trace() for tok in l: sig = jellyfish.soundex(tok) l_sig.append(sig) l_map[sig].append(tok) for tok in r: sig = jellyfish.soundex(tok) if sig in l_sig: l_sig.remove(sig) else: r_sig.append(sig) r_map[sig].append(tok) new_l = [] for item in l_sig: for i in l_map[item]: new_l.append(i) new_r = [] for item in r_sig: for i in r_map[item]: new_r.append(i) if len(new_l)!=len(new_r): pdb.set_trace() return (sorted(new_l),sorted(new_r))
def match_strings(self, string1, string2): wordList1=string1.split() wordList2=string2.split() for entity in wordList1: for word in wordList2: if jellyfish.soundex(entity) == jellyfish.soundex(word):return True return False
def augment_data(df: pd.DataFrame) -> pd.DataFrame: """Augment dataframe of FEBRL person data with blocking keys and cleanup for comparison step. Args: df: pandas dataframe containing FEBRL-generated person data Returns: Augmented dataframe. """ df["surname"] = df["surname"].fillna("") df["first_name"] = df["first_name"].fillna("") # Soundex phonetic encodings. df["soundex_surname"] = df["surname"].apply(lambda x: jellyfish.soundex(x)) df["soundex_firstname"] = df["first_name"].apply( lambda x: jellyfish.soundex(x)) # NYSIIS phonetic encodings. df["nysiis_surname"] = df["surname"].apply(lambda x: jellyfish.nysiis(x)) df["nysiis_firstname"] = df["first_name"].apply( lambda x: jellyfish.nysiis(x)) # Last 3 of SSID. df["ssid_last3"] = df["soc_sec_id"].apply(lambda x: str(x)[-3:].zfill(3) if x else None) df["soc_sec_id"] = df["soc_sec_id"].astype(str) # DOB to date object. df["dob"] = df["date_of_birth"].apply(lambda x: dob_to_date(x))
def Soundex(word, dict): word_sound = jellyfish.soundex(word.decode('utf-8')) match_candidates = [] for token in dict: if (jellyfish.soundex(token.strip().decode('utf-8')) == word_sound): match_candidates.append(token.strip().lower()) return match_candidates
def get_block_key(name1, name2, input_type='REFERENCE'): """ from name1 and name2 generates a blocking key for input_type of reference: first name, last_name for input_type of document: first_name_last_name, first_name_last_name """ if input_type == 'REFERENCE': name1 = name1.split()[0].lower().replace("'", "").decode("ISO-8859-1").encode('utf8', 'ignore') name2 = name2.split()[0].lower().replace("'", "").decode("ISO-8859-1").encode('utf8', 'ignore') feature_set = {'f3f': name1[:3], 'l2f': name1[-2:], 'f3l': name2[:3], 'l2l': name2[-2:], 'soundex': jellyfish.soundex(name1) + '_' + \ jellyfish.soundex(name2)} # TODO: For now we're not considering the gender! Please check it later! block_key = feature_set.get('f3f', '') + '_' + feature_set.get('l2f', '') + '_' + feature_set.get('f3l', '') + '_' + feature_set.get( 'l2l', '') + '_' + feature_set.get('soundex', '') if input_type == 'DOCUMENT': name1 = name1.strip().replace(' ', '_') name2 = name2.strip().replace(' ', '_') blocks = sorted([get_block_key(name1.split('_')[0], name1.split('_')[-1]), get_block_key(name2.split('_')[0], name2.split('_')[-1])]) block_key = '_'.join(blocks).decode('utf-8', 'ignore') return block_key
def get_soundex(self, word1, word2, old_word): if jellyfish.soundex(word1) == jellyfish.soundex(word2): return word2 elif jellyfish.soundex(word1) == jellyfish.soundex(old_word): return old_word else: return word1
def check_street(name): name = name.strip() name = name.split(" ") ls = [n.lower() for n in name] for item in ls: try: int(item) return "other" except: continue word = ls[-1] if word == "null": return "other" lst = [ 'boulevard', 'parkway', 'east', 'west', 'street', 'avenue', 'lane', 'place', 'road', 'broadway', 'beach', 'drive', 'trail', 'circle', 'promenade', 'transit', 'park', 'highway', 'expressway', 'parkway', 'overpass', 'tunnel', 'slip', 'bridge', 'exit', 'loop', 'court', 'ramp', 'alley', 'entrance', 'heights', 'oval' ] slst = [soundex(l) for l in lst] if any(item in ls for item in lst): return "street_name" elif any(soundex(item) in ls for item in slst): return "street_name" else: return "other"
def shortword(sen): if "'re" in sen: sen=sen.replace("'re"," are") #sent=pytypo.correct_sentence(sen) spl=sen.split() s='' number=['0','1','2','3','4','5','6','7','8','9'] for string in spl: count=0 nd=0 z=0 h=0 val=0 al=len(string) # if not d.check(string) and string[-1]!='z': # string=pytypo.correct_sentence(string) # if al>1: # if string[-1]==string[-2] and not d.check(string): # string=pytypo.cut_repeat(string,1) for name,v in dictionary.dic.iteritems(): for a in v: if string==a: count+=1 s+=name+' ' z=1 break if z: continue for num in number: if num in string: s+=string+' ' val+=1 break if val==1: continue if d.check(string): s+=string+' ' continue else: try: a=jellyfish.soundex(unicode(string)) x=int(a[1:]) z=d.suggest(string) for line in z: if d.check(line): b=jellyfish.soundex(unicode(line)) y=int(b[1:]) if (a[0]==b[0])and(abs(x-y))<=5 and len(line)>len(string): s+=line+' ' count+=1 break except UnicodeDecodeError: ex=0 f = open("../data_sets/shortword.txt",'a') f.write(str(s)+"\n"); return s
def match_strings(self, string1, string2): wordList1 = string1.split() wordList2 = string2.split() for entity in wordList1: for word in wordList2: if jellyfish.soundex(entity) == jellyfish.soundex(word): return True return False
def compare_pred(x, y): if x == y: return 1. else: if editdistance.eval( x, y) <= 1 and jellyfish.soundex(x) == jellyfish.soundex(y): return 0.5 return 0.
def similarity_factor(s1, s2): """ Returns float number which corresponds to similarity order of two strings s1 and s2 """ diffl = difflib.SequenceMatcher(None, s1, s2).ratio()*100 ng = ngram.NGram.compare(s1, s2, N=1)*100 fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100 jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100 return mean([diffl, ng, fpr, jac_soundex, jac_metaphone]) if mean([diffl, ng, fpr]) < jac_soundex else mean([diffl, ng, fpr, jac_metaphone])
def soundexSimilarity(self, s1, s2): try: sdx1 = jf.soundex(s1) sdx2 = jf.soundex(s2) except: return 0 else: return sum([1 if a == b else 0 for a, b in zip(sdx1, sdx2)]) / max( len(sdx1), len(sdx2))
def compare_for_seniority_finding(s1, s2): """ Returns the input word if it is similar (according to corresponding algorithms) to some another word. s1 - main string, s2 - string from list for comparison """ fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100 jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100 jac_mrc = (1-distance.jaccard(jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower()))*100 return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def update_entry(entry: dict) -> dict: entry["PassengerBirthDate"] = (entry["PassengerBirthDate"][6:10] + "-" + entry["PassengerBirthDate"][0:2] + "-" + entry["PassengerBirthDate"][3:5]) entry["PassengerFirstName_sx"] = soundex(entry["PassengerFirstName"]) entry["PassengerSecondName_sx"] = soundex(entry["PassengerSecondName"]) entry["PassengerLastName_sx"] = soundex(entry["PassengerLastName"]) return entry
def update_entry(entry: dict) -> dict: name = entry["name"].split() entry["PassengerLastName"] = name[0] if len(name) >= 1 else "" entry["PassengerFirstName"] = name[1] if len(name) >= 2 else "" entry["PassengerSecondName"] = name[2] if len(name) >= 3 else "" entry["PassengerFirstName_sx"] = soundex(entry["PassengerFirstName"]) entry["PassengerSecondName_sx"] = soundex(entry["PassengerSecondName"]) entry["PassengerLastName_sx"] = soundex(entry["PassengerLastName"]) return entry
def get_best_matched(query_word='', jacard_list=None): index_value = jacard_list[0][1] word = jacard_list[0][0] if index_value >= 0.40: return word.replace("$", '') elif 0.30 < index_value < 0.40: sound_query_word = jellyfish.soundex(query_word) sound_word = jellyfish.soundex(word) if sound_query_word[1:] == sound_word[1:]: return word.replace("$", '') else: return None else: return None
def similarity_factor(s1, s2): """ Returns float number which corresponds to similarity order of two strings s1 and s2 """ diffl = difflib.SequenceMatcher(None, s1, s2).ratio() * 100 ng = ngram.NGram.compare(s1, s2, N=1) * 100 fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1 - distance.jaccard( jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower())) * 100 jac_soundex = (1 - distance.jaccard( jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower())) * 100 return mean([diffl, ng, fpr, jac_soundex, jac_metaphone ]) if mean([diffl, ng, fpr]) < jac_soundex else mean( [diffl, ng, fpr, jac_metaphone])
def soundex(): fw5 = open('soundex_result.txt', 'w') for line in wiki_misspell: string = line.strip() dis = 100000 bests = "" string_s = jellyfish.soundex(string) for entry in my_dict: entry.strip() entry_s = jellyfish.soundex(entry) # tem_dis = distance(entry_s, string_s) len_entry = len(entry_s) + 1 len_string = len(string_s) + 1 distance_m = [[0 for i in range(len_string)] for i in range(len_entry)] for i in range(0, len_entry): distance_m[i][0] = 0 for i in range(0, len_string): distance_m[0][i] = 0 for i in range(1, len_entry): for j in range(1, len_string): if entry_s[i - 1] == string_s[j - 1]: distance_m[i][j] = min( distance_m[i - 1][j - 1] - 1, distance_m[i - 1][j] + 1, distance_m[i][j - 1] + 1, ) else: distance_m[i][j] = min( distance_m[i - 1][j - 1] + 1, distance_m[i - 1][j] + 1, distance_m[i][j - 1] + 1, ) tem_dis = distance_m[len_entry - 1][len_string - 1] if tem_dis < dis: dis = tem_dis bests = " " bests = entry.strip() elif tem_dis == dis: bests += ' ' + entry.strip() print(dis, string, bests) fw5.write(bests + '\n') fw5.close()
def find_correct_words(word): correct_words = [] dic_path = "dict.txt" try: with open(dic_path) as dict: for dic_word_line in dict: dict_word = dic_word_line.strip() word_soundex = jellyfish.soundex(word) dict_word_soundex = jellyfish.soundex(dict_word) if word_soundex == dict_word_soundex: correct_words.append(dict_word) except Exception as e: print(e) return correct_words
def compare_for_seniority_finding(s1, s2): """ Returns the input word if it is similar (according to corresponding algorithms) to some another word. s1 - main string, s2 - string from list for comparison """ fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1 - distance.jaccard( jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower())) * 100 jac_soundex = (1 - distance.jaccard( jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower())) * 100 jac_mrc = (1 - distance.jaccard( jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower())) * 100 return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def extract_block_key(person, gender_names): feature_set = {'id': person['id'], 'first_name': person['first_name'].replace("'", ""), 'last_name': person['last_name'].replace("'", ""), 'role': person['role'], 'register_type': person['register_type'], 'register_id': person['register_id']} if person['gender'] == "male" or person['gender'] == "female": feature_set['gender'] = person['gender'] if p['first_name'] and person['last_name']: if not (person['gender'] == "male" or person['gender'] == "female"): first_split = person['first_name'].split()[0] if first_split in gender_names['male']: feature_set['gender'] = "male" # print first_split, "male" if first_split in gender_names['female']: feature_set['gender'] = "female" # print first_split, "female" if first_split not in gender_names['male'] and first_split not in gender_names['female']: feature_set['gender'] = "unknown" # print first_split, "unknown" feature_set['f3f'] = person['first_name'].split()[0][:3].replace("'", "") feature_set['l2f'] = person['first_name'].split()[0][-2:].replace("'", "") feature_set['f3l'] = person['last_name'].split()[0][:3].replace("'", "") feature_set['l2l'] = person['last_name'].split()[0][-2:].replace("'", "") feature_set['soundex'] = jellyfish.soundex(person['first_name'].split()[0].replace("'", "")) + '_' + \ jellyfish.soundex(person['last_name'].split()[0].replace("'", "")) feature_set['block_key'] = feature_set.get('gender', '') + '_' + feature_set.get('f3f', '') + '_' + feature_set.get( 'f3l', '') + '_' \ + feature_set.get('l2f', '') + '_' + feature_set.get('l2l', '') + '_' + feature_set.get( 'soundex', '') else: feature_set['gender'] = '' feature_set['soundex'] = '' feature_set['block_key'] = '' feature_set['f3f'] = '' feature_set['f3l'] = '' feature_set['l2l'] = '' feature_set['l2f'] = '' return feature_set
def phonetic(s, method): """ Phonetically encode the values in the Series. :param method: The algorithm that is used to phonetically encode the values. The possible options are 'soundex' (`wikipedia <https://en.wikipedia.org/wiki/Soundex>`_) and 'nysiis' (`wikipedia <https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System>`_). :type method: str :return: A Series with phonetic encoded values. :rtype: pandas.Series .. note:: The 'soundex' and 'nysiis' algorithms use the package 'jellyfish'. It can be installed with pip (``pip install jellyfish``). """ try: import jellyfish except ImportError: print ("Install jellyfish to use string encoding.") s = clean(s, replace_by_none='[^\-\_A-Za-z0-9]+') if method == 'soundex': return s.str.upper().apply(lambda x: jellyfish.soundex(x) if pandas.notnull(x) else np.nan) elif method == 'nysiis': return s.str.upper().apply(lambda x: jellyfish.nysiis(x) if pandas.notnull(x) else np.nan) else: raise Exception("Phonetic encoding method not found")
def fuzzy(string): return jsonify({ "metaphone": jellyfish.metaphone(string), "soundex": jellyfish.soundex(string), "nysiis": jellyfish.nysiis(string), "match_rating_codex": jellyfish.match_rating_codex(string), })
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def __init__(self): SpellChecker.dictCountMap = self.readDitionary( '../data/count_1w100k.txt') for key in SpellChecker.dictCountMap: SpellChecker.totalCount += SpellChecker.dictCountMap[key] for word in SpellChecker.dictCountMap: tGList = self.getGrams(word, SpellChecker.invertMapGram) for tgram in tGList: tmpWordList = [] if tgram in SpellChecker.invertTriMap: tmpWordList = SpellChecker.invertTriMap[tgram] tmpWordList.append(word) SpellChecker.invertTriMap[tgram] = tmpWordList tmpWordList = [] soundexHash = jellyfish.soundex(word) if soundexHash in SpellChecker.invertSoundexMap: tmpWordList = SpellChecker.invertSoundexMap[soundexHash] tmpWordList.append(word) SpellChecker.invertSoundexMap[soundexHash] = tmpWordList metaHash = jellyfish.metaphone(word) if metaHash in SpellChecker.invertMetaMap: tmpWordList = SpellChecker.invertMetaMap[metaHash] tmpWordList.append(word) SpellChecker.invertMetaMap[metaHash] = tmpWordList
def soundex_mapping(self, word): print("Performinig Soundex Mapping") sounds = {"Z600":0,"O500":1,"T000":2,"T600":3,"F600":4,"F100":5,"S200":6,"S150":7,"E230":8,"N500":9} try: num = sounds[jellyfish.soundex(word)] except: num = 0 return num
def get_match(self, dictList ,token): candidates = [] candidatesGram = [] bestMatch = "" soundex_token = jellyfish.soundex(token) candidates = [match for match in dictList if jellyfish.soundex(match) == soundex_token] if len(candidates) > 1: GramSet = ngram.NGram(candidates) candidatesGram = GramSet.search(token) if len(candidatesGram) > 0: bestMatch = candidatesGram[0][0] elif len(candidates) == 1: bestMatch = candidates[0] return bestMatch, candidates, candidatesGram
def get_most_similar(self, word): def similar_key(x): return jellyfish.levenshtein_distance(x, word) phonetic = jellyfish.soundex(word) similar = self.phonetic_map[phonetic] most_similar = sorted(list(similar), key=similar_key) return most_similar[:self.closest_neighbours]
def _word_similarity_score(a, b): if a == b: return 1. # Case and whitespace insenstive comparison. if a.lower().strip() == b.lower().strip(): return 0.95 # Penalize whitespace matching to non-whitespace. if ((_isspace(a) and not _isspace(b)) or (not _isspace(a) and _isspace(b))): return 0 # Exceptions to punctuation. if _match_ampersand(a, b): return 0.85 # Penalize punctuation matching to non-punctuation. if _ispunc(a) and _ispunc(b): return 0.95 if ((_ispunc(a) and not _ispunc(b)) or (not _ispunc(a) and _ispunc(b))): return 0 # Problems with phonetic match functions segfaulting on # empty strings. Also beneficial to match strings with # no alpha characters to each other (e.g., line numbers). a_alpha = u''.join([ c for c in a if c.isalpha() ]) b_alpha = u''.join([ c for c in b if c.isalpha() ]) if a_alpha == '' and b_alpha == '': return 0.85 # Strings sound alike (approximate phonetic match). if jf.match_rating_comparison(a_alpha, b_alpha): return 0.9 if jf.metaphone(a_alpha) == jf.metaphone(b_alpha): return 0.9 if jf.soundex(a_alpha) == jf.soundex(b_alpha): return 0.9 if jf.nysiis(a_alpha) == jf.nysiis(b_alpha): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a, b)
def correct(self, wrongWord): candidates = [] candidateDistList = [] wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram) for trigram in wWTGrams: if trigram in SpellChecker.invertTriMap: candidates = candidates + SpellChecker.invertTriMap[trigram] candidates = list(set(candidates)) #print (len(candidates)) for candidate in candidates: if abs(len(candidate) - len(wrongWord)) > 2: continue if wrongWord == candidate: continue ed = self.compED(candidate, wrongWord) jd = jellyfish.jaro_distance(wrongWord, candidate) gd = self.getJackSim( self.getGrams(candidate, SpellChecker.jackardGram), self.getGrams(wrongWord, SpellChecker.jackardGram)) score = gd * SpellChecker.dictCountMap[ candidate] / SpellChecker.totalCount * (1 / (ed + 1)) * (1 / (jd + 1)) if jellyfish.metaphone(wrongWord) == jellyfish.metaphone( candidate): score = score + 0.1 if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate): score = score + 0.1 if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate): score = score + 0.1 if jellyfish.match_rating_codex( wrongWord) == jellyfish.match_rating_codex(candidate): score = score + 0.1 tmpCandidate = ScoreRcd(candidate, ed, score) candidateDistList.append(tmpCandidate) candidateDistList.sort() return candidateDistList
def get_hash(word, hash_type): if hash_type == "SOUNDEX": hash = jellyfish.soundex(word) elif hash_type == "NYSIIS": hash = jellyfish.nysiis(word) elif hash_type == "MRA": hash = jellyfish.match_rating_codex(word) elif hash_type == "METAPHONE": hash = jellyfish.metaphone(word) else: raise NotImplementedError( "approach '{}' not implemented".format(hash_type)) return hash
def test_soundex(self): cases = [("Washington", "W252"), ("Lee", "L000"), ("Gutierrez", "G362"), ("Pfister", "P236"), ("Jackson", "J250"), ("Tymczak", "T522"), ("", ""), ("A", "A000"), (u"Çáŕẗéř", "C636"), ] for (s1, code) in cases: self.assertEqual(jellyfish.soundex(s1), code)
def transform(self, data): if isinstance(data, basestring): return soundex(unicode(data))
def extract_feature(name, standard): """ (string, string) --> [boolean, boolean, boolean, int, int, int, boolean, boolean, boolean, int] extracts various features for each record (name, standard) and exports results in form of a list of booleans and integers. >>> extract_feature('ARINCK', 'AAFTINK') [0,0,0,1,1,1,?, ?, ?, 1] """ if not name or not standard: return [] f_list = [] # features list # f1: Boolean feature -- If first 2 letters of name and standard name are equal f_list.append(name[:2] == standard[:2]) # f2: Boolean feature -- If last 2 letters of name and standard name are equal f_list.append(name[-2:] == standard[-2:]) # f3: Boolean feature -- If size of name and standard name are equal f_list.append(len(name) == len(standard)) # f4: Number feature -- absolute difference of name size and standard size f_list.append(abs(len(name) - len(standard))) # f5: Number feature--Number of longest first equal chars for i in xrange(1,len(name)+1): if not name[:i] == standard[:i]: break # print i, name, standard f_list.append(i-1) # f6: Number feature -- Number of longest last equal chars for i in range(len(name)): if not name[-i-1:] == standard[-i-1:]: break f_list.append(i) # f7: Boolean feature -- if soundex code of name and standard name is equal import jellyfish f_list.append(jellyfish.soundex(name) == jellyfish.soundex(standard)) # f8: Boolean feature -- if metaphone code of name and standard name is equal f_list.append(jellyfish.metaphone(name) == jellyfish.metaphone(standard)) # f9: Boolean feature -- if double-metaphone code of name and standard name is equal from preModules import metaphone dm_flag = False # a flag that shows whether two words have any common double-metaphone or not for dm1 in metaphone.doublemetaphone(name): for dm2 in metaphone.doublemetaphone(standard): if dm1 and dm2 and dm1 == dm2: dm_flag = True break f_list.append(dm_flag) # f10: Number feature -- longest common chars between name and its standard name from modules.basic_modules.basic import longest_common_substring f_list.append(len(longest_common_substring(name, standard))) return f_list
import jellyfish #checking if two words are homophones (not much accurate) x,y = map(str,input("Enter two words : ").split()) if(jellyfish.metaphone(x) == jellyfish.metaphone(y) or jellyfish.soundex(x) == jellyfish.soundex(y)): print("Homophones !") else: print("Not Homophones !") ''' #check difference between two words #returns number of changes print(jellyfish.levenshtein_distance(x,y)) '''
# Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查 print(Levenshtein.jaro_winkler('hello', 'helol')) # 0.9533333333333333; 计算 Jaro – Winkler 距离
import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish') #'JLLFSH' import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish')