def token_similarity(a, b): # Strings are a case insensitive match. # Match any whitespace to any whitespace. if a.word.lower().strip() == b.word.lower().strip(): return 1. # Make it impossible for words to map to whitespace. if ((isspace(a.word) and not isspace(b.word)) or (not isspace(a.word) and isspace(b.word))): return -1. # Make it impossible for words to map to punctuation. if ispunc(a.word) and ispunc(b.word): return 0.9 if ((ispunc(a.word) and not ispunc(b.word)) or (not ispunc(a.word) and ispunc(b.word))): return -1. # Strings sound alike (approximate phonetic match). if a.word.isalpha() and b.word.isalpha(): if jf.metaphone(a.word) == jf.metaphone(b.word): return 0.9 if jf.soundex(a.word) == jf.soundex(b.word): return 0.9 if jf.nysiis(a.word) == jf.nysiis(b.word): return 0.9 if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a.word, b.word)
def compare_for_seniority_finding(s1, s2): """ Returns the input word if it is similar (according to corresponding algorithms) to some another word. s1 - main string, s2 - string from list for comparison """ fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100 jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100 jac_mrc = (1-distance.jaccard(jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower()))*100 return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def mrc(): # english ----------------------------- tokens = [ 'Ball Bearing', 'bll brng', 'Centrifugal', 'centrifigal', 'PUmp', 'pmp' ] print('Running Match Rating Codex (EN)...') # print tokens print('Tokens: ', end='') for i in tokens: print(i, ' | ', end='') # printcodes print('\n', end="") print('Codes: ', end='') for i in tokens: print(jellyfish.match_rating_codex(i), ' | ', end='') # print string match comparisons print('\n', end="") print('Comparisons: ', end='') print('Ball Bearing, bll brng: ', jellyfish.match_rating_comparison('Ball Bearing', 'bll brng')) print('Centrifugal, centrifigal: ', jellyfish.match_rating_comparison('Centrifugal', 'centrifigal')) print('PUmp, pmp: ', jellyfish.match_rating_comparison('PUmp', 'pmp')) # german ----------------------------- tokens = [ 'Kugellager', 'kugelagr', 'Zentrifugal', 'zentrifkl', 'PUmpe', 'pmp' ] print('\n\nRunning Match Rating Codex Comparison (DE)...') # print tokens print('Tokens: ', end='') for i in tokens: print(i, ' | ', end='') # printcodes print('\n', end="") print('Codes: ', end='') for i in tokens: print(jellyfish.match_rating_codex(i), ' | ', end='') # print string match comparisons print('\n', end="") print('Comparisons: ', end='') print('Kugellager, kugelagr: ', jellyfish.match_rating_comparison('Kugellager', 'kugelagr')) print('Zentrifugal, zentrifkl: ', jellyfish.match_rating_comparison('Zentrifugal', 'zentrifkl')) print('PUmpe, pmp: ', jellyfish.match_rating_comparison('PUmpe', 'pmp'))
def compare_for_seniority_finding(s1, s2): """ Returns the input word if it is similar (according to corresponding algorithms) to some another word. s1 - main string, s2 - string from list for comparison """ fpr = fuzz.partial_ratio(s1, s2) jac_metaphone = (1 - distance.jaccard( jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower())) * 100 jac_soundex = (1 - distance.jaccard( jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower())) * 100 jac_mrc = (1 - distance.jaccard( jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower())) * 100 return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def fuzzy(string): return jsonify({ "metaphone": jellyfish.metaphone(string), "soundex": jellyfish.soundex(string), "nysiis": jellyfish.nysiis(string), "match_rating_codex": jellyfish.match_rating_codex(string), })
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def compare_context(phraselist_nst, ngramlist): baselist = [ jf.match_rating_codex(k.decode('utf-8', 'ignore')) for k in ngramlist if k not in stwords ] for wd in phraselist_nst: fl = 0 phonetic = jf.match_rating_codex(wd.decode('utf-8', 'ignore')) for k in baselist: if jf.levenshtein_distance(phonetic, k) <= 1: fl = 1 break if (fl == 0): return False return True
def test_match_rating_codex(self): cases = [("Byrne", "BYRN"), ("Boern", "BRN"), ("Smith", "SMTH"), ("Smyth", "SMYTH"), ("Catherine", "CTHRN"), ("Kathryn", "KTHRYN"), ] for (s1, s2) in cases: self.assertEqual(jellyfish.match_rating_codex(s1), s2)
def measure_string_distance(s1, s2, method): ''' Four methods will be used with method code from 1 to 4 Two methods focused on string similarity and the other two will be focused on phonetic encoding Method code to method name: 1. jaro-winkler distance 2. damerau-levenshtein distance 3. Metaphone 4. NYSIIS 5. match_rating_codex note: for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match) for methods 1 and 2, the methods will return a value in range [0, 1] ''' result = 0 if s1 == '' or s2 == '': return result if method == 1: result = jellyfish.jaro_winkler(s1, s2) elif method == 2: try: diff = jellyfish.damerau_levenshtein_distance(s1, s2) result = 1 - (diff / max(len(s1), len(s2))) except: result = 0 elif method == 3: result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0 elif method == 4: result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0 elif method == 5: result = 1 if jellyfish.match_rating_codex( s1) == jellyfish.match_rating_codex(s2) else 0 # elif method == 0: # raise ValueError("provide a method code (1-6).") # else: # raise ValueError("the method parameter must be in the range from 1 to 6.") return result
def featurize(df): if len(df.columns)==3: df.columns=['a', 'b', 'target'] elif len(df.columns)==2: df.columns=['a', 'b'] else: df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' }) df['TM_A'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1) df['TM_B'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1) df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1) df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1) # Jellyfish levenshtein df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1) # Scale Levenshtein column scaler = MinMaxScaler() df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1)) # Jellyfish phoneme df['metaphone'] = df.apply( lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1) df['nysiis'] = df.apply( lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1) df['mtch_rtng_cdx'] = df.apply( lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1) df['pshp_soundex_first'] = df.apply( lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1) for i, algo in enumerate(algos): df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1) return df
def get_hash(word, hash_type): if hash_type == "SOUNDEX": hash = jellyfish.soundex(word) elif hash_type == "NYSIIS": hash = jellyfish.nysiis(word) elif hash_type == "MRA": hash = jellyfish.match_rating_codex(word) elif hash_type == "METAPHONE": hash = jellyfish.metaphone(word) else: raise NotImplementedError( "approach '{}' not implemented".format(hash_type)) return hash
def correct(self, wrongWord): candidates = [] candidateDistList = [] wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram) for trigram in wWTGrams: if trigram in SpellChecker.invertTriMap: candidates = candidates + SpellChecker.invertTriMap[trigram] candidates = list(set(candidates)) #print (len(candidates)) for candidate in candidates: if abs(len(candidate) - len(wrongWord)) > 2: continue if wrongWord == candidate: continue ed = self.compED(candidate, wrongWord) jd = jellyfish.jaro_distance(wrongWord, candidate) gd = self.getJackSim( self.getGrams(candidate, SpellChecker.jackardGram), self.getGrams(wrongWord, SpellChecker.jackardGram)) score = gd * SpellChecker.dictCountMap[ candidate] / SpellChecker.totalCount * (1 / (ed + 1)) * (1 / (jd + 1)) if jellyfish.metaphone(wrongWord) == jellyfish.metaphone( candidate): score = score + 0.1 if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate): score = score + 0.1 if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate): score = score + 0.1 if jellyfish.match_rating_codex( wrongWord) == jellyfish.match_rating_codex(candidate): score = score + 0.1 tmpCandidate = ScoreRcd(candidate, ed, score) candidateDistList.append(tmpCandidate) candidateDistList.sort() return candidateDistList
def compare(word1, dictionary): c1_1 = jellyfish.soundex(word1) c2_1 = jellyfish.metaphone(word1) c3_1 = jellyfish.nysiis(word1) c4_1 = jellyfish.match_rating_codex(word1) result = (0, None) for word2 in dictionary: c1_2 = jellyfish.soundex(word2) c2_2 = jellyfish.metaphone(word2) c3_2 = jellyfish.nysiis(word2) c4_2 = jellyfish.match_rating_codex(word2) c1 = levenshtein(c1_1, c1_2) c2 = levenshtein(c2_1, c2_2) c3 = levenshtein(c3_1, c3_2) c4 = levenshtein(c4_1, c4_2) sim = c1 * 0.2 + c2 * 0.3 + c3 * 0.3 + c4 * 0.2 if sim > result[0]: result = (sim, word2) return result
def phonetic_similarity(word1, word2): encoding_1 = {} encoding_2 = {} algorithm_similarity_score = {} cumulative_score = 0 encoding_1['metaphone'] = jellyfish.metaphone(word1) encoding_1['nysiis'] = jellyfish.nysiis(word1) encoding_1['soundex'] = jellyfish.soundex(word1) encoding_1['match_rating_codex'] = jellyfish.match_rating_codex(word1) encoding_2['metaphone'] = jellyfish.metaphone(word2) encoding_2['nysiis'] = jellyfish.nysiis(word2) encoding_2['soundex'] = jellyfish.soundex(word2) encoding_2['match_rating_codex'] = jellyfish.match_rating_codex(word2) for algorithm in encoding_1.keys(): algorithm_similarity_score[algorithm] = jellyfish.levenshtein_distance( encoding_1[algorithm], encoding_2[algorithm]) * weightage[algorithm] cumulative_score += algorithm_similarity_score[algorithm] return cumulative_score
def main(): # declare test strings # rem: u prefix is required jellyfish convention str1 = u'Jellyfish' str2= u'Smellyfish' # test Phonetic Encoding print('\nPhonetic Encoding ----------------------------') # Metaphone r1 = jellyfish.metaphone(str1) r2 = jellyfish.metaphone(str2) print('Metaphone: ', r1, ", ", r2) # American Soundex r1 = jellyfish.soundex(str1) r2 = jellyfish.soundex(str2) print('Soundex: ', r1, ", ", r2) # NYSIIS r1 = jellyfish.nysiis(str1) r2 = jellyfish.nysiis(str2) print('NYSIIS: ', r1, ", ", r2) # Match Rating Codex r1 = jellyfish.match_rating_codex(str1) r2 = jellyfish.match_rating_codex(str2) print('Match Rating Codex: ', r1, ", ", r2) # test Stemming print('\nStemming -------------------------------------') pStr1 = u'Jellyfished' pStr2 = u'Smellyfishing' r1 = jellyfish.porter_stem(str1) r2 = jellyfish.porter_stem(str2) print('Porter Stemmer: ', r1, ", ", r2) # test String Comparison print('\nString Comparisons ---------------------------') # Levenshtein Distance r = jellyfish.levenshtein_distance(str1, str2) print('Levenshtein Distance: ', r) # Damerau-Levenshtein Distance r = jellyfish.damerau_levenshtein_distance(str1, str2) print('Damerau-Levenshtein Distance: ', r) # Hamming Distance result = jellyfish.hamming_distance(str1, str2) print('Hamming Distance: ', r) # Jaro Distance result = jellyfish.jaro_distance(str1, str2) print('Jaro Distance: ', r) # Jaro-Winkler Distance result = jellyfish.jaro_winkler(str1, str2) print('Jaro-Winkler Distance: ', r) # Match Rating Approach (comparison) r = jellyfish.match_rating_comparison(str1, str2) print('Match Rating Comparison: ', r) # end program print('Done.')
# Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查 print(Levenshtein.jaro_winkler('hello', 'helol')) # 0.9533333333333333; 计算 Jaro – Winkler 距离
continue #if ed ==0: # ed =1 jd=jellyfish.jaro_distance(wrongWord,candidate) #if jd==0: # jd =1 gd = getJackSim(getGrams(candidate,jackardGram),getGrams(wrongWord,jackardGram)) score = gd * dictCountMap[candidate]/totalCount * (1/(ed+1)) * (1/(jd+1)) #New Code if jellyfish.metaphone(wrongWord) == jellyfish.metaphone(candidate): score = score+0.1 if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate): score = score+0.1 if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate): score = score+0.1 if jellyfish.match_rating_codex(wrongWord) == jellyfish.match_rating_codex(candidate): score = score+0.1 tmpCandidate = ScoreRcd(candidate,ed, score) ; candidateDistList.append(tmpCandidate) candidateDistList.sort() maxIter = 10 if len(candidateDistList) < maxIter: maxIter = len(candidateDistList) for i in range(0,maxIter): out = out + candidateDistList[i].getScore() + ' ' print (out)
import os import spellcheck import jellyfish s = u'piece' s1 = jellyfish.match_rating_codex(u'place').lower().decode('utf-8', 'ignore') s2 = jellyfish.match_rating_codex(u'plaid').lower().decode('utf-8', 'ignore') print s1, s2 print jellyfish.levenshtein_distance(s1, s2) #print jellyfish.levenshtein_distance(s,u'thruout')
def transform(self, data): if isinstance(data, basestring): return match_rating_codex(unicode(data))
def match_rating_codex(s): return jellyfish.match_rating_codex(s)
from jellyfish import soundex, metaphone, match_rating_codex sn = open("senticnet5.txt", "r") for line in sn: sndx = [] metaphn = [] codex = [] concept = (line.split('\t')[0]) words = concept.split('_') for i in range(len(words)): sndx.append(soundex(words[i])) metaphn.append(metaphone(words[i])) codex.append(match_rating_codex(words[i])) print(concept, '\t', '_'.join(metaphn))
import re from jellyfish import soundex,metaphone,match_rating_codex sentic = open("codex.txt", "r").read() sentic1 = open("concepts+soundex.txt", "r").read() sentic2 = open("concepts+metaphone.txt", "r").read() #text = sentic.read().strip().split() string = input("Enter a string: ") cdx = match_rating_codex(string) print(cdx) sdx = soundex(string) meta = metaphone(string) print("Codex Results\n") for line in sentic.split("\n"): #print (line) if cdx in line.split(" \t "): print (line) #if sndx in soundx and len(sndx) == len(soundx): # print(soundx) print ("Soundex Results\n") for line in sentic1.split("\n"): if sdx in line.split(" \t "): print (line) print("Metaphone Results\n") for line in sentic2.split("\n"): if meta in line.split(" \t "): print(line)
# nx.draw(G, pos, with_labels=True, node_size=0) # ---------------------------------------------> jellyfish <-------------------------------------------- # # String comparison grape_1 = 'Ma' grape_2 = 'Mariette' jf.levenshtein_distance(grape_1, grape_2) jf.jaro_distance(grape_1, grape_2) jf.damerau_levenshtein_distance(grape_1, grape_2) # Phonetic encoding jf.metaphone(grape_1) jf.soundex(grape_1) jf.nysiis(grape_1) jf.match_rating_codex(grape_1) jf.match_rating_codex(grape_2) # ---------------------------------------------> Udacity <-------------------------------------------- # scores = [3.0, 1.0, 0.2] scores2 = np.array([[1, 2, 3, 6], [2, 4, 5, 6], [3, 8, 7, 6]]) def softmax(x): """Compute softmax values for each sets of scores in x.""" return np.exp(x) / np.sum(np.exp(x), axis=0) print(softmax(scores))
def match_rating_codex(s): return None if s == None else J.match_rating_codex(s)
import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish') #'JLLFSH' import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish')