def test_gh_words(self): result = phonetics.dmetaphone("laugh") self.assertEquals(result, ("LF", "")) result = phonetics.dmetaphone("cough") self.assertEquals(result, ("KF", "")) result = phonetics.dmetaphone("rough") self.assertEquals(result, ("RF", ""))
def phonetic_similarity(some, other, use_equivalences=False): if some == other: return 1.0 if not some or not other: return 0.0 some_phonetics = phonetics.dmetaphone(some) other_phonetics = phonetics.dmetaphone(other) if some_phonetics == other_phonetics: return 1.0 pair_wise_similarities = [] for some_phonetic in some_phonetics: if not some_phonetic: continue for other_phonetic in other_phonetics: if not other_phonetic: continue some_equiv = metaphone_representative( some_phonetic) if use_equivalences else some_phonetic other_equiv = metaphone_representative( other_phonetic) if use_equivalences else other_phonetic pair_wise_similarities.append( string_similarity(some_equiv, other_equiv)) return 0.0 if not pair_wise_similarities else max(pair_wise_similarities)
def test_cc_words(self): result = phonetics.dmetaphone("accident") self.assertEquals(result, ("AKSTNT", "")) result = phonetics.dmetaphone("accede") self.assertEquals(result, ("AKST", "")) result = phonetics.dmetaphone("succeed") self.assertEquals(result, ("SKST", ""))
def test_various_german(self): result = phonetics.dmetaphone("ach") self.assertEquals(result, ("AX", "AK")) result = phonetics.dmetaphone("bacher") self.assertEquals(result, ("PKR", "")) result = phonetics.dmetaphone("macher") self.assertEquals(result, ("MKR", ""))
def test_dutch_origin(self): result = phonetics.dmetaphone("school") self.assertEquals(result, ("SKL", "")) result = phonetics.dmetaphone("schooner") self.assertEquals(result, ("SKNR", "")) result = phonetics.dmetaphone("schermerhorn") self.assertEquals(result, ("XRMRRN", "SKRMRRN")) result = phonetics.dmetaphone("schenker") self.assertEquals(result, ("XNKR", "SKNKR"))
def test_various_spanish(self): result = phonetics.dmetaphone("bajador") self.assertEquals(result, ("PJTR", "PHTR")) result = phonetics.dmetaphone("cabrillo") self.assertEquals(result, ("KPRL", "KPR")) result = phonetics.dmetaphone("gallegos") self.assertEquals(result, ("KLKS", "KKS")) result = phonetics.dmetaphone("San Jacinto") self.assertEquals(result, ("SNHSNT", ""))
def test_mc_words(self): result = phonetics.dmetaphone("mac caffrey") self.assertEquals(result, ("MKFR", "")) result = phonetics.dmetaphone("mac gregor") self.assertEquals(result, ("MKRKR", "")) result = phonetics.dmetaphone("mc crae") self.assertEquals(result, ("MKR", "")) result = phonetics.dmetaphone("mcclain") self.assertEquals(result, ("MKLN", ""))
def phonetic_weight(wordone, wordtwo): #provide a score for the phonetic s2v_similarity of two words using double metaphone and damaru levenshtein. The weight of this score can be set above. w1 = phonetics.dmetaphone(wordone) w2 = phonetics.dmetaphone(wordtwo) score = enchant.utils.levenshtein(w1, w2) if score == 0: return rhyme_weighting elif score == 1: return (rhyme_weighting/2) else: return 0
def test_pb_words(self): result = phonetics.dmetaphone("Campbell") self.assertEquals(result, ("KMPL", "")) result = phonetics.dmetaphone("raspberry") self.assertEquals(result, ("RSPR", "")) result = phonetics.dmetaphone("wright") self.assertEquals(result, ("RT", "")) result = phonetics.dmetaphone("right") self.assertEquals(result, ("RT", "")) result = phonetics.dmetaphone("left") self.assertEquals(result, ("LFT", ""))
def similarity(word1, word2): nysiis1 = phonetics.nysiis(word1) nysiis2 = phonetics.nysiis(word2) nysiis_distance = levenshtein(nysiis1, nysiis2) metaphone1 = phonetics.metaphone(word1) metaphone2 = phonetics.metaphone(word2) metaphone_distance = levenshtein(metaphone1, metaphone2) dmetaphone1 = phonetics.dmetaphone(word1) dmetaphone2 = phonetics.dmetaphone(word2) dmetaphone_distance = levenshtein(dmetaphone1, dmetaphone2) # return a linear combination of these distances return nysiis_distance * 0.2 + metaphone_distance * 0.4 + dmetaphone_distance * 0.6
def similarity(self, other: str) -> float: res_seqmat = SequenceMatcher(None, self.name, other).ratio() res_lev = Levenshtein.distance(self.name, other) res_met = Levenshtein.distance(phonetics.metaphone(self.name), phonetics.metaphone(other)) phon_this = phonetics.dmetaphone(self.name) phon_oher = phonetics.dmetaphone(other) min_so_far = 9999999 for i in phon_this: for j in phon_oher: min_so_far = min(min_so_far, Levenshtein.distance(i, j)) res_dmet = min_so_far weights = {"seqmat": 0.1, "lev": 0.5, "met": 0.2, "dmet": 0.3} return (res_seqmat * weights['seqmat'] + res_lev * weights['lev'] + res_met * weights['met'] + res_dmet * weights['dmet']) / 4.0
def genMetaPhoneScore(str1, str2): ''' This function returns a score of Phonetic match of two strings. It uses doublemetaphone to generate phonetic strings. ''' if (str1 == "" or str2 == ""): return 0.0 count = 0 try: str1 = phonetics.dmetaphone(str1)[0] str2 = phonetics.dmetaphone(str2)[0] except: return 0.0 for i in range(min(len(str1), len(str2))): if (str1[i] == str2[i]): count += 1 return (float(count) / max(len(str1), len(str2)))
def dmetaphone_fuzzy_match(x, against, strategy=MatchStrategy.PARTIAL_TOKEN_SORT_RATIO): d = phonetics.dmetaphone(x) d2 = phonetics.dmetaphone(against) score = 0 t = 0 for x in [d[0], d[1]]: if not x: continue for c in [d2[0], d2[1]]: if not c: continue score += fuzzy_match(x, c, strategy) t += 1 if not t: return 0 # should never happen return score / t
def test_ch_words(self): result = phonetics.dmetaphone("Charac") self.assertEquals(result, ("KRK", "")) result = phonetics.dmetaphone("Charis") self.assertEquals(result, ("KRS", "")) result = phonetics.dmetaphone("chord") self.assertEquals(result, ("KRT", "")) result = phonetics.dmetaphone("Chym") self.assertEquals(result, ("KM", "")) result = phonetics.dmetaphone("Chia") self.assertEquals(result, ("K", "")) result = phonetics.dmetaphone("chem") self.assertEquals(result, ("KM", "")) result = phonetics.dmetaphone("chore") self.assertEquals(result, ("XR", "")) result = phonetics.dmetaphone("orchestra") self.assertEquals(result, ("ARKSTR", "")) result = phonetics.dmetaphone("architect") self.assertEquals(result, ("ARKTKT", "")) result = phonetics.dmetaphone("orchid") self.assertEquals(result, ("ARKT", ""))
def find_phonetic_similarity(name, nameset): name_phon = phonetics.soundex(name) min = 100 min_word = "" for word in nameset: word_phon = phonetics.dmetaphone(word) min_edit_distance = levenshtein(word_phon, name_phon) if min_edit_distance < min: min = min_edit_distance min_word = word return min_word
def test_homophones(self): self.assertEqual(phonetics.dmetaphone(u"tolled"), phonetics.dmetaphone(u"told")) self.assertEqual(phonetics.dmetaphone(u"katherine"), phonetics.dmetaphone(u"catherine")) self.assertEqual(phonetics.dmetaphone(u"brian"), phonetics.dmetaphone(u"bryan"))
def extractLexFeatures(w1, w2): ''' Extracts the set of features for a pair of word (w1, w2) Returns a string with a cvs format for the features ''' # length of the inputs s1 = w1.replace(bpeMark, '') s2 = w2.replace(bpeMark, '') try: lengths = str(len(s1))+','+str(len(s2))+','+str("{0:.2f}".format(len(s1)/len(s2))) except ZeroDivisionError: lengths = str(len(s1))+','+str(len(s2))+','+str("{0:.2f}".format(0.00)) # Levenshtein between tokens leven = Levenshtein.distance(w1, w2) # cosine similarity between common n-grams n2 = round(char_ngram(w1, w2, 2),4) n3 = round(char_ngram(w1, w2, 3),4) n4 = round(char_ngram(w1, w2, 4),4) ngrams = str(n2)+','+str(n3)+','+str(n4) # moved to the estimation of semantic features # cosine similarity between word embeddings # if w1 in proc.embeddingL1.vocab and w2 in proc.embeddingL1.vocab: # dWE = proc.embeddingL1.similarity(w1, w2) #else: # dWE = 0 # Levenshtein between Metaphone 2 phonetic keys of the tokens # TODO: port the java version of metaphone 3 w1M2 = phonetics.dmetaphone(w1) w2M2 = phonetics.dmetaphone(w2) levenM2 = Levenshtein.distance(w1M2[0], w2M2[0]) features = lengths+','+str(leven)+','+ ngrams +','+ str(levenM2) return features
def test_various_italian(self): result = phonetics.dmetaphone("bacci") self.assertEquals(result, ("PX", "")) result = phonetics.dmetaphone("bertucci") self.assertEquals(result, ("PRTX", "")) result = phonetics.dmetaphone("bellocchio") self.assertEquals(result, ("PLX", "")) result = phonetics.dmetaphone("bacchus") self.assertEquals(result, ("PKS", "")) result = phonetics.dmetaphone("focaccia") self.assertEquals(result, ("FKX", "")) result = phonetics.dmetaphone("chianti") self.assertEquals(result, ("KNT", "")) result = phonetics.dmetaphone("tagliaro") self.assertEquals(result, ("TKLR", "TLR")) result = phonetics.dmetaphone("biaggi") self.assertEquals(result, ("PJ", "PK"))
def test_similar_names(self): result = phonetics.dmetaphone(u"Bartosz") self.assertEquals(result, ('PRTS', 'PRTX')) result = phonetics.dmetaphone(u"Bartosch") self.assertEquals(result, ('PRTX', '')) result = phonetics.dmetaphone(u"Bartos") self.assertEquals(result, ('PRTS', '')) result = set(phonetics.dmetaphone(u"Jablonski")).intersection( phonetics.dmetaphone(u"Yablonsky")) self.assertEquals(list(result), ['APLNSK']) result = set(phonetics.dmetaphone(u"Smith")).intersection( phonetics.dmetaphone(u"Schmidt")) self.assertEquals(list(result), ['XMT'])
def read(self) -> bool: """ Read in list of country names and ISO codes """ if self.progress is not None: self.progress.update_progress(100, "Read ISO countries...") # list of all countries and their ISO codes # This also includes some common aliases self.geodb.db.begin() self.logger.debug(self.lang_list) # Add country names to DB for ky, row in country_dict.items(): # Localize country names to specified for lang in self.lang_list: # If we have a translation table for this language, then apply it if trans_table.get(lang): tbl = trans_table.get(lang) # Look up the country translation if tbl.get(ky): ky = tbl.get(ky) break # Apply first translation in list # Create Geo_row # ('paris', 'fr', '07', '012', '12.345', '45.123', 'PPL') geo_row = [None] * GeoDB.Entry.MAX geo_row[GeoDB.Entry.NAME] = GeoKeys.normalize(ky) sdx = phonetics.dmetaphone(geo_row[GeoDB.Entry.NAME]) geo_row[GeoDB.Entry.SDX] = sdx[0] geo_row[GeoDB.Entry.ISO] = row[CnRow.ISO].lower() geo_row[GeoDB.Entry.ADM1] = '' geo_row[GeoDB.Entry.ADM2] = '' geo_row[GeoDB.Entry.LAT] = row[CnRow.LAT] geo_row[GeoDB.Entry.LON] = row[CnRow.LON] geo_row[GeoDB.Entry.FEAT] = 'ADM0' geo_row[GeoDB.Entry.ID] = row[CnRow.ISO].lower() self.geodb.insert(geo_row=geo_row, feat_code='ADM0') self.geodb.db.commit() return False
def databaseTI(): global data find_query = 'SELECT `description`,`count` FROM `newcourses`' cursor.execute(find_query) text = [] text.append(cursor.fetchall()) print(text[0][0][0]) count = 1 for collection in text[0]: print(collection[1]) tags_final=[] tags = entities_text(collection[0]) for tag in tags: tags_final.append(tag.name) tags_final = set(tags_final) find_query = 'SELECT `courseid` FROM `newcourses` WHERE count = '+str(collection[1]) cursor.execute(find_query) result = cursor.fetchall() print(result) course_id = result[0][0] #print(description[0]) #print(course_id) for tag in tags_final: #print(course_id + ' ' + tag) insert_query = 'INSERT INTO `tags`(`course_id`, `tag`, `tagmp`) VALUES ('+course_id+',"'+ tag +'","'+ phonetics.dmetaphone(tag)[0] +'")' cursor.execute(insert_query) print("->",data) df=pd.DataFrame(data) cursor.close() connection.commit() print("-<> ",phonetics.dmetaphone('blockchain')) return True
def test_g3_words(self): result = phonetics.dmetaphone("gya") self.assertEquals(result, ("K", "J")) result = phonetics.dmetaphone("ges") self.assertEquals(result, ("KS", "JS")) result = phonetics.dmetaphone("gep") self.assertEquals(result, ("KP", "JP")) result = phonetics.dmetaphone("geb") self.assertEquals(result, ("KP", "JP")) result = phonetics.dmetaphone("gel") self.assertEquals(result, ("KL", "JL")) result = phonetics.dmetaphone("gey") self.assertEquals(result, ("K", "J")) result = phonetics.dmetaphone("gib") self.assertEquals(result, ("KP", "JP")) result = phonetics.dmetaphone("gil") self.assertEquals(result, ("KL", "JL")) result = phonetics.dmetaphone("gin") self.assertEquals(result, ("KN", "JN")) result = phonetics.dmetaphone("gie") self.assertEquals(result, ("K", "J")) result = phonetics.dmetaphone("gei") self.assertEquals(result, ("K", "J")) result = phonetics.dmetaphone("ger") self.assertEquals(result, ("KR", "JR")) result = phonetics.dmetaphone("danger") self.assertEquals(result, ("TNJR", "TNKR")) result = phonetics.dmetaphone("manager") self.assertEquals(result, ("MNKR", "MNJR")) result = phonetics.dmetaphone("dowager") self.assertEquals(result, ("TKR", "TJR"))
def get_phonetic_transcriptions(word): transcriptions = phonetics.dmetaphone(word) return transcriptions
def test_various_french(self): result = phonetics.dmetaphone("rogier") self.assertEquals(result, ("RJ", "RKR")) result = phonetics.dmetaphone("breaux") self.assertEquals(result, ("PR", ""))
def test_various_slavic(self): result = phonetics.dmetaphone("Wewski") self.assertEquals(result, ("ASK", "FFSK"))
def test_various_chinese(self): result = phonetics.dmetaphone("zhao") self.assertEquals(result, ("J", ""))
import phonetics import editdistance import fuzzy from itertools import combinations, product print(phonetics.dmetaphone('catherine')) print(phonetics.dmetaphone('kathryn')) print(phonetics.dmetaphone('335 Deinhard Lane, Mc Call, ID 83638')) print(phonetics.dmetaphone('5105 Berwyn Road, College Park, MD 20740')) print(phonetics.dmetaphone('5105 Berwin Road, College Park, MD 20740')) name1 = '5105 Berwyn Road, College Park, MD 20740' name2 = '5105 Berwin Road, College Park, MD 20740' name3 = '335 Deinhard Lane, Mc Call, ID 83638' nysiis_score = editdistance.eval(fuzzy.nysiis(name1), fuzzy.nysiis(name2)) other_nysiis_score = editdistance.eval(fuzzy.nysiis(name1), fuzzy.nysiis(name3)) print(nysiis_score) print(other_nysiis_score) #copied over from polish.py def loadTestData(inPath=r'testData.txt'): testDataPath = inPath with open(testDataPath, 'r') as temp: testData = [x[:-1] for x in temp.readlines()] return testData matchStrings = loadTestData()
def test_double_result(self): result = phonetics.dmetaphone(u"richard") self.assertEquals(result, ('RXRT', 'RKRT'))
def test_th_words(self): result = phonetics.dmetaphone("Thomas") self.assertEquals(result, ("TMS", "")) result = phonetics.dmetaphone("Thames") self.assertEquals(result, ("TMS", ""))
def test_single_result(self): result = phonetics.dmetaphone(u"aubrey") self.assertEquals(result, ('APR', ''))