def test_cc_words(self): result = doublemetaphone("accident") self.assertEquals(result, ("AKSTNT", "AKSTNT")) result = doublemetaphone("accede") self.assertEquals(result, ("AKST", "AKST")) result = doublemetaphone("succeed") self.assertEquals(result, ("SKST", "SKST"))
def test_gh_words(self): result = doublemetaphone("laugh") self.assertEquals(result, ("LF", "LF")) result = doublemetaphone("cough") self.assertEquals(result, ("KF", "KF")) result = doublemetaphone("rough") self.assertEquals(result, ("RF", "RF"))
def test_various_german(self): result = doublemetaphone("ach") self.assertEquals(result, ("AK", "AK")) result = doublemetaphone("bacher") self.assertEquals(result, ("PKR", "PKR")) result = doublemetaphone("macher") self.assertEquals(result, ("MKR", "MKR"))
def test_dutch_origin(self): result = doublemetaphone("school") self.assertEquals(result, ("SKL", "SKL")) result = doublemetaphone("schooner") self.assertEquals(result, ("SKNR", "SKNR")) result = doublemetaphone("schermerhorn") self.assertEquals(result, ("XRMRRN", "SKRMRRN")) result = doublemetaphone("schenker") self.assertEquals(result, ("XNKR", "SKNKR"))
def test_mc_words(self): result = doublemetaphone("mac caffrey") self.assertEquals(result, ("MKFR", "MKFR")) result = doublemetaphone("mac gregor") self.assertEquals(result, ("MKRKR", "MKRKR")) result = doublemetaphone("mc crae") self.assertEquals(result, ("MKR", "MKR")) result = doublemetaphone("mcclain") self.assertEquals(result, ("MKLN", "MKLN"))
def test_various_spanish(self): result = doublemetaphone("bajador") self.assertEquals(result, ("PJTR", "PHTR")) result = doublemetaphone("cabrillo") self.assertEquals(result, ("KPRL", "KPR")) result = doublemetaphone("gallegos") self.assertEquals(result, ("KLKS", "KKS")) result = doublemetaphone("San Jacinto") self.assertEquals(result, ("SNHSNT", "SNHSNT"))
def match(value1, value2): value1metaphone = doublemetaphone.doublemetaphone(value1) value2metaphone = doublemetaphone.doublemetaphone(value2) possibilities = [ value1metaphone[0] == value2metaphone[0], value1metaphone[0] == value2metaphone[1], value1metaphone[1] == value2metaphone[0], value1metaphone[1] == value2metaphone[1] != '' ] return 1.0 if True in possibilities else 0.0
def metaphoneToken(field): return { metaphone_token for metaphone_token in itertools.chain( *(doublemetaphone(token) for token in set(field.split()))) if metaphone_token }
def match(data1, data2, fields1, fields2): data1phonetic = {key: {field: doublemetaphone.doublemetaphone(data1[key][field]) for field in data1[key]} for key in data1} data2phonetic = {key: {field: doublemetaphone.doublemetaphone(data2[key][field]) for field in data2[key]} for key in data2} matches = [] for data1key, data1values in data1phonetic.items(): for data2key, data2values in data2phonetic.items(): match = True for field1, field2 in zip(fields1, fields2): possibilities = [ data1values.get(field1)[0] == data2values.get(field2)[0], data1values.get(field1)[0] == data2values.get(field2)[1], data1values.get(field1)[1] == data2values.get(field2)[0], data1values.get(field1)[1] == data2values.get(field2)[1] != '' ] if True not in possibilities: match = False if match: matches.append((data1key, data2key, 1)) return matches
def match(data1, data2, fields1, fields2, threshold): # threshold is unused phonetic1 = [[doublemetaphone.doublemetaphone(value) for value in row] for row in data1] phonetic2 = [[doublemetaphone.doublemetaphone(value) for value in row] for row in data2] matches = [] for i1, row1 in enumerate(phonetic1): for i2, row2 in enumerate(phonetic2): match = True for metaphone1, metaphone2 in zip(row1, row2): possibilities = [ metaphone1[0] == metaphone2[0], metaphone1[0] == metaphone2[1], metaphone1[1] == metaphone2[0], metaphone1[1] == metaphone2[1] != '' ] if True not in possibilities: match = False if match: matches.append((i1, i2, 1)) return matches
def doubleMetaphone(field): """TODO. Examples: .. code:: python > print(doubleMetaphone('John Woodward')) > {'ANTRT', 'JNTRT'} """ return {metaphone for metaphone in doublemetaphone(field) if metaphone}
def match(value1, value2): value1metaphone_words = [ doublemetaphone.doublemetaphone(word) for word in value1.split(' ') ] value2metaphone_words = [ doublemetaphone.doublemetaphone(word) for word in value2.split(' ') ] value1metaphone = [ ' '.join(permutations) for permutations in zip(*value1metaphone_words) ] value2metaphone = [ ' '.join(permutations) for permutations in zip(*value2metaphone_words) ] possibilities = [ value1metaphone[0] == value2metaphone[0], value1metaphone[0] == value2metaphone[1], value1metaphone[1] == value2metaphone[0], value1metaphone[1] == value2metaphone[1] != '' ] return 1.0 if True in possibilities else 0.0
def test_ch_words(self): result = doublemetaphone("Charac") self.assertEquals(result, ("KRK", "KRK")) result = doublemetaphone("Charis") self.assertEquals(result, ("KRS", "KRS")) result = doublemetaphone("chord") self.assertEquals(result, ("KRT", "KRT")) result = doublemetaphone("Chym") self.assertEquals(result, ("KM", "KM")) result = doublemetaphone("Chia") self.assertEquals(result, ("K", "K")) result = doublemetaphone("chem") self.assertEquals(result, ("KM", "KM")) result = doublemetaphone("chore") self.assertEquals(result, ("XR", "XR")) result = doublemetaphone("orchestra") self.assertEquals(result, ("ARKSTR", "ARKSTR")) result = doublemetaphone("architect") self.assertEquals(result, ("ARKTKT", "ARKTKT")) result = doublemetaphone("orchid") self.assertEquals(result, ("ARKT", "ARKT"))
def metaphoneToken(field): """TODO. Examples: .. code:: python > print(metaphoneToken('John Woodward')) > {'AN', 'ATRT', 'FTRT', 'JN'} """ return { metaphone_token for metaphone_token in itertools.chain( *(doublemetaphone(token) for token in set(field.split()))) if metaphone_token }
def tokenFeatures(token) : if token in (u'&') : token_clean = token_abbrev = token else : token_clean = re.sub(r'(^[\W]*)|([^.\w]*$)', u'', token.lower()) token_abbrev = re.sub(r'\W', u'', token_clean) metaphone = doublemetaphone(token_abbrev) features = {'nopunc' : token_abbrev, 'abbrev' : token_clean.endswith('.'), 'comma' : token.endswith(','), 'hyphenated' : '-' in token_clean, 'contracted' : "'" in token_clean, 'bracketed' : bool(re.match(r'(["(\']\w+)|(\w+[")\'])', token) and not re.match(r'["(\']\w+[")\']', token)), 'fullbracketed' : bool(re.match(r'["(\']\w+[")\']', token)), 'length' : len(token_abbrev), 'initial' : len(token_abbrev) == 1 and token_abbrev.isalpha(), 'has.vowels' : bool(set(token_abbrev[1:]) & set(VOWELS_Y)), 'just.letters' : token_abbrev.isalpha(), 'roman' : set('xvi').issuperset(token_abbrev), 'endswith.vowel' : token_abbrev.endswith(VOWELS_Y), 'digits' : digits(token_abbrev), 'metaphone1' : metaphone[0], 'metaphone2' : metaphone[1], 'more.vowels' : vowelRatio(token_abbrev), 'in.names' : token_abbrev.upper() in ratios, 'prepositions' : token_abbrev in PREPOSITIONS, 'first.name' : ratios.get(token_abbrev.upper(), 0), 'gender_ratio' : gender_names.get(token_abbrev, False), 'possessive' : token_clean.endswith("'s") } reversed_token = token_abbrev[::-1] for i in range(1, len(token_abbrev)) : features['prefix_%s' % i] = token_abbrev[:i] features['suffix_%s' % i] = reversed_token[:i][::-1] if i > 4 : break for tri_gram in ngrams(token_abbrev, 3) : features[tri_gram] = True for four_gram in ngrams(token_abbrev, 4) : features[four_gram] = True return features
def test_various_italian(self): result = doublemetaphone("bacci") self.assertEquals(result, ("PX", "PX")) result = doublemetaphone("bertucci") self.assertEquals(result, ("PRTX", "PRTX")) result = doublemetaphone("bellocchio") self.assertEquals(result, ("PLX", "PLX")) result = doublemetaphone("bacchus") self.assertEquals(result, ("PKS", "PKS")) result = doublemetaphone("focaccia") self.assertEquals(result, ("FKX", "FKX")) result = doublemetaphone("chianti") self.assertEquals(result, ("KNT", "KNT")) result = doublemetaphone("tagliaro") self.assertEquals(result, ("TKLR", "TLR")) result = doublemetaphone("biaggi") self.assertEquals(result, ("PJ", "PK"))
def test_homophones(self): self.assertEqual( doublemetaphone(u"tolled"), doublemetaphone(u"told")) self.assertEqual( doublemetaphone(u"katherine"), doublemetaphone(u"catherine")) self.assertEqual( doublemetaphone(u"brian"), doublemetaphone(u"bryan"))
def test_similar_names(self): #result = doublemetaphone("Bartoš") #self.assertEquals(result, ('PRTS', 'PRTS')) result = doublemetaphone(u"Bartosz") self.assertEquals(result, ('PRTS', 'PRTX')) result = doublemetaphone(u"Bartosch") self.assertEquals(result, ('PRTX', 'PRTX')) result = doublemetaphone(u"Bartos") self.assertEquals(result, ('PRTS', 'PRTS')) result = set(doublemetaphone(u"Jablonski")).intersection( doublemetaphone(u"Yablonsky")) self.assertEquals(list(result), ['APLNSK']) result = set(doublemetaphone(u"Smith")).intersection( doublemetaphone(u"Schmidt")) self.assertEquals(list(result), ['XMT'])
def doubleMetaphone(field): return {metaphone for metaphone in doublemetaphone(field) if metaphone}
def test_th_words(self): result = doublemetaphone("Thomas") self.assertEquals(result, ("TMS", "TMS")) result = doublemetaphone("Thames") self.assertEquals(result, ("TMS", "TMS"))
def test_pb_words(self): result = doublemetaphone("Campbell") self.assertEquals(result, ("KMPL", "KMPL")) result = doublemetaphone("raspberry") self.assertEquals(result, ("RSPR", "RSPR"))
def test_g3_words(self): result = doublemetaphone("gya") self.assertEquals(result, ("K", "J")) result = doublemetaphone("ges") self.assertEquals(result, ("KS", "JS")) result = doublemetaphone("gep") self.assertEquals(result, ("KP", "JP")) result = doublemetaphone("geb") self.assertEquals(result, ("KP", "JP")) result = doublemetaphone("gel") self.assertEquals(result, ("KL", "JL")) result = doublemetaphone("gey") self.assertEquals(result, ("K", "J")) result = doublemetaphone("gib") self.assertEquals(result, ("KP", "JP")) result = doublemetaphone("gil") self.assertEquals(result, ("KL", "JL")) result = doublemetaphone("gin") self.assertEquals(result, ("KN", "JN")) result = doublemetaphone("gie") self.assertEquals(result, ("K", "J")) result = doublemetaphone("gei") self.assertEquals(result, ("K", "J")) result = doublemetaphone("ger") self.assertEquals(result, ("KR", "JR")) result = doublemetaphone("danger") self.assertEquals(result, ("TNJR", "TNKR")) result = doublemetaphone("manager") self.assertEquals(result, ("MNKR", "MNJR")) result = doublemetaphone("dowager") self.assertEquals(result, ("TKR", "TJR"))
def test_single_result(self): result = doublemetaphone(u"aubrey") self.assertEquals(result, ('APR', 'APR'))
def test_non_english_unicode(self): result = doublemetaphone("andestādītu") self.assertEquals(result, ('ANTSTTT', 'ANTSTTT'))
def test_double_result(self): result = doublemetaphone(u"richard") self.assertEquals(result, ('RXRT', 'RKRT'))
def test_general_word_list(self): result = doublemetaphone('Jose') self.assertEquals(result, ('HS', 'HS')) result = doublemetaphone('cambrillo') self.assertEquals(result, ('KMPRL', 'KMPR')) result = doublemetaphone('otto') self.assertEquals(result, ('AT', 'AT')) result = doublemetaphone('aubrey') self.assertEquals(result, ('APR', 'APR')) result = doublemetaphone('maurice') self.assertEquals(result, ('MRS', 'MRS')) result = doublemetaphone('auto') self.assertEquals(result, ('AT', 'AT')) result = doublemetaphone('maisey') self.assertEquals(result, ('MS', 'MS')) result = doublemetaphone('catherine') self.assertEquals(result, ('K0RN', 'KTRN')) result = doublemetaphone('geoff') self.assertEquals(result, ('JF', 'KF')) result = doublemetaphone('Chile') self.assertEquals(result, ('XL', 'XL')) result = doublemetaphone('katherine') self.assertEquals(result, ('K0RN', 'KTRN')) result = doublemetaphone('steven') self.assertEquals(result, ('STFN', 'STFN')) result = doublemetaphone('zhang') self.assertEquals(result, ('JNK', 'JNK')) result = doublemetaphone('bob') self.assertEquals(result, ('PP', 'PP')) result = doublemetaphone('ray') self.assertEquals(result, ('R', 'R')) result = doublemetaphone('Tux') self.assertEquals(result, ('TKS', 'TKS')) result = doublemetaphone('bryan') self.assertEquals(result, ('PRN', 'PRN')) result = doublemetaphone('bryce') self.assertEquals(result, ('PRS', 'PRS')) result = doublemetaphone('Rapelje') self.assertEquals(result, ('RPL', 'RPL')) result = doublemetaphone('richard') self.assertEquals(result, ('RXRT', 'RKRT')) result = doublemetaphone('solilijs') self.assertEquals(result, ('SLLS', 'SLLS')) result = doublemetaphone('Dallas') self.assertEquals(result, ('TLS', 'TLS')) result = doublemetaphone('Schwein') self.assertEquals(result, ('XN', 'XFN')) result = doublemetaphone('dave') self.assertEquals(result, ('TF', 'TF')) result = doublemetaphone('eric') self.assertEquals(result, ('ARK', 'ARK')) result = doublemetaphone('Parachute') self.assertEquals(result, ('PRKT', 'PRKT')) result = doublemetaphone('brian') self.assertEquals(result, ('PRN', 'PRN')) result = doublemetaphone('randy') self.assertEquals(result, ('RNT', 'RNT')) result = doublemetaphone('Through') self.assertEquals(result, ('0R', 'TR')) result = doublemetaphone('Nowhere') self.assertEquals(result, ('NR', 'NR')) result = doublemetaphone('heidi') self.assertEquals(result, ('HT', 'HT')) result = doublemetaphone('Arnow') self.assertEquals(result, ('ARN', 'ARNF')) result = doublemetaphone('Thumbail') self.assertEquals(result, ('0MPL', 'TMPL'))
def metaphoneToken(field): return {metaphone_token for metaphone_token in itertools.chain(*(doublemetaphone(token) for token in set(field.split()))) if metaphone_token}
def test_various_chinese(self): result = doublemetaphone("zhao") self.assertEquals(result, ("J", "J"))
def test_various_slavic(self): result = doublemetaphone("Wewski") self.assertEquals(result, ("ASK", "FFSK"))
def _dm_result_main(s): return doublemetaphone(s)[0]
def test_various_french(self): result = doublemetaphone("rogier") self.assertEquals(result, ("RJ", "RJR")) result = doublemetaphone("breaux") self.assertEquals(result, ("PR", "PR"))