コード例 #1
0
 def test_cc_words(self):
     result = doublemetaphone("accident")
     self.assertEquals(result, ("AKSTNT", "AKSTNT"))
     result = doublemetaphone("accede")
     self.assertEquals(result, ("AKST", "AKST"))
     result = doublemetaphone("succeed")
     self.assertEquals(result, ("SKST", "SKST"))
コード例 #2
0
 def test_gh_words(self):
     result = doublemetaphone("laugh")
     self.assertEquals(result, ("LF", "LF"))
     result = doublemetaphone("cough")
     self.assertEquals(result, ("KF", "KF"))
     result = doublemetaphone("rough")
     self.assertEquals(result, ("RF", "RF"))
コード例 #3
0
 def test_various_german(self):
     result = doublemetaphone("ach")
     self.assertEquals(result, ("AK", "AK"))
     result = doublemetaphone("bacher")
     self.assertEquals(result, ("PKR", "PKR"))
     result = doublemetaphone("macher")
     self.assertEquals(result, ("MKR", "MKR"))
コード例 #4
0
 def test_dutch_origin(self):
     result = doublemetaphone("school")
     self.assertEquals(result, ("SKL", "SKL"))
     result = doublemetaphone("schooner")
     self.assertEquals(result, ("SKNR", "SKNR"))
     result = doublemetaphone("schermerhorn")
     self.assertEquals(result, ("XRMRRN", "SKRMRRN"))
     result = doublemetaphone("schenker")
     self.assertEquals(result, ("XNKR", "SKNKR"))
コード例 #5
0
 def test_mc_words(self):
     result = doublemetaphone("mac caffrey")
     self.assertEquals(result, ("MKFR", "MKFR"))
     result = doublemetaphone("mac gregor")
     self.assertEquals(result, ("MKRKR", "MKRKR"))
     result = doublemetaphone("mc crae")
     self.assertEquals(result, ("MKR", "MKR"))
     result = doublemetaphone("mcclain")
     self.assertEquals(result, ("MKLN", "MKLN"))
コード例 #6
0
 def test_various_spanish(self):
     result = doublemetaphone("bajador")
     self.assertEquals(result, ("PJTR", "PHTR"))
     result = doublemetaphone("cabrillo")
     self.assertEquals(result, ("KPRL", "KPR"))
     result = doublemetaphone("gallegos")
     self.assertEquals(result, ("KLKS", "KKS"))
     result = doublemetaphone("San Jacinto")
     self.assertEquals(result, ("SNHSNT", "SNHSNT"))
コード例 #7
0
ファイル: fuzzymetaphone.py プロジェクト: sskadamb/csvmatch
def match(value1, value2):
    value1metaphone = doublemetaphone.doublemetaphone(value1)
    value2metaphone = doublemetaphone.doublemetaphone(value2)
    possibilities = [
        value1metaphone[0] == value2metaphone[0],
        value1metaphone[0] == value2metaphone[1],
        value1metaphone[1] == value2metaphone[0],
        value1metaphone[1] == value2metaphone[1] != ''
    ]
    return 1.0 if True in possibilities else 0.0
コード例 #8
0
ファイル: predicates.py プロジェクト: jamal2300/dedupe-1
def metaphoneToken(field):
    return {
        metaphone_token
        for metaphone_token in itertools.chain(
            *(doublemetaphone(token) for token in set(field.split())))
        if metaphone_token
    }
コード例 #9
0
def match(data1, data2, fields1, fields2):
    data1phonetic = {key: {field: doublemetaphone.doublemetaphone(data1[key][field]) for field in data1[key]} for key in data1}
    data2phonetic = {key: {field: doublemetaphone.doublemetaphone(data2[key][field]) for field in data2[key]} for key in data2}
    matches = []
    for data1key, data1values in data1phonetic.items():
        for data2key, data2values in data2phonetic.items():
            match = True
            for field1, field2 in zip(fields1, fields2):
                possibilities = [
                    data1values.get(field1)[0] == data2values.get(field2)[0],
                    data1values.get(field1)[0] == data2values.get(field2)[1],
                    data1values.get(field1)[1] == data2values.get(field2)[0],
                    data1values.get(field1)[1] == data2values.get(field2)[1] != ''
                ]
                if True not in possibilities: match = False
            if match: matches.append((data1key, data2key, 1))
    return matches
コード例 #10
0
ファイル: fuzzymetaphone.py プロジェクト: vkhokhla/csvmatch
def match(data1, data2, fields1, fields2, threshold): # threshold is unused
    phonetic1 = [[doublemetaphone.doublemetaphone(value) for value in row] for row in data1]
    phonetic2 = [[doublemetaphone.doublemetaphone(value) for value in row] for row in data2]
    matches = []
    for i1, row1 in enumerate(phonetic1):
        for i2, row2 in enumerate(phonetic2):
            match = True
            for metaphone1, metaphone2 in zip(row1, row2):
                possibilities = [
                    metaphone1[0] == metaphone2[0],
                    metaphone1[0] == metaphone2[1],
                    metaphone1[1] == metaphone2[0],
                    metaphone1[1] == metaphone2[1] != ''
                ]
                if True not in possibilities: match = False
            if match: matches.append((i1, i2, 1))
    return matches
コード例 #11
0
def doubleMetaphone(field):
    """TODO.

    Examples:
    .. code:: python
        > print(doubleMetaphone('John Woodward'))
        > {'ANTRT', 'JNTRT'}
    """
    return {metaphone for metaphone in doublemetaphone(field) if metaphone}
コード例 #12
0
ファイル: fuzzymetaphone.py プロジェクト: stungkit/csvmatch
def match(value1, value2):
    value1metaphone_words = [
        doublemetaphone.doublemetaphone(word) for word in value1.split(' ')
    ]
    value2metaphone_words = [
        doublemetaphone.doublemetaphone(word) for word in value2.split(' ')
    ]
    value1metaphone = [
        ' '.join(permutations) for permutations in zip(*value1metaphone_words)
    ]
    value2metaphone = [
        ' '.join(permutations) for permutations in zip(*value2metaphone_words)
    ]
    possibilities = [
        value1metaphone[0] == value2metaphone[0],
        value1metaphone[0] == value2metaphone[1],
        value1metaphone[1] == value2metaphone[0],
        value1metaphone[1] == value2metaphone[1] != ''
    ]
    return 1.0 if True in possibilities else 0.0
コード例 #13
0
 def test_ch_words(self):
     result = doublemetaphone("Charac")
     self.assertEquals(result, ("KRK", "KRK"))
     result = doublemetaphone("Charis")
     self.assertEquals(result, ("KRS", "KRS"))
     result = doublemetaphone("chord")
     self.assertEquals(result, ("KRT", "KRT"))
     result = doublemetaphone("Chym")
     self.assertEquals(result, ("KM", "KM"))
     result = doublemetaphone("Chia")
     self.assertEquals(result, ("K", "K"))
     result = doublemetaphone("chem")
     self.assertEquals(result, ("KM", "KM"))
     result = doublemetaphone("chore")
     self.assertEquals(result, ("XR", "XR"))
     result = doublemetaphone("orchestra")
     self.assertEquals(result, ("ARKSTR", "ARKSTR"))
     result = doublemetaphone("architect")
     self.assertEquals(result, ("ARKTKT", "ARKTKT"))
     result = doublemetaphone("orchid")
     self.assertEquals(result, ("ARKT", "ARKT"))
コード例 #14
0
def metaphoneToken(field):
    """TODO.

    Examples:
    .. code:: python
        > print(metaphoneToken('John Woodward'))
        > {'AN', 'ATRT', 'FTRT', 'JN'}
    """
    return {
        metaphone_token
        for metaphone_token in itertools.chain(
            *(doublemetaphone(token) for token in set(field.split())))
        if metaphone_token
    }
コード例 #15
0
def tokenFeatures(token) :

    if token in (u'&') :
        token_clean = token_abbrev = token
        
    else :
        token_clean = re.sub(r'(^[\W]*)|([^.\w]*$)', u'', token.lower())
        token_abbrev = re.sub(r'\W', u'', token_clean)

    metaphone = doublemetaphone(token_abbrev)

    features = {'nopunc' : token_abbrev,
                'abbrev' : token_clean.endswith('.'),
                'comma'  : token.endswith(','), 
                'hyphenated' : '-' in token_clean,
                'contracted' : "'" in token_clean,
                'bracketed' : bool(re.match(r'(["(\']\w+)|(\w+[")\'])', token) and not re.match(r'["(\']\w+[")\']', token)),
                'fullbracketed' : bool(re.match(r'["(\']\w+[")\']', token)),
                'length' : len(token_abbrev),
                'initial' : len(token_abbrev) == 1 and token_abbrev.isalpha(),
                'has.vowels'  : bool(set(token_abbrev[1:]) & set(VOWELS_Y)),
                'just.letters' : token_abbrev.isalpha(),
                'roman' : set('xvi').issuperset(token_abbrev),
                'endswith.vowel' : token_abbrev.endswith(VOWELS_Y),
                'digits' : digits(token_abbrev),
                'metaphone1' : metaphone[0],
                'metaphone2' : metaphone[1],
                'more.vowels' : vowelRatio(token_abbrev),
                'in.names' : token_abbrev.upper() in ratios,
                'prepositions' : token_abbrev in PREPOSITIONS,
                'first.name' : ratios.get(token_abbrev.upper(), 0),
                'gender_ratio' : gender_names.get(token_abbrev, False),
                'possessive' : token_clean.endswith("'s") 
                }

    reversed_token = token_abbrev[::-1]
    for i in range(1, len(token_abbrev)) :
        features['prefix_%s' % i] = token_abbrev[:i]
        features['suffix_%s' % i] = reversed_token[:i][::-1]
        if i > 4 :
            break

    for tri_gram in ngrams(token_abbrev, 3) :
        features[tri_gram] = True

    for four_gram in ngrams(token_abbrev, 4) :
        features[four_gram] = True

    return features
コード例 #16
0
def tokenFeatures(token) :

    if token in (u'&') :
        token_clean = token_abbrev = token
        
    else :
        token_clean = re.sub(r'(^[\W]*)|([^.\w]*$)', u'', token.lower())
        token_abbrev = re.sub(r'\W', u'', token_clean)

    metaphone = doublemetaphone(token_abbrev)

    features = {'nopunc' : token_abbrev,
                'abbrev' : token_clean.endswith('.'),
                'comma'  : token.endswith(','), 
                'hyphenated' : '-' in token_clean,
                'contracted' : "'" in token_clean,
                'bracketed' : bool(re.match(r'(["(\']\w+)|(\w+[")\'])', token) and not re.match(r'["(\']\w+[")\']', token)),
                'fullbracketed' : bool(re.match(r'["(\']\w+[")\']', token)),
                'length' : len(token_abbrev),
                'initial' : len(token_abbrev) == 1 and token_abbrev.isalpha(),
                'has.vowels'  : bool(set(token_abbrev[1:]) & set(VOWELS_Y)),
                'just.letters' : token_abbrev.isalpha(),
                'roman' : set('xvi').issuperset(token_abbrev),
                'endswith.vowel' : token_abbrev.endswith(VOWELS_Y),
                'digits' : digits(token_abbrev),
                'metaphone1' : metaphone[0],
                'metaphone2' : metaphone[1],
                'more.vowels' : vowelRatio(token_abbrev),
                'in.names' : token_abbrev.upper() in ratios,
                'prepositions' : token_abbrev in PREPOSITIONS,
                'first.name' : ratios.get(token_abbrev.upper(), 0),
                'gender_ratio' : gender_names.get(token_abbrev, False),
                'possessive' : token_clean.endswith("'s") 
                }

    reversed_token = token_abbrev[::-1]
    for i in range(1, len(token_abbrev)) :
        features['prefix_%s' % i] = token_abbrev[:i]
        features['suffix_%s' % i] = reversed_token[:i][::-1]
        if i > 4 :
            break

    for tri_gram in ngrams(token_abbrev, 3) :
        features[tri_gram] = True

    for four_gram in ngrams(token_abbrev, 4) :
        features[four_gram] = True

    return features
コード例 #17
0
 def test_various_italian(self):
     result = doublemetaphone("bacci")
     self.assertEquals(result, ("PX", "PX"))
     result = doublemetaphone("bertucci")
     self.assertEquals(result, ("PRTX", "PRTX"))
     result = doublemetaphone("bellocchio")
     self.assertEquals(result, ("PLX", "PLX"))
     result = doublemetaphone("bacchus")
     self.assertEquals(result, ("PKS", "PKS"))
     result = doublemetaphone("focaccia")
     self.assertEquals(result, ("FKX", "FKX"))
     result = doublemetaphone("chianti")
     self.assertEquals(result, ("KNT", "KNT"))
     result = doublemetaphone("tagliaro")
     self.assertEquals(result, ("TKLR", "TLR"))
     result = doublemetaphone("biaggi")
     self.assertEquals(result, ("PJ", "PK"))
コード例 #18
0
 def test_homophones(self):
     self.assertEqual(
         doublemetaphone(u"tolled"),
         doublemetaphone(u"told"))
     self.assertEqual(
         doublemetaphone(u"katherine"),
         doublemetaphone(u"catherine"))
     self.assertEqual(
         doublemetaphone(u"brian"),
         doublemetaphone(u"bryan"))
コード例 #19
0
    def test_similar_names(self):
        #result = doublemetaphone("Bartoš")
        #self.assertEquals(result, ('PRTS', 'PRTS'))
        result = doublemetaphone(u"Bartosz")
        self.assertEquals(result, ('PRTS', 'PRTX'))
        result = doublemetaphone(u"Bartosch")
        self.assertEquals(result, ('PRTX', 'PRTX'))
        result = doublemetaphone(u"Bartos")
        self.assertEquals(result, ('PRTS', 'PRTS'))

        result = set(doublemetaphone(u"Jablonski")).intersection(
            doublemetaphone(u"Yablonsky"))
        self.assertEquals(list(result), ['APLNSK'])
        result = set(doublemetaphone(u"Smith")).intersection(
            doublemetaphone(u"Schmidt"))
        self.assertEquals(list(result), ['XMT'])
コード例 #20
0
ファイル: predicates.py プロジェクト: datamade/dedupe
def doubleMetaphone(field):
    return {metaphone for metaphone in doublemetaphone(field) if metaphone}
コード例 #21
0
 def test_th_words(self):
     result = doublemetaphone("Thomas")
     self.assertEquals(result, ("TMS", "TMS"))
     result = doublemetaphone("Thames")
     self.assertEquals(result, ("TMS", "TMS"))
コード例 #22
0
 def test_pb_words(self):
     result = doublemetaphone("Campbell")
     self.assertEquals(result, ("KMPL", "KMPL"))
     result = doublemetaphone("raspberry")
     self.assertEquals(result, ("RSPR", "RSPR"))
コード例 #23
0
 def test_g3_words(self):
     result = doublemetaphone("gya")
     self.assertEquals(result, ("K", "J"))
     result = doublemetaphone("ges")
     self.assertEquals(result, ("KS", "JS"))
     result = doublemetaphone("gep")
     self.assertEquals(result, ("KP", "JP"))
     result = doublemetaphone("geb")
     self.assertEquals(result, ("KP", "JP"))
     result = doublemetaphone("gel")
     self.assertEquals(result, ("KL", "JL"))
     result = doublemetaphone("gey")
     self.assertEquals(result, ("K", "J"))
     result = doublemetaphone("gib")
     self.assertEquals(result, ("KP", "JP"))
     result = doublemetaphone("gil")
     self.assertEquals(result, ("KL", "JL"))
     result = doublemetaphone("gin")
     self.assertEquals(result, ("KN", "JN"))
     result = doublemetaphone("gie")
     self.assertEquals(result, ("K", "J"))
     result = doublemetaphone("gei")
     self.assertEquals(result, ("K", "J"))
     result = doublemetaphone("ger")
     self.assertEquals(result, ("KR", "JR"))
     result = doublemetaphone("danger")
     self.assertEquals(result, ("TNJR", "TNKR"))
     result = doublemetaphone("manager")
     self.assertEquals(result, ("MNKR", "MNJR"))
     result = doublemetaphone("dowager")
     self.assertEquals(result, ("TKR", "TJR"))
コード例 #24
0
 def test_single_result(self):
     result = doublemetaphone(u"aubrey")
     self.assertEquals(result, ('APR', 'APR'))
コード例 #25
0
 def test_non_english_unicode(self):
     result = doublemetaphone("andestādītu")
     self.assertEquals(result, ('ANTSTTT', 'ANTSTTT'))
コード例 #26
0
 def test_double_result(self):
     result = doublemetaphone(u"richard")
     self.assertEquals(result, ('RXRT', 'RKRT'))
コード例 #27
0
 def test_general_word_list(self):
     result = doublemetaphone('Jose')
     self.assertEquals(result, ('HS', 'HS'))
     result = doublemetaphone('cambrillo')
     self.assertEquals(result, ('KMPRL', 'KMPR'))
     result = doublemetaphone('otto')
     self.assertEquals(result, ('AT', 'AT'))
     result = doublemetaphone('aubrey')
     self.assertEquals(result, ('APR', 'APR'))
     result = doublemetaphone('maurice')
     self.assertEquals(result, ('MRS', 'MRS'))
     result = doublemetaphone('auto')
     self.assertEquals(result, ('AT', 'AT'))
     result = doublemetaphone('maisey')
     self.assertEquals(result, ('MS', 'MS'))
     result = doublemetaphone('catherine')
     self.assertEquals(result, ('K0RN', 'KTRN'))
     result = doublemetaphone('geoff')
     self.assertEquals(result, ('JF', 'KF'))
     result = doublemetaphone('Chile')
     self.assertEquals(result, ('XL', 'XL'))
     result = doublemetaphone('katherine')
     self.assertEquals(result, ('K0RN', 'KTRN'))
     result = doublemetaphone('steven')
     self.assertEquals(result, ('STFN', 'STFN'))
     result = doublemetaphone('zhang')
     self.assertEquals(result, ('JNK', 'JNK'))
     result = doublemetaphone('bob')
     self.assertEquals(result, ('PP', 'PP'))
     result = doublemetaphone('ray')
     self.assertEquals(result, ('R', 'R'))
     result = doublemetaphone('Tux')
     self.assertEquals(result, ('TKS', 'TKS'))
     result = doublemetaphone('bryan')
     self.assertEquals(result, ('PRN', 'PRN'))
     result = doublemetaphone('bryce')
     self.assertEquals(result, ('PRS', 'PRS'))
     result = doublemetaphone('Rapelje')
     self.assertEquals(result, ('RPL', 'RPL'))
     result = doublemetaphone('richard')
     self.assertEquals(result, ('RXRT', 'RKRT'))
     result = doublemetaphone('solilijs')
     self.assertEquals(result, ('SLLS', 'SLLS'))
     result = doublemetaphone('Dallas')
     self.assertEquals(result, ('TLS', 'TLS'))
     result = doublemetaphone('Schwein')
     self.assertEquals(result, ('XN', 'XFN'))
     result = doublemetaphone('dave')
     self.assertEquals(result, ('TF', 'TF'))
     result = doublemetaphone('eric')
     self.assertEquals(result, ('ARK', 'ARK'))
     result = doublemetaphone('Parachute')
     self.assertEquals(result, ('PRKT', 'PRKT'))
     result = doublemetaphone('brian')
     self.assertEquals(result, ('PRN', 'PRN'))
     result = doublemetaphone('randy')
     self.assertEquals(result, ('RNT', 'RNT'))
     result = doublemetaphone('Through')
     self.assertEquals(result, ('0R', 'TR'))
     result = doublemetaphone('Nowhere')
     self.assertEquals(result, ('NR', 'NR'))
     result = doublemetaphone('heidi')
     self.assertEquals(result, ('HT', 'HT'))
     result = doublemetaphone('Arnow')
     self.assertEquals(result, ('ARN', 'ARNF'))
     result = doublemetaphone('Thumbail')
     self.assertEquals(result, ('0MPL', 'TMPL'))
コード例 #28
0
ファイル: predicates.py プロジェクト: datamade/dedupe
def metaphoneToken(field):
    return {metaphone_token for metaphone_token
            in itertools.chain(*(doublemetaphone(token)
                                 for token in set(field.split())))
            if metaphone_token}
コード例 #29
0
 def test_various_chinese(self):
     result = doublemetaphone("zhao")
     self.assertEquals(result, ("J", "J"))
コード例 #30
0
 def test_various_slavic(self):
     result = doublemetaphone("Wewski")
     self.assertEquals(result, ("ASK", "FFSK"))
コード例 #31
0
def _dm_result_main(s):
    return doublemetaphone(s)[0]
コード例 #32
0
ファイル: predicates.py プロジェクト: zhenglinyi/dedupe
def doubleMetaphone(field):
    return {metaphone for metaphone in doublemetaphone(field) if metaphone}
コード例 #33
0
 def test_various_french(self):
     result = doublemetaphone("rogier")
     self.assertEquals(result, ("RJ", "RJR"))
     result = doublemetaphone("breaux")
     self.assertEquals(result, ("PR", "PR"))