Beispiel #1
0
    def find_match_levenshtein_metaphone(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token,
                word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        token_metaphone = jellyfish.metaphone(token.decode("utf-8"))
        match_metaphone = [
            match for match in candidates
            if jellyfish.metaphone(match.decode("utf-8")) == token_metaphone
        ]

        #G = ngram.NGram(match_metaphone)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in match_metaphone:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return match_metaphone, is_match
Beispiel #2
0
    def get_event_code(self, key, language, dimension="2D", event_type="MT"):
        """Returns EventCode

        :param str key: Movie name
        :param str language: Movie language
        :param dimension: Movie dimension, can be 2D, 2D 4DX, 3D, 3D 4DX, or IMAX 3D
        :type dimension: str
        :param event_type: Event types( MT(Movies), CT(Events), PL(Plays), SP(Sports))
        :type event_type: str
        :return: Event Code
        :rtype: str
        :raises BMSError: If the event code is not found
        """
        quickbook = self.quickbook(event_type)
        movies = quickbook['moviesData']['BookMyShow']['arrEvents']
        key = metaphone(key).replace(' ', '')
        for movie in movies:
            if key == metaphone(movie['EventTitle']).replace(' ', ''):
                for child in movie['ChildEvents']:
                    if language == child[
                            'EventLanguage'] and dimension == child[
                                'EventDimension']:
                        return child['EventCode']
        else:
            raise BMSError(
                "Event code not found! Please check the Movie name and other options"
            )
Beispiel #3
0
    def get_similarity_score(self, str_a, str_b):
        '''
        Compare phonetic similarity of two strings.
        Input strings are converted into pinyin before comparing.
        :param str_a: (String) name entity
        :param str_b: (String) name entity
        :return: (float) Similarity score ranging between 0 to 1
        '''
        cn_str_a = self.get_pinyin("".join(str_a), space_seperated=False)
        cn_str_b = self.get_pinyin("".join(str_b), space_seperated=False)
        phone_a, phone_b = jellyfish.metaphone(cn_str_a), jellyfish.metaphone(
            cn_str_b)

        # Calculate phone edit distance and phone similarity
        edit_distance = self._phone_edit_distance(phone_a, phone_b)
        max_score = max(len(phone_a),
                        len(phone_b)) * 4 - abs(len(phone_a) - len(phone_b))
        if max_score == 0:
            if str_a == str_b:
                return 1
            else:
                return 0
        similarity = 1 - edit_distance / max_score

        return similarity
Beispiel #4
0
def token_similarity(a, b):
    # Strings are a case insensitive match.
    # Match any whitespace to any whitespace.
    if a.word.lower().strip() == b.word.lower().strip():
        return 1.

    # Make it impossible for words to map to whitespace.
    if ((isspace(a.word) and not isspace(b.word))
            or (not isspace(a.word) and isspace(b.word))):
        return -1.

    # Make it impossible for words to map to punctuation.
    if ispunc(a.word) and ispunc(b.word):
        return 0.9
    if ((ispunc(a.word) and not ispunc(b.word))
            or (not ispunc(a.word) and ispunc(b.word))):
        return -1.

    # Strings sound alike (approximate phonetic match).
    if a.word.isalpha() and b.word.isalpha():
        if jf.metaphone(a.word) == jf.metaphone(b.word):
            return 0.9
        if jf.soundex(a.word) == jf.soundex(b.word):
            return 0.9
        if jf.nysiis(a.word) == jf.nysiis(b.word):
            return 0.9
        if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word):
            return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a.word, b.word)
def phonetic_similarity(ref, result):
    targetTmp = result
    refTmp = ref
    targetTmp = jellyfish.metaphone(result)
    refTmp = jellyfish.metaphone(ref)

    return jellyfish.jaro_winkler(refTmp, targetTmp)
def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio()*100
    ng = ngram.NGram.compare(s1, s2, N=1)*100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone]) if mean([diffl, ng, fpr]) < jac_soundex else mean([diffl, ng, fpr, jac_metaphone])
def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    jac_mrc = (1-distance.jaccard(jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower()))*100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def phonetic_encoded_jaro_winkler_sim(ref, result):
    targetTmp = jellyfish.metaphone(result)
    refTmp = jellyfish.metaphone(ref)
    if Debug:
        print("Result: \t\t" + result)
        print("Converted result: \t" + targetTmp)
        print("Ref: \t\t\t" + ref)
        print("Converted ref: \t\t" + refTmp)
        print(
            "------------------------------------------------------------------------------"
        )
    return jellyfish.jaro_winkler(refTmp, targetTmp)
    def correct(self, wrongWord):
        candidates = []
        candidateDistList = []
        wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram)

        for trigram in wWTGrams:
            if trigram in SpellChecker.invertTriMap:
                addList = []
                tmpList = SpellChecker.invertTriMap[trigram]
                for tmp in tmpList:
                    ed = self.compED(tmp, wrongWord)
                    if ed <= 2:
                        addList.append(tmp)
                candidates = candidates + addList

        #soundexHash = jellyfish.soundex(wrongWord)
        #if soundexHash in SpellChecker.invertSoundexMap:
        #	candidates = candidates + SpellChecker.invertSoundexMap[soundexHash]
        #candidates = list(set(candidates))

        metaHash = jellyfish.metaphone(wrongWord)
        if metaHash in SpellChecker.invertMetaMap:
            candidates = candidates + SpellChecker.invertMetaMap[metaHash]
        candidates = list(set(candidates))

        #print (len(candidates))

        for candidate in candidates:
            if abs(len(candidate) - len(wrongWord)) > 2:
                continue
            if wrongWord == candidate:
                continue
            ed = self.compED(candidate, wrongWord)
            jd = jellyfish.jaro_distance(wrongWord, candidate)
            gd = self.getJackSim(
                self.getGrams(candidate, SpellChecker.jackardGram),
                self.getGrams(wrongWord, SpellChecker.jackardGram))
            score = float(SpellChecker.dictCountMap[candidate]) / float(
                SpellChecker.totalCount) + (
                    max(len(candidate), len(wrongWord)) - ed)
            if jellyfish.metaphone(wrongWord) == jellyfish.metaphone(
                    candidate):
                score = score + 0.1
            #if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate):
            #	score = score+0.1
            #if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate):
            #	score = score+0.1
            #if jellyfish.match_rating_codex(wrongWord) == jellyfish.match_rating_codex(candidate):
            #	score = score+0.1
            tmpCandidate = ScoreRcd(candidate, ed, score)
            candidateDistList.append(tmpCandidate)
        candidateDistList.sort()
        return candidateDistList
def phonetic_encoded_jaccard_sim(str1, str2):
    targetTmp = jellyfish.metaphone(str2)
    refTmp = jellyfish.metaphone(str1)
    if Debug:
        print("Result: \t\t" + str2)
        print("Converted result: \t" + targetTmp)
        print("Ref: \t\t\t" + str1)
        print("Converted ref: \t\t" + refTmp)
        print(
            "------------------------------------------------------------------------------"
        )

    return jaccard_sim(refTmp, targetTmp)
Beispiel #11
0
def phonetic_similarity(ref, result):
    targetTmp = result
    refTmp = ref
    targetTmp = jellyfish.metaphone(result)
    refTmp = jellyfish.metaphone(ref)
    if Debug:
        print("Result: \t\t" + result)
        print("Converted result: \t" + targetTmp)
        print("Ref: \t\t\t" + ref)
        print("Converted ref: \t\t" + refTmp)
        print(
            "------------------------------------------------------------------------------"
        )
    return round(jellyfish.jaro_winkler(refTmp, targetTmp), 5)
def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio() * 100
    ng = ngram.NGram.compare(s1, s2, N=1) * 100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone
                 ]) if mean([diffl, ng, fpr]) < jac_soundex else mean(
                     [diffl, ng, fpr, jac_metaphone])
def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    jac_mrc = (1 - distance.jaccard(
        jellyfish.match_rating_codex(unicode(s1)).lower(),
        jellyfish.match_rating_codex(unicode(s2)).lower())) * 100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
def string_match(x, y):

    # converts the string into unicode
    x = x.decode('utf-8')
    y = y.decode('utf-8')

    # makes the metaphones of the words
    xp = jf.metaphone(x)
    yp = jf.metaphone(y)

    # compares the metaphones of the words
    if xp == yp:
        return True

    return False
    def __init__(self):
        SpellChecker.dictCountMap = self.readDitionary(
            '../data/count_1w100k.txt')
        for key in SpellChecker.dictCountMap:
            SpellChecker.totalCount += SpellChecker.dictCountMap[key]
        for word in SpellChecker.dictCountMap:
            tGList = self.getGrams(word, SpellChecker.invertMapGram)
            for tgram in tGList:
                tmpWordList = []
                if tgram in SpellChecker.invertTriMap:
                    tmpWordList = SpellChecker.invertTriMap[tgram]
                tmpWordList.append(word)
                SpellChecker.invertTriMap[tgram] = tmpWordList
            tmpWordList = []

            soundexHash = jellyfish.soundex(word)
            if soundexHash in SpellChecker.invertSoundexMap:
                tmpWordList = SpellChecker.invertSoundexMap[soundexHash]
            tmpWordList.append(word)
            SpellChecker.invertSoundexMap[soundexHash] = tmpWordList

            metaHash = jellyfish.metaphone(word)
            if metaHash in SpellChecker.invertMetaMap:
                tmpWordList = SpellChecker.invertMetaMap[metaHash]
            tmpWordList.append(word)
            SpellChecker.invertMetaMap[metaHash] = tmpWordList
Beispiel #16
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
Beispiel #17
0
 def make_shortcode(self, name):
     upper_name = name.upper().replace(" ", "")
     shortcode = jellyfish.metaphone(upper_name)[:4]
     if len(shortcode) < 4:
         shortcode = upper_name[:4]
     shortcode += str(random.randint(0, 99999999)).zfill(8)
     return shortcode[:8]
Beispiel #18
0
def pickle_data():

    dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))

    conn = sqlite3.connect(dbpath)
    conn.row_factory = dict_factory

    c = conn.cursor()
    c.execute("""SELECT * FROM states ORDER BY name""")

    states = []

    for row in c:
        row['name_metaphone'] = jellyfish.metaphone(row['name'])
        row['is_territory'] = row['is_territory'] == 1
        row['is_obsolete'] = row['is_obsolete'] == 1
        row['is_contiguous'] = row['is_contiguous'] == 1
        row['is_continental'] = row['is_continental'] == 1
        row['time_zones'] = row['time_zones'].split(',')
        states.append(row)

    pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))

    with open(pkl_path, 'wb') as pkl_file:
        pickle.dump(states, pkl_file)
Beispiel #19
0
def fuzzy(string):
    return jsonify({
        "metaphone": jellyfish.metaphone(string),
        "soundex": jellyfish.soundex(string),
        "nysiis": jellyfish.nysiis(string),
        "match_rating_codex": jellyfish.match_rating_codex(string),
    })
Beispiel #20
0
def pickle_state_data():

    dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))

    conn = sqlite3.connect(dbpath)
    conn.row_factory = dict_factory

    c = conn.cursor()
    c.execute("""SELECT * FROM states ORDER BY name""")

    states = []

    for row in c:
        row['name_metaphone'] = jellyfish.metaphone(row['name'])
        row['is_territory'] = row['is_territory'] == 1
        row['is_obsolete'] = row['is_obsolete'] == 1
        row['is_contiguous'] = row['is_contiguous'] == 1
        row['is_continental'] = row['is_continental'] == 1
        row['time_zones'] = row['time_zones'].split(',')
        states.append(row)

    pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))

    with open(pkl_path, 'wb') as pkl_file:
        pickle.dump(states, pkl_file)
def metaphone(x):

    x = x.decode('utf-8')

    xp = jf.metaphone(x)

    return xp
Beispiel #22
0
 def make_shortcode(self, name):
     upper_name = name.upper().replace(" ", "")
     shortcode = jellyfish.metaphone(upper_name)[:4]
     if len(shortcode) < 4:
         shortcode = upper_name[:4]
     shortcode += str(random.randint(0, 99999999)).zfill(8)
     return shortcode[:8]
Beispiel #23
0
def pickle_data():

    dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))

    conn = sqlite3.connect(dbpath)
    conn.row_factory = dict_factory

    c = conn.cursor()
    c.execute("""SELECT * FROM states ORDER BY name""")

    states = []

    for row in c:
        row['name_metaphone'] = jellyfish.metaphone(row['name'])
        row['is_territory'] = row['is_territory'] == 1
        row['is_obsolete'] = row['is_obsolete'] == 1
        row['is_contiguous'] = row['is_contiguous'] == 1
        row['is_continental'] = row['is_continental'] == 1
        row['time_zones'] = row['time_zones'].split(',')
        states.append(row)

    pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))

    with open(pkl_path, 'wb') as pkl_file:
        # Use `protocol=2` to ensure package compatibility with Python 2,
        # even if the `.pkl` file is built under Python 3
        pickle.dump(states, pkl_file, protocol=2)
    def find_min_dist(lyrics):
        nonlocal min_dist
        nonlocal min_dist_idx
        nonlocal phrase
        nonlocal idx

        # Find best match phrase in lyrics
        min_dist_this_lyrics = 10000
        min_dist_start_idx = 0
        min_dist_end_idx = 0
        lyrics_met = jellyfish.metaphone(lyrics).split(' ')
        for i in range(0, len(lyrics_met) - len(test_met)):
            this_lyrics_met = lyrics_met[i:i + len(test_met)]
            if this_lyrics_met[0] == test_met[0]:
                dist = jellyfish.levenshtein_distance(''.join(test_met), ''.join(this_lyrics_met))
                if dist < min_dist_this_lyrics:
                    min_dist_this_lyrics = dist
                    min_dist_start_idx = i
                    min_dist_end_idx = i + len(test_met)

        # Check against global min
        if min_dist_this_lyrics < min_dist:
            min_dist = min_dist_this_lyrics
            min_dist_idx = idx
            phrase = ' '.join(lyrics.split(' ')[min_dist_start_idx:min_dist_end_idx])

        # Increment global idx
        idx += 1
Beispiel #25
0
def metaphone():
    fw6 = open('metaphone_result.txt', 'w')
    for line in wiki_misspell:
        string = line.strip()
        dis = 100000
        bests = ""
        string_s = jellyfish.metaphone(string)
        for entry in my_dict:
            entry.strip()
            entry_s = jellyfish.metaphone(entry)

            len_entry = len(entry_s) + 1
            len_string = len(string_s) + 1
            distance_m = [[0 for i in range(len_string)]
                          for i in range(len_entry)]
            for i in range(0, len_entry):
                distance_m[i][0] = 0
            for i in range(0, len_string):
                distance_m[0][i] = 0

            for i in range(1, len_entry):
                for j in range(1, len_string):
                    if entry_s[i - 1] == string_s[j - 1]:
                        distance_m[i][j] = min(
                            distance_m[i - 1][j - 1] - 1,
                            distance_m[i - 1][j] + 1,
                            distance_m[i][j - 1] + 1,
                        )
                    else:
                        distance_m[i][j] = min(
                            distance_m[i - 1][j - 1] + 1,
                            distance_m[i - 1][j] + 1,
                            distance_m[i][j - 1] + 1,
                        )

            tem_dis = distance_m[len_entry - 1][len_string - 1]

            if tem_dis < dis:
                dis = tem_dis
                bests = " "
                bests = entry.strip()
            elif tem_dis == dis:
                bests += ' ' + entry.strip()

        print(dis, string, bests)
        fw6.write(bests + '\n')
    fw6.close()
Beispiel #26
0
def _word_similarity_score(a, b):
    if a == b:
        return 1.

    # Case and whitespace insenstive comparison.
    if a.lower().strip() == b.lower().strip():
        return 0.95

    # Penalize whitespace matching to non-whitespace.
    if ((_isspace(a) and not _isspace(b)) or
        (not _isspace(a) and _isspace(b))):
        return 0

    # Exceptions to punctuation.
    if _match_ampersand(a, b):
        return 0.85
    # Penalize punctuation matching to non-punctuation.
    if _ispunc(a) and _ispunc(b):
        return 0.95
    if ((_ispunc(a) and not _ispunc(b)) or
        (not _ispunc(a) and _ispunc(b))):
        return 0

    # Problems with phonetic match functions segfaulting on
    # empty strings. Also beneficial to match strings with
    # no alpha characters to each other (e.g., line numbers).
    a_alpha = u''.join([ c for c in a if c.isalpha() ])
    b_alpha = u''.join([ c for c in b if c.isalpha() ])
    if a_alpha == '' and b_alpha == '':
        return 0.85

    # Strings sound alike (approximate phonetic match).
    if jf.match_rating_comparison(a_alpha, b_alpha):
        return 0.9
    if jf.metaphone(a_alpha) == jf.metaphone(b_alpha):
        return 0.9
    if jf.soundex(a_alpha) == jf.soundex(b_alpha):
        return 0.9
    if jf.nysiis(a_alpha) == jf.nysiis(b_alpha):
        return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a, b)
Beispiel #27
0
def get_ethnicity_list(input_list):
    output_list = []
    for i in input_list:
        temp = jellyfish.metaphone(unicode(i))
        if D4name_ethnicity_meta.has_key(temp):
            output_list.append(D4name_ethnicity_meta[temp])
        else:
            output_list.append('other')

    return output_list
Beispiel #28
0
def phonetic(addressline):
    # create a metaphone representation of an address or partial address
    words = re.split('\s+', addressline)
    phonetics = []
    for word in words:
        if re.match('\d', word):
            phonetics.append(word)
        else:
            phonetics.append(jellyfish.metaphone(word))
    return ''.join(phonetics)
Beispiel #29
0
def process_stop_words(text):
    result = []
    words = text.split()
    for word in words:
        lower_word = word.lower()
        if lower_word.isdigit():
            result.append(lower_word)
        else:
            if word and not word.lower() in stop_words_set:
                result.append(jf.metaphone(lower_word))
    return ' '.join(result)
Beispiel #30
0
 def scoring(self, suggestion, phrase):
     score = 0
     if (suggestion.distance == 0):
         score += 2000
     if (suggestion.suggest_rule == SuggestRule.PREFIX):
         score += 500
     if (suggestion.distance == 1):
         score += 300
     if (suggestion.distance == 2):
         score += 100
     if (suggestion.distance > 2):
         score += (100 - suggestion.distance * 10)
     score += suggestion.count / 100000000
     if (suggestion.count < 100000):
         score -= 10000000 / suggestion.count
     if (jellyfish.metaphone(
             suggestion.term) == jellyfish.metaphone(phrase)):
         score += 50
     print(str(suggestion) + "score is : " + str(score))
     return score * -1
Beispiel #31
0
    def __init__(self, plainEntity):
        """
    Instantiates a new encoded entity object.
    Requires 'plainEntity': plain text entity to encode
    """

        if isinstance(plainEntity, str):
            plainEntity = unicode(plainEntity, 'utf-8')

        self.plain = plainEntity
        self.encoded = jellyfish.metaphone(plainEntity)
Beispiel #32
0
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            self.__dict__[k] = v

        try:
            import jellyfish

            self.__dict__["name_metaphone"] = jellyfish.metaphone(
                self.__dict__["name"])
        except:
            pass
Beispiel #33
0
def process_stop_words(text):
    result = []
    words = text.split()
    for word in words:
        lower_word = word.lower()
        if lower_word.isdigit():
            result.append(lower_word)
        else:
            if word and not word.lower() in stop_words_set:
                result.append(jf.metaphone(lower_word))
    return " ".join(result)
Beispiel #34
0
def measure_string_distance(s1, s2, method):
    '''
            Four methods will be used with method code from 1 to 4
            Two methods focused on string similarity and the other two will be focused on phonetic encoding
            Method code to method name:
            1. jaro-winkler distance
            2. damerau-levenshtein distance
            3. Metaphone
            4. NYSIIS
            5. match_rating_codex

            note:
                    for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match)
                    for methods 1 and 2, the methods will return a value in range [0, 1]
    '''
    result = 0

    if s1 == '' or s2 == '':
        return result

    if method == 1:
        result = jellyfish.jaro_winkler(s1, s2)
    elif method == 2:
        try:
            diff = jellyfish.damerau_levenshtein_distance(s1, s2)
            result = 1 - (diff / max(len(s1), len(s2)))
        except:
            result = 0
    elif method == 3:
        result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0
    elif method == 4:
        result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0
    elif method == 5:
        result = 1 if jellyfish.match_rating_codex(
            s1) == jellyfish.match_rating_codex(s2) else 0
    # elif method == 0:
    # 	raise ValueError("provide a method code (1-6).")
    # else:
    # 	raise ValueError("the method parameter must be in the range from 1 to 6.")

    return result
 def process(self, term):
     if term == '':
         return term
     candidates = self.generate_candidates(term)
     if candidates:
         scores = [
             (0.6 * levenshtein_distance(metaphone(i), metaphone(term)) +
              0.4 * levenshtein_distance(i, term), idx)
             for idx, i in enumerate(candidates)
         ]
         min_value = 1000
         min_idx = -1
         if len(scores) > 0:
             for score, idx in scores:
                 if candidates[idx].startswith(
                         term[0]) and "'" not in candidates[idx]:
                     if score < min_value:
                         min_value = score
                         min_idx = idx
             term = candidates[min_idx]
     return term
Beispiel #36
0
def bestcandidate(wrd):
    w = wrd
    candidate_list = []
    try:
        #Check the Brown word clusters
        c = bcluster._word[w]
        for rec in c:
            d = rec['cluster']
        recs = bcluster._cluster[d]
        for rec in recs:
            candidate = rec['word']
            levenshtein = jellyfish.levenshtein_distance(w,candidate)
            n2 = jellyfish.metaphone(w)
            n3 = jellyfish.metaphone(candidate)
            if chant.check(candidate):
                #Filter the candidates within a specific character and phonetic distance
                if levenshtein <= 2 or jellyfish.levenshtein_distance(n2, n3) <= 1:
                    candidate_list.append((candidate, rec['count']))
        return candidate_list[-1][0]
    except Exception:
        return 'No'
Beispiel #37
0
def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['TM_A'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1)
    df['TM_B'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1)

    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1)
    
    # Jellyfish levenshtein
    df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1)
    # Scale Levenshtein column
    scaler = MinMaxScaler()
    df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1))

    # Jellyfish phoneme
    df['metaphone'] = df.apply(
        lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1)
    df['nysiis'] = df.apply(
        lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1)
    df['mtch_rtng_cdx'] = df.apply(
        lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1)
    
    return df
Beispiel #38
0
    def test_metaphone(self):
        cases = [("metaphone", 'MTFN'),
                 ("wHErE", "WR"),
                 ("shell", "XL"),
                 ("this is a difficult string", "0S IS A TFKLT STRNK"),
                 ("aeromancy", "ERMNS"),
                 ("Antidisestablishmentarianism", "ANTTSSTBLXMNTRNSM"),
                 ("sunlight labs", "SNLT LBS"),
                 ("sonlite laabz", "SNLT LBS"),
                 (u"Çáŕẗéř", "KRTR"),
                 ]

        for (s1, code) in cases:
            self.assertEqual(jellyfish.metaphone(s1), code)
Beispiel #39
0
def lookup(val, field=None, use_cache=True):
    """ Semi-fuzzy state lookup. This method will make a best effort
        attempt at finding the state based on the lookup value provided.

          * two digits will search for FIPS code
          * two letters will search for state abbreviation
          * anything else will try to match the metaphone of state names

        Metaphone is used to allow for incorrect, but phonetically accurate,
        spelling of state names.

        Exact matches can be done on any attribute on State objects by passing
        the `field` argument. This skips the fuzzy-ish matching and does an
        exact, case-sensitive comparison against the specified field.

        This method caches non-None results, but can the cache can be bypassed
        with the `use_cache=False` argument.
    """

    import jellyfish

    if field is None:
        if FIPS_RE.match(val):
            field = 'fips'
        elif ABBR_RE.match(val):
            val = val.upper()
            field = 'abbr'
        else:
            val = jellyfish.metaphone(val)
            field = 'name_metaphone'

    # see if result is in cache
    cache_key = "%s:%s" % (field, val)
    if use_cache and cache_key in _lookup_cache:
        return _lookup_cache[cache_key]

    for state in STATES_AND_TERRITORIES:
        if val == getattr(state, field):
            _lookup_cache[cache_key] = state
            return state
#     Damerau-Levenshtein Distance
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
print(Levenshtein.jaro_winkler('hello', 'helol'))  # 0.9533333333333333; 计算 Jaro – Winkler 距离
import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')
#'JLLFSH'
import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')
def extract_feature(name, standard):
    """ (string, string) --> [boolean, boolean, boolean, int, int, int, boolean, boolean, boolean, int]
    extracts various features for each record (name, standard) and exports results in form of a list of booleans and integers.

    >>> extract_feature('ARINCK', 'AAFTINK')
     [0,0,0,1,1,1,?, ?, ?, 1]

    """
    if not name or not standard:
        return []

    f_list = [] # features list

    # f1: Boolean feature -- If first 2 letters of name and standard name are equal
    f_list.append(name[:2] == standard[:2])
    # f2: Boolean feature -- If last 2 letters of name and standard name are equal
    f_list.append(name[-2:] == standard[-2:])

    # f3: Boolean feature -- If size of name and standard name are equal
    f_list.append(len(name) == len(standard))

    # f4: Number feature -- absolute difference of name size and standard size
    f_list.append(abs(len(name) - len(standard)))

    # f5: Number feature--Number of longest first equal chars
    for i in xrange(1,len(name)+1):

        if not name[:i] == standard[:i]:
            break
    # print i, name, standard
    f_list.append(i-1)



    # f6: Number feature -- Number of longest last equal chars
    for i in range(len(name)):
        if not name[-i-1:] == standard[-i-1:]:
            break

    f_list.append(i)


    # f7: Boolean feature -- if soundex code of name and standard name is equal
    import jellyfish
    f_list.append(jellyfish.soundex(name) == jellyfish.soundex(standard))

    # f8: Boolean feature -- if metaphone code of name and standard name is equal

    f_list.append(jellyfish.metaphone(name) == jellyfish.metaphone(standard))

    # f9: Boolean feature -- if double-metaphone code of name and standard name is equal
    from preModules import metaphone
    dm_flag = False # a flag that shows whether two words have any common double-metaphone or not
    for dm1 in metaphone.doublemetaphone(name):
        for dm2 in metaphone.doublemetaphone(standard):
            if dm1 and dm2 and dm1 == dm2:
                dm_flag = True
                break

    f_list.append(dm_flag)

    # f10: Number feature -- longest common chars between name and its standard name
    from modules.basic_modules.basic import longest_common_substring
    f_list.append(len(longest_common_substring(name, standard)))

    return f_list
def metaph(list_in_df):
    new_list = []
    for tx in list_in_df:
        metaph_equiv = jellyfish.metaphone(unicode(tx))
        new_list.append(metaph_equiv)
    return new_list
from collections import Counter
import jellyfish


# Input data file
input_datafile = '/Users/MariaAthena/Dropbox/00 Imperial College/1601 Workforce Analytics/Assignments/BS1810_IndividualPart1_EngesaethMaria/Data/D3_patent_data.csv'

# Load data file containing required data to create ethnicity dictionary
ethnicfile = open('/Users/MariaAthena/Dropbox/00 Imperial College/1601 Workforce Analytics/Assignments/BS1810_IndividualPart1_EngesaethMaria/Data/D4name_ethnicity.pkl', 'rb')
# Create ethnicity_dict: {'names': 'ethnicity of name'}
ethnicity_dict = pickle.load(ethnicfile)
ethnicfile.close()

# loops through ethnicity_dict changes each key to its metaphone equivalent
for key in ethnicity_dict.keys():
    phonetic_key = jellyfish.metaphone(unicode(key))
    # replaces phonetic key with old key
    ethnicity_dict[phonetic_key] = ethnicity_dict.pop(key)


# Helper functions

# Calculating the Herfindahl index
def herfindahl(input_list):
    cntry_cnt = Counter(input_list)
    vals = cntry_cnt.values()
    prob = 0
    for val in vals:
        prob = prob + (val / float(sum(vals))) ** 2
    return prob
import jellyfish

#checking if two words are homophones (not much accurate)
x,y = map(str,input("Enter two words : ").split())
if(jellyfish.metaphone(x) == jellyfish.metaphone(y) or jellyfish.soundex(x) == jellyfish.soundex(y)):
    print("Homophones !")
else:
    print("Not Homophones !")
'''
#check difference between two words
#returns number of changes
print(jellyfish.levenshtein_distance(x,y))
'''
Beispiel #46
0
# %% Define helper functions
# Herfindal Index
def herf(input_list):
    from collections import Counter
    counts = Counter(input_list)
    denom = sum(counts.values())
    ans = sum([(x/float(denom))**2 for x in counts.values()])
    return ans    

# %% Step 1 - Read in the data
d4 = pd.read_csv("../../data/D4_ethnic_surnames.csv")

# %% Step 2 - Convert names to metaphone representations
d4_long = pd.melt(d4)
d4_long.columns = ['ethnicity', 'name']
d4_long['meta'] = [jellyfish.metaphone(unicode(name)) for name in d4_long.name]
# d4_long = d4_long.drop_duplicates()

# %% Step 3 - Get patents data
d3 = pd.read_csv("../../data/D3_patent_data.csv")

# %% Step 4 - Patent lastname ethnicities
## 4.1 Reshape patents data
# inventor names
d3_inv_names = pd.concat([d3.pnum, 
                             d3.lastname.apply(lambda y: pd.Series(y.split(';')))], 
                             #d3.cntries.apply(lambda y: pd.Series(y.split(';')))],
                             axis = 1)
                             
d3_inv_names_melt = pd.melt(d3_inv_names, 
                             id_vars = 'pnum', 
def phonetic_match(s1, s2):
    """
    returns bool of whether two strings are phonetically identical after processing
    """
    return jellyfish.metaphone(s1) == jellyfish.metaphone(s2)
Beispiel #48
0
def add_blocking_code(blocking_type=2):
    '''(int) -> ()
    adds new potential matches according to the blocking technique to the carr_match table.
    
    blocking_type 1: metaphone of last name
    '''
    import time

    if blocking_type == 1:
        import jellyfish

        count = 0
        query = 'Select id, last_name from all_persons'
        cur1 = run_query(query)  # fetch the person with the the random id

        ref_list = []
        for row in cur1.fetchall():
            ref_list.append([row[0], row[1]])
        cur1.close()
        query = ''
        for ref in ref_list:
            count += 1
            if count % 10000 == 0:
                if query:
                    # print query 
                    cur = run_query(query)
                    cur.fetchall()
                    cur.close()
                    query = ''
            metaphone = jellyfish.metaphone(ref[1])
            query += 'update all_persons set metaphone = "' + metaphone + '" where id =' + str(ref[0]) + ';'
        if query:
            cur = run_query(query)
            cur.fetchall()
            cur.close()
            query = ''

    if blocking_type == 2:
        start = time.time()
        import jellyfish

        count = 0

        # importing all names and their ids
        query = 'Select id, concat(first_name, " ", last_name) from all_persons'
        cur1 = run_query(query)  # fetch the person with the the random id
        ref_list = []
        for row in cur1.fetchall():
            ref_list.append([row[0], row[1]])
        cur1.close()

        for ref in ref_list[4300000:]:
            count += 1
            if count % 10000 == 0:
                end = time.time()
                elapsed = end - start
                print count, elapsed
                # start = time.time()

            metaphone = jellyfish.metaphone(ref[1])
            query = 'update all_persons set metaphone = "' + metaphone + '" where id =' + str(ref[0]) + ';'
            cur = run_query(query)
            cur.fetchall()
            cur.close()
 def transform(self, data):
     if isinstance(data, basestring):
         return metaphone(unicode(data))
def loadAuthors(authorfile, printaffilwordfreq=False):
	reader = csv.reader(open(authorfile, 'rb'))
	reader.next()
	authors = []
	lastname_cnt = defaultdict(int)
	iFfL_cnt = defaultdict(int)
	affiliations = []
	fullnames = []
 	print_err("Parsing names and counts")
	#[^~:_`@\?\\|\'/\"\.\-0-9a-z;,\n\r \+\-\)\}&%\$\*\{\>=\^]
	titles_c = nameparser.constants.TITLES - set(['wing', 'lord', 'mg', 'mate', 'king', 'sharif', 'sheikh', 'rt', 'lama', 'gen', 'bg', 'baba', 'ab'])
	suffixes_c = nameparser.constants.SUFFIXES | set(['junior', 'senior', 'vii']) 
	prefixes_c = nameparser.constants.PREFIXES - set(['bin']) # more common as first name

	id2affiliation = {}
	id2fullname = {}
	for i, line in verbose_iter(reader):
		line[1:] = [unidecode(unicode(cell, 'utf-8')) for cell in line[1:]]

  		if line[2]:
			id2affiliation[int(line[0])] = len(affiliations)
			line[2] = strip_punc(line[2].lower())
			affiliations.append(line[2])

		fullnm = strip_punc(line[1].lower().encode('ascii'))
		if fullnm:
			id2fullname[int(line[0])] = len(fullnames)
			fullnames.append(fullnm)
		if printaffilwordfreq:
			continue
		
		hn = HumanName(line[1].replace('-', ' '), titles_c=titles_c, prefixes_c=prefixes_c, suffixes_c=suffixes_c)
		ai = {
 			'fullname_joined': hn.full_name,
 			'name_title': hn.title,
 			'name_first': hn.first,
 			'name_middle': hn.middle,
 			'name_last': hn.last,
 			'name_suffix': hn.suffix
		}
		ai = {k: strip_punc(v.lower().encode('ascii'), space_dashes=False) for k, v in ai.iteritems()}
		ai['name'] = hn.full_name.lower().strip().encode('ascii').translate(None, ';')
		ai['fullname'] = strip_punc(hn.full_name.lower().encode('ascii'))
		ai['fullname_parsed'] = ai['name_first'] + ai['name_middle'] + ai['name_last'] + ai['name_suffix']
		ai['affiliation'] = line[2]
		ai['metaphone_fullname'] = jellyfish.metaphone(ai['fullname']).encode('ascii').translate(None, ' ')
		if ai['name_last']:
			if ai['name_first']:
				ai['iFfL'] = ai['name_first'][0] + ai['name_last']
			else:
				ai['iFfL'] = 'L:' + ai['name_last']
		elif ai['name_first']:
			ai['iFfL'] = 'F:' + ai['name_first'] # use full first name if no last name
		else:
			ai['iFfL'] = 'ID:' + line[0]
		if ai['name_last'] and ai['name_first']:
			ai['fFfL'] = ai['name_first'] + ai['name_last']
			ai['fFiL'] = ai['name_first'] + ai['name_last'][0]
		else:
			ai['fFfL'] = ai['iFfL']
			ai['fFiL'] = ai['iFfL']

		if not ai['fullname_joined']:
			ai['fullname_joined'] = 'ID:' + line[0]
		if not ai['fullname']:
			ai['fullname'] = 'ID:' + line[0]
		if not ai['fullname_parsed']:
			ai['fullname_parsed'] = ai['fullname']

		authors.append((int(line[0]), ai))
 		lastname_cnt[ai['name_last']] += 1
  		iFfL_cnt[ai['iFfL']] += 1

	print_err("Computing TF-IDF of affiliations")

	# min_df = 2 because though we deduct non common words, they should be significant first
	affil_tfidf = computeTFIDFs(affiliations, 'all', min_df=2, words_freq=printaffilwordfreq)
	if printaffilwordfreq:
		print "-----"
	name_tfidf = computeTFIDFs(fullnames, None, min_df=2, ngram_range=(1,3), words_freq=printaffilwordfreq, token_pattern=u'(?u)\\b[a-zA-Z][a-zA-Z]+\\b')
	if printaffilwordfreq:
		return

	print_err("Calculating IDFs")
	iFfL_IDF = dict(zip(iFfL_cnt.keys(), np.log(float(len(authors)) / np.array(iFfL_cnt.values()))))
	lastname_IDF = dict(zip(lastname_cnt.keys(), np.log(float(len(authors)) / np.array(lastname_cnt.values()))))

	print_err("Packing it into a list")
 	for i, a in enumerate(authors):
 		authors[i][1]['iFfL_idf'] = iFfL_IDF[a[1]['iFfL']]
 		authors[i][1]['lastname_idf'] = lastname_IDF[a[1]['name_last']]
 		if len(a[1]['affiliation']) == 0:
 			authors[i][1]['affil_tfidf'] = None
 		else:
			authors[i][1]['affil_tfidf'] = affil_tfidf[id2affiliation[a[0]]]
		if a[0] in id2fullname:
 			authors[i][1]['fullname_tfidf'] = name_tfidf[id2fullname[a[0]]]
 		else:
			authors[i][1]['fullname_tfidf'] = None
			
		if (i+1) % 10000 == 0:
			print_err(i+1)
 	authors_dict = dict(authors)
 	return authors_dict