Python metaphone Exemples, jellyfish.metaphone Python Exemples

Exemple #1

0

Afficher le fichier

    def find_match_levenshtein_metaphone(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token,
                word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        token_metaphone = jellyfish.metaphone(token.decode("utf-8"))
        match_metaphone = [
            match for match in candidates
            if jellyfish.metaphone(match.decode("utf-8")) == token_metaphone
        ]

        #G = ngram.NGram(match_metaphone)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in match_metaphone:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return match_metaphone, is_match

Exemple #2

0

Afficher le fichier

    def get_event_code(self, key, language, dimension="2D", event_type="MT"):
        """Returns EventCode

        :param str key: Movie name
        :param str language: Movie language
        :param dimension: Movie dimension, can be 2D, 2D 4DX, 3D, 3D 4DX, or IMAX 3D
        :type dimension: str
        :param event_type: Event types( MT(Movies), CT(Events), PL(Plays), SP(Sports))
        :type event_type: str
        :return: Event Code
        :rtype: str
        :raises BMSError: If the event code is not found
        """
        quickbook = self.quickbook(event_type)
        movies = quickbook['moviesData']['BookMyShow']['arrEvents']
        key = metaphone(key).replace(' ', '')
        for movie in movies:
            if key == metaphone(movie['EventTitle']).replace(' ', ''):
                for child in movie['ChildEvents']:
                    if language == child[
                            'EventLanguage'] and dimension == child[
                                'EventDimension']:
                        return child['EventCode']
        else:
            raise BMSError(
                "Event code not found! Please check the Movie name and other options"
            )

Exemple #3

0

Afficher le fichier

    def get_similarity_score(self, str_a, str_b):
        '''
        Compare phonetic similarity of two strings.
        Input strings are converted into pinyin before comparing.
        :param str_a: (String) name entity
        :param str_b: (String) name entity
        :return: (float) Similarity score ranging between 0 to 1
        '''
        cn_str_a = self.get_pinyin("".join(str_a), space_seperated=False)
        cn_str_b = self.get_pinyin("".join(str_b), space_seperated=False)
        phone_a, phone_b = jellyfish.metaphone(cn_str_a), jellyfish.metaphone(
            cn_str_b)

        # Calculate phone edit distance and phone similarity
        edit_distance = self._phone_edit_distance(phone_a, phone_b)
        max_score = max(len(phone_a),
                        len(phone_b)) * 4 - abs(len(phone_a) - len(phone_b))
        if max_score == 0:
            if str_a == str_b:
                return 1
            else:
                return 0
        similarity = 1 - edit_distance / max_score

        return similarity

Exemple #4

0

Afficher le fichier

Fichier : collate_ms.py Projet : brianhie/variant.ms

def token_similarity(a, b):
    # Strings are a case insensitive match.
    # Match any whitespace to any whitespace.
    if a.word.lower().strip() == b.word.lower().strip():
        return 1.

    # Make it impossible for words to map to whitespace.
    if ((isspace(a.word) and not isspace(b.word))
            or (not isspace(a.word) and isspace(b.word))):
        return -1.

    # Make it impossible for words to map to punctuation.
    if ispunc(a.word) and ispunc(b.word):
        return 0.9
    if ((ispunc(a.word) and not ispunc(b.word))
            or (not ispunc(a.word) and ispunc(b.word))):
        return -1.

    # Strings sound alike (approximate phonetic match).
    if a.word.isalpha() and b.word.isalpha():
        if jf.metaphone(a.word) == jf.metaphone(b.word):
            return 0.9
        if jf.soundex(a.word) == jf.soundex(b.word):
            return 0.9
        if jf.nysiis(a.word) == jf.nysiis(b.word):
            return 0.9
        if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word):
            return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a.word, b.word)

Exemple #5

0

Afficher le fichier

Fichier : calDetectionOverhead.py Projet : zxynbnb/MVP-audio-AE-detector

def phonetic_similarity(ref, result):
    targetTmp = result
    refTmp = ref
    targetTmp = jellyfish.metaphone(result)
    refTmp = jellyfish.metaphone(ref)

    return jellyfish.jaro_winkler(refTmp, targetTmp)

Exemple #6

0

Afficher le fichier

Fichier : utils.py Projet : activewizardslab/datascience_python

def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio()*100
    ng = ngram.NGram.compare(s1, s2, N=1)*100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone]) if mean([diffl, ng, fpr]) < jac_soundex else mean([diffl, ng, fpr, jac_metaphone])

Exemple #7

0

Afficher le fichier

Fichier : utils.py Projet : activewizardslab/datascience_python

def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    jac_mrc = (1-distance.jaccard(jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower()))*100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65

Exemple #8

0

Afficher le fichier

Fichier : similarity_score_calculation.py Projet : zxynbnb/MVP-audio-AE-detector

def phonetic_encoded_jaro_winkler_sim(ref, result):
    targetTmp = jellyfish.metaphone(result)
    refTmp = jellyfish.metaphone(ref)
    if Debug:
        print("Result: \t\t" + result)
        print("Converted result: \t" + targetTmp)
        print("Ref: \t\t\t" + ref)
        print("Converted ref: \t\t" + refTmp)
        print(
            "------------------------------------------------------------------------------"
        )
    return jellyfish.jaro_winkler(refTmp, targetTmp)

Exemple #9

0

Afficher le fichier

Fichier : spellChecker (copy).py Projet : asidharth019/NLP_spell_checker

    def correct(self, wrongWord):
        candidates = []
        candidateDistList = []
        wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram)

        for trigram in wWTGrams:
            if trigram in SpellChecker.invertTriMap:
                addList = []
                tmpList = SpellChecker.invertTriMap[trigram]
                for tmp in tmpList:
                    ed = self.compED(tmp, wrongWord)
                    if ed <= 2:
                        addList.append(tmp)
                candidates = candidates + addList

        #soundexHash = jellyfish.soundex(wrongWord)
        #if soundexHash in SpellChecker.invertSoundexMap:
        #	candidates = candidates + SpellChecker.invertSoundexMap[soundexHash]
        #candidates = list(set(candidates))

        metaHash = jellyfish.metaphone(wrongWord)
        if metaHash in SpellChecker.invertMetaMap:
            candidates = candidates + SpellChecker.invertMetaMap[metaHash]
        candidates = list(set(candidates))

        #print (len(candidates))

        for candidate in candidates:
            if abs(len(candidate) - len(wrongWord)) > 2:
                continue
            if wrongWord == candidate:
                continue
            ed = self.compED(candidate, wrongWord)
            jd = jellyfish.jaro_distance(wrongWord, candidate)
            gd = self.getJackSim(
                self.getGrams(candidate, SpellChecker.jackardGram),
                self.getGrams(wrongWord, SpellChecker.jackardGram))
            score = float(SpellChecker.dictCountMap[candidate]) / float(
                SpellChecker.totalCount) + (
                    max(len(candidate), len(wrongWord)) - ed)
            if jellyfish.metaphone(wrongWord) == jellyfish.metaphone(
                    candidate):
                score = score + 0.1
            #if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate):
            #	score = score+0.1
            #if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate):
            #	score = score+0.1
            #if jellyfish.match_rating_codex(wrongWord) == jellyfish.match_rating_codex(candidate):
            #	score = score+0.1
            tmpCandidate = ScoreRcd(candidate, ed, score)
            candidateDistList.append(tmpCandidate)
        candidateDistList.sort()
        return candidateDistList

Exemple #10

0

Afficher le fichier

Fichier : similarity_score_calculation.py Projet : zxynbnb/MVP-audio-AE-detector

def phonetic_encoded_jaccard_sim(str1, str2):
    targetTmp = jellyfish.metaphone(str2)
    refTmp = jellyfish.metaphone(str1)
    if Debug:
        print("Result: \t\t" + str2)
        print("Converted result: \t" + targetTmp)
        print("Ref: \t\t\t" + str1)
        print("Converted ref: \t\t" + refTmp)
        print(
            "------------------------------------------------------------------------------"
        )

    return jaccard_sim(refTmp, targetTmp)

Exemple #11

0

Afficher le fichier

def phonetic_similarity(ref, result):
    targetTmp = result
    refTmp = ref
    targetTmp = jellyfish.metaphone(result)
    refTmp = jellyfish.metaphone(ref)
    if Debug:
        print("Result: \t\t" + result)
        print("Converted result: \t" + targetTmp)
        print("Ref: \t\t\t" + ref)
        print("Converted ref: \t\t" + refTmp)
        print(
            "------------------------------------------------------------------------------"
        )
    return round(jellyfish.jaro_winkler(refTmp, targetTmp), 5)

Exemple #12

0

Afficher le fichier

Fichier : utils.py Projet : activewizardslab/datascience_python

def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio() * 100
    ng = ngram.NGram.compare(s1, s2, N=1) * 100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone
                 ]) if mean([diffl, ng, fpr]) < jac_soundex else mean(
                     [diffl, ng, fpr, jac_metaphone])

Exemple #13

0

Afficher le fichier

Fichier : utils.py Projet : activewizardslab/datascience_python

def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    jac_mrc = (1 - distance.jaccard(
        jellyfish.match_rating_codex(unicode(s1)).lower(),
        jellyfish.match_rating_codex(unicode(s2)).lower())) * 100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65

Exemple #14

0

Afficher le fichier

Fichier : stringmatching.py Projet : DSP20SCM40S/Event-Matching

def string_match(x, y):

    # converts the string into unicode
    x = x.decode('utf-8')
    y = y.decode('utf-8')

    # makes the metaphones of the words
    xp = jf.metaphone(x)
    yp = jf.metaphone(y)

    # compares the metaphones of the words
    if xp == yp:
        return True

    return False

Exemple #15

0

Afficher le fichier

Fichier : spellChecker (copy).py Projet : asidharth019/NLP_spell_checker

    def __init__(self):
        SpellChecker.dictCountMap = self.readDitionary(
            '../data/count_1w100k.txt')
        for key in SpellChecker.dictCountMap:
            SpellChecker.totalCount += SpellChecker.dictCountMap[key]
        for word in SpellChecker.dictCountMap:
            tGList = self.getGrams(word, SpellChecker.invertMapGram)
            for tgram in tGList:
                tmpWordList = []
                if tgram in SpellChecker.invertTriMap:
                    tmpWordList = SpellChecker.invertTriMap[tgram]
                tmpWordList.append(word)
                SpellChecker.invertTriMap[tgram] = tmpWordList
            tmpWordList = []

            soundexHash = jellyfish.soundex(word)
            if soundexHash in SpellChecker.invertSoundexMap:
                tmpWordList = SpellChecker.invertSoundexMap[soundexHash]
            tmpWordList.append(word)
            SpellChecker.invertSoundexMap[soundexHash] = tmpWordList

            metaHash = jellyfish.metaphone(word)
            if metaHash in SpellChecker.invertMetaMap:
                tmpWordList = SpellChecker.invertMetaMap[metaHash]
            tmpWordList.append(word)
            SpellChecker.invertMetaMap[metaHash] = tmpWordList

Exemple #16

0

Afficher le fichier

def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))

Exemple #17

0

Afficher le fichier

Fichier : models.py Projet : marinho/bipolar-server

 def make_shortcode(self, name):
     upper_name = name.upper().replace(" ", "")
     shortcode = jellyfish.metaphone(upper_name)[:4]
     if len(shortcode) < 4:
         shortcode = upper_name[:4]
     shortcode += str(random.randint(0, 99999999)).zfill(8)
     return shortcode[:8]

Exemple #18

0

Afficher le fichier

Fichier : build.py Projet : VulcanTechnologies/python-us

def pickle_data():

    dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))

    conn = sqlite3.connect(dbpath)
    conn.row_factory = dict_factory

    c = conn.cursor()
    c.execute("""SELECT * FROM states ORDER BY name""")

    states = []

    for row in c:
        row['name_metaphone'] = jellyfish.metaphone(row['name'])
        row['is_territory'] = row['is_territory'] == 1
        row['is_obsolete'] = row['is_obsolete'] == 1
        row['is_contiguous'] = row['is_contiguous'] == 1
        row['is_continental'] = row['is_continental'] == 1
        row['time_zones'] = row['time_zones'].split(',')
        states.append(row)

    pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))

    with open(pkl_path, 'wb') as pkl_file:
        pickle.dump(states, pkl_file)

Exemple #19

0

Afficher le fichier

Fichier : views.py Projet : dbarlett/namespect

def fuzzy(string):
    return jsonify({
        "metaphone": jellyfish.metaphone(string),
        "soundex": jellyfish.soundex(string),
        "nysiis": jellyfish.nysiis(string),
        "match_rating_codex": jellyfish.match_rating_codex(string),
    })

Exemple #20

0

Afficher le fichier

Fichier : build.py Projet : c-roberts/Covid-Tracker

def pickle_state_data():

    dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))

    conn = sqlite3.connect(dbpath)
    conn.row_factory = dict_factory

    c = conn.cursor()
    c.execute("""SELECT * FROM states ORDER BY name""")

    states = []

    for row in c:
        row['name_metaphone'] = jellyfish.metaphone(row['name'])
        row['is_territory'] = row['is_territory'] == 1
        row['is_obsolete'] = row['is_obsolete'] == 1
        row['is_contiguous'] = row['is_contiguous'] == 1
        row['is_continental'] = row['is_continental'] == 1
        row['time_zones'] = row['time_zones'].split(',')
        states.append(row)

    pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))

    with open(pkl_path, 'wb') as pkl_file:
        pickle.dump(states, pkl_file)

Exemple #21

0

Afficher le fichier

Fichier : stringmatching.py Projet : DSP20SCM40S/Event-Matching

def metaphone(x):

    x = x.decode('utf-8')

    xp = jf.metaphone(x)

    return xp

Exemple #22

0

Afficher le fichier

Fichier : models.py Projet : bisibuka/bipolar-server

 def make_shortcode(self, name):
     upper_name = name.upper().replace(" ", "")
     shortcode = jellyfish.metaphone(upper_name)[:4]
     if len(shortcode) < 4:
         shortcode = upper_name[:4]
     shortcode += str(random.randint(0, 99999999)).zfill(8)
     return shortcode[:8]

Exemple #23

0

Afficher le fichier

def pickle_data():

    dbpath = os.path.abspath(os.path.join(PWD, 'data.db'))

    conn = sqlite3.connect(dbpath)
    conn.row_factory = dict_factory

    c = conn.cursor()
    c.execute("""SELECT * FROM states ORDER BY name""")

    states = []

    for row in c:
        row['name_metaphone'] = jellyfish.metaphone(row['name'])
        row['is_territory'] = row['is_territory'] == 1
        row['is_obsolete'] = row['is_obsolete'] == 1
        row['is_contiguous'] = row['is_contiguous'] == 1
        row['is_continental'] = row['is_continental'] == 1
        row['time_zones'] = row['time_zones'].split(',')
        states.append(row)

    pkl_path = os.path.abspath(os.path.join(PWD, 'us', 'states.pkl'))

    with open(pkl_path, 'wb') as pkl_file:
        # Use `protocol=2` to ensure package compatibility with Python 2,
        # even if the `.pkl` file is built under Python 3
        pickle.dump(states, pkl_file, protocol=2)

Exemple #24

0

Afficher le fichier

Fichier : song_finder.py Projet : shreyasparbat/song-lyric-analyser

    def find_min_dist(lyrics):
        nonlocal min_dist
        nonlocal min_dist_idx
        nonlocal phrase
        nonlocal idx

        # Find best match phrase in lyrics
        min_dist_this_lyrics = 10000
        min_dist_start_idx = 0
        min_dist_end_idx = 0
        lyrics_met = jellyfish.metaphone(lyrics).split(' ')
        for i in range(0, len(lyrics_met) - len(test_met)):
            this_lyrics_met = lyrics_met[i:i + len(test_met)]
            if this_lyrics_met[0] == test_met[0]:
                dist = jellyfish.levenshtein_distance(''.join(test_met), ''.join(this_lyrics_met))
                if dist < min_dist_this_lyrics:
                    min_dist_this_lyrics = dist
                    min_dist_start_idx = i
                    min_dist_end_idx = i + len(test_met)

        # Check against global min
        if min_dist_this_lyrics < min_dist:
            min_dist = min_dist_this_lyrics
            min_dist_idx = idx
            phrase = ' '.join(lyrics.split(' ')[min_dist_start_idx:min_dist_end_idx])

        # Increment global idx
        idx += 1

Exemple #25

0

Afficher le fichier

def metaphone():
    fw6 = open('metaphone_result.txt', 'w')
    for line in wiki_misspell:
        string = line.strip()
        dis = 100000
        bests = ""
        string_s = jellyfish.metaphone(string)
        for entry in my_dict:
            entry.strip()
            entry_s = jellyfish.metaphone(entry)

            len_entry = len(entry_s) + 1
            len_string = len(string_s) + 1
            distance_m = [[0 for i in range(len_string)]
                          for i in range(len_entry)]
            for i in range(0, len_entry):
                distance_m[i][0] = 0
            for i in range(0, len_string):
                distance_m[0][i] = 0

            for i in range(1, len_entry):
                for j in range(1, len_string):
                    if entry_s[i - 1] == string_s[j - 1]:
                        distance_m[i][j] = min(
                            distance_m[i - 1][j - 1] - 1,
                            distance_m[i - 1][j] + 1,
                            distance_m[i][j - 1] + 1,
                        )
                    else:
                        distance_m[i][j] = min(
                            distance_m[i - 1][j - 1] + 1,
                            distance_m[i - 1][j] + 1,
                            distance_m[i][j - 1] + 1,
                        )

            tem_dis = distance_m[len_entry - 1][len_string - 1]

            if tem_dis < dis:
                dis = tem_dis
                bests = " "
                bests = entry.strip()
            elif tem_dis == dis:
                bests += ' ' + entry.strip()

        print(dis, string, bests)
        fw6.write(bests + '\n')
    fw6.close()

Exemple #26

0

Afficher le fichier

Fichier : controllers.py Projet : brianhie/variant.ms

def _word_similarity_score(a, b):
    if a == b:
        return 1.

    # Case and whitespace insenstive comparison.
    if a.lower().strip() == b.lower().strip():
        return 0.95

    # Penalize whitespace matching to non-whitespace.
    if ((_isspace(a) and not _isspace(b)) or
        (not _isspace(a) and _isspace(b))):
        return 0

    # Exceptions to punctuation.
    if _match_ampersand(a, b):
        return 0.85
    # Penalize punctuation matching to non-punctuation.
    if _ispunc(a) and _ispunc(b):
        return 0.95
    if ((_ispunc(a) and not _ispunc(b)) or
        (not _ispunc(a) and _ispunc(b))):
        return 0

    # Problems with phonetic match functions segfaulting on
    # empty strings. Also beneficial to match strings with
    # no alpha characters to each other (e.g., line numbers).
    a_alpha = u''.join([ c for c in a if c.isalpha() ])
    b_alpha = u''.join([ c for c in b if c.isalpha() ])
    if a_alpha == '' and b_alpha == '':
        return 0.85

    # Strings sound alike (approximate phonetic match).
    if jf.match_rating_comparison(a_alpha, b_alpha):
        return 0.9
    if jf.metaphone(a_alpha) == jf.metaphone(b_alpha):
        return 0.9
    if jf.soundex(a_alpha) == jf.soundex(b_alpha):
        return 0.9
    if jf.nysiis(a_alpha) == jf.nysiis(b_alpha):
        return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a, b)

Exemple #27

0

Afficher le fichier

def get_ethnicity_list(input_list):
    output_list = []
    for i in input_list:
        temp = jellyfish.metaphone(unicode(i))
        if D4name_ethnicity_meta.has_key(temp):
            output_list.append(D4name_ethnicity_meta[temp])
        else:
            output_list.append('other')

    return output_list

Exemple #28

0

Afficher le fichier

Fichier : addressutils.py Projet : dlavery/address-utils

def phonetic(addressline):
    # create a metaphone representation of an address or partial address
    words = re.split('\s+', addressline)
    phonetics = []
    for word in words:
        if re.match('\d', word):
            phonetics.append(word)
        else:
            phonetics.append(jellyfish.metaphone(word))
    return ''.join(phonetics)

Exemple #29

0

Afficher le fichier

def process_stop_words(text):
    result = []
    words = text.split()
    for word in words:
        lower_word = word.lower()
        if lower_word.isdigit():
            result.append(lower_word)
        else:
            if word and not word.lower() in stop_words_set:
                result.append(jf.metaphone(lower_word))
    return ' '.join(result)

Exemple #30

0

Afficher le fichier

Fichier : scorealgo.py Projet : Ashfakh/searchString

 def scoring(self, suggestion, phrase):
     score = 0
     if (suggestion.distance == 0):
         score += 2000
     if (suggestion.suggest_rule == SuggestRule.PREFIX):
         score += 500
     if (suggestion.distance == 1):
         score += 300
     if (suggestion.distance == 2):
         score += 100
     if (suggestion.distance > 2):
         score += (100 - suggestion.distance * 10)
     score += suggestion.count / 100000000
     if (suggestion.count < 100000):
         score -= 10000000 / suggestion.count
     if (jellyfish.metaphone(
             suggestion.term) == jellyfish.metaphone(phrase)):
         score += 50
     print(str(suggestion) + "score is : " + str(score))
     return score * -1

Exemple #31

0

Afficher le fichier

    def __init__(self, plainEntity):
        """
    Instantiates a new encoded entity object.
    Requires 'plainEntity': plain text entity to encode
    """

        if isinstance(plainEntity, str):
            plainEntity = unicode(plainEntity, 'utf-8')

        self.plain = plainEntity
        self.encoded = jellyfish.metaphone(plainEntity)

Exemple #32

0

Afficher le fichier

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            self.__dict__[k] = v

        try:
            import jellyfish

            self.__dict__["name_metaphone"] = jellyfish.metaphone(
                self.__dict__["name"])
        except:
            pass

Exemple #33

0

Afficher le fichier

Fichier : id_maker.py Projet : pythonpro-dev/pp-utils

def process_stop_words(text):
    result = []
    words = text.split()
    for word in words:
        lower_word = word.lower()
        if lower_word.isdigit():
            result.append(lower_word)
        else:
            if word and not word.lower() in stop_words_set:
                result.append(jf.metaphone(lower_word))
    return " ".join(result)

Exemple #34

0

Afficher le fichier

def measure_string_distance(s1, s2, method):
    '''
            Four methods will be used with method code from 1 to 4
            Two methods focused on string similarity and the other two will be focused on phonetic encoding
            Method code to method name:
            1. jaro-winkler distance
            2. damerau-levenshtein distance
            3. Metaphone
            4. NYSIIS
            5. match_rating_codex

            note:
                    for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match)
                    for methods 1 and 2, the methods will return a value in range [0, 1]
    '''
    result = 0

    if s1 == '' or s2 == '':
        return result

    if method == 1:
        result = jellyfish.jaro_winkler(s1, s2)
    elif method == 2:
        try:
            diff = jellyfish.damerau_levenshtein_distance(s1, s2)
            result = 1 - (diff / max(len(s1), len(s2)))
        except:
            result = 0
    elif method == 3:
        result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0
    elif method == 4:
        result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0
    elif method == 5:
        result = 1 if jellyfish.match_rating_codex(
            s1) == jellyfish.match_rating_codex(s2) else 0
    # elif method == 0:
    # 	raise ValueError("provide a method code (1-6).")
    # else:
    # 	raise ValueError("the method parameter must be in the range from 1 to 6.")

    return result

Exemple #35

0

Afficher le fichier

Fichier : filter.py Projet : wang2467/amazon_online_review_cleaning

 def process(self, term):
     if term == '':
         return term
     candidates = self.generate_candidates(term)
     if candidates:
         scores = [
             (0.6 * levenshtein_distance(metaphone(i), metaphone(term)) +
              0.4 * levenshtein_distance(i, term), idx)
             for idx, i in enumerate(candidates)
         ]
         min_value = 1000
         min_idx = -1
         if len(scores) > 0:
             for score, idx in scores:
                 if candidates[idx].startswith(
                         term[0]) and "'" not in candidates[idx]:
                     if score < min_value:
                         min_value = score
                         min_idx = idx
             term = candidates[min_idx]
     return term

Exemple #36

0

Afficher le fichier

Fichier : nerd_tweets.bk.py Projet : rugebiker/WIR-DM

def bestcandidate(wrd):
    w = wrd
    candidate_list = []
    try:
        #Check the Brown word clusters
        c = bcluster._word[w]
        for rec in c:
            d = rec['cluster']
        recs = bcluster._cluster[d]
        for rec in recs:
            candidate = rec['word']
            levenshtein = jellyfish.levenshtein_distance(w,candidate)
            n2 = jellyfish.metaphone(w)
            n3 = jellyfish.metaphone(candidate)
            if chant.check(candidate):
                #Filter the candidates within a specific character and phonetic distance
                if levenshtein <= 2 or jellyfish.levenshtein_distance(n2, n3) <= 1:
                    candidate_list.append((candidate, rec['count']))
        return candidate_list[-1][0]
    except Exception:
        return 'No'

Exemple #37

0

Afficher le fichier

def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['TM_A'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1)
    df['TM_B'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1)

    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1)
    
    # Jellyfish levenshtein
    df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1)
    # Scale Levenshtein column
    scaler = MinMaxScaler()
    df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1))

    # Jellyfish phoneme
    df['metaphone'] = df.apply(
        lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1)
    df['nysiis'] = df.apply(
        lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1)
    df['mtch_rtng_cdx'] = df.apply(
        lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1)
    
    return df

Exemple #38

0

Afficher le fichier

Fichier : test.py Projet : jprobst21/jellyfish

    def test_metaphone(self):
        cases = [("metaphone", 'MTFN'),
                 ("wHErE", "WR"),
                 ("shell", "XL"),
                 ("this is a difficult string", "0S IS A TFKLT STRNK"),
                 ("aeromancy", "ERMNS"),
                 ("Antidisestablishmentarianism", "ANTTSSTBLXMNTRNSM"),
                 ("sunlight labs", "SNLT LBS"),
                 ("sonlite laabz", "SNLT LBS"),
                 (u"Çáŕẗéř", "KRTR"),
                 ]

        for (s1, code) in cases:
            self.assertEqual(jellyfish.metaphone(s1), code)

Exemple #39

0

Afficher le fichier

Fichier : states.py Projet : 0x19/python-us

def lookup(val, field=None, use_cache=True):
    """ Semi-fuzzy state lookup. This method will make a best effort
        attempt at finding the state based on the lookup value provided.

          * two digits will search for FIPS code
          * two letters will search for state abbreviation
          * anything else will try to match the metaphone of state names

        Metaphone is used to allow for incorrect, but phonetically accurate,
        spelling of state names.

        Exact matches can be done on any attribute on State objects by passing
        the `field` argument. This skips the fuzzy-ish matching and does an
        exact, case-sensitive comparison against the specified field.

        This method caches non-None results, but can the cache can be bypassed
        with the `use_cache=False` argument.
    """

    import jellyfish

    if field is None:
        if FIPS_RE.match(val):
            field = 'fips'
        elif ABBR_RE.match(val):
            val = val.upper()
            field = 'abbr'
        else:
            val = jellyfish.metaphone(val)
            field = 'name_metaphone'

    # see if result is in cache
    cache_key = "%s:%s" % (field, val)
    if use_cache and cache_key in _lookup_cache:
        return _lookup_cache[cache_key]

    for state in STATES_AND_TERRITORIES:
        if val == getattr(state, field):
            _lookup_cache[cache_key] = state
            return state

Exemple #40

0

Afficher le fichier

Fichier : l7_jellyfish_levenshtein.py Projet : coder352/shellscript

#     Damerau-Levenshtein Distance
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
print(Levenshtein.jaro_winkler('hello', 'helol'))  # 0.9533333333333333; 计算 Jaro – Winkler 距离

Exemple #41

0

Afficher le fichier

Fichier : jellyfish_example.py Projet : carriercomm/scraperwiki-scraper-vault

import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')
#'JLLFSH'
import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')

Exemple #42

0

Afficher le fichier

Fichier : featureExtraction.py Projet : branjbar/miss-project

def extract_feature(name, standard):
    """ (string, string) --> [boolean, boolean, boolean, int, int, int, boolean, boolean, boolean, int]
    extracts various features for each record (name, standard) and exports results in form of a list of booleans and integers.

    >>> extract_feature('ARINCK', 'AAFTINK')
     [0,0,0,1,1,1,?, ?, ?, 1]

    """
    if not name or not standard:
        return []

    f_list = [] # features list

    # f1: Boolean feature -- If first 2 letters of name and standard name are equal
    f_list.append(name[:2] == standard[:2])
    # f2: Boolean feature -- If last 2 letters of name and standard name are equal
    f_list.append(name[-2:] == standard[-2:])

    # f3: Boolean feature -- If size of name and standard name are equal
    f_list.append(len(name) == len(standard))

    # f4: Number feature -- absolute difference of name size and standard size
    f_list.append(abs(len(name) - len(standard)))

    # f5: Number feature--Number of longest first equal chars
    for i in xrange(1,len(name)+1):

        if not name[:i] == standard[:i]:
            break
    # print i, name, standard
    f_list.append(i-1)



    # f6: Number feature -- Number of longest last equal chars
    for i in range(len(name)):
        if not name[-i-1:] == standard[-i-1:]:
            break

    f_list.append(i)


    # f7: Boolean feature -- if soundex code of name and standard name is equal
    import jellyfish
    f_list.append(jellyfish.soundex(name) == jellyfish.soundex(standard))

    # f8: Boolean feature -- if metaphone code of name and standard name is equal

    f_list.append(jellyfish.metaphone(name) == jellyfish.metaphone(standard))

    # f9: Boolean feature -- if double-metaphone code of name and standard name is equal
    from preModules import metaphone
    dm_flag = False # a flag that shows whether two words have any common double-metaphone or not
    for dm1 in metaphone.doublemetaphone(name):
        for dm2 in metaphone.doublemetaphone(standard):
            if dm1 and dm2 and dm1 == dm2:
                dm_flag = True
                break

    f_list.append(dm_flag)

    # f10: Number feature -- longest common chars between name and its standard name
    from modules.basic_modules.basic import longest_common_substring
    f_list.append(len(longest_common_substring(name, standard)))

    return f_list

Exemple #43

0

Afficher le fichier

Fichier : part1b.py Projet : mariaathena/msc_data_analytics

def metaph(list_in_df):
    new_list = []
    for tx in list_in_df:
        metaph_equiv = jellyfish.metaphone(unicode(tx))
        new_list.append(metaph_equiv)
    return new_list

Exemple #44

0

Afficher le fichier

Fichier : part1b.py Projet : mariaathena/msc_data_analytics

from collections import Counter
import jellyfish


# Input data file
input_datafile = '/Users/MariaAthena/Dropbox/00 Imperial College/1601 Workforce Analytics/Assignments/BS1810_IndividualPart1_EngesaethMaria/Data/D3_patent_data.csv'

# Load data file containing required data to create ethnicity dictionary
ethnicfile = open('/Users/MariaAthena/Dropbox/00 Imperial College/1601 Workforce Analytics/Assignments/BS1810_IndividualPart1_EngesaethMaria/Data/D4name_ethnicity.pkl', 'rb')
# Create ethnicity_dict: {'names': 'ethnicity of name'}
ethnicity_dict = pickle.load(ethnicfile)
ethnicfile.close()

# loops through ethnicity_dict changes each key to its metaphone equivalent
for key in ethnicity_dict.keys():
    phonetic_key = jellyfish.metaphone(unicode(key))
    # replaces phonetic key with old key
    ethnicity_dict[phonetic_key] = ethnicity_dict.pop(key)


# Helper functions

# Calculating the Herfindahl index
def herfindahl(input_list):
    cntry_cnt = Counter(input_list)
    vals = cntry_cnt.values()
    prob = 0
    for val in vals:
        prob = prob + (val / float(sum(vals))) ** 2
    return prob

Exemple #45

0

Afficher le fichier

Fichier : jellyfish_test.py Projet : mananpal1997/Useful-Python-libraries

import jellyfish

#checking if two words are homophones (not much accurate)
x,y = map(str,input("Enter two words : ").split())
if(jellyfish.metaphone(x) == jellyfish.metaphone(y) or jellyfish.soundex(x) == jellyfish.soundex(y)):
    print("Homophones !")
else:
    print("Not Homophones !")
'''
#check difference between two words
#returns number of changes
print(jellyfish.levenshtein_distance(x,y))
'''

Exemple #46

0

Afficher le fichier

Fichier : 01_workshop4.py Projet : Jim89/icl

# %% Define helper functions
# Herfindal Index
def herf(input_list):
    from collections import Counter
    counts = Counter(input_list)
    denom = sum(counts.values())
    ans = sum([(x/float(denom))**2 for x in counts.values()])
    return ans    

# %% Step 1 - Read in the data
d4 = pd.read_csv("../../data/D4_ethnic_surnames.csv")

# %% Step 2 - Convert names to metaphone representations
d4_long = pd.melt(d4)
d4_long.columns = ['ethnicity', 'name']
d4_long['meta'] = [jellyfish.metaphone(unicode(name)) for name in d4_long.name]
# d4_long = d4_long.drop_duplicates()

# %% Step 3 - Get patents data
d3 = pd.read_csv("../../data/D3_patent_data.csv")

# %% Step 4 - Patent lastname ethnicities
## 4.1 Reshape patents data
# inventor names
d3_inv_names = pd.concat([d3.pnum, 
                             d3.lastname.apply(lambda y: pd.Series(y.split(';')))], 
                             #d3.cntries.apply(lambda y: pd.Series(y.split(';')))],
                             axis = 1)
                             
d3_inv_names_melt = pd.melt(d3_inv_names, 
                             id_vars = 'pnum',

Exemple #47

0

Afficher le fichier

Fichier : textual.py Projet : potatochip/jurispect_country_labels

def phonetic_match(s1, s2):
    """
    returns bool of whether two strings are phonetically identical after processing
    """
    return jellyfish.metaphone(s1) == jellyfish.metaphone(s2)

Exemple #48

0

Afficher le fichier

Fichier : basic.py Projet : branjbar/miss-project

def add_blocking_code(blocking_type=2):
    '''(int) -> ()
    adds new potential matches according to the blocking technique to the carr_match table.
    
    blocking_type 1: metaphone of last name
    '''
    import time

    if blocking_type == 1:
        import jellyfish

        count = 0
        query = 'Select id, last_name from all_persons'
        cur1 = run_query(query)  # fetch the person with the the random id

        ref_list = []
        for row in cur1.fetchall():
            ref_list.append([row[0], row[1]])
        cur1.close()
        query = ''
        for ref in ref_list:
            count += 1
            if count % 10000 == 0:
                if query:
                    # print query 
                    cur = run_query(query)
                    cur.fetchall()
                    cur.close()
                    query = ''
            metaphone = jellyfish.metaphone(ref[1])
            query += 'update all_persons set metaphone = "' + metaphone + '" where id =' + str(ref[0]) + ';'
        if query:
            cur = run_query(query)
            cur.fetchall()
            cur.close()
            query = ''

    if blocking_type == 2:
        start = time.time()
        import jellyfish

        count = 0

        # importing all names and their ids
        query = 'Select id, concat(first_name, " ", last_name) from all_persons'
        cur1 = run_query(query)  # fetch the person with the the random id
        ref_list = []
        for row in cur1.fetchall():
            ref_list.append([row[0], row[1]])
        cur1.close()

        for ref in ref_list[4300000:]:
            count += 1
            if count % 10000 == 0:
                end = time.time()
                elapsed = end - start
                print count, elapsed
                # start = time.time()

            metaphone = jellyfish.metaphone(ref[1])
            query = 'update all_persons set metaphone = "' + metaphone + '" where id =' + str(ref[0]) + ';'
            cur = run_query(query)
            cur.fetchall()
            cur.close()

Exemple #49

0

Afficher le fichier

Fichier : transformations.py Projet : francescoinfante/identity

 def transform(self, data):
     if isinstance(data, basestring):
         return metaphone(unicode(data))

Exemple #50

0

Afficher le fichier

Fichier : process_authors.py Projet : wonglkd/KDDCup13Track2

def loadAuthors(authorfile, printaffilwordfreq=False):
	reader = csv.reader(open(authorfile, 'rb'))
	reader.next()
	authors = []
	lastname_cnt = defaultdict(int)
	iFfL_cnt = defaultdict(int)
	affiliations = []
	fullnames = []
 	print_err("Parsing names and counts")
	#[^~:_`@\?\\|\'/\"\.\-0-9a-z;,\n\r \+\-\)\}&%\$\*\{\>=\^]
	titles_c = nameparser.constants.TITLES - set(['wing', 'lord', 'mg', 'mate', 'king', 'sharif', 'sheikh', 'rt', 'lama', 'gen', 'bg', 'baba', 'ab'])
	suffixes_c = nameparser.constants.SUFFIXES | set(['junior', 'senior', 'vii']) 
	prefixes_c = nameparser.constants.PREFIXES - set(['bin']) # more common as first name

	id2affiliation = {}
	id2fullname = {}
	for i, line in verbose_iter(reader):
		line[1:] = [unidecode(unicode(cell, 'utf-8')) for cell in line[1:]]

  		if line[2]:
			id2affiliation[int(line[0])] = len(affiliations)
			line[2] = strip_punc(line[2].lower())
			affiliations.append(line[2])

		fullnm = strip_punc(line[1].lower().encode('ascii'))
		if fullnm:
			id2fullname[int(line[0])] = len(fullnames)
			fullnames.append(fullnm)
		if printaffilwordfreq:
			continue
		
		hn = HumanName(line[1].replace('-', ' '), titles_c=titles_c, prefixes_c=prefixes_c, suffixes_c=suffixes_c)
		ai = {
 			'fullname_joined': hn.full_name,
 			'name_title': hn.title,
 			'name_first': hn.first,
 			'name_middle': hn.middle,
 			'name_last': hn.last,
 			'name_suffix': hn.suffix
		}
		ai = {k: strip_punc(v.lower().encode('ascii'), space_dashes=False) for k, v in ai.iteritems()}
		ai['name'] = hn.full_name.lower().strip().encode('ascii').translate(None, ';')
		ai['fullname'] = strip_punc(hn.full_name.lower().encode('ascii'))
		ai['fullname_parsed'] = ai['name_first'] + ai['name_middle'] + ai['name_last'] + ai['name_suffix']
		ai['affiliation'] = line[2]
		ai['metaphone_fullname'] = jellyfish.metaphone(ai['fullname']).encode('ascii').translate(None, ' ')
		if ai['name_last']:
			if ai['name_first']:
				ai['iFfL'] = ai['name_first'][0] + ai['name_last']
			else:
				ai['iFfL'] = 'L:' + ai['name_last']
		elif ai['name_first']:
			ai['iFfL'] = 'F:' + ai['name_first'] # use full first name if no last name
		else:
			ai['iFfL'] = 'ID:' + line[0]
		if ai['name_last'] and ai['name_first']:
			ai['fFfL'] = ai['name_first'] + ai['name_last']
			ai['fFiL'] = ai['name_first'] + ai['name_last'][0]
		else:
			ai['fFfL'] = ai['iFfL']
			ai['fFiL'] = ai['iFfL']

		if not ai['fullname_joined']:
			ai['fullname_joined'] = 'ID:' + line[0]
		if not ai['fullname']:
			ai['fullname'] = 'ID:' + line[0]
		if not ai['fullname_parsed']:
			ai['fullname_parsed'] = ai['fullname']

		authors.append((int(line[0]), ai))
 		lastname_cnt[ai['name_last']] += 1
  		iFfL_cnt[ai['iFfL']] += 1

	print_err("Computing TF-IDF of affiliations")

	# min_df = 2 because though we deduct non common words, they should be significant first
	affil_tfidf = computeTFIDFs(affiliations, 'all', min_df=2, words_freq=printaffilwordfreq)
	if printaffilwordfreq:
		print "-----"
	name_tfidf = computeTFIDFs(fullnames, None, min_df=2, ngram_range=(1,3), words_freq=printaffilwordfreq, token_pattern=u'(?u)\\b[a-zA-Z][a-zA-Z]+\\b')
	if printaffilwordfreq:
		return

	print_err("Calculating IDFs")
	iFfL_IDF = dict(zip(iFfL_cnt.keys(), np.log(float(len(authors)) / np.array(iFfL_cnt.values()))))
	lastname_IDF = dict(zip(lastname_cnt.keys(), np.log(float(len(authors)) / np.array(lastname_cnt.values()))))

	print_err("Packing it into a list")
 	for i, a in enumerate(authors):
 		authors[i][1]['iFfL_idf'] = iFfL_IDF[a[1]['iFfL']]
 		authors[i][1]['lastname_idf'] = lastname_IDF[a[1]['name_last']]
 		if len(a[1]['affiliation']) == 0:
 			authors[i][1]['affil_tfidf'] = None
 		else:
			authors[i][1]['affil_tfidf'] = affil_tfidf[id2affiliation[a[0]]]
		if a[0] in id2fullname:
 			authors[i][1]['fullname_tfidf'] = name_tfidf[id2fullname[a[0]]]
 		else:
			authors[i][1]['fullname_tfidf'] = None
			
		if (i+1) % 10000 == 0:
			print_err(i+1)
 	authors_dict = dict(authors)
 	return authors_dict