Beispiel #1
0
def soft_idf(name1, name2, counts, sim_func=jaccard):
    ''' Implementation of Soft IDf matching.  Finds the best pairing of tokens in the two
    names based on some similarity metric, and then scores that pairing based on the token
    similarity and the inverse document frequency (IDF) of the terms. '''
    name1_tokens = list(tokens(name1))
    name2_tokens = list(tokens(name2))
    tok_match = token_match(name1_tokens, name2_tokens, sim_func)

    idf = {t: math.log(len(counts) / (counts[t] + 1)) for t in name1_tokens}
    idf.update({t: math.log(len(counts) / (counts[t] + 1)) for t in name2_tokens})

    def sx(tk, idf):
        return math.sqrt(sum([(idf[t] / len(tk)) ** 2 for t in tk if t]))

    return sum(sim_func(a, b) * idf[a] / len(name1_tokens) * idf[b] / len(name2_tokens)
               for a, b in tok_match.items()) / (sx(name1_tokens, idf) * sx(name2_tokens, idf))
Beispiel #2
0
def term_counts(names):
    ''' Build a counter of tokens in the set of all possible names '''
    return Counter(itertools.chain(*(tokens(n) for n in names)))