コード例 #1
0
ファイル: named.py プロジェクト: Glank/rdp
 def ident(self, name):
     orig_name = name
     best_matches = []
     results = self.entity_type.query.get_results()
     mapper = self.entity_type.result_mapper
     name = mapper.stand(name)
     for uri, given, stand in mapper.map(results):
         dist = jaccard_ngram_dist(stand,name,3)
         best_matches.append(((given,uri),dist))
         if len(best_matches)>100:
             best_matches.sort(key=lambda x:x[1])
             best_matches = best_matches[:self.max_matches]
     best_matches.sort(key=lambda x:x[1])
     best_matches = best_matches[:self.max_matches]
     best_dist = best_matches[0][1]
     possibles = [best_matches[0][0]]
     for match, dist in best_matches[1:]:
         if (dist+best_dist)==0:
             percent_diff = 0
         else:
             percent_diff = (dist-best_dist)*2/float(dist+best_dist)
         if percent_diff < self.max_percent_diff:
             possibles.append(match)
     if len(possibles)>1:
         identified = self.prompt_possibles(orig_name, possibles)
     else:
         identified = possibles[0]
     return identified
コード例 #2
0
ファイル: levenshtein.py プロジェクト: Glank/rdp
    'adam smith',
    'bob smith',
    'carl smith',
    'dale jones',
    'ernest kirstein'
]

import ngrams
to_compare = [
    'tom smith',
    'john smith',
    'tom'
]
#to_compare = ['rj smith', 'rj', 'cs smith']
def run_comps(f, to_compare):
    for i,s1 in enumerate(to_compare):
        for s2 in to_compare[i+1:]:
            print f(s1,s2), repr(s1), repr(s2)

print "Without Info:"
d = lambda a,b: ngrams.jaccard_ngram_dist(a,b,3)
run_comps(d, to_compare)

print "\nSample Set:"
print '\t'+'\n\t'.join(names)
model = ngram_model(names, 3)

print "\nWith Info:"
d = lambda a,b: info_ngram_dist(3,a,b,model)
run_comps(d, to_compare)