def get_lshtein_shortlist(srcfile, tgtfiles, listsize=3): slist = [None]*listsize dummylist = [] for t in tgtfiles: dummylist.append(t) for i in range(listsize): slist[i] = dummylist[0] dist = lshtein.distance(strip_text(srcfile['html']), strip_text(slist[i]['html'])) for t in dummylist: ldist = lshtein.distance(strip_text(srcfile['html']), strip_text(t['html'])) if ldist < dist: slist[i] = t dist = ldist dummylist.remove(slist[i]) return slist
def get_lshtein_shortlist(srcfile, tgtfiles, listsize=3): slist = [None] * listsize dummylist = [] for t in tgtfiles: dummylist.append(t) for i in range(listsize): slist[i] = dummylist[0] dist = lshtein.distance(strip_text(srcfile['html']), strip_text(slist[i]['html'])) for t in dummylist: ldist = lshtein.distance(strip_text(srcfile['html']), strip_text(t['html'])) if ldist < dist: slist[i] = t dist = ldist dummylist.remove(slist[i]) return slist
def get_match(srcfile, slist, hweight, nweight, uweight, c, v): match = slist[0] bconf = 0.0 if len(slist) > 0: for t in slist: html = lshtein.distance(strip_text(srcfile['html']), strip_text(t['html'])) html_rel = float(len(strip_text(srcfile['html'])) - html) / len( strip_text(srcfile['html'])) srcnumbers = set(get_numbers(srcfile['html'])) num_rel = float( len(srcnumbers & set(get_numbers(t['html'])))) / float( len(srcnumbers)) url = lshtein.distance(srcfile['url'], t['url']) url_rel = float(len(srcfile['url']) - url) / len(srcfile['url']) conf = url_rel * uweight + num_rel * nweight + html_rel * hweight if v: print "TGTFILE: ", t['filename'] print "url: ", url_rel print "num: ", num_rel print "html: ", html_rel print "conf:", conf if conf > bconf: match = t bconf = conf if bconf < c: #below threshhold return None if match['match'] != None: if srcfile['conf'] < bconf: #if better match match['match']['match'] = None match['match']['conf'] = 0.0 match['match'] = srcfile return (match, bconf) else: return None #srcfile not matched match['match'] = srcfile return (match, bconf)
def get_match(srcfile, slist, hweight, nweight, uweight, c, v): match = slist[0] bconf = 0.0 if len(slist) > 0: for t in slist: html = lshtein.distance(strip_text(srcfile['html']), strip_text(t['html'])) html_rel = float(len(strip_text(srcfile['html']))-html) / len(strip_text(srcfile['html'])) srcnumbers = set(get_numbers(srcfile['html'])) num_rel = float(len(srcnumbers & set(get_numbers(t['html'])))) / float(len(srcnumbers)) url = lshtein.distance(srcfile['url'], t['url']) url_rel = float(len(srcfile['url']) - url) / len(srcfile['url']) conf = url_rel*uweight + num_rel*nweight + html_rel*hweight if v: print "TGTFILE: ", t['filename'] print "url: ", url_rel print "num: ", num_rel print "html: ", html_rel print "conf:", conf if conf > bconf: match = t bconf = conf if bconf < c: #below threshhold return None if match['match'] != None: if srcfile['conf'] < bconf: #if better match match['match']['match'] = None match['match']['conf'] = 0.0 match['match'] = srcfile return (match, bconf) else: return None #srcfile not matched match['match'] = srcfile return (match, bconf)
def test_basic_distance(self): """Tests distance correctness with a few basic values""" assert lshtein.distance("word", "word") == 0 assert lshtein.distance("word", "") == 4 assert lshtein.distance("", "word") == 4 assert lshtein.distance("word", "word 2") == 2 assert lshtein.distance("words", "word") == 1 assert lshtein.distance("word", "woord") == 1
def test_basic_distance(self): """Tests distance correctness with a few basic values""" levenshtein = lshtein.LevenshteinComparer() assert lshtein.distance("word", "word") == 0 assert lshtein.distance("word", "") == 4 assert lshtein.distance("", "word") == 4 assert lshtein.distance("word", "word 2") == 2 assert lshtein.distance("words", "word") == 1 assert lshtein.distance("word", "woord") == 1