def step5_calc_minpath_dt(p_txt_folder='tnt', o_txt_folder='txt', size_limit=-1): res1 = [] res2 = [] skipped = [] print ":: Calculate minimal path deviation ." print " :: processed text folder:", p_txt_folder print " :: original text folder:", o_txt_folder print "-" * 64 print " Deviation | Text1 size | Text2 size | Text diff | Distance | " print "-" * 64 for s in gen_string_pairs(p_txt_folder, o_txt_folder): if size_limit > 0 and (len(s[1]) > size_limit or len(s[0]) > size_limit): skipped.append(os.path.basename(s[0])) continue m = UprightDistanceMatrix() fLevDistance2().fill_matrix(s[0], s[1], m) p = fMinPath()(m) #p = fLevPath(insPriority = 1)(m) i = 0 mdt1 = mdt2 = 0 for col in range(m.cols - 1, -1, -1): arow = (col * m.rows) // m.cols while i < len(p) and p[i][1] == col: dt = arow - p[i][0] if dt < 0: if dt < mdt2: mdt2 = dt else: if dt > mdt1: mdt1 = dt i += 1 res1.append(mdt1) res2.append(mdt2) print " [%4d ..%3d] " % (mdt2, mdt1), "| %6d " % len( s[0]), "| %6d " % len(s[1]), " | %3d " % ( len(s[0]) - len(s[1])), " | %3d " % m.get( m.rows - 1, m.cols - 1), "| %s " % os.path.basename(s[3]) res1.sort() res2.sort() print "=" * 52 print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]" print "=" * 52
def test_lev3(self): for t in TEXTS: s1, s2 = t[0], t[1] d = fClassicalLevDistance()(s1, s2) m1 = DistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m1) m2 = DistanceMatrix() fLevDistance().fill_matrix(s1, s2, m2) m3 = DistanceMatrix() fLevDistance2().fill_matrix(s1, s2, m3) m4 = DistanceMatrix(111) fLevDistanceDiag(len(s1) - 1).fill_matrix(s1, s2, m4) m5 = fLevDistanceDiag(len(s1) - 1).matrix(s1, s2) # C-code m6 = c_lev_distance.fLevDistance().matrix(s1, s2) m6_1 = DistanceMatrix() c_lev_distance.fLevDistance().fill_matrix(s1, s2, m6_1) m7 = c_lev_distance.fLevDistanceDiag(len(s1) - 1).matrix(s1, s2) m7_1 = DistanceMatrix(def_value = 111) c_lev_distance.fLevDistanceDiag(len(s1) - 1).fill_matrix(s1, s2, m7_1) self.assertEqual(m1, m2) self.assertEqual(m2, m3) self.assertEqual(m3, m4) self.assertEqual(m4, m5) self.assertEqual(m5, m6) self.assertEqual(m6, m7) self.assertEqual(m6, m6_1) self.assertEqual(m7, m7_1) # test C and Python diagonal m8 = c_lev_distance.fLevDistanceDiag(2).matrix(s1, s2) m9 = fLevDistanceDiag(2).matrix(s1, s2) self.assertEqual(m6, m7)
def test_lev3(self): for t in TEXTS: s1, s2 = t[0], t[1] d = fClassicalLevDistance()(s1, s2) m1 = DistanceMatrix() fClassicalLevDistance().fill_matrix(s1, s2, m1) m2 = DistanceMatrix() fLevDistance().fill_matrix(s1, s2, m2) m3 = DistanceMatrix() fLevDistance2().fill_matrix(s1, s2, m3) m4 = DistanceMatrix(111) fLevDistanceDiag(len(s1) - 1).fill_matrix(s1, s2, m4) m5 = fLevDistanceDiag(len(s1) - 1).matrix(s1, s2) # C-code m6 = c_lev_distance.fLevDistance().matrix(s1, s2) m6_1 = DistanceMatrix() c_lev_distance.fLevDistance().fill_matrix(s1, s2, m6_1) m7 = c_lev_distance.fLevDistanceDiag(len(s1) - 1).matrix(s1, s2) m7_1 = DistanceMatrix(def_value=111) c_lev_distance.fLevDistanceDiag(len(s1) - 1).fill_matrix( s1, s2, m7_1) self.assertEqual(m1, m2) self.assertEqual(m2, m3) self.assertEqual(m3, m4) self.assertEqual(m4, m5) self.assertEqual(m5, m6) self.assertEqual(m6, m7) self.assertEqual(m6, m6_1) self.assertEqual(m7, m7_1) # test C and Python diagonal m8 = c_lev_distance.fLevDistanceDiag(2).matrix(s1, s2) m9 = fLevDistanceDiag(2).matrix(s1, s2) self.assertEqual(m6, m7)
def step5_calc_minpath_dt(p_txt_folder = 'tnt', o_txt_folder = 'txt', size_limit = -1): res1 = [] res2 = [] skipped = [] print ":: Calculate minimal path deviation ." print " :: processed text folder:", p_txt_folder print " :: original text folder:", o_txt_folder print "-"*64 print " Deviation | Text1 size | Text2 size | Text diff | Distance | " print "-"*64 for s in gen_string_pairs(p_txt_folder, o_txt_folder): if size_limit > 0 and (len(s[1]) > size_limit or len(s[0]) > size_limit): skipped.append(os.path.basename(s[0])) continue m = UprightDistanceMatrix() fLevDistance2().fill_matrix(s[0], s[1], m) p = fMinPath()(m) #p = fLevPath(insPriority = 1)(m) i = 0 mdt1 = mdt2 = 0 for col in range(m.cols - 1, -1, -1): arow = (col * m.rows) // m.cols while i < len(p) and p[i][1] == col: dt = arow - p[i][0] if dt < 0: if dt < mdt2: mdt2 = dt else: if dt > mdt1: mdt1 = dt i += 1 res1.append(mdt1) res2.append(mdt2) print " [%4d ..%3d] " % (mdt2, mdt1), "| %6d " % len(s[0]), "| %6d " % len(s[1]), " | %3d " % (len(s[0]) - len(s[1])), " | %3d " % m.get(m.rows-1, m.cols-1), "| %s " % os.path.basename(s[3]) res1.sort() res2.sort() print "=" * 52 print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]" print "=" * 52
def test_lev1(self): s1 = s2 = "abcdef gk lmn" self.assertEqual(fLevDistance()(s1, s2), 0) self.assertEqual(fLevDistance2()(s1, s2), 0) self.assertEqual(fClassicalLevDistance()(s1, s2), 0) self.assertEqual(fLevDistanceDiag()(s1, s2), 0) # C code testing self.assertEqual(c_lev_distance.fLevDistance()(s1, s2), 0) self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2), 0) # unicode strings (contains Greek characters) s1 = s2 = u"abcdef \u03A0 gk \u03A3 lmn \u03A9" self.assertEqual(c_lev_distance.fLevDistance()(s1, s2), 0) self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2), 0)
def test_lev1(self): s1 = s2 = "abcdef gk lmn" self.assertEqual(fLevDistance()(s1, s2) , 0) self.assertEqual(fLevDistance2()(s1, s2) , 0) self.assertEqual(fClassicalLevDistance()(s1, s2) , 0) self.assertEqual(fLevDistanceDiag()(s1, s2) , 0) # C code testing self.assertEqual(c_lev_distance.fLevDistance()(s1, s2) , 0) self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2) , 0) # unicode strings (contains Greek characters) s1 = s2 = u"abcdef \u03A0 gk \u03A3 lmn \u03A9" self.assertEqual(c_lev_distance.fLevDistance()(s1, s2) , 0) self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2) , 0)
def test_lev2(self): for t in TEXTS: s1, s2 = t[0], t[1] d = fClassicalLevDistance()(s1, s2) self.assertEqual(fLevDistance()(s1, s2) , d) self.assertEqual(fLevDistance2()(s1, s2) , d) # C-code self.assertEqual(c_lev_distance.fLevDistance()(s1, s2) , d) self.assertEqual(c_lev_distance.fLevDistanceDiag(len(s1))(s1, s2) , d) # diagonal levenshtein self.assertEqual(fLevDistanceDiag(len(s1))(s1, s2) , d) # test diag bounds handling self.assertEqual(fLevDistanceDiag(10*len(s1))(s1, s2) , d) self.assertEqual(c_lev_distance.fLevDistanceDiag(10*len(s1))(s1, s2) , d)
def test_lev2(self): for t in TEXTS: s1, s2 = t[0], t[1] d = fClassicalLevDistance()(s1, s2) self.assertEqual(fLevDistance()(s1, s2), d) self.assertEqual(fLevDistance2()(s1, s2), d) # C-code self.assertEqual(c_lev_distance.fLevDistance()(s1, s2), d) self.assertEqual( c_lev_distance.fLevDistanceDiag(len(s1))(s1, s2), d) # diagonal levenshtein self.assertEqual(fLevDistanceDiag(len(s1))(s1, s2), d) # test diag bounds handling self.assertEqual(fLevDistanceDiag(10 * len(s1))(s1, s2), d) self.assertEqual( c_lev_distance.fLevDistanceDiag(10 * len(s1))(s1, s2), d)
def step3_calc_lev_distance(): for s in gen_string_pairs("cleaned-tnt", "txt"): d = fLevDistance2()(s[0], s[1]) print "Lev distance [", os.path.basename(s[2]), "(", len(s[0]),"b),", os.path.basename(s[3]),"(", len(s[1]),")] = ", d
def step4_calc_minpath(): for s in gen_string_pairs("tnt", "txt"): m = UprightDistanceMatrix() fLevDistance2().fill_matrix(s[0], s[1], m) print fMinPath()(m)
def step3_calc_lev_distance(): for s in gen_string_pairs("cleaned-tnt", "txt"): d = fLevDistance2()(s[0], s[1]) print "Lev distance [", os.path.basename(s[2]), "(", len( s[0]), "b),", os.path.basename(s[3]), "(", len(s[1]), ")] = ", d