Beispiel #1
0
def step5_calc_minpath_dt(p_txt_folder='tnt',
                          o_txt_folder='txt',
                          size_limit=-1):
    res1 = []
    res2 = []
    skipped = []

    print ":: Calculate minimal path deviation ."
    print "     :: processed text folder:", p_txt_folder
    print "     :: original  text folder:", o_txt_folder
    print "-" * 64
    print "   Deviation   | Text1 size | Text2 size | Text diff  | Distance | "
    print "-" * 64
    for s in gen_string_pairs(p_txt_folder, o_txt_folder):
        if size_limit > 0 and (len(s[1]) > size_limit
                               or len(s[0]) > size_limit):
            skipped.append(os.path.basename(s[0]))
            continue

        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        p = fMinPath()(m)
        #p = fLevPath(insPriority = 1)(m)

        i = 0
        mdt1 = mdt2 = 0
        for col in range(m.cols - 1, -1, -1):
            arow = (col * m.rows) // m.cols
            while i < len(p) and p[i][1] == col:
                dt = arow - p[i][0]
                if dt < 0:
                    if dt < mdt2: mdt2 = dt
                else:
                    if dt > mdt1: mdt1 = dt
                i += 1

        res1.append(mdt1)
        res2.append(mdt2)

        print " [%4d ..%3d] " % (mdt2, mdt1), "|   %6d  " % len(
            s[0]), "|  %6d  " % len(s[1]), " |   %3d    " % (
                len(s[0]) - len(s[1])), " |   %3d   " % m.get(
                    m.rows - 1, m.cols - 1), "| %s " % os.path.basename(s[3])

    res1.sort()
    res2.sort()
    print "=" * 52
    print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]"
    print "=" * 52
Beispiel #2
0
    def test_lev3(self): 
        for t in TEXTS:
            s1, s2 = t[0], t[1]

            d = fClassicalLevDistance()(s1, s2)
        
            m1 = DistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m1)
         
            m2 = DistanceMatrix()
            fLevDistance().fill_matrix(s1, s2, m2)
         
            m3 = DistanceMatrix()
            fLevDistance2().fill_matrix(s1, s2, m3)
         
            m4 = DistanceMatrix(111)
            fLevDistanceDiag(len(s1) - 1).fill_matrix(s1, s2, m4)
         
            m5 = fLevDistanceDiag(len(s1) - 1).matrix(s1, s2)
            
            # C-code
            m6 = c_lev_distance.fLevDistance().matrix(s1, s2)       
           
            m6_1 = DistanceMatrix()
            c_lev_distance.fLevDistance().fill_matrix(s1, s2, m6_1)
            m7 = c_lev_distance.fLevDistanceDiag(len(s1) - 1).matrix(s1, s2)
            
            m7_1 = DistanceMatrix(def_value = 111)
            c_lev_distance.fLevDistanceDiag(len(s1) - 1).fill_matrix(s1, s2, m7_1)

            
            self.assertEqual(m1, m2)        
            self.assertEqual(m2, m3)        
            self.assertEqual(m3, m4)        
            self.assertEqual(m4, m5)        
            self.assertEqual(m5, m6)        
            self.assertEqual(m6, m7)        
            self.assertEqual(m6, m6_1)        
            self.assertEqual(m7, m7_1)        


            # test C and Python diagonal 
            m8 = c_lev_distance.fLevDistanceDiag(2).matrix(s1, s2)
            m9 = fLevDistanceDiag(2).matrix(s1, s2)
            self.assertEqual(m6, m7) 
Beispiel #3
0
    def test_lev3(self):
        for t in TEXTS:
            s1, s2 = t[0], t[1]

            d = fClassicalLevDistance()(s1, s2)

            m1 = DistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m1)

            m2 = DistanceMatrix()
            fLevDistance().fill_matrix(s1, s2, m2)

            m3 = DistanceMatrix()
            fLevDistance2().fill_matrix(s1, s2, m3)

            m4 = DistanceMatrix(111)
            fLevDistanceDiag(len(s1) - 1).fill_matrix(s1, s2, m4)

            m5 = fLevDistanceDiag(len(s1) - 1).matrix(s1, s2)

            # C-code
            m6 = c_lev_distance.fLevDistance().matrix(s1, s2)

            m6_1 = DistanceMatrix()
            c_lev_distance.fLevDistance().fill_matrix(s1, s2, m6_1)
            m7 = c_lev_distance.fLevDistanceDiag(len(s1) - 1).matrix(s1, s2)

            m7_1 = DistanceMatrix(def_value=111)
            c_lev_distance.fLevDistanceDiag(len(s1) - 1).fill_matrix(
                s1, s2, m7_1)

            self.assertEqual(m1, m2)
            self.assertEqual(m2, m3)
            self.assertEqual(m3, m4)
            self.assertEqual(m4, m5)
            self.assertEqual(m5, m6)
            self.assertEqual(m6, m7)
            self.assertEqual(m6, m6_1)
            self.assertEqual(m7, m7_1)

            # test C and Python diagonal
            m8 = c_lev_distance.fLevDistanceDiag(2).matrix(s1, s2)
            m9 = fLevDistanceDiag(2).matrix(s1, s2)
            self.assertEqual(m6, m7)
Beispiel #4
0
def step5_calc_minpath_dt(p_txt_folder = 'tnt', o_txt_folder = 'txt', size_limit = -1):
    res1 = []
    res2 = []
    skipped = [] 
    
    print ":: Calculate minimal path deviation ." 
    print "     :: processed text folder:", p_txt_folder 
    print "     :: original  text folder:", o_txt_folder
    print "-"*64
    print "   Deviation   | Text1 size | Text2 size | Text diff  | Distance | "
    print "-"*64
    for s in gen_string_pairs(p_txt_folder, o_txt_folder):
        if size_limit > 0 and (len(s[1]) >  size_limit or len(s[0]) > size_limit): 
            skipped.append(os.path.basename(s[0]))
            continue
        
        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        p = fMinPath()(m)
        #p = fLevPath(insPriority = 1)(m)
        
        i = 0 
        mdt1 = mdt2 = 0
        for col in range(m.cols - 1, -1, -1):       
            arow = (col * m.rows) // m.cols
            while i < len(p) and p[i][1] == col:
                dt = arow - p[i][0]
                if dt < 0:
                    if dt < mdt2: mdt2 = dt
                else:
                    if dt > mdt1: mdt1 = dt
                i += 1

        res1.append(mdt1)
        res2.append(mdt2)

        print " [%4d ..%3d] " % (mdt2, mdt1), "|   %6d  " % len(s[0]),  "|  %6d  " % len(s[1]), " |   %3d    " %  (len(s[0]) - len(s[1])), " |   %3d   " % m.get(m.rows-1, m.cols-1), "| %s " % os.path.basename(s[3])  

    res1.sort()
    res2.sort()
    print "=" * 52
    print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]" 
    print "=" * 52
Beispiel #5
0
    def test_lev1(self):
        s1 = s2 = "abcdef gk lmn"
        self.assertEqual(fLevDistance()(s1, s2), 0)
        self.assertEqual(fLevDistance2()(s1, s2), 0)
        self.assertEqual(fClassicalLevDistance()(s1, s2), 0)
        self.assertEqual(fLevDistanceDiag()(s1, s2), 0)

        # C code testing
        self.assertEqual(c_lev_distance.fLevDistance()(s1, s2), 0)
        self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2), 0)

        # unicode strings (contains Greek characters)
        s1 = s2 = u"abcdef \u03A0 gk \u03A3 lmn \u03A9"
        self.assertEqual(c_lev_distance.fLevDistance()(s1, s2), 0)
        self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2), 0)
Beispiel #6
0
 def test_lev1(self): 
     s1 = s2 = "abcdef gk lmn"
     self.assertEqual(fLevDistance()(s1, s2) , 0)        
     self.assertEqual(fLevDistance2()(s1, s2) , 0)        
     self.assertEqual(fClassicalLevDistance()(s1, s2) , 0)        
     self.assertEqual(fLevDistanceDiag()(s1, s2) , 0)        
    
     # C code testing  
     self.assertEqual(c_lev_distance.fLevDistance()(s1, s2) , 0)        
     self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2) , 0)        
     
     # unicode strings (contains Greek characters)
     s1 = s2 = u"abcdef \u03A0 gk \u03A3 lmn \u03A9"
     self.assertEqual(c_lev_distance.fLevDistance()(s1, s2) , 0)        
     self.assertEqual(c_lev_distance.fLevDistanceDiag()(s1, s2) , 0)        
Beispiel #7
0
    def test_lev2(self): 
        for t in TEXTS:
            s1, s2 = t[0], t[1]

            d = fClassicalLevDistance()(s1, s2)
            self.assertEqual(fLevDistance()(s1, s2) , d)        
            self.assertEqual(fLevDistance2()(s1, s2) , d)       
            
            # C-code             
            self.assertEqual(c_lev_distance.fLevDistance()(s1, s2) , d)        
            self.assertEqual(c_lev_distance.fLevDistanceDiag(len(s1))(s1, s2) , d)        
           
            # diagonal levenshtein
            self.assertEqual(fLevDistanceDiag(len(s1))(s1, s2) , d)        
            
            # test diag bounds handling
            self.assertEqual(fLevDistanceDiag(10*len(s1))(s1, s2) , d)        
            self.assertEqual(c_lev_distance.fLevDistanceDiag(10*len(s1))(s1, s2) , d)        
Beispiel #8
0
    def test_lev2(self):
        for t in TEXTS:
            s1, s2 = t[0], t[1]

            d = fClassicalLevDistance()(s1, s2)
            self.assertEqual(fLevDistance()(s1, s2), d)
            self.assertEqual(fLevDistance2()(s1, s2), d)

            # C-code
            self.assertEqual(c_lev_distance.fLevDistance()(s1, s2), d)
            self.assertEqual(
                c_lev_distance.fLevDistanceDiag(len(s1))(s1, s2), d)

            # diagonal levenshtein
            self.assertEqual(fLevDistanceDiag(len(s1))(s1, s2), d)

            # test diag bounds handling
            self.assertEqual(fLevDistanceDiag(10 * len(s1))(s1, s2), d)
            self.assertEqual(
                c_lev_distance.fLevDistanceDiag(10 * len(s1))(s1, s2), d)
Beispiel #9
0
def step3_calc_lev_distance():
    for s in gen_string_pairs("cleaned-tnt", "txt"):
        d = fLevDistance2()(s[0], s[1])
        print "Lev distance [", os.path.basename(s[2]), "(", len(s[0]),"b),",  os.path.basename(s[3]),"(", len(s[1]),")] = ", d 
Beispiel #10
0
def step4_calc_minpath():
    for s in gen_string_pairs("tnt", "txt"):
        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        print fMinPath()(m)
Beispiel #11
0
def step4_calc_minpath():
    for s in gen_string_pairs("tnt", "txt"):
        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        print fMinPath()(m)
Beispiel #12
0
def step3_calc_lev_distance():
    for s in gen_string_pairs("cleaned-tnt", "txt"):
        d = fLevDistance2()(s[0], s[1])
        print "Lev distance [", os.path.basename(s[2]), "(", len(
            s[0]), "b),", os.path.basename(s[3]), "(", len(s[1]), ")] = ", d