Example #1
0
    def test_lev5(self): 
        for t in TEXTS:
            s1, s2 = t[0], t[1]
       
            m1 = DistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m1)
            p1 = fMinPath()(m1)
            self.assertEqual(p1, fLevPath()(m1))        
            self.assertEqual(s2, operations_to_text(p1, s1, s2))

         
            m2 = UprightDistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m2)
            p2 = fMinPath()(m2)
            self.assertEqual(p2, fLevPath()(m2))        
            self.assertEqual(s2, operations_to_text(p2, s1, s2))

            m3 = HybridSubsetDistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m3)
            p3 = fMinPath()(m3)
            self.assertEqual(p3, fLevPath()(m3))        
            self.assertEqual(s2, operations_to_text(p3, s1, s2))

            m4 = UprightSubsetDistanceMatrix(len(s1))
            fClassicalLevDistance().fill_matrix(s1, s2, m4)
            p4 = fMinPath()(m4)
            self.assertEqual(p4, fLevPath()(m4))        
            self.assertEqual(s2, operations_to_text(p4, s1, s2))

            self.assertEqual(p1, p2)        
            self.assertEqual(p2, p3)        
            self.assertEqual(p3, p4)        
Example #2
0
    def test_lev5(self):
        for t in TEXTS:
            s1, s2 = t[0], t[1]

            m1 = DistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m1)
            p1 = fMinPath()(m1)
            self.assertEqual(p1, fLevPath()(m1))
            self.assertEqual(s2, operations_to_text(p1, s1, s2))

            m2 = UprightDistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m2)
            p2 = fMinPath()(m2)
            self.assertEqual(p2, fLevPath()(m2))
            self.assertEqual(s2, operations_to_text(p2, s1, s2))

            m3 = HybridSubsetDistanceMatrix()
            fClassicalLevDistance().fill_matrix(s1, s2, m3)
            p3 = fMinPath()(m3)
            self.assertEqual(p3, fLevPath()(m3))
            self.assertEqual(s2, operations_to_text(p3, s1, s2))

            m4 = UprightSubsetDistanceMatrix(len(s1))
            fClassicalLevDistance().fill_matrix(s1, s2, m4)
            p4 = fMinPath()(m4)
            self.assertEqual(p4, fLevPath()(m4))
            self.assertEqual(s2, operations_to_text(p4, s1, s2))

            self.assertEqual(p1, p2)
            self.assertEqual(p2, p3)
            self.assertEqual(p3, p4)
Example #3
0
def step5_calc_minpath_dt(p_txt_folder='tnt',
                          o_txt_folder='txt',
                          size_limit=-1):
    res1 = []
    res2 = []
    skipped = []

    print ":: Calculate minimal path deviation ."
    print "     :: processed text folder:", p_txt_folder
    print "     :: original  text folder:", o_txt_folder
    print "-" * 64
    print "   Deviation   | Text1 size | Text2 size | Text diff  | Distance | "
    print "-" * 64
    for s in gen_string_pairs(p_txt_folder, o_txt_folder):
        if size_limit > 0 and (len(s[1]) > size_limit
                               or len(s[0]) > size_limit):
            skipped.append(os.path.basename(s[0]))
            continue

        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        p = fMinPath()(m)
        #p = fLevPath(insPriority = 1)(m)

        i = 0
        mdt1 = mdt2 = 0
        for col in range(m.cols - 1, -1, -1):
            arow = (col * m.rows) // m.cols
            while i < len(p) and p[i][1] == col:
                dt = arow - p[i][0]
                if dt < 0:
                    if dt < mdt2: mdt2 = dt
                else:
                    if dt > mdt1: mdt1 = dt
                i += 1

        res1.append(mdt1)
        res2.append(mdt2)

        print " [%4d ..%3d] " % (mdt2, mdt1), "|   %6d  " % len(
            s[0]), "|  %6d  " % len(s[1]), " |   %3d    " % (
                len(s[0]) - len(s[1])), " |   %3d   " % m.get(
                    m.rows - 1, m.cols - 1), "| %s " % os.path.basename(s[3])

    res1.sort()
    res2.sort()
    print "=" * 52
    print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]"
    print "=" * 52
Example #4
0
def step5_calc_minpath_dt(p_txt_folder = 'tnt', o_txt_folder = 'txt', size_limit = -1):
    res1 = []
    res2 = []
    skipped = [] 
    
    print ":: Calculate minimal path deviation ." 
    print "     :: processed text folder:", p_txt_folder 
    print "     :: original  text folder:", o_txt_folder
    print "-"*64
    print "   Deviation   | Text1 size | Text2 size | Text diff  | Distance | "
    print "-"*64
    for s in gen_string_pairs(p_txt_folder, o_txt_folder):
        if size_limit > 0 and (len(s[1]) >  size_limit or len(s[0]) > size_limit): 
            skipped.append(os.path.basename(s[0]))
            continue
        
        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        p = fMinPath()(m)
        #p = fLevPath(insPriority = 1)(m)
        
        i = 0 
        mdt1 = mdt2 = 0
        for col in range(m.cols - 1, -1, -1):       
            arow = (col * m.rows) // m.cols
            while i < len(p) and p[i][1] == col:
                dt = arow - p[i][0]
                if dt < 0:
                    if dt < mdt2: mdt2 = dt
                else:
                    if dt > mdt1: mdt1 = dt
                i += 1

        res1.append(mdt1)
        res2.append(mdt2)

        print " [%4d ..%3d] " % (mdt2, mdt1), "|   %6d  " % len(s[0]),  "|  %6d  " % len(s[1]), " |   %3d    " %  (len(s[0]) - len(s[1])), " |   %3d   " % m.get(m.rows-1, m.cols-1), "| %s " % os.path.basename(s[3])  

    res1.sort()
    res2.sort()
    print "=" * 52
    print " Maximal deviation = [", res2[0], " .. ", res1[len(res1) - 1], "]" 
    print "=" * 52
Example #5
0
    def test_ldistance(self):     
        ed = fClassicalLevDistance()
        self.assertEqual(ed("Text", "Text"), 0)

        ed = fClassicalLevDistance()
        self.assertEqual(ed("Texmt", "sdText"), 3)

        ed = fLevDistance()
        self.assertEqual(ed("Text", "Text"), 0)
        
        ed = fLevDistance()
        self.assertEqual(ed("Texmt", "sdText"), 3)

        em = [ [1,	2,	2,	3,	4,	5 ],    
               [2,	2,	3,	2,	3,	4 ],
               [3,	3,	3,	3,	2,	3 ],
               [4,	4,	4,	4,	3,	3 ],
               [5,	5,	5,	5,	4,	3 ] ]
               
        m = DistanceMatrix()
        fClassicalLevDistance().fill_matrix("Texmt", "sdText", m)
        for row in range(len(em)):
            for col in range(len(em[row])):
                self.assertEqual(em[row][col], m.get(row,col))

        m = fLevDistance().fill_matrix("Texmt", "sdText", DistanceMatrix())
        for row in range(len(em)):
            for col in range(len(em[row])):
                self.assertEqual(em[row][col], m.get(row,col))

        mp = [(4, 5), (3, 4), (2, 4), (1, 3), (0, 2), (0, 1), (0, 0)]
        i = 0
        for e in fMinPath()(m):
            self.assertEqual(e, mp[i])
            i+=1

        f = fDistanceMatch(fLevDistance) 
        self.assertEqual(f("Text 1", "Text 2"), True)

        f = fDistanceMatch(fLevDistance) 
        self.assertEqual(f("Text 1", "Tessxt 2"), False)
Example #6
0
    def test_ldistance(self):
        ed = fClassicalLevDistance()
        self.assertEqual(ed("Text", "Text"), 0)

        ed = fClassicalLevDistance()
        self.assertEqual(ed("Texmt", "sdText"), 3)

        ed = fLevDistance()
        self.assertEqual(ed("Text", "Text"), 0)

        ed = fLevDistance()
        self.assertEqual(ed("Texmt", "sdText"), 3)

        em = [[1, 2, 2, 3, 4, 5], [2, 2, 3, 2, 3, 4], [3, 3, 3, 3, 2, 3], [4, 4, 4, 4, 3, 3], [5, 5, 5, 5, 4, 3]]

        m = DistanceMatrix()
        fClassicalLevDistance().fill_matrix("Texmt", "sdText", m)
        for row in range(len(em)):
            for col in range(len(em[row])):
                self.assertEqual(em[row][col], m.get(row, col))

        m = fLevDistance().fill_matrix("Texmt", "sdText", DistanceMatrix())
        for row in range(len(em)):
            for col in range(len(em[row])):
                self.assertEqual(em[row][col], m.get(row, col))

        mp = [(4, 5), (3, 4), (2, 4), (1, 3), (0, 2), (0, 1), (0, 0)]
        i = 0
        for e in fMinPath()(m):
            self.assertEqual(e, mp[i])
            i += 1

        f = fDistanceMatch(fLevDistance)
        self.assertEqual(f("Text 1", "Text 2"), True)

        f = fDistanceMatch(fLevDistance)
        self.assertEqual(f("Text 1", "Tessxt 2"), False)
Example #7
0
def step4_calc_minpath():
    for s in gen_string_pairs("tnt", "txt"):
        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        print fMinPath()(m)
Example #8
0
t = time.time()

#prepare_data()


#step1_fetch_text(os.path.join(data_dir, "xml"))
#step2_run_tnt(os.path.join(data_dir, "txt"))
#step3_calc_lev_distance()
#step4_calc_minpath()
#step5_calc_minpath_dt()
#step6_cleantnt_txt()
#step5_calc_minpath_dt("cleaned-tnt", "txt", size_limit=1000)



from gravity.tae.match.lev_distance import  fLevDistanceDiag
#s1 = "Song about   Alice dream"
#s2 = "Song al?ce dream" 

s1 = "  abc d dfg  rer klm"
s2 = "abc dfg  klm" 


m = DistanceMatrix(111)
#fLevDistanceDiag(1).fill_matrix(s2, s1, m)
fLevDistance().fill_matrix(s2, s1, m)
print m.toString(s2, s1, fMinPath()(m))


print "Elapsed time: ", (time.time()  - t)
Example #9
0
def step4_calc_minpath():
    for s in gen_string_pairs("tnt", "txt"):
        m = UprightDistanceMatrix()
        fLevDistance2().fill_matrix(s[0], s[1], m)
        print fMinPath()(m)
Example #10
0
    step1_fetch_text(os.path.join(data_dir, "xml"))
    step2_run_tnt(os.path.join(data_dir, "txt"))
    step6_cleantnt_txt()


t = time.time()

#prepare_data()

#step1_fetch_text(os.path.join(data_dir, "xml"))
#step2_run_tnt(os.path.join(data_dir, "txt"))
#step3_calc_lev_distance()
#step4_calc_minpath()
#step5_calc_minpath_dt()
#step6_cleantnt_txt()
#step5_calc_minpath_dt("cleaned-tnt", "txt", size_limit=1000)

from gravity.tae.match.lev_distance import fLevDistanceDiag
#s1 = "Song about   Alice dream"
#s2 = "Song al?ce dream"

s1 = "  abc d dfg  rer klm"
s2 = "abc dfg  klm"

m = DistanceMatrix(111)
#fLevDistanceDiag(1).fill_matrix(s2, s1, m)
fLevDistance().fill_matrix(s2, s1, m)
print m.toString(s2, s1, fMinPath()(m))

print "Elapsed time: ", (time.time() - t)