def dfs(self, node, esp=0, maxDist=0.3): """ Navega em profundidade na árvore buscando nós adjacentes com distancia de edição menor que maxDist e os agrupa na mesma componente. @param Node node: No atual da dfs @param int esp: Nível da árvore @param float maxDist: Proporção máxima de diferença para ser agrupado na mesma componente """ global s1 global s2 s1 = "" s2 = "" result = [False] * len(node.childNodes) match = False for x in xrange(0,len(node.childNodes)): c = node.childNodes[x] if x == 0 or c.height < 3: s1 = c.str match = False self._comp += 1 continue s2 = c.str # print 'str:', s1, s2 if len(s1) <= (len(s2) * (1.0+maxDist)) and\ len(s2) <= (len(s1) * (1.0 +maxDist)): if float(stringDistance(s1,s2))/max(len(s1), len(s2)) > maxDist : s1 = c.str match = False self._comp += 1 continue else: s1 = c.str result[x] = self._comp if not match: result[x-1] = self._comp match = True else: s1 = c.str match = False self._comp += 1 #for node.result = result for x in xrange(0,len(node.childNodes)): if not result[x]: self.dfs(node.childNodes[x], esp+1)
def test_stringDistance(self): """ Test if costs and distance value are correct """ print '\n - Test: utils.distances.stringDistance()' add = 1 change = 2 delete = 4 cost = { 'a': add, 'd': delete, 'c': change } dist = distances.stringDistance('aa','aba', cost) self.assertEqual(dist, add, '1: %d!=%d' % (dist, change)) dist = distances.stringDistance('a', 'ab', cost) self.assertEqual(dist, add, '2: %d!=%d' % (dist, change)) dist = distances.stringDistance('a', 'ba', cost) self.assertEqual(dist, add, '3: %d!=%d' % (dist, change)) dist = distances.stringDistance('a', 'b', cost) self.assertEqual(dist, change, '4: %d!=%d' % (dist, change)) dist = distances.stringDistance('ba', 'a', cost) self.assertEqual(dist, delete, '5: %d!=%d' % (dist, change)) dist = distances.stringDistance('aba', 'aa', cost) self.assertEqual(dist, delete, '6: %d!=%d' % (dist, change))
def match(node, maxDist=0, height=3, tags=False, printtag=False): global __debug if __debug: print "Debug mode: eri.utils.match.match()" s1 = "" s2 = "" result = [False] * len(node.childNodes) match = False _comp = 0 for x in xrange(0,len(node.childNodes)): c = node.childNodes[x] #primary test to not match low height if x == 0 or c.height < height: s1 = c.tags if tags else c.str match = False _comp += 1 continue s2 = c.tags if tags else c.str if printtag or __debug: print 'str:', s1, s2 # match test if len(s1) <= (len(s2) * (1.0+maxDist)) or\ len(s2) <= (len(s1) * (1.0+maxDist)): d = float(stringDistance(s1,s2))/max(len(s1), len(s2)) if __debug: print 'distance:', d #save match component if d <= maxDist : s1 = c.tags if tags else c.str result[x] = _comp result[x-1] = _comp if not match else result[x-1] match = True continue # not match s1 = c.tags if tags else c.str match = False _comp += 1 #for if __debug: print "return: eri.utils.match.match()" return result
from eri.utils.distances import stringDiff, stringDistance a = "eturn one that starts earliest in" b = "starts earliest in a, and of all those maximal matching" for i in xrange(1, 10000): stringDiff(a,b) stringDistance(a,b)