Ejemplo n.º 1
0
    def dfs(self, node, esp=0, maxDist=0.3):
        """
        Navega em profundidade na árvore buscando nós adjacentes com distancia
        de edição menor que maxDist e os agrupa na mesma componente.

        @param Node node: No atual da dfs
        @param int esp: Nível da árvore
        @param float maxDist: Proporção máxima de diferença para ser agrupado
        na mesma componente
        """
        global s1
        global s2
        s1 = ""
        s2 = ""

        result = [False] * len(node.childNodes)
        match = False

        for x in xrange(0,len(node.childNodes)):
            c = node.childNodes[x]

            if x == 0 or c.height < 3:
                s1 = c.str
                match = False
                self._comp += 1
                continue

            s2 = c.str
#            print 'str:', s1, s2
            if len(s1) <= (len(s2) * (1.0+maxDist)) and\
              len(s2) <= (len(s1) * (1.0 +maxDist)):
                if float(stringDistance(s1,s2))/max(len(s1), len(s2)) > maxDist :
                    s1 = c.str
                    match = False
                    self._comp += 1
                    continue
                else:
                    s1 = c.str
                    result[x] = self._comp
                    if not match:
                        result[x-1] = self._comp

                    match = True
            else:
                s1 = c.str
                match = False
                self._comp += 1
        #for

        node.result = result
        for x in xrange(0,len(node.childNodes)):
            if not result[x]:
                self.dfs(node.childNodes[x], esp+1)
Ejemplo n.º 2
0
    def test_stringDistance(self):
        """
        Test if costs and distance value are correct
        """
        print '\n - Test: utils.distances.stringDistance()'
        add = 1
        change = 2
        delete = 4
        cost = { 'a': add, 'd': delete, 'c': change }

        dist = distances.stringDistance('aa','aba', cost)
        self.assertEqual(dist, add, '1: %d!=%d' % (dist, change))

        dist = distances.stringDistance('a', 'ab', cost)
        self.assertEqual(dist, add, '2: %d!=%d' % (dist, change))

        dist = distances.stringDistance('a', 'ba', cost)
        self.assertEqual(dist, add, '3: %d!=%d' % (dist, change))

        dist = distances.stringDistance('a', 'b', cost)
        self.assertEqual(dist, change, '4: %d!=%d' % (dist, change))

        dist = distances.stringDistance('ba', 'a', cost)
        self.assertEqual(dist, delete, '5: %d!=%d' % (dist, change))

        dist = distances.stringDistance('aba', 'aa', cost)
        self.assertEqual(dist, delete, '6: %d!=%d' % (dist, change))
Ejemplo n.º 3
0
def match(node, maxDist=0, height=3, tags=False, printtag=False):
    global __debug
    if __debug:
        print "Debug mode: eri.utils.match.match()"

    s1 = ""
    s2 = ""
    result = [False] * len(node.childNodes)
    match = False
    _comp = 0

    for x in xrange(0,len(node.childNodes)):
        c = node.childNodes[x]

        #primary test to not match low height
        if x == 0 or c.height < height:
            s1 = c.tags if tags else c.str
            match = False
            _comp += 1
            continue

        s2 = c.tags if tags else c.str

        if printtag or __debug:
            print 'str:', s1, s2

        # match test
        if len(s1) <= (len(s2) * (1.0+maxDist)) or\
          len(s2) <= (len(s1) * (1.0+maxDist)):
            d = float(stringDistance(s1,s2))/max(len(s1), len(s2))
            if __debug:
                print 'distance:', d
            #save match component
            if d <= maxDist :
                s1 = c.tags if tags else c.str
                result[x] = _comp
                result[x-1] = _comp if not match else result[x-1]
                match = True
                continue

        # not match
        s1 = c.tags if tags else c.str
        match = False
        _comp += 1
    #for

    if __debug:
        print "return: eri.utils.match.match()"

    return result
Ejemplo n.º 4
0
from eri.utils.distances import stringDiff, stringDistance
a = "eturn one that starts earliest in"
b = "starts earliest in a, and of all those maximal matching"

for i in xrange(1, 10000):
    stringDiff(a,b)
    stringDistance(a,b)