Ejemplo n.º 1
0
def file(files, extractor, configurator, output=None):
    """
    Roda uma extrator com as configuracoes especificadas para um conjunto de arquivos

    @param extractor: Extrator to extract information from dom tree
    @param configurator: Configurator to provide the information to run benchmark
    @param output: std to print output messagens
    """
    import urllib
    parser = ParseDom()
    marker = configurator.marker()
    metric = configurator.metric()

    proof = configurator.proof()

    print "Doc\tPre\tRec\tlext\tlpro\tfile_name"
    for id, filePath in enumerate(files):
        fileName = filePath.split('/')[-1]
        marker.reset()
        if filePath[0:4] == "http":
            htmlString = urllib.urlopen(filePath).read()
        else:
            htmlString = open(filePath, 'r').read()

        dom = parser.parse(htmlString)
        p = proof.getProof(dom)
        r = extractor.process(dom, marker)

        v = 0
        t = len(p)
        if marker.labels.has_key('table'):
            v = len(marker.labels['table'])

        if t == 0:
            if v == 0:
                x = (1,1)
            else:
                x = (0,1)

        elif v > 0:
            x = metric.process(marker.labels['table'], p)

        precision = x[0]
        recall = x[1]
        print "%d\t%.02f\t%0.2f\t%d\t%d\t%s" % \
            (id+1, precision, recall, v, t, fileName)

        if output:
            out = open('out/%d.html' % (id+1), 'w')
            print >>out, r
        else:
            pass
Ejemplo n.º 2
0
#            print 'find productlist'
            self.labels['productlist'].append(node)
        elif node.hasAttribute('proof_product') and \
          node.getAttribute('proof_product') == 'true':
            self.labels['product'].append(node)

        for child in node.childNodes:
            self.dfs(child)

    def getProof(self, dom):
        """
        Get a set of nodes
        """
        self.__resetLabels()
        self.dfs(dom)
        return self.labels

if __name__ == "__main__":
    from eri.utils.parsedom import ParseDom
    import sys
    if len(sys.argv) < 2:
        print "erro"
    else:

        p = ParseDom()

        d = p.parse(sys.argv[1])
 #       c = CeProof()
 #       c.getProof(d)

Ejemplo n.º 3
0
if __name__ == '__main__':

    from eri.utils.parsedom import ParseDom
    from eri.marker import Marker

    if len(sys.argv) < 2:
        raise SystemExit, "use: %s <URI> [output_file]" % sys.argv[0]

    filePath = sys.argv[1]

    if len(sys.argv) > 2:
        out = open(sys.argv[2], 'w')
    else:
        out = sys.stdout

    if filePath[0:4] == "http":
        import urllib
        htmlString = urllib.urlopen(filePath).read()
    else:
        htmlString = open(filePath, 'r').read()

    marker = Marker()
    parser = ParseDom()
    dom = parser.parse(htmlString)

    extractor = Coloring()
    result = extractor.process(dom, marker)
    print >> out, result

Ejemplo n.º 4
0
if __name__ == '__main__':

    from eri.utils.parsedom import ParseDom
    from eri.markercoloring import MarkerColoring as Marker

    if len(sys.argv) < 2:
        raise SystemExit, "use: %s <URI> [output_file]" % sys.argv[0]

    filePath = sys.argv[1]

    if len(sys.argv) > 2:
        out = open(sys.argv[2], 'w')
    else:
        out = sys.stdout

    if filePath[0:4] == "http":
        import urllib
        htmlString = urllib.urlopen(filePath).read()
    else:
        htmlString = open(filePath, 'r').read()

    marker = Marker()
    parser = ParseDom()
    dom = parser.parse(filePath)

    extractor = Table()
    result = extractor.process(dom, marker)
    print >> out, result