def feed_page(self, name, data):
     if name == self.linkinfo:
         print >> stderr, 'Loading: %r' % name
         for line in data.split('\n'):
             if line:
                 (name, strs) = eval(line)
                 self.dic[name] = strs
     else:
         try:
             n = name.index('/')
         except ValueError:
             return
         base_href = 'http://' + name[n + 1:]
         if not self.linkinfo:
             self.baseid = name[:n]
             handler = HTMLLinkFinder(self, base_href, self)
             parser = HTMLParser3(handler, charset=self.default_charset)
             parser.feed_byte(data)
             parser.close()
         if not self.acldb or self.acldb.allowed(name):
             tree = parse(data,
                          charset=self.default_charset,
                          base_href=base_href)
             n = self.analyzer.add_tree(name, tree)
             print >> stderr, 'Added: %d: %s' % (n, name)
         else:
             print >> stderr, 'Skipped: %s' % name
     return
Beispiel #2
0
 def feed_page(self, name, data):
   if name == self.linkinfo:
     print >>stderr, 'Loading: %r' % name
     for line in data.split('\n'):
       if line:
         (name,strs) = eval(line)
         self.dic[name] = strs
   else:
     try:
       n = name.index('/')
     except ValueError:
       return
     base_href = 'http://'+name[n+1:]
     if not self.linkinfo:
       self.baseid = name[:n]
       handler = HTMLLinkFinder(self, base_href, self)
       parser = HTMLParser3(handler, charset=self.default_charset)
       parser.feed_byte(data)
       parser.close()
     if not self.acldb or self.acldb.allowed(name):
       tree = parse(data, charset=self.default_charset, base_href=base_href)
       n = self.analyzer.add_tree(name, tree)
       print >>stderr, 'Added: %d: %s' % (n, name)
     else:
       print >>stderr, 'Skipped: %s' % name
   return
    def __iter__(self):
        previous = self.previous
        (debug, cluster_threshold, title_threshold, score_threshold) = (0, 0.97, 0.6, 100)
        mangle_pat = None
        linkinfo = 'linkinfo'
        #
        analyzer = LayoutAnalyzer(debug=debug)
        if mangle_pat:
            analyzer.set_encoder(mangle_pat)

        feeder = PageFeeder(analyzer, linkinfo=linkinfo, acldb=None,
                                default_charset=default_charset, debug=debug)

        items = []
        for item in previous:
            content = self.getHtml(item)
            if self.disable or not self.condition(item):
                yield item
            elif item.get('_template'):
                yield item
            elif content is not None:
                feeder.feed_page(item['_site_url'] + item['_path'], content)
                items.append(item)
            else:
                yield item
            feeder.close()
        if items:
            self.clusters = {}
            clusters = analyzer.analyze(cluster_threshold, title_threshold)
            patternset = LayoutPatternSet()
            patternset.pats = [c for c in clusters if c.pattern and score_threshold <= c.score]

        #default_charset='iso-8859-1'
        pat_threshold=0.8
        self.debug = 0
        strict=True
        for item in items:
            content = self.getHtml(item)
            name = item['_site_url'] + item['_path']
            if name == 'linkinfo': continue
            tree = parse(content, charset=default_charset)
            (pat1, layout) = patternset.identify_layout(tree, pat_threshold, strict=strict)
            etree = lxml.html.fromstring(content)
            newfields = self.dump_text(name, pat1, layout, etree, item['_path'])
            if newfields:
                self.log.info("PASS: '******', matched=%s", item.get('_path'), newfields.keys() )
            else:
                self.log.info("FAIL: '%s'", item.get('_path'))
                
            item.update( newfields )
            yield item
    def analyse(self, previous):
        """
        Perform automatic field extration without hints.
        
        Deep magic.
        """
        (debug, cluster_threshold, title_threshold, score_threshold) = (0, 0.97, 0.6, 100)
        mangle_pat = None
        linkinfo = 'linkinfo'
        #
        analyzer = LayoutAnalyzer(debug=debug)
        if mangle_pat:
            analyzer.set_encoder(mangle_pat)

        feeder = PageFeeder(analyzer, linkinfo=linkinfo, acldb=None,
                                default_charset=default_charset, debug=debug)

        items = []
        for item in previous:
            content = self.getHtml(item)
            if content is not None:
                feeder.feed_page(item['_site_url'] + item['_path'], content)
                items.append(item)
            else:
                yield item
            feeder.close()

        self.clusters = {}
        clusters = analyzer.analyze(cluster_threshold, title_threshold)
        patternset = LayoutPatternSet()
        patternset.pats = [c for c in clusters if c.pattern and score_threshold <= c.score]

        #default_charset='iso-8859-1'
        pat_threshold=0.8
        self.debug = 0
        strict=True
        for item in items:
            content = self.getHtml(item)
            name = item['_site_url'] + item['_path']
            if name == 'linkinfo': continue
            tree = parse(content, charset=default_charset)
            (pat1, layout) = patternset.identify_layout(tree, pat_threshold, strict=strict)
            etree = lxml.html.fromstring(content)
            item.update( self.dump_text(name, pat1, layout, etree) )
            yield item
 def feed_page(self, name, fp):
     if name == self.linkinfo: return
     self.feed_tree(name, parse(fp, charset=self.default_charset))
     return