def feed_page(self, name, data): if name == self.linkinfo: print >> stderr, 'Loading: %r' % name for line in data.split('\n'): if line: (name, strs) = eval(line) self.dic[name] = strs else: try: n = name.index('/') except ValueError: return base_href = 'http://' + name[n + 1:] if not self.linkinfo: self.baseid = name[:n] handler = HTMLLinkFinder(self, base_href, self) parser = HTMLParser3(handler, charset=self.default_charset) parser.feed_byte(data) parser.close() if not self.acldb or self.acldb.allowed(name): tree = parse(data, charset=self.default_charset, base_href=base_href) n = self.analyzer.add_tree(name, tree) print >> stderr, 'Added: %d: %s' % (n, name) else: print >> stderr, 'Skipped: %s' % name return
def feed_page(self, name, data): if name == self.linkinfo: print >>stderr, 'Loading: %r' % name for line in data.split('\n'): if line: (name,strs) = eval(line) self.dic[name] = strs else: try: n = name.index('/') except ValueError: return base_href = 'http://'+name[n+1:] if not self.linkinfo: self.baseid = name[:n] handler = HTMLLinkFinder(self, base_href, self) parser = HTMLParser3(handler, charset=self.default_charset) parser.feed_byte(data) parser.close() if not self.acldb or self.acldb.allowed(name): tree = parse(data, charset=self.default_charset, base_href=base_href) n = self.analyzer.add_tree(name, tree) print >>stderr, 'Added: %d: %s' % (n, name) else: print >>stderr, 'Skipped: %s' % name return
def __iter__(self): previous = self.previous (debug, cluster_threshold, title_threshold, score_threshold) = (0, 0.97, 0.6, 100) mangle_pat = None linkinfo = 'linkinfo' # analyzer = LayoutAnalyzer(debug=debug) if mangle_pat: analyzer.set_encoder(mangle_pat) feeder = PageFeeder(analyzer, linkinfo=linkinfo, acldb=None, default_charset=default_charset, debug=debug) items = [] for item in previous: content = self.getHtml(item) if self.disable or not self.condition(item): yield item elif item.get('_template'): yield item elif content is not None: feeder.feed_page(item['_site_url'] + item['_path'], content) items.append(item) else: yield item feeder.close() if items: self.clusters = {} clusters = analyzer.analyze(cluster_threshold, title_threshold) patternset = LayoutPatternSet() patternset.pats = [c for c in clusters if c.pattern and score_threshold <= c.score] #default_charset='iso-8859-1' pat_threshold=0.8 self.debug = 0 strict=True for item in items: content = self.getHtml(item) name = item['_site_url'] + item['_path'] if name == 'linkinfo': continue tree = parse(content, charset=default_charset) (pat1, layout) = patternset.identify_layout(tree, pat_threshold, strict=strict) etree = lxml.html.fromstring(content) newfields = self.dump_text(name, pat1, layout, etree, item['_path']) if newfields: self.log.info("PASS: '******', matched=%s", item.get('_path'), newfields.keys() ) else: self.log.info("FAIL: '%s'", item.get('_path')) item.update( newfields ) yield item
def analyse(self, previous): """ Perform automatic field extration without hints. Deep magic. """ (debug, cluster_threshold, title_threshold, score_threshold) = (0, 0.97, 0.6, 100) mangle_pat = None linkinfo = 'linkinfo' # analyzer = LayoutAnalyzer(debug=debug) if mangle_pat: analyzer.set_encoder(mangle_pat) feeder = PageFeeder(analyzer, linkinfo=linkinfo, acldb=None, default_charset=default_charset, debug=debug) items = [] for item in previous: content = self.getHtml(item) if content is not None: feeder.feed_page(item['_site_url'] + item['_path'], content) items.append(item) else: yield item feeder.close() self.clusters = {} clusters = analyzer.analyze(cluster_threshold, title_threshold) patternset = LayoutPatternSet() patternset.pats = [c for c in clusters if c.pattern and score_threshold <= c.score] #default_charset='iso-8859-1' pat_threshold=0.8 self.debug = 0 strict=True for item in items: content = self.getHtml(item) name = item['_site_url'] + item['_path'] if name == 'linkinfo': continue tree = parse(content, charset=default_charset) (pat1, layout) = patternset.identify_layout(tree, pat_threshold, strict=strict) etree = lxml.html.fromstring(content) item.update( self.dump_text(name, pat1, layout, etree) ) yield item
def feed_page(self, name, fp): if name == self.linkinfo: return self.feed_tree(name, parse(fp, charset=self.default_charset)) return