def feed_page(self, name, data): if name == self.linkinfo: print >> stderr, 'Loading: %r' % name for line in data.split('\n'): if line: (name, strs) = eval(line) self.dic[name] = strs else: try: n = name.index('/') except ValueError: return base_href = 'http://' + name[n + 1:] if not self.linkinfo: self.baseid = name[:n] handler = HTMLLinkFinder(self, base_href, self) parser = HTMLParser3(handler, charset=self.default_charset) parser.feed_byte(data) parser.close() if not self.acldb or self.acldb.allowed(name): tree = parse(data, charset=self.default_charset, base_href=base_href) n = self.analyzer.add_tree(name, tree) print >> stderr, 'Added: %d: %s' % (n, name) else: print >> stderr, 'Skipped: %s' % name return
def parse(x, base_href=None, charset=None, stylesheet=None): builder = HTMLDocumentBuilder(base_href=base_href, stylesheet=stylesheet) parser = HTMLParser3(builder, charset=charset) if isinstance(x, unicode): parser.feed_unicode(x) e = parser.close() elif isinstance(x, str): parser.feed_byte(x) e = parser.close() else: parser.feed_file(x) e = parser.close() x.close() return e
def parse1(self, fp, url, mimetype, charset, visited): if mimetype not in self.ACCEPT_TYPE: return handler = HTMLLinkFinder(self, url, self.reftxtdb) parser = HTMLParser3(handler, charset=charset) body = fp.read() parser.feed_byte(body) parser.close() if self.consumer and not visited: url = urljoin(handler.base_href, url) if url.startswith('http://'): name = self.baseid + url[6:] self.consumer.feed_page(name, body) if self.debug: print >> stderr, 'FEED: %r' % name return
pass def feed(self, s): sys.stdout.write(self.encoder(s, 'replace')[0]) sys.stdout.flush() return def usage(): print 'usage: html2txt.py [-c charset_in] [-C charset_out] files ...' sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], 'c:C:') except getopt.GetoptError: usage() (charset_in, charset_out) = ('utf-8', 'utf-8') for (k, v) in opts: if k == '-c': charset_in = v elif k == '-C': charset_out = v if not args: args = ['-'] for url in args: if url == '-': fp = sys.stdin elif url.startswith('http:') or url.startswith('ftp:'): fp = urllib.urlopen(url) else: fp = file(url) p = HTMLParser3(HTMLTextHandler(out(charset_out)), charset=charset_in) p.feed_file(fp).close() fp.close()