def set_charset(self, charset): """Changes the current charset and codec.""" if self.debug: print >>sys.stderr, 'set_charset: %s' % charset self.codec = getcodec(charset) self.handler.set_charset(charset) return
self.codec = codec return def close(self): pass def feed(self, s): sys.stdout.write(s.encode(self.codec, 'replace')) sys.stdout.flush() return def usage(): print 'usage: html2txt.py [-c charset_in] [-C charset_out] files ...' sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], 'vc:C:') except getopt.GetoptError: usage() (verbose, charset_in, charset_out) = (False, None, 'utf-8') for (k,v) in opts: if k == '-v': verbose = True elif k == '-c': charset_in = v elif k == '-C': charset_out = v if not args: args = ['-'] for url in args: if url == '-': fp = sys.stdin elif url.startswith('http:') or url.startswith('ftp:'): fp = urllib.urlopen(url) else: fp = file(url) p = HTMLParser3(HTMLTextHandler(out(getcodec(charset_out))), charset=charset_in) p.feedfile(fp).close() fp.close()
p = HTMLParser3(HTMLDocumentBuilder(base_href=base_href), charset=charset, debug=debug) p.feed(s) return p.close() # main if __name__ == "__main__": import getopt from htmlutils import getcodec from urllib import urlopen def usage(): print "usage: htmldom.py [-d] [-c charset_in] [-C charset_out] [url ...]" sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], "dc:C:") except getopt.GetoptError: usage() (debug, charset_in, charset_out) = (False, 'iso-8859-1', 'iso-8859-1') for (k, v) in opts: if k == "-d": debug += 1 elif k == "-c": charset_in = v elif k == "-C": charset_out = v codec = getcodec(charset_out) for url in args: fp = urlopen(url) root = parsefile(fp, base_href=url, charset=charset_in) fp.close() if debug: root.debug() else: root.dump(codec=codec)