Ejemplo n.º 1
0
 def set_charset(self, charset):
   """Changes the current charset and codec."""
   if self.debug:
     print >>sys.stderr, 'set_charset: %s' % charset
   self.codec = getcodec(charset)
   self.handler.set_charset(charset)
   return
Ejemplo n.º 2
0
      self.codec = codec
      return
    def close(self): pass
    def feed(self, s):
      sys.stdout.write(s.encode(self.codec, 'replace'))
      sys.stdout.flush()
      return
  def usage():
    print 'usage: html2txt.py [-c charset_in] [-C charset_out] files ...'
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], 'vc:C:')
  except getopt.GetoptError:
    usage()
  (verbose, charset_in, charset_out) = (False, None, 'utf-8')
  for (k,v) in opts:
    if k == '-v': verbose = True
    elif k == '-c': charset_in = v
    elif k == '-C': charset_out = v
  if not args: args = ['-']
  for url in args:
    if url == '-':
      fp = sys.stdin
    elif url.startswith('http:') or url.startswith('ftp:'):
      fp = urllib.urlopen(url)
    else:
      fp = file(url)
    p = HTMLParser3(HTMLTextHandler(out(getcodec(charset_out))), charset=charset_in)
    p.feedfile(fp).close()
    fp.close()
Ejemplo n.º 3
0
  p = HTMLParser3(HTMLDocumentBuilder(base_href=base_href), charset=charset, debug=debug)
  p.feed(s)
  return p.close()

# main
if __name__ == "__main__":
  import getopt
  from htmlutils import getcodec
  from urllib import urlopen
  def usage():
    print "usage: htmldom.py [-d] [-c charset_in] [-C charset_out] [url ...]"
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], "dc:C:")
  except getopt.GetoptError:
    usage()
  (debug, charset_in, charset_out) = (False, 'iso-8859-1', 'iso-8859-1')
  for (k, v) in opts:
    if k == "-d": debug += 1
    elif k == "-c": charset_in = v
    elif k == "-C": charset_out = v
  codec = getcodec(charset_out)
  for url in args:
    fp = urlopen(url)
    root = parsefile(fp, base_href=url, charset=charset_in)
    fp.close()
    if debug:
      root.debug()
    else:
      root.dump(codec=codec)