#!/usr/bin/env python from schardet import detect import os,sys SRC='webpage/' for i in os.listdir(SRC): filename = SRC + i fileobject = open(filename, "r") content = fileobject.read(); encoding = detect(content) if encoding: try: new_content = content.decode(encoding).encode('utf-8') fileobject = open(filename, 'w') fileobject.write(new_content) fileobject.flush() fileobject.close() print "done" except Exception,e: print(e) pass
if __name__ == "__main__": baseurl = '' if sys.argv[1:]: arg = sys.argv[1] if arg.startswith('http://') or arg.startswith('https://'): baseurl = arg j = urllib.urlopen(baseurl) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: if len(sys.argv) > 2: encoding = sys.argv[2] data = open(arg, 'r').read() charset = schardet.detect(data) try: data = data.decode(charset, 'ignore') except Exception, e: sys.stderr.write('Failed to detect file encoding:' + arg + '\n') exit(1) else: data = sys.stdin.read().decode('utf8') wrapwrite(html2text(data, baseurl))
if __name__ == "__main__": baseurl = '' if sys.argv[1:]: arg = sys.argv[1] if arg.startswith('http://') or arg.startswith('https://'): baseurl = arg j = urllib.urlopen(baseurl) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: if len(sys.argv) > 2: encoding = sys.argv[2] data = open(arg, 'r').read() charset = schardet.detect(data) try: data = data.decode(charset,'ignore') except Exception, e: sys.stderr.write('Failed to detect file encoding:'+arg+'\n') exit(1) else: data = sys.stdin.read().decode('utf8') wrapwrite(html2text(data, baseurl))