Esempio n. 1
0
#!/usr/bin/env python
from schardet import detect 
import os,sys
SRC='webpage/'
for i in os.listdir(SRC):
    filename = SRC + i
    fileobject = open(filename, "r")
    content = fileobject.read();
    encoding = detect(content)
    if encoding:
        try:
            new_content = content.decode(encoding).encode('utf-8')
            fileobject = open(filename, 'w')
            fileobject.write(new_content)
            fileobject.flush()
            fileobject.close()
            print "done"
        except Exception,e:
            print(e)
            pass 
Esempio n. 2
0
if __name__ == "__main__":
    baseurl = ''
    if sys.argv[1:]:
        arg = sys.argv[1]
        if arg.startswith('http://') or arg.startswith('https://'):
            baseurl = arg
            j = urllib.urlopen(baseurl)
            try:
                from feedparser import _getCharacterEncoding as enc
            except ImportError:
                enc = lambda x, y: ('utf-8', 1)
            text = j.read()
            encoding = enc(j.headers, text)[0]
            if encoding == 'us-ascii': encoding = 'utf-8'
            data = text.decode(encoding)
        else:
            if len(sys.argv) > 2:
                encoding = sys.argv[2]
            data = open(arg, 'r').read()
            charset = schardet.detect(data)
            try:
                data = data.decode(charset, 'ignore')
            except Exception, e:
                sys.stderr.write('Failed to detect file encoding:' + arg +
                                 '\n')
                exit(1)
    else:
        data = sys.stdin.read().decode('utf8')

    wrapwrite(html2text(data, baseurl))
Esempio n. 3
0
if __name__ == "__main__":
    baseurl = ''
    if sys.argv[1:]:
        arg = sys.argv[1]
        if arg.startswith('http://') or arg.startswith('https://'):
            baseurl = arg
            j = urllib.urlopen(baseurl)
            try:
                from feedparser import _getCharacterEncoding as enc
            except ImportError:
                   enc = lambda x, y: ('utf-8', 1)
            text = j.read()
            encoding = enc(j.headers, text)[0]
            if encoding == 'us-ascii': encoding = 'utf-8'
            data = text.decode(encoding)
        else:
            if len(sys.argv) > 2:
                encoding = sys.argv[2]
            data = open(arg, 'r').read()
            charset = schardet.detect(data)
            try:
                data = data.decode(charset,'ignore')
            except Exception, e:
                sys.stderr.write('Failed to detect file encoding:'+arg+'\n')
                exit(1)
    else:
        data = sys.stdin.read().decode('utf8')

    wrapwrite(html2text(data, baseurl))