Esempio n. 1
0
def get_decoded_raw(name):
    from calibre.ebooks.chardet import xml_to_unicode, force_encoding
    with open(name, 'rb') as f:
        raw = f.read()
    syntax = syntax_from_mime(name, guess_type(name))
    if syntax is None:
        try:
            raw = raw.decode('utf-8')
        except ValueError:
            pass
    elif syntax != 'raster_image':
        if syntax in {'html', 'xml'}:
            raw = xml_to_unicode(raw, verbose=True)[0]
        else:
            m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I)
            if m is not None and m.group(1) != '8bit':
                enc = m.group(1)
                if enc == b'unicode':
                    enc = 'utf-8'
            else:
                enc = force_encoding(raw, verbose=True)
            try:
                raw = raw.decode(enc)
            except (LookupError, ValueError):
                try:
                    raw = raw.decode('utf-8')
                except ValueError:
                    pass
    return raw, syntax
Esempio n. 2
0
def get_decoded_raw(name):
    from calibre.ebooks.chardet import xml_to_unicode, force_encoding
    with open(name, 'rb') as f:
        raw = f.read()
    syntax = syntax_from_mime(name, guess_type(name))
    if syntax is None:
        try:
            raw = raw.decode('utf-8')
        except ValueError:
            pass
    elif syntax != 'raster_image':
        if syntax in {'html', 'xml'}:
            raw = xml_to_unicode(raw, verbose=True)[0]
        else:
            m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I)
            if m is not None and m.group(1) != '8bit':
                enc = m.group(1)
                if enc == b'unicode':
                    enc = 'utf-8'
            else:
                enc = force_encoding(raw, verbose=True)
            try:
                raw = raw.decode(enc)
            except (LookupError, ValueError):
                try:
                    raw = raw.decode('utf-8')
                except ValueError:
                    pass
    return raw, syntax
 def run(self, path_to_ebook):
     print("reformatter: "+path_to_ebook)
     f = open(path_to_ebook, 'r')
     raw = f.read()
     encoding = force_encoding(raw, True)
     print("Detected encoding: ", encoding)
     txt = unicode(raw, encoding, errors='replace')
     # reformat
     if prefs['reformat']:
         print("reformatting...")
         from ptxt2ftxt import ptxt2ftxt, ftxtclean
         from ftxt2markdown import ftxt2markdown
         txt = ptxt2ftxt(txt, para_by_mark=prefs['para_by_mark'])
         txt = ftxtclean(txt, pretty_quote=prefs['pretty_quote'], correct_word_break=prefs['correct_word_break'])
         txt = ftxt2markdown(txt, guessChapter=prefs['guess_chapter'], guessParaSep=prefs['insert_empty_paragraph'])
     # save as temporary file
     tempfile = self.temporary_file('.txt')
     tempfile.write( txt.encode('utf-8') )
     tempfile.close()
     print("save as ", tempfile.name)
     return tempfile.name