def get_decoded_raw(name): from calibre.ebooks.chardet import xml_to_unicode, force_encoding with open(name, 'rb') as f: raw = f.read() syntax = syntax_from_mime(name, guess_type(name)) if syntax is None: try: raw = raw.decode('utf-8') except ValueError: pass elif syntax != 'raster_image': if syntax in {'html', 'xml'}: raw = xml_to_unicode(raw, verbose=True)[0] else: m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I) if m is not None and m.group(1) != '8bit': enc = m.group(1) if enc == b'unicode': enc = 'utf-8' else: enc = force_encoding(raw, verbose=True) try: raw = raw.decode(enc) except (LookupError, ValueError): try: raw = raw.decode('utf-8') except ValueError: pass return raw, syntax
def run(self, path_to_ebook): print("reformatter: "+path_to_ebook) f = open(path_to_ebook, 'r') raw = f.read() encoding = force_encoding(raw, True) print("Detected encoding: ", encoding) txt = unicode(raw, encoding, errors='replace') # reformat if prefs['reformat']: print("reformatting...") from ptxt2ftxt import ptxt2ftxt, ftxtclean from ftxt2markdown import ftxt2markdown txt = ptxt2ftxt(txt, para_by_mark=prefs['para_by_mark']) txt = ftxtclean(txt, pretty_quote=prefs['pretty_quote'], correct_word_break=prefs['correct_word_break']) txt = ftxt2markdown(txt, guessChapter=prefs['guess_chapter'], guessParaSep=prefs['insert_empty_paragraph']) # save as temporary file tempfile = self.temporary_file('.txt') tempfile.write( txt.encode('utf-8') ) tempfile.close() print("save as ", tempfile.name) return tempfile.name