def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.utils.img import save_cover_data_to from calibre import CurrentDir stream.seek(0) try: raw = stream.read(3) except Exception: raw = b'' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4 * 1024 * 1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except Exception: data = b'' if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}: try: mi.cover_data = ('jpg', save_cover_data_to(data)) except Exception: log.exception('Failed to read MOBI cover') return mi
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.utils.magick.draw import save_cover_data_to from calibre import CurrentDir stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4*1024*1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' if data and what(None, data) in {'jpg', 'jpeg', 'gif', 'png', 'bmp', 'webp'}: try: mi.cover_data = ('jpg', save_cover_data_to(data, 'cover.jpg', return_data=True)) except Exception: log.exception('Failed to read MOBI cover') return mi
def get_pages_pagebreak_tag(self, mobi_file_path): ''' Determine pages based on the presence of <mbp:pagebreak>. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() html = as_bytes(mr.mobi_html.lower()) for m in re.finditer(b'<[^>]*pagebreak[^>]*>', html): pages.append(m.end()) return pages
def get_pages_pagebreak_tag(self, mobi_file_path): ''' Determine pages based on the presense of <mbp:pagebreak>. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() html = mr.mobi_html.lower() for m in re.finditer('<[^>]*pagebreak[^>]*>', html): pages.append(m.end()) return pages
def parse_mobi(pathtoebook, book_fmt): mobiReader = MobiReader(pathtoebook, default_log) html = b'' offset = 1 # use code from calibre.ebooks.mobi.reader.mobi8:Mobi8Reader.__call__ if book_fmt == 'AZW3' and mobiReader.kf8_type == 'joint': offset = mobiReader.kf8_boundary + 2 mobiReader.extract_text(offset=offset) html = mobiReader.mobi_html if book_fmt == 'AZW3': m8r = Mobi8Reader(mobiReader, default_log) m8r.kf8_sections = mobiReader.sections[offset-1:] m8r.read_indices() m8r.build_parts() html = b''.join(m8r.parts) # match text between HTML tags for match_text in re.finditer(b'>[^<>]+<', html): yield (match_text.start() + 1, match_text.group(0)[1:-1])
def do_explode(path, dest): from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader with open(path, 'rb') as stream: mr = MobiReader(stream, default_log, None, None) with CurrentDir(dest): mr = Mobi8Reader(mr, default_log, for_tweak=True) opf = os.path.abspath(mr()) obfuscated_fonts = mr.encrypted_fonts return opf, obfuscated_fonts
def convert(self, stream, options, file_ext, log, accelerators): self.is_kf8 = False self.mobi_is_joint = False from calibre.ebooks.mobi.reader.mobi6 import MobiReader from lxml import html parse_cache = {} try: mr = MobiReader(stream, log, options.input_encoding, options.debug_pipeline) if mr.kf8_type is None: mr.extract_content('.', parse_cache) except: mr = MobiReader(stream, log, options.input_encoding, options.debug_pipeline, try_extra_data_fix=True) if mr.kf8_type is None: mr.extract_content('.', parse_cache) if mr.kf8_type is not None: log('Found KF8 MOBI of type %r' % mr.kf8_type) if mr.kf8_type == 'joint': self.mobi_is_joint = True from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader mr = Mobi8Reader(mr, log) opf = os.path.abspath(mr()) self.encrypted_fonts = mr.encrypted_fonts self.is_kf8 = True return opf raw = parse_cache.pop('calibre_raw_mobi_markup', False) if raw: if isinstance(raw, unicode_type): raw = raw.encode('utf-8') with lopen('debug-raw.html', 'wb') as f: f.write(raw) from calibre.ebooks.oeb.base import close_self_closing_tags for f, root in parse_cache.items(): raw = html.tostring(root, encoding='utf-8', method='xml', include_meta_content_type=False) raw = close_self_closing_tags(raw) with lopen(f, 'wb') as q: q.write(raw) accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path
def do_explode(path, dest): with open(path, 'rb') as stream: mr = MobiReader(stream, default_log, None, None) with CurrentDir(dest): mr = Mobi8Reader(mr, default_log) opf = os.path.abspath(mr()) try: os.remove('debug-raw.html') except: pass return opf
def do_explode(path, dest): from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader with open(path, 'rb') as stream: mr = MobiReader(stream, default_log, None, None) with CurrentDir(dest): mr = Mobi8Reader(mr, default_log) opf = os.path.abspath(mr()) obfuscated_fonts = mr.encrypted_fonts try: os.remove('debug-raw.html') except: pass return opf, obfuscated_fonts
def parse_mobi(book_path): # use code from calibre.ebooks.mobi.reader.mobi8:Mobi8Reader.__call__ # and calibre.ebook.conversion.plugins.mobi_input:MOBIInput.convert # https://github.com/kevinhendricks/KindleUnpack/blob/master/lib/mobi_k8proc.py#L216 try: mr = MobiReader(book_path, default_log) except Exception: mr = MobiReader(book_path, default_log, try_extra_data_fix=True) if mr.kf8_type == 'joint': raise Exception('JointMOBI') mr.check_for_drm() mr.extract_text() html = mr.mobi_html if mr.kf8_type == 'standalone': m8r = Mobi8Reader(mr, default_log) m8r.kf8_sections = mr.sections m8r.read_indices() m8r.build_parts() html = b''.join(m8r.parts) # match text between HTML tags for match_text in re.finditer(b'>[^<>]+<', html): yield (match_text.group(0)[1:-1].decode('utf-8'), match_text.start() + 1)
def convert(self, stream, options, file_ext, log, accelerators): self.is_kf8 = False self.mobi_is_joint = False from calibre.ebooks.mobi.reader.mobi6 import MobiReader from lxml import html parse_cache = {} try: mr = MobiReader(stream, log, options.input_encoding, options.debug_pipeline) if mr.kf8_type is None: mr.extract_content('.', parse_cache) except: mr = MobiReader(stream, log, options.input_encoding, options.debug_pipeline, try_extra_data_fix=True) if mr.kf8_type is None: mr.extract_content('.', parse_cache) if mr.kf8_type is not None: log('Found KF8 MOBI of type %r'%mr.kf8_type) if mr.kf8_type == 'joint': self.mobi_is_joint = True from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader mr = Mobi8Reader(mr, log) opf = os.path.abspath(mr()) self.encrypted_fonts = mr.encrypted_fonts self.is_kf8 = True return opf raw = parse_cache.pop('calibre_raw_mobi_markup', False) if raw: if isinstance(raw, unicode_type): raw = raw.encode('utf-8') with lopen('debug-raw.html', 'wb') as f: f.write(raw) from calibre.ebooks.oeb.base import close_self_closing_tags for f, root in parse_cache.items(): raw = html.tostring(root, encoding='utf-8', method='xml', include_meta_content_type=False) raw = close_self_closing_tags(raw) with lopen(f, 'wb') as q: q.write(raw) accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]' return mr.created_opf_path
def do_explode(path, dest): from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader with open(path, 'rb') as stream: mr = MobiReader(stream, default_log, None, None) with CurrentDir(dest): mr = Mobi8Reader(mr, default_log) opf = os.path.abspath(mr()) obfuscated_fonts = mr.encrypted_fonts # If there are no images then the azw3 input plugin dumps all # binary records as .unknown images, remove them if os.path.exists('images') and os.path.isdir('images'): files = os.listdir('images') unknown = [x for x in files if x.endswith('.unknown')] if len(files) == len(unknown): [os.remove('images/'+f) for f in files] try: os.remove('debug-raw.html') except: pass return opf, obfuscated_fonts
def get_pages_accurate(self, mobi_file_path): ''' A more accurate but much more resource intensive and slower method to calculate the page length. Parses the uncompressed text. In an average paper back book There are 32 lines per page and a maximum of 70 characters per line. Each paragraph starts a new line and every 70 characters (minus markup) in a paragraph starts a new line. The position after every 30 lines will be marked as a new page. This can be make more accurate by accounting for <div class="mbp_pagebreak" /> as a new page marker. And <br> elements as an empty line. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() # States in_tag = False in_p = False check_p = False closing = False p_char_count = 0 # Get positions of every line # A line is either a paragraph starting # or every 70 characters in a paragraph. lines = [] pos = -1 # We want this to be as fast as possible so we # are going to do one pass across the text. re # and string functions will parse the text each # time they are called. # # We can can use .lower() here because we are # not modifying the text. In this case the case # doesn't matter just the absolute character and # the position within the stream. data = bytearray(as_bytes(mr.mobi_html.lower())) slash, p, lt, gt = map(ord, '/p<>') for c in data: pos += 1 # Check if we are starting or stopping a p tag. if check_p: if c == slash: closing = True continue elif c == p: if closing: in_p = False else: in_p = True lines.append(pos - 2) check_p = False closing = False continue if c == lt: in_tag = True check_p = True continue elif c == gt: in_tag = False check_p = False continue if in_p and not in_tag: p_char_count += 1 if p_char_count == 70: lines.append(pos) p_char_count = 0 # Every 30 lines is a new page for i in range(0, len(lines), 32): pages.append(lines[i]) return pages
def get_pages_accurate(self, mobi_file_path): ''' A more accurate but much more resource intensive and slower method to calculate the page length. Parses the uncompressed text. In an average paper back book There are 32 lines per page and a maximum of 70 characters per line. Each paragraph starts a new line and every 70 characters (minus markup) in a paragraph starts a new line. The position after every 30 lines will be marked as a new page. This can be make more accurate by accounting for <div class="mbp_pagebreak" /> as a new page marker. And <br> elements as an empty line. ''' pages = [] # Get the MOBI html. mr = MobiReader(mobi_file_path, default_log) if mr.book_header.encryption_type != 0: # DRMed book return self.get_pages_fast(mobi_file_path) mr.extract_text() # States in_tag = False in_p = False check_p = False closing = False p_char_count = 0 # Get positions of every line # A line is either a paragraph starting # or every 70 characters in a paragraph. lines = [] pos = -1 # We want this to be as fast as possible so we # are going to do one pass across the text. re # and string functions will parse the text each # time they are called. # # We can can use .lower() here because we are # not modifying the text. In this case the case # doesn't matter just the absolute character and # the position within the stream. for c in mr.mobi_html.lower(): pos += 1 # Check if we are starting or stopping a p tag. if check_p: if c == '/': closing = True continue elif c == 'p': if closing: in_p = False else: in_p = True lines.append(pos - 2) check_p = False closing = False continue if c == '<': in_tag = True check_p = True continue elif c == '>': in_tag = False check_p = False continue if in_p and not in_tag: p_char_count += 1 if p_char_count == 70: lines.append(pos) p_char_count = 0 # Every 30 lines is a new page for i in xrange(0, len(lines), 32): pages.append(lines[i]) return pages
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre import CurrentDir try: from PIL import Image as PILImage PILImage except ImportError: import Image as PILImage stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4*1024*1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' buf = cStringIO.StringIO(data) try: im = PILImage.open(buf) except: log.exception('Failed to read MOBI cover') else: obuf = cStringIO.StringIO() im.convert('RGB').save(obuf, format='JPEG') mi.cover_data = ('jpg', obuf.getvalue()) return mi
def get_metadata(stream): from calibre.ebooks.metadata import MetaInformation from calibre.ptempfile import TemporaryDirectory from calibre.ebooks.mobi.reader.headers import MetadataHeader from calibre.ebooks.mobi.reader.mobi6 import MobiReader from calibre import CurrentDir try: from PIL import Image as PILImage PILImage except ImportError: import Image as PILImage stream.seek(0) try: raw = stream.read(3) except: raw = '' stream.seek(0) if raw == b'TPZ': from calibre.ebooks.metadata.topaz import get_metadata return get_metadata(stream) from calibre.utils.logging import Log log = Log() try: mi = MetaInformation(os.path.basename(stream.name), [_('Unknown')]) except: mi = MetaInformation(_('Unknown'), [_('Unknown')]) mh = MetadataHeader(stream, log) if mh.title and mh.title != _('Unknown'): mi.title = mh.title if mh.exth is not None: if mh.exth.mi is not None: mi = mh.exth.mi else: size = 1024**3 if hasattr(stream, 'seek') and hasattr(stream, 'tell'): pos = stream.tell() stream.seek(0, 2) size = stream.tell() stream.seek(pos) if size < 4 * 1024 * 1024: with TemporaryDirectory('_mobi_meta_reader') as tdir: with CurrentDir(tdir): mr = MobiReader(stream, log) parse_cache = {} mr.extract_content(tdir, parse_cache) if mr.embedded_mi is not None: mi = mr.embedded_mi if hasattr(mh.exth, 'cover_offset'): cover_index = mh.first_image_index + mh.exth.cover_offset data = mh.section_data(int(cover_index)) else: try: data = mh.section_data(mh.first_image_index) except: data = '' buf = cStringIO.StringIO(data) try: im = PILImage.open(buf) except: log.exception('Failed to read MOBI cover') else: obuf = cStringIO.StringIO() im.convert('RGB').save(obuf, format='JPEG') mi.cover_data = ('jpg', obuf.getvalue()) return mi