def load_cdic(self, cdic): if cdic[0:8] != b'CDIC\x00\x00\x00\x10': raise MobiError('Invalid CDIC header') phrases, bits = struct.unpack_from(b'>LL', cdic, 8) n = min(1<<bits, phrases-len(self.dictionary)) h = struct.Struct(b'>H').unpack_from def getslice(off): blen, = h(cdic, 16+off) slice = cdic[18+off:18+off+(blen&0x7fff)] return (slice, blen&0x8000) self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
def __init__(self, stream): self.stream = stream data = self.data = StreamSlicer(stream) self.type = data[60:68] if self.type != b"BOOKMOBI": return self.nrecs, = unpack('>H', data[76:78]) record0 = self.record0 = self.record(0) mobi_header_length, = unpack('>I', record0[0x14:0x18]) if not mobi_header_length: raise MobiError( "Non-standard file format. Try 'Convert E-Books' with MOBI as Input and Output formats." ) self.encryption_type, = unpack('>H', record0[12:14]) codepage, = unpack('>I', record0[28:32]) self.codec = 'utf-8' if codepage == 65001 else 'cp1252' image_base, = unpack('>I', record0[108:112]) flags, = self.flags, = unpack('>I', record0[128:132]) have_exth = self.have_exth = (flags & 0x40) != 0 self.cover_record = self.thumbnail_record = None self.timestamp = None self.pdbrecords = self.get_pdbrecords() self.drm_block = None if self.encryption_type != 0: if self.have_exth: self.drm_block = self.fetchDRMdata() else: raise MobiError( 'Unable to set metadata on DRM file without EXTH header') self.original_exth_records = {} if not have_exth: self.create_exth() self.have_exth = True # Fetch timestamp, cover_record, thumbnail_record self.fetchEXTHFields()
def extract_text(self, offset=1): self.log.debug('Extracting text...') text_sections = [ self.text_section(i) for i in range( offset, min(self.book_header.records + offset, len(self.sections))) ] processed_records = list( range(offset - 1, self.book_header.records + offset)) self.mobi_html = b'' if self.book_header.compression_type == 'DH': huffs = [ self.sections[i][0] for i in range( self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number) ] processed_records += list( range( self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number)) huff = HuffReader(huffs) unpack = huff.unpack elif self.book_header.compression_type == '\x00\x02': unpack = decompress_doc elif self.book_header.compression_type == '\x00\x01': unpack = lambda x: x else: raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type)) self.mobi_html = b''.join(map(unpack, text_sections)) if self.mobi_html.endswith(b'#'): self.mobi_html = self.mobi_html[:-1] if self.book_header.ancient and '<html' not in self.mobi_html[: 300].lower( ): self.mobi_html = self.mobi_html.replace('\r ', '\n\n ') self.mobi_html = self.mobi_html.replace('\0', '') if self.book_header.codec == 'cp1252': self.mobi_html = self.mobi_html.replace('\x1e', '') # record separator self.mobi_html = self.mobi_html.replace('\x02', '') # start of text return processed_records
def load_huff(self, huff): if huff[0:8] != b'HUFF\x00\x00\x00\x18': raise MobiError('Invalid HUFF header') off1, off2 = struct.unpack_from(b'>LL', huff, 8) def dict1_unpack(v): codelen, term, maxcode = v&0x1f, v&0x80, v>>8 assert codelen != 0 if codelen <= 8: assert term maxcode = ((maxcode + 1) << (32 - codelen)) - 1 return (codelen, term, maxcode) self.dict1 = map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) dict2 = struct.unpack_from(b'>64L', huff, off2) self.mincode, self.maxcode = (), () for codelen, mincode in enumerate((0,) + dict2[0::2]): self.mincode += (mincode << (32 - codelen), ) for codelen, maxcode in enumerate((0,) + dict2[1::2]): self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) self.dictionary = []
def update(self, mi, asin=None): mi.title = normalize(mi.title) def update_exth_record(rec): recs.append(rec) if rec[0] in self.original_exth_records: self.original_exth_records.pop(rec[0]) if self.type != b"BOOKMOBI": raise MobiError( "Setting metadata only supported for MOBI files of type 'BOOK'.\n" "\tThis is a %r file of type %r" % (self.type[0:4], self.type[4:8])) recs = [] added_501 = False try: from calibre.ebooks.conversion.config import load_defaults prefs = load_defaults('mobi_output') pas = prefs.get('prefer_author_sort', False) kindle_pdoc = prefs.get('personal_doc', None) share_not_sync = prefs.get('share_not_sync', False) except: pas = False kindle_pdoc = None share_not_sync = False if mi.author_sort and pas: # We want an EXTH field per author... authors = mi.author_sort.split(' & ') for author in authors: update_exth_record( (100, normalize(author).encode(self.codec, 'replace'))) elif mi.authors: authors = mi.authors for author in authors: update_exth_record( (100, normalize(author).encode(self.codec, 'replace'))) if mi.publisher: update_exth_record( (101, normalize(mi.publisher).encode(self.codec, 'replace'))) if mi.comments: # Strip user annotations a_offset = mi.comments.find('<div class="user_annotations">') ad_offset = mi.comments.find('<hr class="annotations_divider" />') if a_offset >= 0: mi.comments = mi.comments[:a_offset] if ad_offset >= 0: mi.comments = mi.comments[:ad_offset] update_exth_record( (103, normalize(mi.comments).encode(self.codec, 'replace'))) if mi.isbn: update_exth_record((104, mi.isbn.encode(self.codec, 'replace'))) if mi.tags: # FIXME: Keep a single subject per EXTH field? subjects = '; '.join(mi.tags) update_exth_record( (105, normalize(subjects).encode(self.codec, 'replace'))) if kindle_pdoc and kindle_pdoc in mi.tags: added_501 = True update_exth_record((501, b'PDOC')) if mi.pubdate: update_exth_record( (106, unicode_type(mi.pubdate).encode(self.codec, 'replace'))) elif mi.timestamp: update_exth_record( (106, unicode_type(mi.timestamp).encode(self.codec, 'replace'))) elif self.timestamp: update_exth_record((106, self.timestamp)) else: update_exth_record( (106, nowf().isoformat().encode(self.codec, 'replace'))) if self.cover_record is not None: update_exth_record((201, pack('>I', self.cover_rindex))) update_exth_record((203, pack('>I', 0))) if self.thumbnail_record is not None: update_exth_record((202, pack('>I', self.thumbnail_rindex))) # Add a 113 record if not present to allow Amazon syncing if (113 not in self.original_exth_records and self.original_exth_records.get(501, None) == 'EBOK' and not added_501 and not share_not_sync): from uuid import uuid4 update_exth_record((113, unicode_type(uuid4()).encode(self.codec))) if asin is not None: update_exth_record((113, asin.encode(self.codec))) update_exth_record((504, asin.encode(self.codec))) # Add a 112 record with actual UUID if getattr(mi, 'uuid', None): update_exth_record( (112, ("calibre:%s" % mi.uuid).encode(self.codec, 'replace'))) if 503 in self.original_exth_records: update_exth_record((503, mi.title.encode(self.codec, 'replace'))) # Update book producer if getattr(mi, 'book_producer', False): update_exth_record( (108, mi.book_producer.encode(self.codec, 'replace'))) # Set langcode in EXTH header if not mi.is_null('language'): lang = canonicalize_lang(mi.language) lang = lang_as_iso639_1(lang) or lang if lang: update_exth_record((524, lang.encode(self.codec, 'replace'))) # Include remaining original EXTH fields for id in sorted(self.original_exth_records): recs.append((id, self.original_exth_records[id])) recs = sorted(recs, key=lambda x: (x[0], x[0])) exth = io.BytesIO() for code, data in recs: exth.write(pack('>II', code, len(data) + 8)) exth.write(data) exth = exth.getvalue() trail = len(exth) % 4 pad = b'\0' * (4 - trail) # Always pad w/ at least 1 byte exth = [b'EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad] exth = b''.join(exth) if getattr(self, 'exth', None) is None: raise MobiError('No existing EXTH record. Cannot update metadata.') if not mi.is_null('language'): self.record0[92:96] = iana2mobi(mi.language) self.create_exth(exth=exth, new_title=mi.title) # Fetch updated timestamp, cover_record, thumbnail_record self.fetchEXTHFields() if mi.cover_data[1] or mi.cover: try: data = mi.cover_data[1] if not data: with open(mi.cover, 'rb') as f: data = f.read() except: pass else: if is_image(self.cover_record): size = len(self.cover_record) cover = rescale_image(data, size) if len(cover) <= size: cover += b'\0' * (size - len(cover)) self.cover_record[:] = cover if is_image(self.thumbnail_record): size = len(self.thumbnail_record) thumbnail = rescale_image(data, size, dimen=MAX_THUMB_DIMEN) if len(thumbnail) <= size: thumbnail += b'\0' * (size - len(thumbnail)) self.thumbnail_record[:] = thumbnail return
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None, try_extra_data_fix=False): self.log = log self.debug = debug self.embedded_mi = None self.warned_about_trailing_entry_corruption = False self.base_css_rules = textwrap.dedent(''' body { text-align: justify } blockquote { margin: 0em 0em 0em 2em; } p { margin: 0em; text-indent: 1.5em } .bold { font-weight: bold } .italic { font-style: italic } .underline { text-decoration: underline } .mbp_pagebreak { page-break-after: always; margin: 0; display: block } ''') self.tag_css_rules = {} self.left_margins = {} self.text_indents = {} if hasattr(filename_or_stream, 'read'): stream = filename_or_stream stream.seek(0) else: stream = open(filename_or_stream, 'rb') raw = stream.read() if raw.startswith('TPZ'): raise TopazError(_('This is an Amazon Topaz book. It cannot be processed.')) if raw.startswith(b'\xeaDRMION\xee'): raise KFXError() self.header = raw[0:72] self.name = self.header[:32].replace('\x00', '') self.num_sections, = struct.unpack('>H', raw[76:78]) self.ident = self.header[0x3C:0x3C + 8].upper() if self.ident not in ['BOOKMOBI', 'TEXTREAD']: raise MobiError('Unknown book type: %s' % repr(self.ident)) self.sections = [] self.section_headers = [] for i in range(self.num_sections): offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8]) flags, val = a1, a2 << 16 | a3 << 8 | a4 self.section_headers.append((offset, flags, val)) def section(section_number): if section_number == self.num_sections - 1: end_off = len(raw) else: end_off = self.section_headers[section_number + 1][0] off = self.section_headers[section_number][0] return raw[off:end_off] for i in range(self.num_sections): self.sections.append((section(i), self.section_headers[i])) self.book_header = bh = BookHeader(self.sections[0][0], self.ident, user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) self.name = self.name.decode(self.book_header.codec, 'replace') self.kf8_type = None k8i = getattr(self.book_header.exth, 'kf8_header', None) # Ancient PRC files from Baen can have random values for # mobi_version, so be conservative if (self.book_header.mobi_version == 8 and hasattr(self.book_header, 'skelidx')): self.kf8_type = 'standalone' elif k8i is not None: # Check for joint mobi 6 and kf 8 file try: raw = self.sections[k8i-1][0] except: raw = None if raw == b'BOUNDARY': try: self.book_header = BookHeader(self.sections[k8i][0], self.ident, user_encoding, self.log) self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i self.book_header.mobi6_records = bh.records # Need the first_image_index from the mobi 6 header as well for x in ('first_image_index',): setattr(self.book_header, x, getattr(bh, x)) # We need to do this because the MOBI 6 text extract code # does not know anything about the kf8 offset if hasattr(self.book_header, 'huff_offset'): self.book_header.huff_offset += k8i self.kf8_type = 'joint' self.kf8_boundary = k8i-1 except: self.book_header = bh
def identity(self): self.stream.seek(60) ident = self.stream.read(8).upper() if ident not in ['BOOKMOBI', 'TEXTREAD']: raise MobiError('Unknown book type: %s' % ident) return ident