def load_cdic(self, cdic): if cdic[0:8] != b'CDIC\x00\x00\x00\x10': raise MobiError('Invalid CDIC header') phrases, bits = struct.unpack_from(b'>LL', cdic, 8) n = min(1 << bits, phrases - len(self.dictionary)) h = struct.Struct(b'>H').unpack_from def getslice(off): blen, = h(cdic, 16 + off) slice = cdic[18 + off:18 + off + (blen & 0x7fff)] return (slice, blen & 0x8000) self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
def extract_text(self, offset=1): self.log.debug('Extracting text...') text_sections = [self.text_section(i) for i in range(offset, min(self.book_header.records + offset, len(self.sections)))] processed_records = list(range(offset-1, self.book_header.records + offset)) self.mobi_html = b'' if self.book_header.compression_type == b'DH': huffs = [self.sections[i][0] for i in range(self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number)] processed_records += list(range(self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number)) huff = HuffReader(huffs) unpack = huff.unpack elif self.book_header.compression_type == b'\x00\x02': unpack = decompress_doc elif self.book_header.compression_type == b'\x00\x01': unpack = lambda x: x # noqa else: raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type) self.mobi_html = b''.join(map(unpack, text_sections)) if self.mobi_html.endswith(b'#'): self.mobi_html = self.mobi_html[:-1] if (self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower()): self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ') self.mobi_html = self.mobi_html.replace(b'\0', b'') if self.book_header.codec == 'cp1252': # record separator self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # start of text self.mobi_html = self.mobi_html.replace(b'\x02', b'') return processed_records
def load_huff(self, huff): if huff[0:8] != b'HUFF\x00\x00\x00\x18': raise MobiError('Invalid HUFF header') off1, off2 = struct.unpack_from(b'>LL', huff, 8) def dict1_unpack(v): codelen, term, maxcode = v & 0x1f, v & 0x80, v >> 8 assert codelen != 0 if codelen <= 8: assert term maxcode = ((maxcode + 1) << (32 - codelen)) - 1 return (codelen, term, maxcode) self.dict1 = tuple( map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))) dict2 = struct.unpack_from(b'>64L', huff, off2) self.mincode, self.maxcode = (), () for codelen, mincode in enumerate((0, ) + dict2[0::2]): self.mincode += (mincode << (32 - codelen), ) for codelen, maxcode in enumerate((0, ) + dict2[1::2]): self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) self.dictionary = []
def identity(self): self.stream.seek(60) ident = self.stream.read(8).upper() if ident not in (b'BOOKMOBI', b'TEXTREAD'): raise MobiError('Unknown book type: %s' % ident) return ident
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None, try_extra_data_fix=False): self.log = log self.debug = debug self.embedded_mi = None self.warned_about_trailing_entry_corruption = False self.base_css_rules = textwrap.dedent(''' body { text-align: justify } blockquote { margin: 0em 0em 0em 2em; } p { margin: 0em; text-indent: 1.5em } .bold { font-weight: bold } .italic { font-style: italic } .underline { text-decoration: underline } .mbp_pagebreak { page-break-after: always; margin: 0; display: block } ''') self.tag_css_rules = {} self.left_margins = {} self.text_indents = {} if hasattr(filename_or_stream, 'read'): stream = filename_or_stream stream.seek(0) else: stream = open(filename_or_stream, 'rb') raw = stream.read() if raw.startswith(b'TPZ'): raise TopazError('This is an Amazon Topaz book. It cannot be ' 'processed.') if raw.startswith(b'\xeaDRMION\xee'): raise KFXError() self.header = raw[0:72] self.name = self.header[:32].replace(b'\x00', b'') self.num_sections, = struct.unpack('>H', raw[76:78]) self.ident = self.header[0x3C:0x3C + 8].upper() if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): raise MobiError('Unknown book type: %s' % repr(self.ident)) self.sections = [] self.section_headers = [] for i in range(self.num_sections): offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8]) flags, val = a1, a2 << 16 | a3 << 8 | a4 self.section_headers.append((offset, flags, val)) def section(section_number): if section_number == self.num_sections - 1: end_off = len(raw) else: end_off = self.section_headers[section_number + 1][0] off = self.section_headers[section_number][0] return raw[off:end_off] for i in range(self.num_sections): self.sections.append((section(i), self.section_headers[i])) bh = BookHeader(self.sections[0][0], self.ident, user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) self.book_header = bh self.name = self.name.decode(self.book_header.codec, 'replace') self.kf8_type = None k8i = getattr(self.book_header.exth, 'kf8_header', None) # Ancient PRC files from Baen can have random values for # mobi_version, so be conservative if (self.book_header.mobi_version == 8 and hasattr(self.book_header, 'skelidx')): self.kf8_type = 'standalone' elif k8i is not None: # Check for joint mobi 6 and kf 8 file try: raw = self.sections[k8i-1][0] except Exception: raw = None if raw == b'BOUNDARY': try: self.book_header = BookHeader(self.sections[k8i][0], self.ident, user_encoding, self.log) _kfii = self.book_header.first_image_index + k8i self.book_header.kf8_first_image_index = _kfii self.book_header.mobi6_records = bh.records # Need the first_image_index from the mobi 6 header as well for x in ('first_image_index',): setattr(self.book_header, x, getattr(bh, x)) # We need to do this because the MOBI 6 text extract code # does not know anything about the kf8 offset if hasattr(self.book_header, 'huff_offset'): self.book_header.huff_offset += k8i self.kf8_type = 'joint' self.kf8_boundary = k8i-1 except Exception: self.book_header = bh