Example #1
0
    def load_cdic(self, cdic):
        if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
            raise MobiError('Invalid CDIC header')
        phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
        n = min(1 << bits, phrases - len(self.dictionary))
        h = struct.Struct(b'>H').unpack_from

        def getslice(off):
            blen, = h(cdic, 16 + off)
            slice = cdic[18 + off:18 + off + (blen & 0x7fff)]
            return (slice, blen & 0x8000)

        self.dictionary += map(getslice,
                               struct.unpack_from(b'>%dH' % n, cdic, 16))
Example #2
0
    def extract_text(self, offset=1):
        self.log.debug('Extracting text...')
        text_sections = [self.text_section(i)
                         for i in range(offset, min(self.book_header.records
                                                    + offset,
                                                    len(self.sections)))]
        processed_records = list(range(offset-1, self.book_header.records +
                                       offset))

        self.mobi_html = b''

        if self.book_header.compression_type == b'DH':
            huffs = [self.sections[i][0]
                     for i in range(self.book_header.huff_offset,
                                    self.book_header.huff_offset +
                                    self.book_header.huff_number)]
            processed_records += list(range(self.book_header.huff_offset,
                                            self.book_header.huff_offset +
                                            self.book_header.huff_number))
            huff = HuffReader(huffs)
            unpack = huff.unpack

        elif self.book_header.compression_type == b'\x00\x02':
            unpack = decompress_doc

        elif self.book_header.compression_type == b'\x00\x01':
            unpack = lambda x: x  # noqa
        else:
            raise MobiError('Unknown compression algorithm: %r' %
                            self.book_header.compression_type)
        self.mobi_html = b''.join(map(unpack, text_sections))
        if self.mobi_html.endswith(b'#'):
            self.mobi_html = self.mobi_html[:-1]

        if (self.book_header.ancient and
                b'<html' not in self.mobi_html[:300].lower()):
            self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
        self.mobi_html = self.mobi_html.replace(b'\0', b'')
        if self.book_header.codec == 'cp1252':
            # record separator
            self.mobi_html = self.mobi_html.replace(b'\x1e', b'')
            # start of text
            self.mobi_html = self.mobi_html.replace(b'\x02', b'')
        return processed_records
Example #3
0
    def load_huff(self, huff):
        if huff[0:8] != b'HUFF\x00\x00\x00\x18':
            raise MobiError('Invalid HUFF header')
        off1, off2 = struct.unpack_from(b'>LL', huff, 8)

        def dict1_unpack(v):
            codelen, term, maxcode = v & 0x1f, v & 0x80, v >> 8
            assert codelen != 0
            if codelen <= 8:
                assert term
            maxcode = ((maxcode + 1) << (32 - codelen)) - 1
            return (codelen, term, maxcode)

        self.dict1 = tuple(
            map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)))

        dict2 = struct.unpack_from(b'>64L', huff, off2)
        self.mincode, self.maxcode = (), ()
        for codelen, mincode in enumerate((0, ) + dict2[0::2]):
            self.mincode += (mincode << (32 - codelen), )
        for codelen, maxcode in enumerate((0, ) + dict2[1::2]):
            self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )

        self.dictionary = []
Example #4
0
 def identity(self):
     self.stream.seek(60)
     ident = self.stream.read(8).upper()
     if ident not in (b'BOOKMOBI', b'TEXTREAD'):
         raise MobiError('Unknown book type: %s' % ident)
     return ident
Example #5
0
    def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
                 try_extra_data_fix=False):
        self.log = log
        self.debug = debug
        self.embedded_mi = None
        self.warned_about_trailing_entry_corruption = False
        self.base_css_rules = textwrap.dedent('''
                body { text-align: justify }

                blockquote { margin: 0em 0em 0em 2em; }

                p { margin: 0em; text-indent: 1.5em }

                .bold { font-weight: bold }

                .italic { font-style: italic }

                .underline { text-decoration: underline }

                .mbp_pagebreak {
                    page-break-after: always; margin: 0; display: block
                }
                ''')
        self.tag_css_rules = {}
        self.left_margins = {}
        self.text_indents = {}

        if hasattr(filename_or_stream, 'read'):
            stream = filename_or_stream
            stream.seek(0)
        else:
            stream = open(filename_or_stream, 'rb')

        raw = stream.read()
        if raw.startswith(b'TPZ'):
            raise TopazError('This is an Amazon Topaz book. It cannot be '
                             'processed.')
        if raw.startswith(b'\xeaDRMION\xee'):
            raise KFXError()

        self.header = raw[0:72]
        self.name = self.header[:32].replace(b'\x00', b'')
        self.num_sections, = struct.unpack('>H', raw[76:78])

        self.ident = self.header[0x3C:0x3C + 8].upper()
        if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
            raise MobiError('Unknown book type: %s' % repr(self.ident))

        self.sections = []
        self.section_headers = []
        for i in range(self.num_sections):
            offset, a1, a2, a3, a4 = struct.unpack('>LBBBB',
                                                   raw[78 + i * 8:78 +
                                                       i * 8 + 8])
            flags, val = a1, a2 << 16 | a3 << 8 | a4
            self.section_headers.append((offset, flags, val))

        def section(section_number):
            if section_number == self.num_sections - 1:
                end_off = len(raw)
            else:
                end_off = self.section_headers[section_number + 1][0]
            off = self.section_headers[section_number][0]
            return raw[off:end_off]

        for i in range(self.num_sections):
            self.sections.append((section(i), self.section_headers[i]))

        bh = BookHeader(self.sections[0][0], self.ident, user_encoding,
                        self.log, try_extra_data_fix=try_extra_data_fix)
        self.book_header = bh
        self.name = self.name.decode(self.book_header.codec, 'replace')
        self.kf8_type = None
        k8i = getattr(self.book_header.exth, 'kf8_header', None)

        # Ancient PRC files from Baen can have random values for
        # mobi_version, so be conservative
        if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
                                                           'skelidx')):
            self.kf8_type = 'standalone'
        elif k8i is not None:  # Check for joint mobi 6 and kf 8 file
            try:
                raw = self.sections[k8i-1][0]
            except Exception:
                raw = None
            if raw == b'BOUNDARY':
                try:
                    self.book_header = BookHeader(self.sections[k8i][0],
                                                  self.ident, user_encoding,
                                                  self.log)
                    _kfii = self.book_header.first_image_index + k8i
                    self.book_header.kf8_first_image_index = _kfii
                    self.book_header.mobi6_records = bh.records

                    # Need the first_image_index from the mobi 6 header as well
                    for x in ('first_image_index',):
                        setattr(self.book_header, x, getattr(bh, x))

                    # We need to do this because the MOBI 6 text extract code
                    # does not know anything about the kf8 offset
                    if hasattr(self.book_header, 'huff_offset'):
                        self.book_header.huff_offset += k8i

                    self.kf8_type = 'joint'
                    self.kf8_boundary = k8i-1
                except Exception:
                    self.book_header = bh