Esempio n. 1
0
 def load_cdic(self, cdic):
     if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
         raise MobiError('Invalid CDIC header')
     phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
     n = min(1<<bits, phrases-len(self.dictionary))
     h = struct.Struct(b'>H').unpack_from
     def getslice(off):
         blen, = h(cdic, 16+off)
         slice = cdic[18+off:18+off+(blen&0x7fff)]
         return (slice, blen&0x8000)
     self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
Esempio n. 2
0
    def __init__(self, stream):
        self.stream = stream
        data = self.data = StreamSlicer(stream)
        self.type = data[60:68]

        if self.type != b"BOOKMOBI":
            return

        self.nrecs, = unpack('>H', data[76:78])
        record0 = self.record0 = self.record(0)
        mobi_header_length, = unpack('>I', record0[0x14:0x18])
        if not mobi_header_length:
            raise MobiError(
                "Non-standard file format.  Try 'Convert E-Books' with MOBI as Input and Output formats."
            )

        self.encryption_type, = unpack('>H', record0[12:14])
        codepage, = unpack('>I', record0[28:32])
        self.codec = 'utf-8' if codepage == 65001 else 'cp1252'

        image_base, = unpack('>I', record0[108:112])
        flags, = self.flags, = unpack('>I', record0[128:132])
        have_exth = self.have_exth = (flags & 0x40) != 0
        self.cover_record = self.thumbnail_record = None
        self.timestamp = None
        self.pdbrecords = self.get_pdbrecords()

        self.drm_block = None
        if self.encryption_type != 0:
            if self.have_exth:
                self.drm_block = self.fetchDRMdata()
            else:
                raise MobiError(
                    'Unable to set metadata on DRM file without EXTH header')

        self.original_exth_records = {}
        if not have_exth:
            self.create_exth()
            self.have_exth = True
        # Fetch timestamp, cover_record, thumbnail_record
        self.fetchEXTHFields()
Esempio n. 3
0
    def extract_text(self, offset=1):
        self.log.debug('Extracting text...')
        text_sections = [
            self.text_section(i) for i in range(
                offset,
                min(self.book_header.records + offset, len(self.sections)))
        ]
        processed_records = list(
            range(offset - 1, self.book_header.records + offset))

        self.mobi_html = b''

        if self.book_header.compression_type == 'DH':
            huffs = [
                self.sections[i][0] for i in range(
                    self.book_header.huff_offset,
                    self.book_header.huff_offset +
                    self.book_header.huff_number)
            ]
            processed_records += list(
                range(
                    self.book_header.huff_offset,
                    self.book_header.huff_offset +
                    self.book_header.huff_number))
            huff = HuffReader(huffs)
            unpack = huff.unpack

        elif self.book_header.compression_type == '\x00\x02':
            unpack = decompress_doc

        elif self.book_header.compression_type == '\x00\x01':
            unpack = lambda x: x
        else:
            raise MobiError('Unknown compression algorithm: %s' %
                            repr(self.book_header.compression_type))
        self.mobi_html = b''.join(map(unpack, text_sections))
        if self.mobi_html.endswith(b'#'):
            self.mobi_html = self.mobi_html[:-1]

        if self.book_header.ancient and '<html' not in self.mobi_html[:
                                                                      300].lower(
                                                                      ):
            self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
        self.mobi_html = self.mobi_html.replace('\0', '')
        if self.book_header.codec == 'cp1252':
            self.mobi_html = self.mobi_html.replace('\x1e',
                                                    '')  # record separator
            self.mobi_html = self.mobi_html.replace('\x02',
                                                    '')  # start of text
        return processed_records
Esempio n. 4
0
    def load_huff(self, huff):
        if huff[0:8] != b'HUFF\x00\x00\x00\x18':
            raise MobiError('Invalid HUFF header')
        off1, off2 = struct.unpack_from(b'>LL', huff, 8)

        def dict1_unpack(v):
            codelen, term, maxcode = v&0x1f, v&0x80, v>>8
            assert codelen != 0
            if codelen <= 8:
                assert term
            maxcode = ((maxcode + 1) << (32 - codelen)) - 1
            return (codelen, term, maxcode)
        self.dict1 = map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))

        dict2 = struct.unpack_from(b'>64L', huff, off2)
        self.mincode, self.maxcode = (), ()
        for codelen, mincode in enumerate((0,) + dict2[0::2]):
            self.mincode += (mincode << (32 - codelen), )
        for codelen, maxcode in enumerate((0,) + dict2[1::2]):
            self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )

        self.dictionary = []
Esempio n. 5
0
    def update(self, mi, asin=None):
        mi.title = normalize(mi.title)

        def update_exth_record(rec):
            recs.append(rec)
            if rec[0] in self.original_exth_records:
                self.original_exth_records.pop(rec[0])

        if self.type != b"BOOKMOBI":
            raise MobiError(
                "Setting metadata only supported for MOBI files of type 'BOOK'.\n"
                "\tThis is a %r file of type %r" %
                (self.type[0:4], self.type[4:8]))

        recs = []
        added_501 = False
        try:
            from calibre.ebooks.conversion.config import load_defaults
            prefs = load_defaults('mobi_output')
            pas = prefs.get('prefer_author_sort', False)
            kindle_pdoc = prefs.get('personal_doc', None)
            share_not_sync = prefs.get('share_not_sync', False)
        except:
            pas = False
            kindle_pdoc = None
            share_not_sync = False
        if mi.author_sort and pas:
            # We want an EXTH field per author...
            authors = mi.author_sort.split(' & ')
            for author in authors:
                update_exth_record(
                    (100, normalize(author).encode(self.codec, 'replace')))
        elif mi.authors:
            authors = mi.authors
            for author in authors:
                update_exth_record(
                    (100, normalize(author).encode(self.codec, 'replace')))
        if mi.publisher:
            update_exth_record(
                (101, normalize(mi.publisher).encode(self.codec, 'replace')))
        if mi.comments:
            # Strip user annotations
            a_offset = mi.comments.find('<div class="user_annotations">')
            ad_offset = mi.comments.find('<hr class="annotations_divider" />')
            if a_offset >= 0:
                mi.comments = mi.comments[:a_offset]
            if ad_offset >= 0:
                mi.comments = mi.comments[:ad_offset]
            update_exth_record(
                (103, normalize(mi.comments).encode(self.codec, 'replace')))
        if mi.isbn:
            update_exth_record((104, mi.isbn.encode(self.codec, 'replace')))
        if mi.tags:
            # FIXME: Keep a single subject per EXTH field?
            subjects = '; '.join(mi.tags)
            update_exth_record(
                (105, normalize(subjects).encode(self.codec, 'replace')))

            if kindle_pdoc and kindle_pdoc in mi.tags:
                added_501 = True
                update_exth_record((501, b'PDOC'))

        if mi.pubdate:
            update_exth_record(
                (106, unicode_type(mi.pubdate).encode(self.codec, 'replace')))
        elif mi.timestamp:
            update_exth_record(
                (106, unicode_type(mi.timestamp).encode(self.codec,
                                                        'replace')))
        elif self.timestamp:
            update_exth_record((106, self.timestamp))
        else:
            update_exth_record(
                (106, nowf().isoformat().encode(self.codec, 'replace')))
        if self.cover_record is not None:
            update_exth_record((201, pack('>I', self.cover_rindex)))
            update_exth_record((203, pack('>I', 0)))
        if self.thumbnail_record is not None:
            update_exth_record((202, pack('>I', self.thumbnail_rindex)))
        # Add a 113 record if not present to allow Amazon syncing
        if (113 not in self.original_exth_records
                and self.original_exth_records.get(501, None) == 'EBOK'
                and not added_501 and not share_not_sync):
            from uuid import uuid4
            update_exth_record((113, unicode_type(uuid4()).encode(self.codec)))

        if asin is not None:
            update_exth_record((113, asin.encode(self.codec)))
            update_exth_record((504, asin.encode(self.codec)))

        # Add a 112 record with actual UUID
        if getattr(mi, 'uuid', None):
            update_exth_record(
                (112, ("calibre:%s" % mi.uuid).encode(self.codec, 'replace')))
        if 503 in self.original_exth_records:
            update_exth_record((503, mi.title.encode(self.codec, 'replace')))

        # Update book producer
        if getattr(mi, 'book_producer', False):
            update_exth_record(
                (108, mi.book_producer.encode(self.codec, 'replace')))

        # Set langcode in EXTH header
        if not mi.is_null('language'):
            lang = canonicalize_lang(mi.language)
            lang = lang_as_iso639_1(lang) or lang
            if lang:
                update_exth_record((524, lang.encode(self.codec, 'replace')))

        # Include remaining original EXTH fields
        for id in sorted(self.original_exth_records):
            recs.append((id, self.original_exth_records[id]))
        recs = sorted(recs, key=lambda x: (x[0], x[0]))

        exth = io.BytesIO()
        for code, data in recs:
            exth.write(pack('>II', code, len(data) + 8))
            exth.write(data)
        exth = exth.getvalue()
        trail = len(exth) % 4
        pad = b'\0' * (4 - trail)  # Always pad w/ at least 1 byte
        exth = [b'EXTH', pack('>II', len(exth) + 12, len(recs)), exth, pad]
        exth = b''.join(exth)

        if getattr(self, 'exth', None) is None:
            raise MobiError('No existing EXTH record. Cannot update metadata.')

        if not mi.is_null('language'):
            self.record0[92:96] = iana2mobi(mi.language)
        self.create_exth(exth=exth, new_title=mi.title)

        # Fetch updated timestamp, cover_record, thumbnail_record
        self.fetchEXTHFields()

        if mi.cover_data[1] or mi.cover:
            try:
                data = mi.cover_data[1]
                if not data:
                    with open(mi.cover, 'rb') as f:
                        data = f.read()
            except:
                pass
            else:
                if is_image(self.cover_record):
                    size = len(self.cover_record)
                    cover = rescale_image(data, size)
                    if len(cover) <= size:
                        cover += b'\0' * (size - len(cover))
                        self.cover_record[:] = cover
                if is_image(self.thumbnail_record):
                    size = len(self.thumbnail_record)
                    thumbnail = rescale_image(data,
                                              size,
                                              dimen=MAX_THUMB_DIMEN)
                    if len(thumbnail) <= size:
                        thumbnail += b'\0' * (size - len(thumbnail))
                        self.thumbnail_record[:] = thumbnail
                return
Esempio n. 6
0
    def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
            try_extra_data_fix=False):
        self.log = log
        self.debug = debug
        self.embedded_mi = None
        self.warned_about_trailing_entry_corruption = False
        self.base_css_rules = textwrap.dedent('''
                body { text-align: justify }

                blockquote { margin: 0em 0em 0em 2em; }

                p { margin: 0em; text-indent: 1.5em }

                .bold { font-weight: bold }

                .italic { font-style: italic }

                .underline { text-decoration: underline }

                .mbp_pagebreak {
                    page-break-after: always; margin: 0; display: block
                }
                ''')
        self.tag_css_rules = {}
        self.left_margins = {}
        self.text_indents = {}

        if hasattr(filename_or_stream, 'read'):
            stream = filename_or_stream
            stream.seek(0)
        else:
            stream = open(filename_or_stream, 'rb')

        raw = stream.read()
        if raw.startswith('TPZ'):
            raise TopazError(_('This is an Amazon Topaz book. It cannot be processed.'))
        if raw.startswith(b'\xeaDRMION\xee'):
            raise KFXError()

        self.header   = raw[0:72]
        self.name     = self.header[:32].replace('\x00', '')
        self.num_sections, = struct.unpack('>H', raw[76:78])

        self.ident = self.header[0x3C:0x3C + 8].upper()
        if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
            raise MobiError('Unknown book type: %s' % repr(self.ident))

        self.sections = []
        self.section_headers = []
        for i in range(self.num_sections):
            offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
            flags, val = a1, a2 << 16 | a3 << 8 | a4
            self.section_headers.append((offset, flags, val))

        def section(section_number):
            if section_number == self.num_sections - 1:
                end_off = len(raw)
            else:
                end_off = self.section_headers[section_number + 1][0]
            off = self.section_headers[section_number][0]
            return raw[off:end_off]

        for i in range(self.num_sections):
            self.sections.append((section(i), self.section_headers[i]))

        self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
            user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
        self.name = self.name.decode(self.book_header.codec, 'replace')
        self.kf8_type = None
        k8i = getattr(self.book_header.exth, 'kf8_header', None)

        # Ancient PRC files from Baen can have random values for
        # mobi_version, so be conservative
        if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
            'skelidx')):
            self.kf8_type = 'standalone'
        elif k8i is not None:  # Check for joint mobi 6 and kf 8 file
            try:
                raw = self.sections[k8i-1][0]
            except:
                raw = None
            if raw == b'BOUNDARY':
                try:
                    self.book_header = BookHeader(self.sections[k8i][0],
                            self.ident, user_encoding, self.log)
                    self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
                    self.book_header.mobi6_records = bh.records

                    # Need the first_image_index from the mobi 6 header as well
                    for x in ('first_image_index',):
                        setattr(self.book_header, x, getattr(bh, x))

                    # We need to do this because the MOBI 6 text extract code
                    # does not know anything about the kf8 offset
                    if hasattr(self.book_header, 'huff_offset'):
                        self.book_header.huff_offset += k8i

                    self.kf8_type = 'joint'
                    self.kf8_boundary = k8i-1
                except:
                    self.book_header = bh
Esempio n. 7
0
 def identity(self):
     self.stream.seek(60)
     ident = self.stream.read(8).upper()
     if ident not in ['BOOKMOBI', 'TEXTREAD']:
         raise MobiError('Unknown book type: %s' % ident)
     return ident