def __init__(self, record0, offset): self.raw = record0.raw self.header_offset = offset self.compression_raw = self.raw[:2] self.compression = { 1: 'No compression', 2: 'PalmDoc compression', 17480: 'HUFF/CDIC compression' }.get( struct.unpack(b'>H', self.compression_raw)[0], repr(self.compression_raw)) self.unused = self.raw[2:4] self.text_length, = struct.unpack(b'>I', self.raw[4:8]) self.number_of_text_records, self.text_record_size = \ struct.unpack(b'>HH', self.raw[8:12]) self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) self.encryption_type = { 0: 'No encryption', 1: 'Old mobipocket encryption', 2: 'Mobipocket encryption' }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) self.unknown = self.raw[14:16] self.identifier = self.raw[16:20] if self.identifier != b'MOBI': raise ValueError('Identifier %r unknown' % self.identifier) self.length, = struct.unpack(b'>I', self.raw[20:24]) self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) self.type = { 2: 'Mobipocket book', 3: 'PalmDOC book', 4: 'Audio', 257: 'News', 258: 'News Feed', 259: 'News magazine', 513: 'PICS', 514: 'Word', 515: 'XLS', 516: 'PPT', 517: 'TEXT', 518: 'HTML', }.get(self.type_raw, repr(self.type_raw)) self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) self.encoding = { 1252: 'cp1252', 65001: 'utf-8', }.get(self.encoding_raw, repr(self.encoding_raw)) self.uid = self.raw[32:36] self.file_version, = struct.unpack(b'>I', self.raw[36:40]) self.meta_orth_indx, self.meta_infl_indx = struct.unpack( b'>II', self.raw[40:48]) self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) self.reserved = self.raw[52:80] self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) langcode = self.locale_raw langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF self.language = main_language.get(langid, 'ENGLISH') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.input_language = self.raw[96:100] self.output_langauage = self.raw[100:104] self.min_version, = struct.unpack(b'>I', self.raw[104:108]) self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) self.has_exth = bool(self.exth_flags & 0x40) self.has_drm_data = self.length >= 174 and len(self.raw) >= 184 if self.has_drm_data: self.unknown3 = self.raw[132:168] self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \ struct.unpack(b'>4I', self.raw[168:184]) self.has_extra_data_flags = self.length >= 232 and len( self.raw) >= 232 + 16 self.has_fcis_flis = False self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[184:192] if self.file_version < 8: self.first_text_record, self.last_text_record = \ struct.unpack_from(b'>HH', self.raw, 192) self.fdst_count = struct.unpack_from(b'>L', self.raw, 196) else: self.fdst_idx, self.fdst_count = struct.unpack_from( b'>LL', self.raw, 192) if self.fdst_count <= 1: self.fdst_idx = NULL_INDEX (self.fcis_number, self.fcis_count, self.flis_number, self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216]) self.unknown6 = self.raw[216:224] self.srcs_record_index = struct.unpack(b'>I', self.raw[224:228])[0] self.num_srcs_records = struct.unpack(b'>I', self.raw[228:232])[0] self.unknown7 = self.raw[232:240] self.extra_data_flags = struct.unpack(b'>I', self.raw[240:244])[0] self.has_multibytes = bool(self.extra_data_flags & 0b1) self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) self.primary_index_record, = struct.unpack(b'>I', self.raw[244:248]) if self.length >= 248: (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx) = struct.unpack_from(b'>4L', self.raw, 248) self.unknown9 = self.raw[264:self.length + 16] if self.meta_orth_indx not in {NULL_INDEX, self.sect_idx}: raise ValueError('KF8 header has different Meta orth and ' 'section indices') # The following are all relative to the position of the header record # make them absolute for ease of debugging self.relative_records = { 'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx', 'meta_orth_indx', 'huffman_record_offset', 'first_non_book_record', 'datp_record_offset', 'fcis_number', 'flis_number', 'primary_index_record', 'fdst_idx', 'first_image_index' } for x in self.relative_records: if hasattr(self, x) and getattr(self, x) != NULL_INDEX: setattr(self, x, self.header_offset + getattr(self, x)) # Try to find the first non-text record self.first_resource_record = offset + 1 + self.number_of_text_records # Default to first record after all text records pointer = min(getattr(self, 'first_non_book_record', NULL_INDEX), getattr(self, 'first_image_index', NULL_INDEX)) if pointer != NULL_INDEX: self.first_resource_record = max(pointer, self.first_resource_record) self.last_resource_record = NULL_INDEX if self.has_exth: self.exth_offset = 16 + self.length self.exth = EXTHHeader(self.raw[self.exth_offset:]) self.end_of_exth = self.exth_offset + self.exth.length self.bytes_after_exth = self.raw[self.end_of_exth:self. fullname_offset] if self.exth.kf8_header_index is not None and offset == 0: # MOBI 6 header in a joint file, adjust self.last_resource_record self.last_resource_record = self.exth.kf8_header_index - 2
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False): self.log = log self.compression_type = raw[:2] self.records, self.records_size = struct.unpack('>HH', raw[8:12]) self.encryption_type, = struct.unpack('>H', raw[12:14]) if ident == 'TEXTREAD': self.codepage = 1252 if len(raw) <= 16: self.codec = 'cp1252' self.extra_flags = 0 self.title = _('Unknown') self.language = 'ENGLISH' self.sublanguage = 'NEUTRAL' self.exth_flag, self.exth = 0, None self.ancient = True self.first_image_index = -1 self.mobi_version = 1 else: self.ancient = False self.doctype = raw[16:20] self.length, self.type, self.codepage, self.unique_id, \ self.version = struct.unpack('>LLLLL', raw[20:40]) try: self.codec = { 1252: 'cp1252', 65001: 'utf-8', }[self.codepage] except (IndexError, KeyError): self.codec = 'cp1252' if not user_encoding else user_encoding log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, self.codec)) # Some KF8 files have header length == 264 (generated by kindlegen # 2.9?). See https://bugs.launchpad.net/bugs/1179144 max_header_length = 500 # We choose 500 for future versions of kindlegen if (ident == 'TEXTREAD' or self.length < 0xE4 or self.length > max_header_length or (try_extra_data_fix and self.length == 0xE4)): self.extra_flags = 0 else: self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4]) if self.compression_type == 'DH': self.huff_offset, self.huff_number = struct.unpack( '>LL', raw[0x70:0x78]) toff, tlen = struct.unpack('>II', raw[0x54:0x5c]) tend = toff + tlen self.title = raw[toff:tend] if tend < len(raw) else _('Unknown') langcode = struct.unpack('!L', raw[0x5C:0x60])[0] langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF self.language = main_language.get(langid, 'ENGLISH') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0] self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth = None if not isinstance(self.title, unicode): self.title = self.title.decode(self.codec, 'replace') if self.exth_flag & 0x40: try: self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title) self.exth.mi.uid = self.unique_id if self.exth.mi.is_null('language'): try: self.exth.mi.language = mobi2iana( langid, sublangid) except: self.log.exception('Unknown language code') except: self.log.exception('Invalid EXTH header') self.exth_flag = 0 self.ncxidx = NULL_INDEX if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) # Ancient PRC files from Baen can have random values for # mobi_version, so be conservative if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): self.dividx, self.skelidx, self.datpidx, self.othidx = \ struct.unpack_from(b'>4L', raw, 0xF8) # need to use the FDST record to find out how to properly # unpack the raw_ml into pieces it is simply a table of start # and end locations for each flow piece self.fdstidx, self.fdstcnt = struct.unpack_from( b'>2L', raw, 0xC0) # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX else: # Null values self.skelidx = self.dividx = self.othidx = self.fdstidx = \ NULL_INDEX
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False): self.log = log self.compression_type = raw[:2] self.records, self.records_size = struct.unpack(">HH", raw[8:12]) self.encryption_type, = struct.unpack(">H", raw[12:14]) if ident == "TEXTREAD": self.codepage = 1252 if len(raw) <= 16: self.codec = "cp1252" self.extra_flags = 0 self.title = _("Unknown") self.language = "ENGLISH" self.sublanguage = "NEUTRAL" self.exth_flag, self.exth = 0, None self.ancient = True self.first_image_index = -1 self.mobi_version = 1 else: self.ancient = False self.doctype = raw[16:20] self.length, self.type, self.codepage, self.unique_id, self.version = struct.unpack(">LLLLL", raw[20:40]) try: self.codec = {1252: "cp1252", 65001: "utf-8"}[self.codepage] except (IndexError, KeyError): self.codec = "cp1252" if not user_encoding else user_encoding log.warn("Unknown codepage %d. Assuming %s" % (self.codepage, self.codec)) # Some KF8 files have header length == 264 (generated by kindlegen # 2.9?). See https://bugs.launchpad.net/bugs/1179144 max_header_length = 500 # We choose 500 for future versions of kindlegen if ( ident == "TEXTREAD" or self.length < 0xE4 or self.length > max_header_length or (try_extra_data_fix and self.length == 0xE4) ): self.extra_flags = 0 else: self.extra_flags, = struct.unpack(">H", raw[0xF2:0xF4]) if self.compression_type == "DH": self.huff_offset, self.huff_number = struct.unpack(">LL", raw[0x70:0x78]) toff, tlen = struct.unpack(">II", raw[0x54:0x5C]) tend = toff + tlen self.title = raw[toff:tend] if tend < len(raw) else _("Unknown") langcode = struct.unpack("!L", raw[0x5C:0x60])[0] langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF self.language = main_language.get(langid, "ENGLISH") self.sublanguage = sub_language.get(sublangid, "NEUTRAL") self.mobi_version = struct.unpack(">I", raw[0x68:0x6C])[0] self.first_image_index = struct.unpack(">L", raw[0x6C : 0x6C + 4])[0] self.exth_flag, = struct.unpack(">L", raw[0x80:0x84]) self.exth = None if not isinstance(self.title, unicode): self.title = self.title.decode(self.codec, "replace") if self.exth_flag & 0x40: try: self.exth = EXTHHeader(raw[16 + self.length :], self.codec, self.title) self.exth.mi.uid = self.unique_id if self.exth.mi.is_null("language"): try: self.exth.mi.language = mobi2iana(langid, sublangid) except: self.log.exception("Unknown language code") except: self.log.exception("Invalid EXTH header") self.exth_flag = 0 self.ncxidx = NULL_INDEX if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b">L", raw, 0xF4) # Ancient PRC files from Baen can have random values for # mobi_version, so be conservative if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): self.dividx, self.skelidx, self.datpidx, self.othidx = struct.unpack_from(b">4L", raw, 0xF8) # need to use the FDST record to find out how to properly # unpack the raw_ml into pieces it is simply a table of start # and end locations for each flow piece self.fdstidx, self.fdstcnt = struct.unpack_from(b">2L", raw, 0xC0) # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX else: # Null values self.skelidx = self.dividx = self.othidx = self.fdstidx = NULL_INDEX
def __init__(self, raw, ident, user_encoding, log, try_extra_data_fix=False): self.log = log self.compression_type = raw[:2] self.records, self.records_size = struct.unpack('>HH', raw[8:12]) self.encryption_type, = struct.unpack('>H', raw[12:14]) if ident == 'TEXTREAD': self.codepage = 1252 if len(raw) <= 16: self.codec = 'cp1252' self.extra_flags = 0 self.title = _('Unknown') self.language = 'ENGLISH' self.sublanguage = 'NEUTRAL' self.exth_flag, self.exth = 0, None self.ancient = True self.first_image_index = -1 self.mobi_version = 1 else: self.ancient = False self.doctype = raw[16:20] self.length, self.type, self.codepage, self.unique_id, \ self.version = struct.unpack('>LLLLL', raw[20:40]) try: self.codec = { 1252: 'cp1252', 65001: 'utf-8', }[self.codepage] except (IndexError, KeyError): self.codec = 'cp1252' if not user_encoding else user_encoding log.warn('Unknown codepage %d. Assuming %s' % (self.codepage, self.codec)) # Some KF8 files have header length == 256 (generated by kindlegen # 2.7?). See https://bugs.launchpad.net/bugs/1067310 max_header_length = 0x100 if (ident == 'TEXTREAD' or self.length < 0xE4 or self.length > max_header_length or (try_extra_data_fix and self.length == 0xE4)): self.extra_flags = 0 else: self.extra_flags, = struct.unpack('>H', raw[0xF2:0xF4]) if self.compression_type == 'DH': self.huff_offset, self.huff_number = struct.unpack('>LL', raw[0x70:0x78]) toff, tlen = struct.unpack('>II', raw[0x54:0x5c]) tend = toff + tlen self.title = raw[toff:tend] if tend < len(raw) else _('Unknown') langcode = struct.unpack('!L', raw[0x5C:0x60])[0] langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF self.language = main_language.get(langid, 'ENGLISH') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0] self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0] self.exth_flag, = struct.unpack('>L', raw[0x80:0x84]) self.exth = None if not isinstance(self.title, unicode): self.title = self.title.decode(self.codec, 'replace') if self.exth_flag & 0x40: try: self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title) self.exth.mi.uid = self.unique_id if self.exth.mi.is_null('language'): try: self.exth.mi.language = mobi2iana(langid, sublangid) except: self.log.exception('Unknown language code') except: self.log.exception('Invalid EXTH header') self.exth_flag = 0 self.ncxidx = NULL_INDEX if len(raw) >= 0xF8: self.ncxidx, = struct.unpack_from(b'>L', raw, 0xF4) # Ancient PRC files from Baen can have random values for # mobi_version, so be conservative if self.mobi_version == 8 and len(raw) >= (0xF8 + 16): self.dividx, self.skelidx, self.datpidx, self.othidx = \ struct.unpack_from(b'>4L', raw, 0xF8) # need to use the FDST record to find out how to properly # unpack the raw_ml into pieces it is simply a table of start # and end locations for each flow piece self.fdstidx, self.fdstcnt = struct.unpack_from(b'>2L', raw, 0xC0) # if cnt is 1 or less, fdst section number can be garbage if self.fdstcnt <= 1: self.fdstidx = NULL_INDEX else: # Null values self.skelidx = self.dividx = self.othidx = self.fdstidx = \ NULL_INDEX
def __init__(self, record0, offset): self.raw = record0.raw self.header_offset = offset self.compression_raw = self.raw[:2] self.compression = {1: 'No compression', 2: 'PalmDoc compression', 17480: 'HUFF/CDIC compression'}.get(struct.unpack(b'>H', self.compression_raw)[0], repr(self.compression_raw)) self.unused = self.raw[2:4] self.text_length, = struct.unpack(b'>I', self.raw[4:8]) self.number_of_text_records, self.text_record_size = \ struct.unpack(b'>HH', self.raw[8:12]) self.encryption_type_raw, = struct.unpack(b'>H', self.raw[12:14]) self.encryption_type = { 0: 'No encryption', 1: 'Old mobipocket encryption', 2: 'Mobipocket encryption' }.get(self.encryption_type_raw, repr(self.encryption_type_raw)) self.unknown = self.raw[14:16] self.identifier = self.raw[16:20] if self.identifier != b'MOBI': raise ValueError('Identifier %r unknown'%self.identifier) self.length, = struct.unpack(b'>I', self.raw[20:24]) self.type_raw, = struct.unpack(b'>I', self.raw[24:28]) self.type = { 2 : 'Mobipocket book', 3 : 'PalmDOC book', 4 : 'Audio', 257 : 'News', 258 : 'News Feed', 259 : 'News magazine', 513 : 'PICS', 514 : 'Word', 515 : 'XLS', 516 : 'PPT', 517 : 'TEXT', 518 : 'HTML', }.get(self.type_raw, repr(self.type_raw)) self.encoding_raw, = struct.unpack(b'>I', self.raw[28:32]) self.encoding = { 1252 : 'cp1252', 65001: 'utf-8', }.get(self.encoding_raw, repr(self.encoding_raw)) self.uid = self.raw[32:36] self.file_version, = struct.unpack(b'>I', self.raw[36:40]) self.meta_orth_indx, self.meta_infl_indx = struct.unpack( b'>II', self.raw[40:48]) self.secondary_index_record, = struct.unpack(b'>I', self.raw[48:52]) self.reserved = self.raw[52:80] self.first_non_book_record, = struct.unpack(b'>I', self.raw[80:84]) self.fullname_offset, = struct.unpack(b'>I', self.raw[84:88]) self.fullname_length, = struct.unpack(b'>I', self.raw[88:92]) self.locale_raw, = struct.unpack(b'>I', self.raw[92:96]) langcode = self.locale_raw langid = langcode & 0xFF sublangid = (langcode >> 10) & 0xFF self.language = main_language.get(langid, 'ENGLISH') self.sublanguage = sub_language.get(sublangid, 'NEUTRAL') self.input_language = self.raw[96:100] self.output_langauage = self.raw[100:104] self.min_version, = struct.unpack(b'>I', self.raw[104:108]) self.first_image_index, = struct.unpack(b'>I', self.raw[108:112]) self.huffman_record_offset, = struct.unpack(b'>I', self.raw[112:116]) self.huffman_record_count, = struct.unpack(b'>I', self.raw[116:120]) self.datp_record_offset, = struct.unpack(b'>I', self.raw[120:124]) self.datp_record_count, = struct.unpack(b'>I', self.raw[124:128]) self.exth_flags, = struct.unpack(b'>I', self.raw[128:132]) self.has_exth = bool(self.exth_flags & 0x40) self.has_drm_data = self.length >= 174 and len(self.raw) >= 184 if self.has_drm_data: self.unknown3 = self.raw[132:168] self.drm_offset, self.drm_count, self.drm_size, self.drm_flags = \ struct.unpack(b'>4I', self.raw[168:184]) self.has_extra_data_flags = self.length >= 232 and len(self.raw) >= 232+16 self.has_fcis_flis = False self.has_multibytes = self.has_indexing_bytes = self.has_uncrossable_breaks = False self.extra_data_flags = 0 if self.has_extra_data_flags: self.unknown4 = self.raw[184:192] if self.file_version < 8: self.first_text_record, self.last_text_record = \ struct.unpack_from(b'>HH', self.raw, 192) self.fdst_count = struct.unpack_from(b'>L', self.raw, 196) else: self.fdst_idx, self.fdst_count = struct.unpack_from(b'>LL', self.raw, 192) if self.fdst_count <= 1: self.fdst_idx = NULL_INDEX (self.fcis_number, self.fcis_count, self.flis_number, self.flis_count) = struct.unpack(b'>IIII', self.raw[200:216]) self.unknown6 = self.raw[216:224] self.srcs_record_index = struct.unpack(b'>I', self.raw[224:228])[0] self.num_srcs_records = struct.unpack(b'>I', self.raw[228:232])[0] self.unknown7 = self.raw[232:240] self.extra_data_flags = struct.unpack(b'>I', self.raw[240:244])[0] self.has_multibytes = bool(self.extra_data_flags & 0b1) self.has_indexing_bytes = bool(self.extra_data_flags & 0b10) self.has_uncrossable_breaks = bool(self.extra_data_flags & 0b100) self.primary_index_record, = struct.unpack(b'>I', self.raw[244:248]) if self.length >= 248: (self.sect_idx, self.skel_idx, self.datp_idx, self.oth_idx ) = struct.unpack_from(b'>4L', self.raw, 248) self.unknown9 = self.raw[264:self.length+16] if self.meta_orth_indx not in {NULL_INDEX, self.sect_idx}: raise ValueError('KF8 header has different Meta orth and ' 'section indices') # The following are all relative to the position of the header record # make them absolute for ease of debugging self.relative_records = {'sect_idx', 'skel_idx', 'datp_idx', 'oth_idx', 'meta_orth_indx', 'huffman_record_offset', 'first_non_book_record', 'datp_record_offset', 'fcis_number', 'flis_number', 'primary_index_record', 'fdst_idx', 'first_image_index'} for x in self.relative_records: if hasattr(self, x) and getattr(self, x) != NULL_INDEX: setattr(self, x, self.header_offset+getattr(self, x)) # Try to find the first non-text record self.first_resource_record = offset + 1 + self.number_of_text_records # Default to first record after all text records pointer = min(getattr(self, 'first_non_book_record', NULL_INDEX), getattr(self, 'first_image_index', NULL_INDEX)) if pointer != NULL_INDEX: self.first_resource_record = max(pointer, self.first_resource_record) self.last_resource_record = NULL_INDEX if self.has_exth: self.exth_offset = 16 + self.length self.exth = EXTHHeader(self.raw[self.exth_offset:]) self.end_of_exth = self.exth_offset + self.exth.length self.bytes_after_exth = self.raw[self.end_of_exth:self.fullname_offset] if self.exth.kf8_header_index is not None and offset == 0: # MOBI 6 header in a joint file, adjust self.last_resource_record self.last_resource_record = self.exth.kf8_header_index - 2