def __init__(self, raw): if raw[:4] != b"FDST": raise ValueError("KF8 does not have a valid FDST record") self.sec_off, self.num_sections = struct.unpack_from(b">LL", raw, 4) if self.sec_off != 12: raise ValueError("FDST record has unknown extra fields") secf = b">%dL" % (self.num_sections * 2) secs = struct.unpack_from(secf, raw, self.sec_off) rest = raw[self.sec_off + struct.calcsize(secf) :] if rest: raise ValueError("FDST record has trailing data: " "%s" % format_bytes(rest)) self.sections = tuple(izip(secs[::2], secs[1::2]))
def dump_record(self, r, dat): ans = [] ans.append('\nRecord #%d: Starts at: %d Ends at: %d' % (r.idx, dat['geom'][0], dat['geom'][1])) s, e, c = dat['starts'], dat['ends'], dat['complete'] ans.append(('\tContains: %d index entries ' '(%d ends, %d complete, %d starts)') % tuple(map(len, (s + e + c, e, c, s)))) byts = bytearray(r.trailing_data.get('indexing', b'')) ans.append('TBS bytes: %s' % format_bytes(byts)) for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)): if entries: ans.append('\t%s:' % typ) for x in entries: ans.append(('\t\tIndex Entry: %s (Parent index: %s, ' 'Depth: %d, Offset: %d, Size: %d) [%s]') % (x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) def bin4(num): ans = bin(num)[2:] return as_bytes('0' * (4 - len(ans)) + ans) def repr_extra(x): return str({bin4(k): v for k, v in iteritems(extra)}) tbs_type = 0 is_periodical = self.doc_type in (257, 258, 259) if len(byts): outermost_index, extra, consumed = decode_tbs(byts, flag_size=3) byts = byts[consumed:] for k in extra: tbs_type |= k ans.append('\nTBS: %d (%s)' % (tbs_type, bin4(tbs_type))) ans.append('Outermost index: %d' % outermost_index) ans.append('Unknown extra start bytes: %s' % repr_extra(extra)) if is_periodical: # Hierarchical periodical try: byts, a = self.interpret_periodical( tbs_type, byts, dat['geom'][0]) except: import traceback traceback.print_exc() a = [] print('Failed to decode TBS bytes for record: %d' % r.idx) ans += a if byts: sbyts = tuple(hex(b)[2:] for b in byts) ans.append('Remaining bytes: %s' % ' '.join(sbyts)) ans.append('') return tbs_type, ans
def __init__(self, raw): if raw[:4] != b'FDST': raise ValueError('KF8 does not have a valid FDST record') self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4) if self.sec_off != 12: raise ValueError('FDST record has unknown extra fields') secf = b'>%dL' % (self.num_sections*2) secs = struct.unpack_from(secf, raw, self.sec_off) rest = raw[self.sec_off+struct.calcsize(secf):] if rest: raise ValueError('FDST record has trailing data: ' '%s'%format_bytes(rest)) self.sections = tuple(zip(secs[::2], secs[1::2]))
def __init__(self, raw): if raw[:4] != b'FDST': raise ValueError('KF8 does not have a valid FDST record') self.sec_off, self.num_sections = struct.unpack_from(b'>LL', raw, 4) if self.sec_off != 12: raise ValueError('FDST record has unknown extra fields') secf = b'>%dL' % (self.num_sections * 2) secs = struct.unpack_from(secf, raw, self.sec_off) rest = raw[self.sec_off + struct.calcsize(secf):] if rest: raise ValueError('FDST record has trailing data: ' '%s' % format_bytes(rest)) self.sections = tuple(izip(secs[::2], secs[1::2]))
def dump_record(self, r, dat): ans = [] ans.append('\nRecord #%d: Starts at: %d Ends at: %d'%(r.idx, dat['geom'][0], dat['geom'][1])) s, e, c = dat['starts'], dat['ends'], dat['complete'] ans.append(('\tContains: %d index entries ' '(%d ends, %d complete, %d starts)')%tuple(map(len, (s+e+c, e, c, s)))) byts = bytearray(r.trailing_data.get('indexing', b'')) ans.append('TBS bytes: %s'%format_bytes(byts)) for typ, entries in (('Ends', e), ('Complete', c), ('Starts', s)): if entries: ans.append('\t%s:'%typ) for x in entries: ans.append(('\t\tIndex Entry: %s (Parent index: %s, ' 'Depth: %d, Offset: %d, Size: %d) [%s]')%( x.index, x.parent_index, x.depth, x.offset, x.size, x.label)) def bin4(num): ans = bin(num)[2:] return as_bytes('0'*(4-len(ans)) + ans) def repr_extra(x): return str({bin4(k):v for k, v in iteritems(extra)}) tbs_type = 0 is_periodical = self.doc_type in (257, 258, 259) if len(byts): outermost_index, extra, consumed = decode_tbs(byts, flag_size=3) byts = byts[consumed:] for k in extra: tbs_type |= k ans.append('\nTBS: %d (%s)'%(tbs_type, bin4(tbs_type))) ans.append('Outermost index: %d'%outermost_index) ans.append('Unknown extra start bytes: %s'%repr_extra(extra)) if is_periodical: # Hierarchical periodical try: byts, a = self.interpret_periodical(tbs_type, byts, dat['geom'][0]) except: import traceback traceback.print_exc() a = [] print('Failed to decode TBS bytes for record: %d'%r.idx) ans += a if byts: sbyts = tuple(hex(b)[2:] for b in byts) ans.append('Remaining bytes: %s'%' '.join(sbyts)) ans.append('') return tbs_type, ans
def __init__(self, records, codec): self.records = OrderedDict() record_offset = 0 for record in records: raw = record.raw pos = 0 while pos < len(raw): length, consumed = decint(raw[pos:]) if length > 0: try: self.records[pos+record_offset] = raw[ pos+consumed:pos+consumed+length].decode(codec) except: byts = raw[pos:] r = format_bytes(byts) print('CNCX entry at offset %d has unknown format %s'%( pos+record_offset, r)) self.records[pos+record_offset] = r pos = len(raw) pos += consumed+length record_offset += 0x10000
def __str__(self): ans = ['*' * 20 + ' MOBI %d Header ' % self.file_version + '*' * 20] a = ans.append def i(d, x): x = 'NULL' if x == NULL_INDEX else x a('%s: %s' % (d, x)) def r(d, attr): x = getattr(self, attr) if attr in self.relative_records and x != NULL_INDEX: a('%s: Absolute: %d Relative: %d' % (d, x, x - self.header_offset)) else: i(d, x) a('Compression: %s' % self.compression) a('Unused: %r' % self.unused) a('Text length: %d' % self.text_length) a('Number of text records: %d' % self.number_of_text_records) a('Text record size: %d' % self.text_record_size) a('Encryption: %s' % self.encryption_type) a('Unknown: %r' % self.unknown) a('Identifier: %r' % self.identifier) a('Header length: %d' % self.length) a('Type: %s' % self.type) a('Encoding: %s' % self.encoding) a('UID: %r' % self.uid) a('File version: %d' % self.file_version) r('Meta Orth Index', 'meta_orth_indx') r('Meta Infl Index', 'meta_infl_indx') r('Secondary index record', 'secondary_index_record') a('Reserved: %r' % self.reserved) r('First non-book record', 'first_non_book_record') a('Full name offset: %d' % self.fullname_offset) a('Full name length: %d bytes' % self.fullname_length) a('Langcode: %r' % self.locale_raw) a('Language: %s' % self.language) a('Sub language: %s' % self.sublanguage) a('Input language: %r' % self.input_language) a('Output language: %r' % self.output_langauage) a('Min version: %d' % self.min_version) r('First Image index', 'first_image_index') r('Huffman record offset', 'huffman_record_offset') a('Huffman record count: %d' % self.huffman_record_count) r('Huffman table offset', 'datp_record_offset') a('Huffman table length: %r' % self.datp_record_count) a('EXTH flags: %s (%s)' % (bin(self.exth_flags)[2:], self.has_exth)) if self.has_drm_data: a('Unknown3: %r' % self.unknown3) r('DRM Offset', 'drm_offset') a('DRM Count: %s' % self.drm_count) a('DRM Size: %s' % self.drm_size) a('DRM Flags: %r' % self.drm_flags) if self.has_extra_data_flags: a('Unknown4: %r' % self.unknown4) if hasattr(self, 'first_text_record'): a('First content record: %d' % self.first_text_record) a('Last content record: %d' % self.last_text_record) else: r('FDST Index', 'fdst_idx') a('FDST Count: %d' % self.fdst_count) r('FCIS number', 'fcis_number') a('FCIS count: %d' % self.fcis_count) r('FLIS number', 'flis_number') a('FLIS count: %d' % self.flis_count) a('Unknown6: %r' % self.unknown6) r('SRCS record index', 'srcs_record_index') a('Number of SRCS records?: %d' % self.num_srcs_records) a('Unknown7: %r' % self.unknown7) a(('Extra data flags: %s (has multibyte: %s) ' '(has indexing: %s) (has uncrossable breaks: %s)') % (bin(self.extra_data_flags), self.has_multibytes, self.has_indexing_bytes, self.has_uncrossable_breaks)) r('NCX index', 'primary_index_record') if self.length >= 248: r('Sections Index', 'sect_idx') r('SKEL Index', 'skel_idx') r('DATP Index', 'datp_idx') r('Other Index', 'oth_idx') if self.unknown9: a('Unknown9: %r' % self.unknown9) ans = '\n'.join(ans) if self.has_exth: ans += '\n\n' + str(self.exth) ans += '\n\nBytes after EXTH (%d bytes): %s' % (len( self.bytes_after_exth), format_bytes(self.bytes_after_exth)) ans += '\nNumber of bytes after full name: %d' % ( len(self.raw) - (self.fullname_offset + self.fullname_length)) ans += '\nRecord 0 length: %d' % len(self.raw) return ans
def read_tbs(self): from calibre.ebooks.mobi.writer8.tbs import (Entry, DOC, collect_indexing_data, encode_strands_as_sequences, sequences_to_bytes, calculate_all_tbs, NegativeStrandIndex) entry_map = [] for index in self.ncx_index: vals = list(index)[:-1] + [None, None, None, None] entry_map.append(Entry(*(vals[:12]))) indexing_data = collect_indexing_data(entry_map, list(map(len, self.text_records))) self.indexing_data = [DOC + '\n' +textwrap.dedent('''\ Index Entry lines are of the form: depth:index_number [action] parent (index_num-parent) Geometry Where Geometry is the start and end of the index entry w.r.t the start of the text record. ''')] tbs_type = 8 try: calculate_all_tbs(indexing_data) except NegativeStrandIndex: calculate_all_tbs(indexing_data, tbs_type=5) tbs_type = 5 for i, strands in enumerate(indexing_data): rec = self.text_records[i] tbs_bytes = rec.trailing_data.get('indexing', b'') desc = ['Record #%d'%i] for s, strand in enumerate(strands): desc.append('Strand %d'%s) for entries in itervalues(strand): for e in entries: desc.append( ' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)'%( e.depth * (' ') + '- ', e.index, e.action, e.parent, e.index-(e.parent or 0), e.start-i*RECORD_SIZE, e.start+e.length-i*RECORD_SIZE)) desc.append('TBS Bytes: ' + format_bytes(tbs_bytes)) flag_sz = 3 sequences = [] otbs = tbs_bytes while tbs_bytes: try: val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz) except: break flag_sz = 4 tbs_bytes = tbs_bytes[consumed:] extra = {bin(k):v for k, v in iteritems(extra)} sequences.append((val, extra)) for j, seq in enumerate(sequences): desc.append('Sequence #%d: %r %r'%(j, seq[0], seq[1])) if tbs_bytes: desc.append('Remaining bytes: %s'%format_bytes(tbs_bytes)) calculated_sequences = encode_strands_as_sequences(strands, tbs_type=tbs_type) try: calculated_bytes = sequences_to_bytes(calculated_sequences) except: calculated_bytes = b'failed to calculate tbs bytes' if calculated_bytes != otbs: print('WARNING: TBS mismatch for record %d'%i) desc.append('WARNING: TBS mismatch!') desc.append('Calculated sequences: %r'%calculated_sequences) desc.append('') self.indexing_data.append('\n'.join(desc))
def read_tbs(self): from calibre.ebooks.mobi.writer8.tbs import ( Entry, DOC, collect_indexing_data, encode_strands_as_sequences, sequences_to_bytes, calculate_all_tbs, NegativeStrandIndex) entry_map = [] for index in self.ncx_index: vals = list(index)[:-1] + [None, None, None, None] entry_map.append(Entry(*(vals[:12]))) indexing_data = collect_indexing_data( entry_map, list(map(len, self.text_records))) self.indexing_data = [ DOC + '\n' + textwrap.dedent('''\ Index Entry lines are of the form: depth:index_number [action] parent (index_num-parent) Geometry Where Geometry is the start and end of the index entry w.r.t the start of the text record. ''') ] tbs_type = 8 try: calculate_all_tbs(indexing_data) except NegativeStrandIndex: calculate_all_tbs(indexing_data, tbs_type=5) tbs_type = 5 for i, strands in enumerate(indexing_data): rec = self.text_records[i] tbs_bytes = rec.trailing_data.get('indexing', b'') desc = ['Record #%d' % i] for s, strand in enumerate(strands): desc.append('Strand %d' % s) for entries in strand.itervalues(): for e in entries: desc.append( ' %s%d [%-9s] parent: %s (%d) Geometry: (%d, %d)' % (e.depth * (' ') + '- ', e.index, e.action, e.parent, e.index - (e.parent or 0), e.start - i * RECORD_SIZE, e.start + e.length - i * RECORD_SIZE)) desc.append('TBS Bytes: ' + format_bytes(tbs_bytes)) flag_sz = 3 sequences = [] otbs = tbs_bytes while tbs_bytes: try: val, extra, consumed = decode_tbs(tbs_bytes, flag_size=flag_sz) except: break flag_sz = 4 tbs_bytes = tbs_bytes[consumed:] extra = {bin(k): v for k, v in extra.iteritems()} sequences.append((val, extra)) for j, seq in enumerate(sequences): desc.append('Sequence #%d: %r %r' % (j, seq[0], seq[1])) if tbs_bytes: desc.append('Remaining bytes: %s' % format_bytes(tbs_bytes)) calculated_sequences = encode_strands_as_sequences( strands, tbs_type=tbs_type) try: calculated_bytes = sequences_to_bytes(calculated_sequences) except: calculated_bytes = b'failed to calculate tbs bytes' if calculated_bytes != otbs: print('WARNING: TBS mismatch for record %d' % i) desc.append('WARNING: TBS mismatch!') desc.append('Calculated sequences: %r' % calculated_sequences) desc.append('') self.indexing_data.append('\n'.join(desc))
def __str__(self): ans = ['*'*20 + ' MOBI %d Header '%self.file_version+ '*'*20] a = ans.append def i(d, x): x = 'NULL' if x == NULL_INDEX else x a('%s: %s'%(d, x)) def r(d, attr): x = getattr(self, attr) if attr in self.relative_records and x != NULL_INDEX: a('%s: Absolute: %d Relative: %d'%(d, x, x-self.header_offset)) else: i(d, x) a('Compression: %s'%self.compression) a('Unused: %r'%self.unused) a('Text length: %d'%self.text_length) a('Number of text records: %d'%self.number_of_text_records) a('Text record size: %d'%self.text_record_size) a('Encryption: %s'%self.encryption_type) a('Unknown: %r'%self.unknown) a('Identifier: %r'%self.identifier) a('Header length: %d'% self.length) a('Type: %s'%self.type) a('Encoding: %s'%self.encoding) a('UID: %r'%self.uid) a('File version: %d'%self.file_version) r('Meta Orth Index', 'meta_orth_indx') r('Meta Infl Index', 'meta_infl_indx') r('Secondary index record', 'secondary_index_record') a('Reserved: %r'%self.reserved) r('First non-book record', 'first_non_book_record') a('Full name offset: %d'%self.fullname_offset) a('Full name length: %d bytes'%self.fullname_length) a('Langcode: %r'%self.locale_raw) a('Language: %s'%self.language) a('Sub language: %s'%self.sublanguage) a('Input language: %r'%self.input_language) a('Output language: %r'%self.output_langauage) a('Min version: %d'%self.min_version) r('First Image index', 'first_image_index') r('Huffman record offset', 'huffman_record_offset') a('Huffman record count: %d'%self.huffman_record_count) r('Huffman table offset', 'datp_record_offset') a('Huffman table length: %r'%self.datp_record_count) a('EXTH flags: %s (%s)'%(bin(self.exth_flags)[2:], self.has_exth)) if self.has_drm_data: a('Unknown3: %r'%self.unknown3) r('DRM Offset', 'drm_offset') a('DRM Count: %s'%self.drm_count) a('DRM Size: %s'%self.drm_size) a('DRM Flags: %r'%self.drm_flags) if self.has_extra_data_flags: a('Unknown4: %r'%self.unknown4) if hasattr(self, 'first_text_record'): a('First content record: %d'%self.first_text_record) a('Last content record: %d'%self.last_text_record) else: r('FDST Index', 'fdst_idx') a('FDST Count: %d'% self.fdst_count) r('FCIS number', 'fcis_number') a('FCIS count: %d'% self.fcis_count) r('FLIS number', 'flis_number') a('FLIS count: %d'% self.flis_count) a('Unknown6: %r'% self.unknown6) r('SRCS record index', 'srcs_record_index') a('Number of SRCS records?: %d'%self.num_srcs_records) a('Unknown7: %r'%self.unknown7) a(('Extra data flags: %s (has multibyte: %s) ' '(has indexing: %s) (has uncrossable breaks: %s)')%( bin(self.extra_data_flags), self.has_multibytes, self.has_indexing_bytes, self.has_uncrossable_breaks)) r('NCX index', 'primary_index_record') if self.length >= 248: r('Sections Index', 'sect_idx') r('SKEL Index', 'skel_idx') r('DATP Index', 'datp_idx') r('Other Index', 'oth_idx') if self.unknown9: a('Unknown9: %r'%self.unknown9) ans = '\n'.join(ans) if self.has_exth: ans += '\n\n' + str(self.exth) ans += '\n\nBytes after EXTH (%d bytes): %s'%( len(self.bytes_after_exth), format_bytes(self.bytes_after_exth)) ans += '\nNumber of bytes after full name: %d' % (len(self.raw) - (self.fullname_offset + self.fullname_length)) ans += '\nRecord 0 length: %d'%len(self.raw) return ans