def _parse_ereader_header132(raw): h = Storage() h.compression, _unknown1, h.encoding, h.number_small_pages, h.number_large_pages, h.non_text_records, h.number_chapters, h.number_small_index, h.number_large_index, h.number_images, h.number_links, h.metadata_available, _unknown2, h.number_footnotes, h.number_sidebars, h.chapter_index_records, h.magic_2560, h.small_page_index_record, h.large_page_index_record, h.image_data_record, h.links_record, h.metadata_record, _unknown3, h.footnote_record, h.sidebar_record, h.last_data_record, = struct.unpack( ">HLHHHHHHHHHHHHHHHHHHHHHHH", raw[:54] ) return h
def _parse_palmdoc_header(raw): h = Storage() h.compression, _unused, h.text_length, h.record_count, h.record_size, h.current_position, = struct.unpack( ">HHLHHL", raw[0:0x10] ) return h
def parse_opf_xml (rawxml): rawxml, encoding = xml_to_unicode(rawxml, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) rawxml = rawxml[rawxml.find('<'):] tree = etree.fromstring(rawxml, etree.XMLParser(recover=True)) opf = Storage() for section in ('metadata', 'manifest', 'spine', 'guide'): subtree = tree.find('opf:%s' % section, namespaces=NAMESPACES) if subtree is not None: for el in subtree.getchildren(): opf.setdefault(section, []).append((el.tag, el.attrib, el.text)) return opf
def _parse_plucker_header(raw): h = Storage() h.uid, h.compression, h.records, = struct.unpack(">HHH", raw[0:6]) h.home_html = None reserved = {} for i in xrange(h.records): adv = 4 * i name, = struct.unpack(">H", raw[6 + adv : 8 + adv]) id, = struct.unpack(">H", raw[8 + adv : 10 + adv]) reserved[id] = name if name == 0: h.home_html = id h.reserved = reserved return h
def _parse_pdb_header(stream): pdbheader = Storage() # PDB fields pdbheader.name, pdbheader.attributes, pdbheader.version, pdbheader.creation_timestamp, pdbheader.modification_timestamp, pdbheader.last_backup_timestamp, pdbheader.modification_number, pdbheader.appinfo_offset, pdbheader.sortinfo_offset, pdbheader.type, pdbheader.creator, pdbheader.uniqueidseed, pdbheader.nextrecordlistid, pdbheader.num_records, = struct.unpack( ">32sHHLLLLLL4s4sLLH", stream.read(78) ) # record offsets and lengths records = [] start = struct.unpack(">LBBBB", stream.read(8))[0] for n in range(1, pdbheader.num_records): next_start = struct.unpack(">LBBBB", stream.read(8))[0] records.append((start, next_start - start)) start = next_start stream.seek(0, 2) end = stream.tell() records.append((start, end - start)) pdbheader.records = records # Clean up some of the fields pdbheader.name = re.sub("[^-A-Za-z0-9'\";:,. ]+", "_", pdbheader.name.replace("\x00", "")) return pdbheader
def _parse_exth_header (raw): exth = Storage() exth.identifier, \ exth.header_length, \ exth.record_count, \ = struct.unpack('>4sLL', raw[:12]) exthdata = raw[12:] pos = 0 records = [] records_left = exth.record_count while records_left > 0: records_left -= 1 record = Storage() record.type, \ record.length, \ = struct.unpack('>LL', exthdata[pos:pos + 8]) record.data = exthdata[pos+8:pos+record.length] pos += record.length records.append(record) exth.records = records return exth
def _parse_ztxt_header(raw): h = Storage() h.version, h.record_count, h.data_size, h.record_size, h.number_bookmarks, h.bookmark_record, h.number_annotations, h.annotation_record, h.flags, _reserved, h.crc32, = struct.unpack( ">HHLHHHHHBBL", raw[0:24] ) return h
def _parse_ereader_header202(raw): # Unfortunately, this header format is mostly unknown h = Storage() h.version, _unknown, h.non_text_records, = struct.unpack(">H6sH", raw[:10]) return h
def _parse_mobi_header (raw): mobiheader = Storage() mobiheader.compression, \ _unused, \ mobiheader.text_length, \ mobiheader.record_count, \ mobiheader.record_size, \ mobiheader.encryption, \ _unknown, \ = struct.unpack('>HHLHHHH', raw[0:0x10]) # Some ancient MOBI files have no more metadata than this if len(raw) <= 16: return mobiheader mobiheader.identifier, \ mobiheader.header_length, \ mobiheader.mobi_type, \ mobiheader.text_encoding, \ mobiheader.unique_id, \ mobiheader.file_version, \ mobiheader.ortographic_index_record, \ mobiheader.inflection_index_record, \ mobiheader.index_names_record, \ mobiheader.index_keys_record, \ mobiheader.extra_index0_record, \ mobiheader.extra_index1_record, \ mobiheader.extra_index2_record, \ mobiheader.extra_index3_record, \ mobiheader.extra_index4_record, \ mobiheader.extra_index5_record, \ mobiheader.first_nonbook_record, \ mobiheader.fullname_offset, \ mobiheader.fullname_length, \ mobiheader.locale, \ mobiheader.dictionary_input_language, \ mobiheader.dictionary_output_language, \ mobiheader.min_version, \ mobiheader.first_image_record, \ mobiheader.huffman_record, \ mobiheader.huffman_record_count, \ mobiheader.huffman_table_record, \ mobiheader.huffman_table_length, \ mobiheader.exth_flags, \ = struct.unpack('>4sLLLLLLLLLLLLLLLLLLLLLLLLLLLL', raw[0x10:0x84]) if len(raw) >= 0xb4: mobiheader.drm_offset, \ mobiheader.drm_count, \ mobiheader.drm_size, \ mobiheader.drm_flags, \ = struct.unpack('>LLLL', raw[0xa4:0xb4]) if mobiheader.header_length < 0xe4 or \ mobiheader.header_length > 0xf8: mobiheader.extra_flags = 0 else: mobiheader.extra_flags, = struct.unpack('>H', raw[0xf2:0xf4]) fullname_end = mobiheader.fullname_offset + mobiheader.fullname_length if fullname_end < len(raw): mobiheader.fullname = raw[mobiheader.fullname_offset:fullname_end] else: mobiheader.fullname = None if mobiheader.exth_flags & 0x40: mobiheader.exth = _parse_exth_header(raw[16 + mobiheader.header_length:]) return mobiheader