def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if len(pheader.section_data(0)) == 132: hr = HeaderRecord(pheader.section_data(0)) if hr.compression in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) mdata = mdata.decode('cp1252', 'replace').split('\x00') mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0]) mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])] mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3]) mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4]) except Exception: pass if extract_cover: mi.cover_data = get_cover(pheader, hr) if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') return mi
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if len(pheader.section_data(0)) == 132: hr = HeaderRecord(pheader.section_data(0)) if hr.compression in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) mdata = mdata.split('\x00') mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0]) mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])] mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3]) mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4]) except: pass if extract_cover: mi.cover_data = get_cover(pheader, hr) if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') return mi
def set_metadata(stream, mi): pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if pheader.section_data(0) != 132: return sections = [ pheader.section_data(x) for x in range(0, pheader.section_count()) ] hr = HeaderRecord(sections[0]) if hr.compression not in (2, 10): return # Create a metadata record for the file if one does not already exist if not hr.has_metadata: sections += [b'', b'MeTaInFo\x00'] last_data = len(sections) - 1 for i in range(0, 132, 2): val, = struct.unpack('>H', sections[0][i:i + 2]) if val >= hr.last_data_offset: sections[0][i:i + 2] = struct.pack('>H', last_data) sections[0][24:26] = struct.pack('>H', 1) # Set has metadata sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata sections[0][52:54] = struct.pack( '>H', last_data) # Ensure last data offset is updated # Merge the metadata into the file file_mi = get_metadata(stream, False) file_mi.smart_update(mi) sections[hr.metadata_offset] = ( '%s\x00%s\x00%s\x00%s\x00%s\x00' % (file_mi.title, authors_to_string(file_mi.authors), '', file_mi.publisher, file_mi.isbn)).encode('cp1252', 'replace') # Rebuild the PDB wrapper because the offsets have changed due to the # new metadata. pheader_builder = PdbHeaderBuilder(pheader.ident, pheader.title) stream.seek(0) stream.truncate(0) pheader_builder.build_header([len(x) for x in sections], stream) # Write the data back to the file for item in sections: stream.write(item)
def get_pages_fast(self, mobi_file_path): ''' 2300 characters of uncompressed text per page. This is not meant to map 1 to 1 to a print book but to be a close enough measure. A test book was chosen and the characters were counted on one page. This number was round to 2240 then 60 characters of markup were added to the total giving 2300. Uncompressed text length is used because it's easily accessible in MOBI files (part of the header). Also, It's faster to work off of the length then to decompress and parse the actual text. ''' text_length = 0 pages = [] count = 0 with lopen(mobi_file_path, 'rb') as mf: phead = PdbHeaderReader(mf) r0 = phead.section_data(0) text_length = struct.unpack('>I', r0[4:8])[0] while count < text_length: pages.append(count) count += 2300 return pages
def set_metadata(stream, mi): pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if pheader.section_data(0) != 132: return sections = [pheader.section_data(x) for x in range(0, pheader.section_count())] hr = HeaderRecord(sections[0]) if hr.compression not in (2, 10): return # Create a metadata record for the file if one does not alreay exist if not hr.has_metadata: sections += ['', 'MeTaInFo\x00'] last_data = len(sections) - 1 for i in range(0, 132, 2): val, = struct.unpack('>H', sections[0][i:i + 2]) if val >= hr.last_data_offset: sections[0][i:i + 2] = struct.pack('>H', last_data) sections[0][24:26] = struct.pack('>H', 1) # Set has metadata sections[0][44:46] = struct.pack('>H', last_data - 1) # Set location of metadata sections[0][52:54] = struct.pack('>H', last_data) # Ensure last data offset is updated # Merge the metadata into the file file_mi = get_metadata(stream, False) file_mi.smart_update(mi) sections[hr.metadata_offset] = '%s\x00%s\x00%s\x00%s\x00%s\x00' % \ (file_mi.title, authors_to_string(file_mi.authors), '', file_mi.publisher, file_mi.isbn) # Rebuild the PDB wrapper because the offsets have changed due to the # new metadata. pheader_builder = PdbHeaderBuilder(pheader.ident, pheader.title) stream.seek(0) stream.truncate(0) pheader_builder.build_header([len(x) for x in sections], stream) # Write the data back to the file for item in sections: stream.write(item)
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) section_data = None for i in range(1, pheader.num_sections): raw_data = pheader.section_data(i) section_header = SectionHeader(raw_data) if section_header.type == DATATYPE_METADATA: section_data = raw_data[8:] break if not section_data: return mi default_encoding = 'latin-1' record_count, = struct.unpack('>H', section_data[0:2]) adv = 0 title = None author = None pubdate = 0 for i in range(record_count): try: type, length = struct.unpack_from('>HH', section_data, 2 + adv) except struct.error: break # CharSet if type == 1: val, = struct.unpack('>H', section_data[6+adv:8+adv]) default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1') # Author elif type == 4: author = section_data[6+adv+(2*length)] # Title elif type == 5: title = section_data[6+adv+(2*length)] # Publication Date elif type == 6: pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4]) adv += 2*length if title: mi.title = title.replace('\0', '').decode(default_encoding, 'replace') if author: author = author.replace('\0', '').decode(default_encoding, 'replace') mi.author = author.split(',') mi.pubdate = datetime.fromtimestamp(pubdate) return mi
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) section_data = None for i in range(1, pheader.num_sections): raw_data = pheader.section_data(i) section_header = SectionHeader(raw_data) if section_header.type == DATATYPE_METADATA: section_data = raw_data[8:] break if not section_data: return mi default_encoding = 'latin-1' record_count, = struct.unpack('>H', section_data[0:2]) adv = 0 title = None author = None pubdate = 0 for i in xrange(record_count): try: type, length = struct.unpack_from('>HH', section_data, 2 + adv) except struct.error: break # CharSet if type == 1: val, = struct.unpack('>H', section_data[6+adv:8+adv]) default_encoding = MIBNUM_TO_NAME.get(val, 'latin-1') # Author elif type == 4: author = section_data[6+adv+(2*length)] # Title elif type == 5: title = section_data[6+adv+(2*length)] # Publication Date elif type == 6: pubdate, = struct.unpack('>I', section_data[6+adv:6+adv+4]) adv += 2*length if title: mi.title = title.replace('\0', '').decode(default_encoding, 'replace') if author: author = author.replace('\0', '').decode(default_encoding, 'replace') mi.author = author.split(',') mi.pubdate = datetime.fromtimestamp(pubdate) return mi
def get_pages_exact(self, mobi_file_path, page_count): ''' Given a specified page count (such as from a custom column), create our array of pages for the apnx file by dividing by the content size of the book. ''' pages = [] count = 0 with lopen(mobi_file_path, 'rb') as mf: phead = PdbHeaderReader(mf) r0 = phead.section_data(0) text_length = struct.unpack('>I', r0[4:8])[0] chars_per_page = int(text_length // page_count) while count < text_length: pages.append(count) count += chars_per_page if len(pages) > page_count: # Rounding created extra page entries pages = pages[:page_count] return pages
def get_pages_exact(self, mobi_file_path, page_count): ''' Given a specified page count (such as from a custom column), create our array of pages for the apnx file by dividing by the content size of the book. ''' pages = [] count = 0 with lopen(mobi_file_path, 'rb') as mf: phead = PdbHeaderReader(mf) r0 = phead.section_data(0) text_length = struct.unpack('>I', r0[4:8])[0] chars_per_page = int(text_length / page_count) while count < text_length: pages.append(count) count += chars_per_page if len(pages) > page_count: # Rounding created extra page entries pages = pages[:page_count] return pages