def extract_text(self, offset=1): self.log.debug('Extracting text...') text_sections = [ self.text_section(i) for i in range( offset, min(self.book_header.records + offset, len(self.sections))) ] processed_records = list( range(offset - 1, self.book_header.records + offset)) self.mobi_html = b'' if self.book_header.compression_type == 'DH': huffs = [ self.sections[i][0] for i in range( self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number) ] processed_records += list( range( self.book_header.huff_offset, self.book_header.huff_offset + self.book_header.huff_number)) huff = HuffReader(huffs) unpack = huff.unpack elif self.book_header.compression_type == '\x00\x02': unpack = decompress_doc elif self.book_header.compression_type == '\x00\x01': unpack = lambda x: x else: raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type)) self.mobi_html = b''.join(map(unpack, text_sections)) if self.mobi_html.endswith(b'#'): self.mobi_html = self.mobi_html[:-1] if self.book_header.ancient and '<html' not in self.mobi_html[: 300].lower( ): self.mobi_html = self.mobi_html.replace('\r ', '\n\n ') self.mobi_html = self.mobi_html.replace('\0', '') if self.book_header.codec == 'cp1252': self.mobi_html = self.mobi_html.replace('\x1e', '') # record separator self.mobi_html = self.mobi_html.replace('\x02', '') # start of text return processed_records
def huffit(off, cnt): huffman_record_nums = list(xrange(off, off + cnt)) huffrecs = [self.records[r].raw for r in huffman_record_nums] huffs = HuffReader(huffrecs) return huffman_record_nums, huffs.unpack