def extract_content(self, output_dir): from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html from calibre.ebooks.pml.pmlconverter import PML_HTMLizer output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) title = self.mi.title if not isinstance(title, unicode_type): title = title.decode('utf-8', 'replace') html = u'<html><head><title>%s</title></head><body>' % title pml = u'' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) pml += self.get_text_page(i) hizer = PML_HTMLizer() html += hizer.parse_pml(pml, 'index.html') toc = hizer.get_toc() if self.header_record.footnote_count > 0: html += '<br /><h1>%s</h1>' % _('Footnotes') footnoteids = re.findall( '\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)): self.log.debug('Extracting footnote page %i' % i) if fid < len(footnoteids): fid = footnoteids[fid] else: fid = '' html += footnote_to_html(fid, self.decompress_text(i)) if self.header_record.sidebar_count > 0: html += '<br /><h1>%s</h1>' % _('Sidebar') sidebarids = re.findall( '\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)): self.log.debug('Extracting sidebar page %i' % i) if sid < len(sidebarids): sid = sidebarids[sid] else: sid = '' html += sidebar_to_html(sid, self.decompress_text(i)) html += '</body></html>' with CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') index.write(html.encode('utf-8')) if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) images = [] with CurrentDir(os.path.join(output_dir, 'images/')): for i in range(0, self.header_record.num_image_pages): name, img = self.get_image(self.header_record.image_data_offset + i) images.append(name) with open(name, 'wb') as imgf: self.log.debug('Writing image %s to images/' % name) imgf.write(img) opf_path = self.create_opf(output_dir, images, toc) return opf_path
def extract_content(self, output_dir): from calibre.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html from calibre.ebooks.pml.pmlconverter import PML_HTMLizer output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) title = self.mi.title if not isinstance(title, unicode): title = title.decode('utf-8', 'replace') html = u'<html><head><title>%s</title></head><body>' % title pml = u'' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i' % i) pml += self.get_text_page(i) hizer = PML_HTMLizer() html += hizer.parse_pml(pml, 'index.html') toc = hizer.get_toc() if self.header_record.footnote_count > 0: html += '<br /><h1>%s</h1>' % _('Footnotes') footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding)) for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)): self.log.debug('Extracting footnote page %i' % i) if fid < len(footnoteids): fid = footnoteids[fid] else: fid = '' html += footnote_to_html(fid, self.decompress_text(i)) if self.header_record.sidebar_count > 0: html += '<br /><h1>%s</h1>' % _('Sidebar') sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding)) for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)): self.log.debug('Extracting sidebar page %i' % i) if sid < len(sidebarids): sid = sidebarids[sid] else: sid = '' html += sidebar_to_html(sid, self.decompress_text(i)) html += '</body></html>' with CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') index.write(html.encode('utf-8')) if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) images = [] with CurrentDir(os.path.join(output_dir, 'images/')): for i in range(0, self.header_record.num_image_pages): name, img = self.get_image(self.header_record.image_data_offset + i) images.append(name) with open(name, 'wb') as imgf: self.log.debug('Writing image %s to images/' % name) imgf.write(img) opf_path = self.create_opf(output_dir, images, toc) return opf_path