def generate_toc(self, oeb_book, ref_url, output_dir): ''' Generate table of contents ''' with directory.CurrentDir(output_dir): def build_node(current_node, parent=None): if parent is None: parent = etree.Element('ul') elif len(current_node.nodes): parent = element(parent, ('ul')) for node in current_node.nodes: point = element(parent, 'li') href = relpath( os.path.abspath(polyglot.unquote(node.href)), os.path.dirname(ref_url)) if isinstance(href, bytes): href = href.decode('utf-8') link = element(point, 'a', href=clean_xml_chars(href)) title = node.title if isinstance(title, bytes): title = title.decode('utf-8') if title: title = re.sub(r'\s+', ' ', title) link.text = clean_xml_chars(title) build_node(node, point) return parent wrap = etree.Element('div') wrap.append(build_node(oeb_book.toc)) return wrap
def extract_content(self, output_dir): from ebook_converter.ebooks.pml.pmlconverter import pml_to_html output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) pml = '' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i', i) pml += self.get_text_page(i) title = self.mi.title if not isinstance(title, str): title = title.decode('utf-8', 'replace') html = '<html><head><title>%s</title></head><body>%s</body></html>' % \ (title, pml_to_html(pml)) with directory.CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') index.write(html.encode('utf-8')) if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) images = [] with directory.CurrentDir(os.path.join(output_dir, 'images/')): for i in range(self.header_record.non_text_offset, len(self.sections)): name, img = self.get_image(i) if name: images.append(name) with open(name, 'wb') as imgf: self.log.debug('Writing image %s to images/', name) imgf.write(img) opf_path = self.create_opf(output_dir, images) return opf_path
def dump_images(self, output_dir): ''' This is primarily used for debugging and 3rd party tools to get the images in the file. ''' if not os.path.exists(output_dir): os.makedirs(output_dir) with directory.CurrentDir(output_dir): for i in range(0, self.header_record.num_image_pages): name, img = self.get_image( self.header_record.image_data_offset + i) with open(name, 'wb') as imgf: imgf.write(img)
def create_opf(self, output_dir, images): with directory.CurrentDir(output_dir): opf = OPFCreator(output_dir, self.mi) manifest = [('index.html', None)] for i in images: manifest.append((os.path.join('images/', i), None)) opf.create_manifest(manifest) opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: opf.render(opffile) return os.path.join(output_dir, 'metadata.opf')
def convert(self, oeb_book, output_path, input_plugin, opts, log): self.log, self.opts = log, opts if not os.path.exists(output_path): os.makedirs(output_path) with directory.CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): href, root = results.pop(key, [None, None]) if root is not None: if key == OPF_MIME: try: self.workaround_nook_cover_bug(root) except: self.log.exception('Something went wrong while ' 'trying to workaround Nook ' 'cover bug, ignoring') try: self.workaround_pocketbook_cover_bug(root) except: self.log.exception('Something went wrong while ' 'trying to workaround ' 'Pocketbook cover bug, ' 'ignoring') self.migrate_lang_code(root) raw = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) if key == OPF_MIME: # Needed as I can't get lxml to output opf:role and # not output <opf:metadata> as well raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw) with open(href, 'wb') as f: f.write(raw) for item in oeb_book.manifest: if (not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name): condense_sheet(item.data) path = os.path.abspath(polyglot.unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) with open(path, 'wb') as f: f.write(item.bytes_representation) item.unload_data_from_memory(memory=path)
def encrypt_fonts(self, uris, tdir, _uuid): # {{{ key = re.sub(r'[^a-fA-F0-9]', '', _uuid) if len(key) < 16: raise ValueError('UUID identifier %r is invalid' % _uuid) key = bytearray(polyglot.from_hex_bytes((key + key)[:32])) paths = [] with directory.CurrentDir(tdir): paths = [os.path.join(*x.split('/')) for x in uris] uris = dict(zip(uris, paths)) fonts = [] for uri in list(uris.keys()): path = uris[uri] if not os.path.exists(path): uris.pop(uri) continue self.log.debug('Encrypting font: %s', uri) with open(path, 'r+b') as f: data = f.read(1024) if len(data) >= 1024: data = bytearray(data) f.seek(0) f.write( bytes( bytearray(data[i] ^ key[i % 16] for i in range(1024)))) else: self.log.warning('Font %s is invalid, ignoring', path) if not isinstance(uri, str): uri = uri.decode('utf-8') fonts.append(''' <enc:EncryptedData> <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/> <enc:CipherData> <enc:CipherReference URI="%s"/> </enc:CipherData> </enc:EncryptedData> ''' % (uri.replace('"', '\\"'))) if fonts: ans = '''<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc"> ''' ans += '\n'.join(fonts) ans += '\n</encryption>' return ans
def __call__(self, stream, odir, log): from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.metadata.odt import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator if not os.path.exists(odir): os.makedirs(odir) with directory.CurrentDir(odir): log.info('Extracting ODT file...') stream.seek(0) mi = get_metadata(stream, 'odt') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] self.filter_load(stream, mi, log) # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method, # which expects, that all lines are strings. html = ''.join([str(l) for l in self.lines]) # A blanket img specification like this causes problems # with EPUB output as the containing element often has # an absolute height and width set that is larger than # the available screen real estate html = html.replace('img { width: 100%; height: 100%; }', '') # odf2xhtml creates empty title tag html = html.replace('<title></title>', '<title>%s</title>' % (mi.title, )) try: html = self.fix_markup(html, log) except: log.exception('Failed to filter CSS, conversion may be slow') with open('index.xhtml', 'wb') as f: f.write(polyglot.as_bytes(html)) zf = ZipFile(stream, 'r') self.extract_pictures(zf) opf = OPFCreator(os.path.abspath(os.getcwd()), mi) opf.create_manifest([(os.path.abspath(os.path.join(r, f2)), None) for r, _, fnames in os.walk(os.getcwd()) for f2 in fnames]) opf.create_spine([os.path.abspath('index.xhtml')]) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.abspath('metadata.opf')
def __call__(self, stream, options, file_ext, log, accelerators, output_dir): try: log.info('InputFormatPlugin: %s running', self.name) if hasattr(stream, 'name'): log.info('on: %s', stream.name) except: # In case stdout is broken pass with directory.CurrentDir(output_dir): for x in os.listdir('.'): shutil.rmtree(x) if os.path.isdir(x) else os.remove(x) ret = self.convert(stream, options, file_ext, log, accelerators) return ret
def create_opf(self, output_dir, images, toc): with directory.CurrentDir(output_dir): if 'cover.png' in images: self.mi.cover = os.path.join('images', 'cover.png') opf = OPFCreator(output_dir, self.mi) manifest = [('index.html', None)] for i in images: manifest.append((os.path.join('images', i), None)) opf.create_manifest(manifest) opf.create_spine(['index.html']) opf.set_toc(toc) with open('metadata.opf', 'wb') as opffile: with open('toc.ncx', 'wb') as tocfile: opf.render(opffile, tocfile, 'toc.ncx') return os.path.join(output_dir, 'metadata.opf')
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html')) with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with directory.CurrentDir(output_dir): cmd = [ 'pdftohtml', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', os.path.basename(pdfsrc), os.path.basename(index) ] if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: ret = subprocess.call(cmd, stderr=logf._fd, stdout=logf._fd) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError('Could not find pdftohtml, check it is ' 'in your PATH') else: raise logf.flush() logf.close() with open(logf.name) as fobj: out = fobj.read().strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: ' '%d\n%s' % (ret, out)) if out: print("pdftohtml log:") print(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with open(index, 'r+b') as i: raw = i.read().decode('utf-8', 'replace') raw = flip_images(raw) raw = raw.replace( '<head', '<!-- created by ebook-converter\'s' ' pdftohtml -->\n <head', 1) i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, # this breaks the pdf heuristics regexps, so replace them raw = raw.replace('<br/>', '<br>') raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = entities.xml_replace_entities(raw) raw = raw.replace('\u00a0', ' ') i.write(raw.encode('utf-8')) cmd = [ 'pdftohtml', '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout', os.path.basename(pdfsrc) ] raw = subprocess.check_output(cmd).strip() if raw: parse_outline(raw, output_dir) try: os.remove(pdfsrc) except Exception: pass
def extract_content(self, output_dir): from ebook_converter.ebooks.pml.pmlconverter import footnote_to_html, sidebar_to_html from ebook_converter.ebooks.pml.pmlconverter import PML_HTMLizer output_dir = os.path.abspath(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) title = self.mi.title if not isinstance(title, str): title = title.decode('utf-8', 'replace') html = '<html><head><title>%s</title></head><body>' % title pml = '' for i in range(1, self.header_record.num_text_pages + 1): self.log.debug('Extracting text page %i', i) pml += self.get_text_page(i) hizer = PML_HTMLizer() html += hizer.parse_pml(pml, 'index.html') toc = hizer.get_toc() if self.header_record.footnote_count > 0: html += '<br /><h1>%s</h1>' % 'Footnotes' footnoteids = re.findall( '\\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode( 'cp1252' if self.encoding is None else self.encoding)) for fid, i in enumerate( range( self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_count)): self.log.debug('Extracting footnote page %i', i) if fid < len(footnoteids): fid = footnoteids[fid] else: fid = '' html += footnote_to_html(fid, self.decompress_text(i)) if self.header_record.sidebar_count > 0: html += '<br /><h1>%s</h1>' % 'Sidebar' sidebarids = re.findall( '\\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode( 'cp1252' if self.encoding is None else self.encoding)) for sid, i in enumerate( range( self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_count)): self.log.debug('Extracting sidebar page %i', i) if sid < len(sidebarids): sid = sidebarids[sid] else: sid = '' html += sidebar_to_html(sid, self.decompress_text(i)) html += '</body></html>' with directory.CurrentDir(output_dir): with open('index.html', 'wb') as index: self.log.debug('Writing text to index.html') index.write(html.encode('utf-8')) if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) images = [] with directory.CurrentDir(os.path.join(output_dir, 'images/')): for i in range(0, self.header_record.num_image_pages): name, img = self.get_image( self.header_record.image_data_offset + i) images.append(name) with open(name, 'wb') as imgf: self.log.debug('Writing image %s to images/', name) imgf.write(img) opf_path = self.create_opf(output_dir, images, toc) return opf_path
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from ebook_converter.utils import zipfile from templite import Templite from ebook_converter.ebooks.html.meta import EasyMeta # read template files if opts.template_html_index is not None: with open(opts.template_html_index, 'rb') as f: template_html_index_data = f.read() else: with open( pkg_resources.resource_filename( 'ebook_converter', 'data/html_export_default_index.tmpl')) as fobj: template_html_index_data = fobj.read().decode() if opts.template_html is not None: with open(opts.template_html, 'rb') as f: template_html_data = f.read() else: with open( pkg_resources.resource_filename( 'ebook_converter', 'data/html_export_default.tmpl')) as fobj: template_html_data = fobj.read().decode() if opts.template_css is not None: with open(opts.template_css, 'rb') as f: template_css_data = f.read() else: with open( pkg_resources.resource_filename( 'ebook_converter', 'data/html_export_default.css')) as fobj: template_css_data = fobj.read().decode() template_html_index_data = template_html_index_data.decode('utf-8') template_html_data = template_html_data.decode('utf-8') template_css_data = template_css_data.decode('utf-8') self.log = log self.opts = opts meta = EasyMeta(oeb_book.metadata) tempdir = os.path.realpath(PersistentTemporaryDirectory()) output_file = os.path.join( tempdir, os.path.basename(re.sub(r'\.zip', '', output_path) + '.html')) output_dir = re.sub(r'\.html', '', output_file) + '_files' if not os.path.exists(output_dir): os.makedirs(output_dir) css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css' with open(css_path, 'wb') as f: f.write(template_css_data.encode('utf-8')) with open(output_file, 'wb') as f: html_toc = self.generate_html_toc(oeb_book, output_file, output_dir) templite = Templite(template_html_index_data) nextLink = oeb_book.spine[0].href nextLink = relpath(output_dir + os.sep + nextLink, os.path.dirname(output_file)) cssLink = relpath(os.path.abspath(css_path), os.path.dirname(output_file)) tocUrl = relpath(output_file, os.path.dirname(output_file)) t = templite.render(has_toc=bool(oeb_book.toc.count()), toc=html_toc, meta=meta, nextLink=nextLink, tocUrl=tocUrl, cssLink=cssLink, firstContentPageLink=nextLink) if isinstance(t, str): t = t.encode('utf-8') f.write(t) with directory.CurrentDir(output_dir): for item in oeb_book.manifest: path = os.path.abspath(polyglot.unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) if item.spine_position is not None: with open(path, 'wb') as f: pass else: with open(path, 'wb') as f: f.write(item.bytes_representation) item.unload_data_from_memory(memory=path) for item in oeb_book.spine: path = os.path.abspath(polyglot.unquote(item.href)) dir = os.path.dirname(path) root = item.data.getroottree() # get & clean HTML <HEAD>-data head = root.xpath( '//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] head_content = etree.tostring(head, pretty_print=True, encoding='unicode') head_content = re.sub(r'\<\/?head.*\>', '', head_content) head_content = re.sub( re.compile(r'\<style.*\/style\>', re.M | re.S), '', head_content) head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content) # get & clean HTML <BODY>-data body = root.xpath( '//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode') ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content) # generate link to next page if item.spine_position + 1 < len(oeb_book.spine): nextLink = oeb_book.spine[item.spine_position + 1].href nextLink = relpath(os.path.abspath(nextLink), dir) else: nextLink = None # generate link to previous page if item.spine_position > 0: prevLink = oeb_book.spine[item.spine_position - 1].href prevLink = relpath(os.path.abspath(prevLink), dir) else: prevLink = None cssLink = relpath(os.path.abspath(css_path), dir) tocUrl = relpath(output_file, dir) firstContentPageLink = oeb_book.spine[0].href # render template templite = Templite(template_html_data) toc = lambda: self.generate_html_toc(oeb_book, path, output_dir ) t = templite.render(ebookContent=ebook_content, prevLink=prevLink, nextLink=nextLink, has_toc=bool(oeb_book.toc.count()), toc=toc, tocUrl=tocUrl, head_content=head_content, meta=meta, cssLink=cssLink, firstContentPageLink=firstContentPageLink) # write html to file with open(path, 'wb') as f: f.write(t.encode('utf-8')) item.unload_data_from_memory(memory=path) zfile = zipfile.ZipFile(output_path, "w") zfile.add_dir(output_dir, os.path.basename(output_dir)) zfile.write(output_file, os.path.basename(output_file), zipfile.ZIP_DEFLATED) if opts.extract_to: if os.path.exists(opts.extract_to): shutil.rmtree(opts.extract_to) os.makedirs(opts.extract_to) zfile.extractall(opts.extract_to) self.log.info('Zip file extracted to %s', opts.extract_to) zfile.close() # cleanup temp dir shutil.rmtree(tempdir)
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with directory.CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html', uid, uid) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml( section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml( d, section_data.header.paragraph_offsets).decode( self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with directory.CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug( 'Wrote image with uid %s to ' 'images/%s.jpg', uid, uid) except Exception as e: self.log.error('Failed to write image with uid %s: %s', uid, e) else: self.log.error( 'Failed to write image with uid %s: ' 'No data.', uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(open('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data( open('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with open('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug( 'Wrote composite image with uid %s to ' 'images/%s.jpg', uid, uid) except Exception as e: self.log.error( 'Failed to write composite image with ' 'uid %s: %s', uid, e) # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb