def __call__(self, oeb, opts): from ebook_converter.utils.imghdr import what self.log = oeb.log attr_path = XPath('//h:img[@src]') for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for img in attr_path(root): raw = img.get('src', '') if not raw.startswith('data:'): continue header, data = raw.partition(',')[0::2] if not header.startswith('data:image/') or not data: continue if ';base64' in header: data = re.sub(r'\s+', '', data) try: data = from_base64_bytes(data) except Exception: self.log.error('Found invalid base64 encoded data ' 'URI, ignoring it') continue else: data = urllib.parse.unquote(data) data = as_bytes(data) fmt = what(None, data) if not fmt: self.log.warn('Image encoded as data URL has unknown ' 'format, ignoring') continue img.set( 'src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def _create_html_root(self, hhcpath, log, encoding): hhcdata = self._read_file(hhcpath) hhcdata = hhcdata.decode(encoding) hhcdata = xml_to_unicode(hhcdata, verbose=True, strip_encoding_pats=True, resolve_entities=True)[0] hhcroot = html.fromstring(hhcdata) toc = self._process_nodes(hhcroot) log.debug('Found %d section nodes' % toc.count()) htmlpath = os.path.splitext(hhcpath)[0] + ".html" base = os.path.dirname(os.path.abspath(htmlpath)) def unquote(x): if isinstance(x, str): x = x.encode('utf-8') return _unquote(x).decode('utf-8') def unquote_path(x): y = unquote(x) if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))): x = y return x def donode(item, parent, base, subpath): for child in item: title = child.title if not title: continue raw = unquote_path(child.href or '') rsrcname = os.path.basename(raw) rsrcpath = os.path.join(subpath, rsrcname) if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))): rsrcpath = raw if '%' not in rsrcpath: rsrcpath = urlquote(rsrcpath) if not raw: rsrcpath = '' c = builder.DIV(builder.A(title, href=rsrcpath)) donode(child, c, base, subpath) parent.append(c) with open(htmlpath, 'wb') as f: if toc.count() > 1: path0 = toc[0].href path0 = unquote_path(path0) subpath = os.path.dirname(path0) base = os.path.dirname(f.name) root = builder.DIV() donode(toc, root, base, subpath) raw = html.tostring(builder.HTML(builder.BODY(root)), encoding='utf-8', pretty_print=True) f.write(raw) else: f.write(as_bytes(hhcdata)) return htmlpath, toc
def write(self, name='styles.css'): def join(style): ans = ['%s : %s;' % (k, v) for k, v in style.items()] if ans: ans[-1] = ans[-1][:-1] return '\n\t'.join(ans) with open(name, 'wb') as f: f.write(as_bytes(self.CSS)) for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles, 'bs')]: for i, s in enumerate(w): if not s: continue rsel = '.%s%d' % (sel, i) s = join(s) f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
def get_table(raw, name): ''' Get the raw table bytes for the specified table in the font ''' name = as_bytes(name.lower()) for table_tag, table, table_index, table_offset, table_checksum in get_tables( raw): if table_tag.lower() == name: return table, table_index, table_offset, table_checksum return None, None, None, None
def __call__(self, stream, odir, log): from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.metadata.odt import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator if not os.path.exists(odir): os.makedirs(odir) with CurrentDir(odir): log('Extracting ODT file...') stream.seek(0) mi = get_metadata(stream, 'odt') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] self.filter_load(stream, mi, log) # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method, # which expects, that all lines are strings. html = ''.join([str(l) for l in self.lines]) # A blanket img specification like this causes problems # with EPUB output as the containing element often has # an absolute height and width set that is larger than # the available screen real estate html = html.replace('img { width: 100%; height: 100%; }', '') # odf2xhtml creates empty title tag html = html.replace('<title></title>', '<title>%s</title>' % (mi.title, )) try: html = self.fix_markup(html, log) except: log.exception('Failed to filter CSS, conversion may be slow') with open('index.xhtml', 'wb') as f: f.write(as_bytes(html)) zf = ZipFile(stream, 'r') self.extract_pictures(zf) opf = OPFCreator(os.path.abspath(os.getcwd()), mi) opf.create_manifest([(os.path.abspath(f2), None) for f2 in walk(os.getcwd())]) opf.create_spine([os.path.abspath('index.xhtml')]) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.abspath('metadata.opf')
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.rtf2xml.ParseRtf import \ RtfInvalidCodeException from ebook_converter.ebooks.rtf.input import InlineClass self.opts = options self.log = log self.log('Converting RTF to XML...') try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException as e: self.log.exception('Unable to parse RTF') raise ValueError('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try ' 'it.\n%s' % e) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} try: imap = self.extract_images(d[0]) except Exception: self.log.exception('Failed to extract images...') self.log('Parsing XML...') doc = etree.fromstring(xml) border_styles = self.convert_borders(doc) for pict in doc.xpath( '//rtf:pict[@num]', namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}): num = int(pict.get('num')) name = imap.get(num, None) if name is not None: pict.set('num', name) self.log('Converting XML to HTML...') inline_class = InlineClass(self.log) with open( pkg_resources.resource_filename('ebook_converter', 'data/rtf.xsl')) as fobj: styledoc = etree.fromstring(fobj.read()) extensions = {('calibre', 'inline-class'): inline_class} transform = etree.XSLT(styledoc, extensions=extensions) result = transform(doc) html = u'index.xhtml' with open(html, 'wb') as f: res = as_bytes(transform.tostring(result)) # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # clean multiple \n res = re.sub(b'\n+', b'\n', res) # Replace newlines inserted by the 'empty_paragraphs' option in # rtf2xml with html blank lines # res = re.sub('\s*<body>', '<body>', res) # res = re.sub('(?<=\n)\n{2}', # u'<p>\u00a0</p>\n'.encode('utf-8'), res) f.write(res) self.write_inline_css(inline_class, border_styles) stream.seek(0) mi = get_metadata(stream, 'rtf') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([(u'index.xhtml', None)]) opf.create_spine([u'index.xhtml']) opf.render(open(u'metadata.opf', 'wb')) return os.path.abspath(u'metadata.opf')
def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb if self.opts.epub_inline_toc: from ebook_converter.ebooks.mobi.writer8.toc import TOCAdder opts.mobi_toc_at_start = not opts.epub_toc_at_end opts.mobi_passthrough = False opts.no_inline_toc = False TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True) if self.opts.epub_flatten: from ebook_converter.ebooks.oeb.transforms.filenames import FlatFilenames FlatFilenames()(oeb, opts) else: from ebook_converter.ebooks.oeb.transforms.filenames import UniqueFilenames UniqueFilenames()(oeb, opts) self.workaround_ade_quirks() self.workaround_webkit_quirks() self.upshift_markup() from ebook_converter.ebooks.oeb.transforms.rescale import RescaleImages RescaleImages(check_colorspaces=True)(oeb, opts) from ebook_converter.ebooks.oeb.transforms.split import Split split = Split(not self.opts.dont_split_on_page_breaks, max_flow_size=self.opts.flow_size*1024 ) split(self.oeb, self.opts) from ebook_converter.ebooks.oeb.transforms.cover import CoverManager cm = CoverManager( no_default_cover=self.opts.no_default_epub_cover, no_svg_cover=self.opts.no_svg_cover, preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio) cm(self.oeb, self.opts, self.log) self.workaround_sony_quirks() if self.oeb.toc.count() == 0: self.log.warn('This EPUB file has no Table of Contents. ' 'Creating a default TOC') first = next(iter(self.oeb.spine)) self.oeb.toc.add('Start', first.href) identifiers = oeb.metadata['identifier'] _uuid = None for x in identifiers: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): _uuid = str(x).split(':')[-1] break encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) if _uuid is None: self.log.warn('No UUID identifier found') _uuid = str(uuid.uuid4()) oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid) if encrypted_fonts and not _uuid.startswith('urn:uuid:'): # Apparently ADE requires this value to start with urn:uuid: # for some absurd reason, or it will throw a hissy fit and refuse # to use the obfuscated fonts. for x in identifiers: if str(x) == _uuid: x.content = 'urn:uuid:' + _uuid with TemporaryDirectory('_epub_output') as tdir: from ebook_converter.customize.ui import plugin_for_output_format metadata_xml = None extra_entries = [] if self.is_periodical: if self.opts.output_profile.epub_periodical_format == 'sony': from ebook_converter.ebooks.epub.periodical import sony_metadata metadata_xml, atom_xml = sony_metadata(oeb) extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)] oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir) if x.endswith('.ncx')][0]) if self.opts.epub_version == '3': self.upgrade_to_epub3(tdir, opf) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid) from ebook_converter.ebooks.epub import initialize_container with initialize_container(output_path, os.path.basename(opf), extra_entries=extra_entries) as epub: epub.add_dir(tdir) if encryption is not None: epub.writestr('META-INF/encryption.xml', as_bytes(encryption)) if metadata_xml is not None: epub.writestr('META-INF/metadata.xml', metadata_xml.encode('utf-8')) if opts.extract_to is not None: from ebook_converter.utils.zipfile import ZipFile if os.path.exists(opts.extract_to): if os.path.isdir(opts.extract_to): shutil.rmtree(opts.extract_to) else: os.remove(opts.extract_to) os.mkdir(opts.extract_to) with ZipFile(output_path) as zf: zf.extractall(path=opts.extract_to) self.log.info('EPUB extracted to', opts.extract_to)
def get_css(self, oeb_book): css = b'' for item in oeb_book.manifest: if item.media_type == 'text/css': css += as_bytes(item.data.cssText) + b'\n\n' return css
def encode_string(raw): ans = bytearray(as_bytes(raw)) ans.insert(0, len(ans)) return bytes(ans)