Example #1
0
 def __call__(self, oeb, opts):
     from ebook_converter.utils.imghdr import what
     self.log = oeb.log
     attr_path = XPath('//h:img[@src]')
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for img in attr_path(root):
             raw = img.get('src', '')
             if not raw.startswith('data:'):
                 continue
             header, data = raw.partition(',')[0::2]
             if not header.startswith('data:image/') or not data:
                 continue
             if ';base64' in header:
                 data = re.sub(r'\s+', '', data)
                 try:
                     data = from_base64_bytes(data)
                 except Exception:
                     self.log.error('Found invalid base64 encoded data '
                                    'URI, ignoring it')
                     continue
             else:
                 data = urllib.parse.unquote(data)
             data = as_bytes(data)
             fmt = what(None, data)
             if not fmt:
                 self.log.warn('Image encoded as data URL has unknown '
                               'format, ignoring')
                 continue
             img.set(
                 'src',
                 item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
Example #2
0
    def _create_html_root(self, hhcpath, log, encoding):

        hhcdata = self._read_file(hhcpath)
        hhcdata = hhcdata.decode(encoding)
        hhcdata = xml_to_unicode(hhcdata, verbose=True,
                                 strip_encoding_pats=True,
                                 resolve_entities=True)[0]
        hhcroot = html.fromstring(hhcdata)
        toc = self._process_nodes(hhcroot)
        log.debug('Found %d section nodes' % toc.count())
        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
        base = os.path.dirname(os.path.abspath(htmlpath))

        def unquote(x):
            if isinstance(x, str):
                x = x.encode('utf-8')
            return _unquote(x).decode('utf-8')

        def unquote_path(x):
            y = unquote(x)
            if (not os.path.exists(os.path.join(base, x)) and
                    os.path.exists(os.path.join(base, y))):
                x = y
            return x

        def donode(item, parent, base, subpath):
            for child in item:
                title = child.title
                if not title:
                    continue
                raw = unquote_path(child.href or '')
                rsrcname = os.path.basename(raw)
                rsrcpath = os.path.join(subpath, rsrcname)
                if (not os.path.exists(os.path.join(base, rsrcpath)) and
                        os.path.exists(os.path.join(base, raw))):
                    rsrcpath = raw

                if '%' not in rsrcpath:
                    rsrcpath = urlquote(rsrcpath)
                if not raw:
                    rsrcpath = ''
                c = builder.DIV(builder.A(title, href=rsrcpath))
                donode(child, c, base, subpath)
                parent.append(c)

        with open(htmlpath, 'wb') as f:
            if toc.count() > 1:
                path0 = toc[0].href
                path0 = unquote_path(path0)
                subpath = os.path.dirname(path0)
                base = os.path.dirname(f.name)
                root = builder.DIV()
                donode(toc, root, base, subpath)
                raw = html.tostring(builder.HTML(builder.BODY(root)),
                                    encoding='utf-8',
                                    pretty_print=True)
                f.write(raw)
            else:
                f.write(as_bytes(hhcdata))
        return htmlpath, toc
Example #3
0
    def write(self, name='styles.css'):
        def join(style):
            ans = ['%s : %s;' % (k, v) for k, v in style.items()]
            if ans:
                ans[-1] = ans[-1][:-1]
            return '\n\t'.join(ans)

        with open(name, 'wb') as f:
            f.write(as_bytes(self.CSS))
            for (w, sel) in [(self.text_styles, 'ts'),
                             (self.block_styles, 'bs')]:
                for i, s in enumerate(w):
                    if not s:
                        continue
                    rsel = '.%s%d' % (sel, i)
                    s = join(s)
                    f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
Example #4
0
def get_table(raw, name):
    ''' Get the raw table bytes for the specified table in the font '''
    name = as_bytes(name.lower())
    for table_tag, table, table_index, table_offset, table_checksum in get_tables(
            raw):
        if table_tag.lower() == name:
            return table, table_index, table_offset, table_checksum
    return None, None, None, None
Example #5
0
    def __call__(self, stream, odir, log):
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.ebooks.metadata.odt import get_metadata
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator

        if not os.path.exists(odir):
            os.makedirs(odir)
        with CurrentDir(odir):
            log('Extracting ODT file...')
            stream.seek(0)
            mi = get_metadata(stream, 'odt')
            if not mi.title:
                mi.title = 'Unknown'
            if not mi.authors:
                mi.authors = ['Unknown']
            self.filter_load(stream, mi, log)

            # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method,
            # which expects, that all lines are strings.
            html = ''.join([str(l) for l in self.lines])

            # A blanket img specification like this causes problems
            # with EPUB output as the containing element often has
            # an absolute height and width set that is larger than
            # the available screen real estate
            html = html.replace('img { width: 100%; height: 100%; }', '')
            # odf2xhtml creates empty title tag
            html = html.replace('<title></title>',
                                '<title>%s</title>' % (mi.title, ))
            try:
                html = self.fix_markup(html, log)
            except:
                log.exception('Failed to filter CSS, conversion may be slow')
            with open('index.xhtml', 'wb') as f:
                f.write(as_bytes(html))
            zf = ZipFile(stream, 'r')
            self.extract_pictures(zf)
            opf = OPFCreator(os.path.abspath(os.getcwd()), mi)
            opf.create_manifest([(os.path.abspath(f2), None)
                                 for f2 in walk(os.getcwd())])
            opf.create_spine([os.path.abspath('index.xhtml')])
            with open('metadata.opf', 'wb') as f:
                opf.render(f)
            return os.path.abspath('metadata.opf')
Example #6
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata.meta import get_metadata
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.ebooks.rtf2xml.ParseRtf import \
            RtfInvalidCodeException
        from ebook_converter.ebooks.rtf.input import InlineClass
        self.opts = options
        self.log = log
        self.log('Converting RTF to XML...')
        try:
            xml = self.generate_xml(stream.name)
        except RtfInvalidCodeException as e:
            self.log.exception('Unable to parse RTF')
            raise ValueError('This RTF file has a feature calibre does not '
                             'support. Convert it to HTML first and then try '
                             'it.\n%s' % e)

        d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
        if d:
            imap = {}
            try:
                imap = self.extract_images(d[0])
            except Exception:
                self.log.exception('Failed to extract images...')

        self.log('Parsing XML...')
        doc = etree.fromstring(xml)
        border_styles = self.convert_borders(doc)
        for pict in doc.xpath(
                '//rtf:pict[@num]',
                namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}):
            num = int(pict.get('num'))
            name = imap.get(num, None)
            if name is not None:
                pict.set('num', name)

        self.log('Converting XML to HTML...')
        inline_class = InlineClass(self.log)
        with open(
                pkg_resources.resource_filename('ebook_converter',
                                                'data/rtf.xsl')) as fobj:
            styledoc = etree.fromstring(fobj.read())
        extensions = {('calibre', 'inline-class'): inline_class}
        transform = etree.XSLT(styledoc, extensions=extensions)
        result = transform(doc)
        html = u'index.xhtml'
        with open(html, 'wb') as f:
            res = as_bytes(transform.tostring(result))
            # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
            # clean multiple \n
            res = re.sub(b'\n+', b'\n', res)
            # Replace newlines inserted by the 'empty_paragraphs' option in
            # rtf2xml with html blank lines
            # res = re.sub('\s*<body>', '<body>', res)
            # res = re.sub('(?<=\n)\n{2}',
            # u'<p>\u00a0</p>\n'.encode('utf-8'), res)
            f.write(res)
        self.write_inline_css(inline_class, border_styles)
        stream.seek(0)
        mi = get_metadata(stream, 'rtf')
        if not mi.title:
            mi.title = 'Unknown'
        if not mi.authors:
            mi.authors = ['Unknown']
        opf = OPFCreator(os.getcwd(), mi)
        opf.create_manifest([(u'index.xhtml', None)])
        opf.create_spine([u'index.xhtml'])
        opf.render(open(u'metadata.opf', 'wb'))
        return os.path.abspath(u'metadata.opf')
    def convert(self, oeb, output_path, input_plugin, opts, log):
        self.log, self.opts, self.oeb = log, opts, oeb

        if self.opts.epub_inline_toc:
            from ebook_converter.ebooks.mobi.writer8.toc import TOCAdder
            opts.mobi_toc_at_start = not opts.epub_toc_at_end
            opts.mobi_passthrough = False
            opts.no_inline_toc = False
            TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)

        if self.opts.epub_flatten:
            from ebook_converter.ebooks.oeb.transforms.filenames import FlatFilenames
            FlatFilenames()(oeb, opts)
        else:
            from ebook_converter.ebooks.oeb.transforms.filenames import UniqueFilenames
            UniqueFilenames()(oeb, opts)

        self.workaround_ade_quirks()
        self.workaround_webkit_quirks()
        self.upshift_markup()
        from ebook_converter.ebooks.oeb.transforms.rescale import RescaleImages
        RescaleImages(check_colorspaces=True)(oeb, opts)

        from ebook_converter.ebooks.oeb.transforms.split import Split
        split = Split(not self.opts.dont_split_on_page_breaks,
                max_flow_size=self.opts.flow_size*1024
                )
        split(self.oeb, self.opts)

        from ebook_converter.ebooks.oeb.transforms.cover import CoverManager
        cm = CoverManager(
                no_default_cover=self.opts.no_default_epub_cover,
                no_svg_cover=self.opts.no_svg_cover,
                preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
        cm(self.oeb, self.opts, self.log)

        self.workaround_sony_quirks()

        if self.oeb.toc.count() == 0:
            self.log.warn('This EPUB file has no Table of Contents. '
                    'Creating a default TOC')
            first = next(iter(self.oeb.spine))
            self.oeb.toc.add('Start', first.href)

        identifiers = oeb.metadata['identifier']
        _uuid = None
        for x in identifiers:
            if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or
                    str(x).startswith('urn:uuid:')):
                _uuid = str(x).split(':')[-1]
                break
        encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])

        if _uuid is None:
            self.log.warn('No UUID identifier found')
            _uuid = str(uuid.uuid4())
            oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid)

        if encrypted_fonts and not _uuid.startswith('urn:uuid:'):
            # Apparently ADE requires this value to start with urn:uuid:
            # for some absurd reason, or it will throw a hissy fit and refuse
            # to use the obfuscated fonts.
            for x in identifiers:
                if str(x) == _uuid:
                    x.content = 'urn:uuid:' + _uuid

        with TemporaryDirectory('_epub_output') as tdir:
            from ebook_converter.customize.ui import plugin_for_output_format
            metadata_xml = None
            extra_entries = []
            if self.is_periodical:
                if self.opts.output_profile.epub_periodical_format == 'sony':
                    from ebook_converter.ebooks.epub.periodical import sony_metadata
                    metadata_xml, atom_xml = sony_metadata(oeb)
                    extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
            oeb_output = plugin_for_output_format('oeb')
            oeb_output.convert(oeb, tdir, input_plugin, opts, log)
            opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
            self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
                    if x.endswith('.ncx')][0])
            if self.opts.epub_version == '3':
                self.upgrade_to_epub3(tdir, opf)
            encryption = None
            if encrypted_fonts:
                encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid)

            from ebook_converter.ebooks.epub import initialize_container
            with initialize_container(output_path, os.path.basename(opf),
                    extra_entries=extra_entries) as epub:
                epub.add_dir(tdir)
                if encryption is not None:
                    epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
                if metadata_xml is not None:
                    epub.writestr('META-INF/metadata.xml',
                            metadata_xml.encode('utf-8'))
            if opts.extract_to is not None:
                from ebook_converter.utils.zipfile import ZipFile
                if os.path.exists(opts.extract_to):
                    if os.path.isdir(opts.extract_to):
                        shutil.rmtree(opts.extract_to)
                    else:
                        os.remove(opts.extract_to)
                os.mkdir(opts.extract_to)
                with ZipFile(output_path) as zf:
                    zf.extractall(path=opts.extract_to)
                self.log.info('EPUB extracted to', opts.extract_to)
Example #8
0
 def get_css(self, oeb_book):
     css = b''
     for item in oeb_book.manifest:
         if item.media_type == 'text/css':
             css += as_bytes(item.data.cssText) + b'\n\n'
     return css
Example #9
0
def encode_string(raw):
    ans = bytearray(as_bytes(raw))
    ans.insert(0, len(ans))
    return bytes(ans)