Example #1
0
 def _metadata_from_opf(self, opf):
     from ebook_converter.ebooks.metadata.opf2 import OPF
     from ebook_converter.ebooks.oeb.transforms.metadata import \
         meta_info_to_oeb_metadata
     stream = io.BytesIO(
         etree.tostring(opf, xml_declaration=True, encoding='utf-8'))
     o = OPF(stream)
     pwm = o.primary_writing_mode
     if pwm:
         self.oeb.metadata.primary_writing_mode = pwm
     mi = o.to_book_metadata()
     if not mi.language:
         mi.language = get_lang().replace('_', '-')
     self.oeb.metadata.add('language', mi.language)
     if not mi.book_producer:
         mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' %
                             dict(a=__appname__, v=__version__))
     meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
     m = self.oeb.metadata
     m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid')
     self.oeb.uid = self.oeb.metadata.identifier[-1]
     if not m.title:
         m.add('title', self.oeb.translate('Unknown'))
     has_aut = False
     for x in m.creator:
         if getattr(x, 'role', '').lower() in ('', 'aut'):
             has_aut = True
             break
     if not has_aut:
         m.add('creator', self.oeb.translate('Unknown'), role='aut')
Example #2
0
 def read_embedded_metadata(self, root, elem, guide):
     raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \
             html.tostring(elem, encoding='utf-8') + b'</package>'
     stream = io.BytesIO(raw)
     opf = OPF(stream)
     self.embedded_mi = opf.to_book_metadata()
     if guide is not None:
         for ref in guide.xpath('descendant::reference'):
             if 'cover' in ref.get('type', '').lower():
                 href = ref.get('href', '')
                 if href.startswith('#'):
                     href = href[1:]
                 anchors = root.xpath('//*[@id="%s"]' % href)
                 if anchors:
                     cpos = anchors[0]
                     reached = False
                     for elem in root.iter():
                         if elem is cpos:
                             reached = True
                         if reached and elem.tag == 'img':
                             cover = elem.get('src', None)
                             self.embedded_mi.cover = cover
                             elem.getparent().remove(elem)
                             break
                 break
Example #3
0
def opf_metadata(opfpath):
    if hasattr(opfpath, 'read'):
        f = opfpath
        opfpath = getattr(f, 'name', os.getcwd())
    else:
        f = open(opfpath, 'rb')
    try:
        opf = OPF(f, os.path.dirname(opfpath))
        if opf.application_id is not None:
            mi = opf.to_book_metadata()
            if hasattr(opf, 'cover') and opf.cover:
                cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
                if os.access(cpath, os.R_OK):
                    fmt = cpath.rpartition('.')[-1]
                    with open(cpath, 'rb') as f:
                        data = f.read()
                    mi.cover_data = (fmt, data)
            return mi
    except Exception:
        import traceback
        traceback.print_exc()
        pass
Example #4
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.chardet import xml_to_unicode
        from ebook_converter.ebooks.metadata.opf2 import OPF
        from ebook_converter.utils.zipfile import ZipFile

        self.log = log
        html = u''
        top_levels = []

        # Extract content from zip archive.
        zf = ZipFile(stream)
        zf.extractall()

        # Find the HTML file in the archive. It needs to be
        # top level.
        index = u''
        multiple_html = False
        # Get a list of all top level files in the archive.
        for x in os.listdir(u'.'):
            if os.path.isfile(x):
                top_levels.append(x)
        # Try to find an index. file.
        for x in top_levels:
            if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
                index = x
                break
        # Look for multiple HTML files in the archive. We look at the
        # top level files only as only they matter in HTMLZ.
        for x in top_levels:
            if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
                # Set index to the first HTML file found if it's not
                # called index.
                if not index:
                    index = x
                else:
                    multiple_html = True
        # Warn the user if there multiple HTML file in the archive. HTMLZ
        # supports a single HTML file. A conversion with a multiple HTML file
        # HTMLZ archive probably won't turn out as the user expects. With
        # Multiple HTML files ZIP input should be used in place of HTMLZ.
        if multiple_html:
            log.warn('Multiple HTML files found in the archive. Only %s will '
                     'be used.' % index)

        if index:
            with open(index, 'rb') as tf:
                html = tf.read()
        else:
            raise Exception('No top level HTML file found.')

        if not html:
            raise Exception('Top level HTML file %s is empty' % index)

        # Encoding
        if options.input_encoding:
            ienc = options.input_encoding
        else:
            ienc = xml_to_unicode(html[:4096])[-1]
        html = html.decode(ienc, 'replace')

        # Run the HTML through the html processing plugin.
        from ebook_converter.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwd()
        htmlfile = os.path.join(base, u'index.html')
        c = 0
        while os.path.exists(htmlfile):
            c += 1
            htmlfile = u'index%d.html' % c
        with open(htmlfile, 'wb') as f:
            f.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile)

        # Set metadata from file.
        from ebook_converter.customize.ui import get_file_type_metadata
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        # Get the cover path from the OPF.
        cover_path = None
        opf = None
        for x in top_levels:
            if os.path.splitext(x)[1].lower() == u'.opf':
                opf = x
                break
        if opf:
            opf = OPF(opf, basedir=os.getcwd())
            cover_path = opf.raster_cover or opf.cover
        # Set the cover.
        if cover_path:
            cdata = None
            with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf:
                cdata = cf.read()
            cover_name = os.path.basename(cover_path)
            id, href = oeb.manifest.generate('cover', cover_name)
            oeb.manifest.add(id,
                             href,
                             mimetypes.guess_type(cover_name)[0],
                             data=cdata)
            oeb.guide.add('cover', 'Cover', href)

        return oeb
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from ebook_converter.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
        from ebook_converter.ebooks.metadata.opf2 import OPF, metadata_to_opf
        from ebook_converter.utils.zipfile import ZipFile
        from ebook_converter.utils.filenames import ascii_filename

        # HTML
        if opts.htmlz_css_type == 'inline':
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
            OEB2HTMLizer = OEB2HTMLInlineCSSizer
        elif opts.htmlz_css_type == 'tag':
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
            OEB2HTMLizer = OEB2HTMLNoCSSizer
        else:
            from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer

        with TemporaryDirectory(u'_htmlz_output') as tdir:
            htmlizer = OEB2HTMLizer(log)
            html = htmlizer.oeb2html(oeb_book, opts)

            fname = u'index'
            if opts.htmlz_title_filename:
                from ebook_converter.utils.filenames import shorten_components_to
                fname = shorten_components_to(100, (ascii_filename(str(oeb_book.metadata.title[0])),))[0]
            with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
                if isinstance(html, str):
                    html = html.encode('utf-8')
                tf.write(html)

            # CSS
            if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
                with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
                    tf.write(htmlizer.get_css(oeb_book))

            # Images
            images = htmlizer.images
            if images:
                if not os.path.exists(os.path.join(tdir, u'images')):
                    os.makedirs(os.path.join(tdir, u'images'))
                for item in oeb_book.manifest:
                    if item.media_type in OEB_IMAGES and item.href in images:
                        if item.media_type == SVG_MIME:
                            data = etree.tostring(item.data, encoding='unicode')
                        else:
                            data = item.data
                        fname = os.path.join(tdir, u'images', images[item.href])
                        with open(fname, 'wb') as img:
                            img.write(data)

            # Cover
            cover_path = None
            try:
                cover_data = None
                if oeb_book.metadata.cover:
                    term = oeb_book.metadata.cover[0].term
                    cover_data = oeb_book.guide[term].item.data
                if cover_data:
                    from ebook_converter.utils.img import save_cover_data_to
                    cover_path = os.path.join(tdir, u'cover.jpg')
                    with open(cover_path, 'w') as cf:
                        cf.write('')
                    save_cover_data_to(cover_data, cover_path)
            except:
                import traceback
                traceback.print_exc()

            # Metadata
            with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
                opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
                mi = opf.to_book_metadata()
                if cover_path:
                    mi.cover = u'cover.jpg'
                mdataf.write(metadata_to_opf(mi))

            htmlz = ZipFile(output_path, 'w')
            htmlz.add_dir(tdir)