Esempio n. 1
0
 def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
     self._href = None
     self._basedir = basedir
     self.path = None
     self.fragment = ''
     try:
         self.mime_type = mimetypes.guess_type(href_or_path)[0]
     except:
         self.mime_type = None
     if self.mime_type is None:
         self.mime_type = 'application/octet-stream'
     if is_path:
         path = href_or_path
         if not os.path.isabs(path):
             path = os.path.abspath(os.path.join(basedir, path))
         if isinstance(path, bytes):
             path = path.decode(sys.getfilesystemencoding())
         self.path = path
     else:
         url = urllib.parse.urlparse(href_or_path)
         if url[0] not in ('', 'file'):
             self._href = href_or_path
         else:
             pc = url[2]
             if isinstance(pc, str):
                 pc = pc.encode('utf-8')
             pc = polyglot.unquote(pc).decode('utf-8')
             self.path = os.path.abspath(
                 os.path.join(basedir, pc.replace('/', os.sep)))
             self.fragment = polyglot.unquote(url[-1])
Esempio n. 2
0
        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except Exception:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = ''
                for txt in txt_path(nl):
                    text += etree.tostring(txt,
                                           method='text',
                                           encoding='unicode',
                                           with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urllib.parse.urlparse(content.get('src'))
                    href = polyglot.unquote(purl[2])
                    fragment = polyglot.unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)
Esempio n. 3
0
 def _cover_from_html(self, hcover):
     from ebook_converter.ebooks import render_html_svg_workaround
     with TemporaryDirectory('_html_cover') as tdir:
         writer = OEBWriter()
         writer(self.oeb, tdir)
         path = os.path.join(tdir, polyglot.unquote(hcover.href))
         data = render_html_svg_workaround(path, self.logger)
         if not data:
             data = b''
     id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
     item = self.oeb.manifest.add(id, href, base.JPEG_MIME, data=data)
     return item
Esempio n. 4
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):

        self.log, self.opts = log, opts
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        with directory.CurrentDir(output_path):
            results = oeb_book.to_opf2(page_map=True)
            for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
                href, root = results.pop(key, [None, None])
                if root is not None:
                    if key == OPF_MIME:
                        try:
                            self.workaround_nook_cover_bug(root)
                        except:
                            self.log.exception('Something went wrong while '
                                               'trying to workaround Nook '
                                               'cover bug, ignoring')
                        try:
                            self.workaround_pocketbook_cover_bug(root)
                        except:
                            self.log.exception('Something went wrong while '
                                               'trying to workaround '
                                               'Pocketbook cover bug, '
                                               'ignoring')
                        self.migrate_lang_code(root)
                    raw = etree.tostring(root,
                                         pretty_print=True,
                                         encoding='utf-8',
                                         xml_declaration=True)
                    if key == OPF_MIME:
                        # Needed as I can't get lxml to output opf:role and
                        # not output <opf:metadata> as well
                        raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
                    with open(href, 'wb') as f:
                        f.write(raw)

            for item in oeb_book.manifest:
                if (not self.opts.expand_css and item.media_type in OEB_STYLES
                        and hasattr(item.data, 'cssText')
                        and 'nook' not in self.opts.output_profile.short_name):
                    condense_sheet(item.data)
                path = os.path.abspath(polyglot.unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
                with open(path, 'wb') as f:
                    f.write(item.bytes_representation)
                item.unload_data_from_memory(memory=path)
Esempio n. 5
0
    def read_from_opf(self, opfreader):
        toc = opfreader.soup.find('spine', toc=True)
        if toc is not None:
            toc = toc['toc']
        if toc is None:
            try:
                toc = (opfreader.soup.find('guide').find('reference',
                                                         attrs={'type': 'toc'
                                                                })['href'])
            except Exception:
                for item in opfreader.manifest:
                    if 'toc' in item.href().lower():
                        toc = item.href()
                        break

        if toc is not None:
            if toc.lower() not in ('ncx', 'ncxtoc'):
                toc = urllib.parse.urlparse(polyglot.unquote(toc))[2]
                toc = toc.replace('/', os.sep)
                if not os.path.isabs(toc):
                    toc = os.path.join(self.base_path, toc)
                try:
                    if not os.path.exists(toc):
                        bn = os.path.basename(toc)
                        # Bug in BAEN OPF files
                        bn = bn.replace('_top.htm', '_toc.htm')
                        toc = os.path.join(os.path.dirname(toc), bn)

                    self.read_html_toc(toc)
                except Exception:
                    print('WARNING: Could not read Table of Contents. '
                          'Continuing anyway.')
            else:
                path = opfreader.manifest.item(toc.lower())
                path = getattr(path, 'path', path)
                if path and os.access(path, os.R_OK):
                    try:
                        self.read_ncx_toc(path)
                    except Exception as err:
                        print('WARNING: Invalid NCX file:', err)
                    return
                cwd = os.path.abspath(self.base_path)
                m = glob.glob(os.path.join(cwd, '*.ncx'))
                if m:
                    toc = m[0]
                    self.read_ncx_toc(toc)
Esempio n. 6
0
    def insert_cover(self):
        g, m = self.oeb.guide, self.oeb.manifest
        item = None
        href = None
        if 'titlepage' not in g:
            if 'cover' in g:
                href = g['cover'].href
            if href is None:
                return
            width, height = self.inspect_cover(href)
            if width == -1 or height == -1:
                self.log.warning('Failed to read cover dimensions')
                width, height = 600, 800
            # if self.preserve_aspect_ratio:
            #    width, height = 600, 800
            self.svg_template = self.svg_template.replace(
                '__viewbox__', '0 0 %d %d' % (width, height))
            self.svg_template = self.svg_template.replace(
                '__width__', str(width))
            self.svg_template = self.svg_template.replace(
                '__height__', str(height))

            if href is not None:
                templ = self.non_svg_template if self.no_svg_cover \
                        else self.svg_template
                tp = templ % polyglot.unquote(href)
                id, href = m.generate('titlepage', 'titlepage.xhtml')
                item = m.add(id,
                             href,
                             mimetypes.guess_type('t.xhtml')[0],
                             data=etree.fromstring(tp))
        else:
            key = urllib.parse.urldefrag(self.oeb.guide['titlepage'].href)[0]
            item = self.oeb.manifest.hrefs[key]
        if item is not None:
            self.oeb.spine.insert(0, item, True)
            if 'cover' not in self.oeb.guide.refs:
                self.oeb.guide.add('cover', 'Title Page', 'a')
            self.oeb.guide.refs['cover'].href = item.href
            if 'titlepage' in self.oeb.guide.refs:
                self.oeb.guide.refs['titlepage'].href = item.href
            titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
            if titem is not None:
                titem.href = item.href
Esempio n. 7
0
    def rewrite_links(self, url):
        href, frag = urllib.parse.urldefrag(url)
        try:
            href = self.current_item.abshref(href)
        except ValueError:
            # Unparseable URL
            return url
        try:
            href = base.urlnormalize(href)
        except ValueError:
            # href has non utf-8 quoting
            return url
        if href in self.map:
            anchor_map = self.map[href]
            nhref = anchor_map[frag if frag else None]
            nhref = self.current_item.relhref(nhref)
            if frag:
                nhref = '#'.join((polyglot.unquote(nhref), frag))

            return nhref
        return url
Esempio n. 8
0
 def build_node(current_node, parent=None):
     if parent is None:
         parent = etree.Element('ul')
     elif len(current_node.nodes):
         parent = element(parent, ('ul'))
     for node in current_node.nodes:
         point = element(parent, 'li')
         href = relpath(
             os.path.abspath(polyglot.unquote(node.href)),
             os.path.dirname(ref_url))
         if isinstance(href, bytes):
             href = href.decode('utf-8')
         link = element(point, 'a', href=clean_xml_chars(href))
         title = node.title
         if isinstance(title, bytes):
             title = title.decode('utf-8')
         if title:
             title = re.sub(r'\s+', ' ', title)
         link.text = clean_xml_chars(title)
         build_node(node, point)
     return parent
Esempio n. 9
0
def parse_html_toc(data):
    from html5_parser import parse
    from ebook_converter.utils.cleantext import clean_xml_chars
    from lxml import etree
    if isinstance(data, bytes):
        data = xml_to_unicode(data,
                              strip_encoding_pats=True,
                              resolve_entities=True)[0]
    root = parse(clean_xml_chars(data),
                 maybe_xhtml=True,
                 keep_doctype=False,
                 sanitize_names=True)
    for a in root.xpath('//*[@href and local-name()="a"]'):
        purl = urllib.parse.urlparse(polyglot.unquote(a.get('href')))
        href, fragment = purl[2], purl[5]
        if not fragment:
            fragment = None
        else:
            fragment = fragment.strip()
        href = href.strip()

        txt = etree.tostring(a, method='text', encoding='unicode')
        yield href, fragment, txt
Esempio n. 10
0
 def unquote(x):
     if isinstance(x, str):
         x = x.encode('utf-8')
     return polyglot.unquote(x).decode('utf-8')
Esempio n. 11
0
    def convert(self, oeb_book, output_path, input_plugin, opts, log):
        from lxml import etree
        from ebook_converter.utils import zipfile
        from templite import Templite
        from ebook_converter.ebooks.html.meta import EasyMeta

        # read template files
        if opts.template_html_index is not None:
            with open(opts.template_html_index, 'rb') as f:
                template_html_index_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default_index.tmpl')) as fobj:
                template_html_index_data = fobj.read().decode()

        if opts.template_html is not None:
            with open(opts.template_html, 'rb') as f:
                template_html_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default.tmpl')) as fobj:
                template_html_data = fobj.read().decode()

        if opts.template_css is not None:
            with open(opts.template_css, 'rb') as f:
                template_css_data = f.read()
        else:
            with open(
                    pkg_resources.resource_filename(
                        'ebook_converter',
                        'data/html_export_default.css')) as fobj:
                template_css_data = fobj.read().decode()

        template_html_index_data = template_html_index_data.decode('utf-8')
        template_html_data = template_html_data.decode('utf-8')
        template_css_data = template_css_data.decode('utf-8')

        self.log = log
        self.opts = opts
        meta = EasyMeta(oeb_book.metadata)

        tempdir = os.path.realpath(PersistentTemporaryDirectory())
        output_file = os.path.join(
            tempdir,
            os.path.basename(re.sub(r'\.zip', '', output_path) + '.html'))
        output_dir = re.sub(r'\.html', '', output_file) + '_files'

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css'
        with open(css_path, 'wb') as f:
            f.write(template_css_data.encode('utf-8'))

        with open(output_file, 'wb') as f:
            html_toc = self.generate_html_toc(oeb_book, output_file,
                                              output_dir)
            templite = Templite(template_html_index_data)
            nextLink = oeb_book.spine[0].href
            nextLink = relpath(output_dir + os.sep + nextLink,
                               os.path.dirname(output_file))
            cssLink = relpath(os.path.abspath(css_path),
                              os.path.dirname(output_file))
            tocUrl = relpath(output_file, os.path.dirname(output_file))
            t = templite.render(has_toc=bool(oeb_book.toc.count()),
                                toc=html_toc,
                                meta=meta,
                                nextLink=nextLink,
                                tocUrl=tocUrl,
                                cssLink=cssLink,
                                firstContentPageLink=nextLink)
            if isinstance(t, str):
                t = t.encode('utf-8')
            f.write(t)

        with directory.CurrentDir(output_dir):
            for item in oeb_book.manifest:
                path = os.path.abspath(polyglot.unquote(item.href))
                dir = os.path.dirname(path)
                if not os.path.exists(dir):
                    os.makedirs(dir)
                if item.spine_position is not None:
                    with open(path, 'wb') as f:
                        pass
                else:
                    with open(path, 'wb') as f:
                        f.write(item.bytes_representation)
                    item.unload_data_from_memory(memory=path)

            for item in oeb_book.spine:
                path = os.path.abspath(polyglot.unquote(item.href))
                dir = os.path.dirname(path)
                root = item.data.getroottree()

                # get & clean HTML <HEAD>-data
                head = root.xpath(
                    '//h:head',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                head_content = etree.tostring(head,
                                              pretty_print=True,
                                              encoding='unicode')
                head_content = re.sub(r'\<\/?head.*\>', '', head_content)
                head_content = re.sub(
                    re.compile(r'\<style.*\/style\>', re.M | re.S), '',
                    head_content)
                head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>',
                                      head_content)

                # get & clean HTML <BODY>-data
                body = root.xpath(
                    '//h:body',
                    namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
                ebook_content = etree.tostring(body,
                                               pretty_print=True,
                                               encoding='unicode')
                ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
                ebook_content = re.sub(r'<(div|a|span)([^>]*)/>',
                                       r'<\1\2></\1>', ebook_content)

                # generate link to next page
                if item.spine_position + 1 < len(oeb_book.spine):
                    nextLink = oeb_book.spine[item.spine_position + 1].href
                    nextLink = relpath(os.path.abspath(nextLink), dir)
                else:
                    nextLink = None

                # generate link to previous page
                if item.spine_position > 0:
                    prevLink = oeb_book.spine[item.spine_position - 1].href
                    prevLink = relpath(os.path.abspath(prevLink), dir)
                else:
                    prevLink = None

                cssLink = relpath(os.path.abspath(css_path), dir)
                tocUrl = relpath(output_file, dir)
                firstContentPageLink = oeb_book.spine[0].href

                # render template
                templite = Templite(template_html_data)
                toc = lambda: self.generate_html_toc(oeb_book, path, output_dir
                                                     )
                t = templite.render(ebookContent=ebook_content,
                                    prevLink=prevLink,
                                    nextLink=nextLink,
                                    has_toc=bool(oeb_book.toc.count()),
                                    toc=toc,
                                    tocUrl=tocUrl,
                                    head_content=head_content,
                                    meta=meta,
                                    cssLink=cssLink,
                                    firstContentPageLink=firstContentPageLink)

                # write html to file
                with open(path, 'wb') as f:
                    f.write(t.encode('utf-8'))
                item.unload_data_from_memory(memory=path)

        zfile = zipfile.ZipFile(output_path, "w")
        zfile.add_dir(output_dir, os.path.basename(output_dir))
        zfile.write(output_file, os.path.basename(output_file),
                    zipfile.ZIP_DEFLATED)

        if opts.extract_to:
            if os.path.exists(opts.extract_to):
                shutil.rmtree(opts.extract_to)
            os.makedirs(opts.extract_to)
            zfile.extractall(opts.extract_to)
            self.log.info('Zip file extracted to %s', opts.extract_to)

        zfile.close()

        # cleanup temp dir
        shutil.rmtree(tempdir)