Ejemplo n.º 1
0
def parse_outline(raw, output_dir):
    raw = clean_xml_chars(
        xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
    outline = etree.fromstring(raw).xpath('(//outline)[1]')
    if outline:
        from ebook_converter.ebooks.oeb.polish.toc import TOC, create_ncx
        outline = outline[0]
        toc = TOC()
        count = [0]

        def process_node(node, toc):
            for child in node.iterchildren('*'):
                if child.tag == 'outline':
                    parent = toc.children[-1] if toc.children else toc
                    process_node(child, parent)
                else:
                    if child.text:
                        page = child.get('page', '1')
                        toc.add(child.text, 'index.html', 'p' + page)
                        count[0] += 1

        process_node(outline, toc)
        if count[0] > 2:
            root = create_ncx(toc, (lambda x: x), 'pdftohtml', 'en',
                              'pdftohtml')
            with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
                f.write(
                    etree.tostring(root,
                                   pretty_print=True,
                                   with_tail=False,
                                   encoding='utf-8',
                                   xml_declaration=True))
Ejemplo n.º 2
0
    def _create_html_root(self, hhcpath, log, encoding):

        hhcdata = self._read_file(hhcpath)
        hhcdata = hhcdata.decode(encoding)
        hhcdata = xml_to_unicode(hhcdata, verbose=True,
                                 strip_encoding_pats=True,
                                 resolve_entities=True)[0]
        hhcroot = html.fromstring(hhcdata)
        toc = self._process_nodes(hhcroot)
        log.debug('Found %d section nodes' % toc.count())
        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
        base = os.path.dirname(os.path.abspath(htmlpath))

        def unquote(x):
            if isinstance(x, str):
                x = x.encode('utf-8')
            return _unquote(x).decode('utf-8')

        def unquote_path(x):
            y = unquote(x)
            if (not os.path.exists(os.path.join(base, x)) and
                    os.path.exists(os.path.join(base, y))):
                x = y
            return x

        def donode(item, parent, base, subpath):
            for child in item:
                title = child.title
                if not title:
                    continue
                raw = unquote_path(child.href or '')
                rsrcname = os.path.basename(raw)
                rsrcpath = os.path.join(subpath, rsrcname)
                if (not os.path.exists(os.path.join(base, rsrcpath)) and
                        os.path.exists(os.path.join(base, raw))):
                    rsrcpath = raw

                if '%' not in rsrcpath:
                    rsrcpath = urlquote(rsrcpath)
                if not raw:
                    rsrcpath = ''
                c = builder.DIV(builder.A(title, href=rsrcpath))
                donode(child, c, base, subpath)
                parent.append(c)

        with open(htmlpath, 'wb') as f:
            if toc.count() > 1:
                path0 = toc[0].href
                path0 = unquote_path(path0)
                subpath = os.path.dirname(path0)
                base = os.path.dirname(f.name)
                root = builder.DIV()
                donode(toc, root, base, subpath)
                raw = html.tostring(builder.HTML(builder.BODY(root)),
                                    encoding='utf-8',
                                    pretty_print=True)
                f.write(raw)
            else:
                f.write(as_bytes(hhcdata))
        return htmlpath, toc
Ejemplo n.º 3
0
def parse_html5(raw,
                decoder=None,
                log=None,
                discard_namespaces=False,
                line_numbers=True,
                linenumber_attribute=None,
                replace_entities=True,
                fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = entities.xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = clean_xml_chars(raw)
    root = html5_parser.parse(raw,
                              maybe_xhtml=not discard_namespaces,
                              line_number_attr=linenumber_attribute,
                              keep_doctype=False,
                              sanitize_names=True)
    if ((discard_namespaces and root.tag != 'html') or
        (not discard_namespaces and
         (root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))):
        raise ValueError('Failed to parse correctly, root has tag: %s and '
                         'prefix: %s' % (root.tag, root.prefix))
    return root
Ejemplo n.º 4
0
    def read_ncx_toc(self, toc, root=None):
        self.base_path = os.path.dirname(toc)
        if root is None:
            with open(toc, 'rb') as f:
                raw = xml_to_unicode(f.read(),
                                     assume_utf8=True,
                                     strip_encoding_pats=True)[0]
            root = etree.fromstring(raw)
        xpn = {'re': 'http://exslt.org/regular-expressions'}
        XPath = functools.partial(etree.XPath, namespaces=xpn)

        def get_attr(node, default=None, attr='playorder'):
            for name, val in node.attrib.items():
                if name and val and name.lower().endswith(attr):
                    return val
            return default

        nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
        txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
        content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
        np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')

        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except Exception:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = ''
                for txt in txt_path(nl):
                    text += etree.tostring(txt,
                                           method='text',
                                           encoding='unicode',
                                           with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urllib.parse.urlparse(content.get('src'))
                    href = polyglot.unquote(purl[2])
                    fragment = polyglot.unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)

        nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
        if not nm:
            raise ValueError('NCX files must have a <navmap> element.')
        nm = nm[0]

        for child in np_path(nm):
            process_navpoint(child, self)
Ejemplo n.º 5
0
def parse_html(markup):
    if isinstance(markup, str):
        markup = chardet.strip_encoding_declarations(markup)
        markup = chardet.substitute_entites(markup)
    else:
        markup = chardet.xml_to_unicode(markup,
                                        strip_encoding_pats=True,
                                        resolve_entities=True)[0]
    markup = cleantext.clean_xml_chars(markup)
    return html5_soup.parse(markup, return_root=False)
Ejemplo n.º 6
0
def parse(raw,
          decoder=None,
          log=None,
          line_numbers=True,
          linenumber_attribute=None,
          replace_entities=True,
          force_html5_parse=False):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = handle_private_entities(raw)
    if replace_entities:
        # Handle &#0;
        raw = entities.xml_replace_entities(raw).replace('\0', '')
    raw = raw.replace('\r\n', '\n').replace('\r', '\n')

    # Remove any preamble before the opening html tag as it can cause problems,
    # especially doctypes, preserve the original linenumbers by inserting
    # newlines at the start
    pre = raw[:2048]
    for match in re.finditer(r'<\s*html', pre, flags=re.I):
        newlines = raw.count('\n', 0, match.start())
        raw = ('\n' * newlines) + raw[match.start():]
        break

    raw = strip_encoding_declarations(raw,
                                      limit=10 * 1024,
                                      preserve_newlines=True)
    if force_html5_parse:
        return parse_html5(raw,
                           log=log,
                           line_numbers=line_numbers,
                           linenumber_attribute=linenumber_attribute,
                           replace_entities=False,
                           fix_newlines=False)
    try:
        ans = etree.fromstring(raw)
        if ans.tag != '{%s}html' % const.XHTML_NS:
            raise ValueError('Root tag is not <html> in the XHTML namespace')
        if linenumber_attribute:
            for elem in ans.iter(etree.element):
                if elem.sourceline is not None:
                    elem.set(linenumber_attribute, str(elem.sourceline))
        return ans
    except Exception:
        if log is not None:
            log.exception('Failed to parse as XML, parsing as tag soup')
        return parse_html5(raw,
                           log=log,
                           line_numbers=line_numbers,
                           linenumber_attribute=linenumber_attribute,
                           replace_entities=False,
                           fix_newlines=False)
Ejemplo n.º 7
0
def parse_opf(stream_or_path):
    stream = stream_or_path
    if not hasattr(stream, 'read'):
        stream = open(stream, 'rb')
    raw = stream.read()
    if not raw:
        raise ValueError('Empty file: ' + getattr(stream, 'name', 'stream'))
    raw, encoding = xml_to_unicode(raw,
                                   strip_encoding_pats=True,
                                   resolve_entities=True,
                                   assume_utf8=True)
    raw = raw[raw.find('<'):]
    root = etree.fromstring(clean_xml_chars(raw))
    if root is None:
        raise ValueError('Not an OPF file')
    return root
Ejemplo n.º 8
0
def html2text(html):
    from html2text import HTML2Text
    import re
    if isinstance(html, bytes):
        from ebook_converter.ebooks.chardet import xml_to_unicode
        html = xml_to_unicode(html,
                              strip_encoding_pats=True,
                              resolve_entities=True)[0]
    # replace <u> tags with <span> as <u> becomes emphasis in html2text
    html = re.sub(r'<\s*(?P<solidus>/?)\s*[uU]\b(?P<rest>[^>]*)>',
                  r'<\g<solidus>span\g<rest>>', html)
    h2t = HTML2Text()
    h2t.default_image_alt = 'Unnamed image'
    h2t.body_width = 0
    h2t.single_line_break = True
    h2t.emphasis_mark = '*'
    return h2t.handle(html)
Ejemplo n.º 9
0
        def fget(self):
            if self.compressed_info_size == 0:
                raise LRFException("This document has no meta info")
            size = self.compressed_info_size - 4
            self._file.seek(self.info_start)
            try:
                src = zlib.decompress(self._file.read(size))
                if len(src) != self.uncompressed_info_size:
                    raise LRFException("Decompression of document meta info\
                                        yielded unexpected results")

                src = xml_to_unicode(src,
                                     strip_encoding_pats=True,
                                     resolve_entities=True,
                                     assume_utf8=True)[0]
                return minidom.parseString(src)
            except zlib.error:
                raise LRFException("Unable to decompress document meta "
                                   "information")
Ejemplo n.º 10
0
def parse_html_toc(data):
    from html5_parser import parse
    from ebook_converter.utils.cleantext import clean_xml_chars
    from lxml import etree
    if isinstance(data, bytes):
        data = xml_to_unicode(data, strip_encoding_pats=True,
                              resolve_entities=True)[0]
    root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False,
                 sanitize_names=True)
    for a in root.xpath('//*[@href and local-name()="a"]'):
        purl = urllib.parse.urlparse(unquote(a.get('href')))
        href, fragment = purl[2], purl[5]
        if not fragment:
            fragment = None
        else:
            fragment = fragment.strip()
        href = href.strip()

        txt = etree.tostring(a, method='text', encoding='unicode')
        yield href, fragment, txt
Ejemplo n.º 11
0
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
    from ebook_converter.ebooks.oeb.base import SVG_NS
    with open(path_to_html, 'rb') as f:
        raw = f.read()
    raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
    data = None
    if SVG_NS in raw:
        try:
            data = extract_cover_from_embedded_svg(
                raw, os.path.dirname(path_to_html), log)
        except Exception:
            pass
    if data is None:
        try:
            data = extract_calibre_cover(raw, os.path.dirname(path_to_html),
                                         log)
        except Exception:
            pass

    if data is None:
        data = render_html_data(path_to_html, width, height)
    return data
Ejemplo n.º 12
0
 def postprocess_book(self, oeb, opts, log):
     from ebook_converter.ebooks.oeb.base import XPath, XHTML
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for bad in ('metadata', 'guide'):
             metadata = XPath('//h:'+bad)(root)
             if metadata:
                 for x in metadata:
                     x.getparent().remove(x)
         body = XPath('//h:body')(root)
         if body:
             body = body[0]
             if len(body) == 1 and body[0].tag == XHTML('pre'):
                 pre = body[0]
                 from ebook_converter.ebooks.txt.processor import \
                     convert_basic, separate_paragraphs_single_line
                 from ebook_converter.ebooks.chardet import xml_to_unicode
                 self.log('LIT file with all text in singe <pre> tag '
                          'detected')
                 html = separate_paragraphs_single_line(pre.text)
                 html = convert_basic(html).replace('<html>',
                                                    '<html xmlns="%s">' %
                                                    const.XHTML_NS)
                 html = xml_to_unicode(html, strip_encoding_pats=True,
                                       resolve_entities=True)[0]
                 if opts.smarten_punctuation:
                     # SmartyPants skips text inside <pre> tags
                     from ebook_converter.ebooks.conversion import \
                             preprocess
                     html = preprocess.smarten_punctuation(html, self.log)
                 root = etree.fromstring(html)
                 body = XPath('//h:body')(root)
                 pre.tag = XHTML('div')
                 pre.text = ''
                 for elem in body:
                     ne = copy.deepcopy(elem)
                     pre.append(ne)
Ejemplo n.º 13
0
def _get_fbroot(raw):
    raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
    root = etree.fromstring(raw)
    return ensure_namespace(root)
Ejemplo n.º 14
0
    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from ebook_converter.ebooks.chardet import xml_to_unicode
        from ebook_converter.ebooks.oeb.polish.parsing import parse
        from ebook_converter.ebooks.oeb.base import \
            serialize
        from ebook_converter.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with open(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                             assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/'
                               'ncx/" version="2005-1" xml:lang="eng">'
                               '<navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % const.EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(base.tag('xhtml', 'a'),
                                     base.tag('xhtml', 'span')):
                text = etree.tostring(
                    x, method='text', encoding='unicode',
                    with_tail=False).strip() or ' '.join(
                        x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(base.tag('ncx', 'navPoint'))
            parent.append(np)
            np.append(np.makeelement(base.tag('ncx', 'navLabel')))
            np[0].append(np.makeelement(base.tag('ncx', 'text')))
            np[0][0].text = text
            if href:
                np.append(
                    np.makeelement(base.tag('ncx', 'content'),
                                   attrib={'src': href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(base.tag('xhtml', 'li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, base.tag('xhtml', 'ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(base.tag('xhtml', 'nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, base.tag('xhtml', 'ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx',
                                dir=os.path.dirname(nav_path),
                                delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, base.NCX_MIME,
                                          append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        url = os.path.relpath(nav_path).replace(os.sep, '/')
        opts.epub3_nav_href = base.urlnormalize(url)
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = (os.path.relpath(
                    os.path.join(base_path, urllib.parse.unquote(href)),
                    base_path))
                abs_href = base.urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with open(nav_path, 'wb') as f:
                    f.write(base.serialize(root, 'application/xhtml+xml'))
Ejemplo n.º 15
0
def parse_html(data,
               log=None,
               decoder=None,
               preprocessor=None,
               filename='<string>',
               non_html_file_tags=frozenset()):
    if log is None:
        log = LOG

    filename = force_unicode(filename, enc=filesystem_encoding)

    if not isinstance(data, str):
        if decoder is not None:
            data = decoder(data)
        else:
            data = xml_to_unicode(data)[0]

    data = strip_encoding_declarations(data)
    # Remove DOCTYPE declaration as it messes up parsing
    # In particular, it causes tostring to insert xmlns
    # declarations, which messes up the coercing logic
    pre = ''
    idx = data.find('<html')
    if idx == -1:
        idx = data.find('<HTML')
    has_html4_doctype = False
    if idx > -1:
        pre = data[:idx]
        data = data[idx:]
        if '<!DOCTYPE' in pre:  # Handle user defined entities
            # kindlegen produces invalid xhtml with uppercase attribute names
            # if fed HTML 4 with uppercase attribute names, so try to detect
            # and compensate for that.
            has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>',
                                          pre) is not None
            # Process private entities
            user_entities = {}
            for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                val = match.group(2)
                if val.startswith('"') and val.endswith('"'):
                    val = val[1:-1]
                user_entities[match.group(1)] = val
            if user_entities:
                pat = re.compile(r'&(%s);' %
                                 ('|'.join(list(user_entities.keys()))))
                data = pat.sub(lambda m: user_entities[m.group(1)], data)

    if preprocessor is not None:
        data = preprocessor(data)

    # There could be null bytes in data if it had &#0; entities in it
    data = data.replace('\0', '')
    data = raw = clean_word_doc(data, log)

    # Try with more & more drastic measures to parse
    try:
        data = etree.fromstring(data)
        check_for_html5(pre, data)
    except (HTML5Doc, etree.XMLSyntaxError):
        log.debug('Initial parse failed, using more' ' forgiving parsers')
        raw = data = xml_replace_entities(raw)
        try:
            data = etree.fromstring(data)
            check_for_html5(pre, data)
        except (HTML5Doc, etree.XMLSyntaxError):
            log.debug('Parsing %s as HTML' % filename)
            data = raw
            try:
                data = html5_parse(data)
            except Exception:
                log.exception(
                    'HTML 5 parsing failed, falling back to older parsers')
                data = _html4_parse(data)

    if has_html4_doctype or data.tag == 'HTML' or (
            len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))):
        # Lower case all tag and attribute names
        data.tag = data.tag.lower()
        for x in data.iterdescendants():
            try:
                x.tag = x.tag.lower()
                for key, val in tuple(x.attrib.items()):
                    del x.attrib[key]
                    key = key.lower()
                    x.attrib[key] = val
            except:
                pass

    if barename(data.tag) != 'html':
        if barename(data.tag) in non_html_file_tags:
            raise NotHTML(data.tag)
        log.warn('File %r does not appear to be (X)HTML' % filename)
        nroot = etree.fromstring('<html></html>')
        has_body = False
        for child in list(data):
            if isinstance(child.tag,
                          (str, bytes)) and barename(child.tag) == 'body':
                has_body = True
                break
        parent = nroot
        if not has_body:
            log.warn('File %r appears to be a HTML fragment' % filename)
            nroot = etree.fromstring('<html><body/></html>')
            parent = nroot[0]
        for child in list(data.iter()):
            oparent = child.getparent()
            if oparent is not None:
                oparent.remove(child)
            parent.append(child)
        data = nroot

    # Force into the XHTML namespace
    if not namespace(data.tag):
        log.warn('Forcing', filename, 'into XHTML namespace')
        data.attrib['xmlns'] = const.XHTML_NS
        data = etree.tostring(data, encoding='unicode')

        try:
            data = etree.fromstring(data)
        except:
            data = data.replace(':=', '=').replace(':>', '>')
            data = data.replace('<http:/>', '')
            try:
                data = etree.fromstring(data)
            except etree.XMLSyntaxError:
                log.warn('Stripping comments from %s' % filename)
                data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data)
                data = data.replace(
                    "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", '')
                data = data.replace("<?xml version='1.0' encoding='utf-8'??>",
                                    '')
                try:
                    data = etree.fromstring(data)
                except etree.XMLSyntaxError:
                    log.warn('Stripping meta tags from %s' % filename)
                    data = re.sub(r'<meta\s+[^>]+?>', '', data)
                    data = etree.fromstring(data)
    elif namespace(data.tag) != const.XHTML_NS:
        # OEB_DOC_NS, but possibly others
        ns = namespace(data.tag)
        attrib = dict(data.attrib)
        nroot = etree.Element(XHTML('html'),
                              nsmap={None: const.XHTML_NS},
                              attrib=attrib)
        for elem in data.iterdescendants():
            if isinstance(elem.tag, (str, bytes)) and \
                namespace(elem.tag) == ns:
                elem.tag = XHTML(barename(elem.tag))
        for elem in data:
            nroot.append(elem)
        data = nroot

    # Remove non default prefixes referring to the XHTML namespace
    data = ensure_namespace_prefixes(data, {None: const.XHTML_NS})

    data = merge_multiple_html_heads_and_bodies(data, log)
    # Ensure has a <head/>
    head = xpath(data, '/h:html/h:head')
    head = head[0] if head else None
    if head is None:
        log.warn('File %s missing <head/> element' % filename)
        head = etree.Element(XHTML('head'))
        data.insert(0, head)
        title = etree.SubElement(head, XHTML('title'))
        title.text = 'Unknown'
    elif not xpath(data, '/h:html/h:head/h:title'):
        title = etree.SubElement(head, XHTML('title'))
        title.text = 'Unknown'
    # Ensure <title> is not empty
    title = xpath(data, '/h:html/h:head/h:title')[0]
    if not title.text or not title.text.strip():
        title.text = 'Unknown'
    # Remove any encoding-specifying <meta/> elements
    for meta in META_XP(data):
        meta.getparent().remove(meta)
    meta = etree.SubElement(head,
                            XHTML('meta'),
                            attrib={'http-equiv': 'Content-Type'})
    meta.set('content',
             'text/html; charset=utf-8')  # Ensure content is second attribute

    # Ensure has a <body/>
    if not xpath(data, '/h:html/h:body'):
        body = xpath(data, '//h:body')
        if body:
            body = body[0]
            body.getparent().remove(body)
            data.append(body)
        else:
            log.warn('File %s missing <body/> element' % filename)
            etree.SubElement(data, XHTML('body'))

    # Remove microsoft office markup
    r = [
        x for x in data.iterdescendants(etree.Element)
        if 'microsoft-com' in x.tag
    ]
    for x in r:
        x.tag = XHTML('span')

    def remove_elem(a):
        p = a.getparent()
        idx = p.index(a) - 1
        p.remove(a)
        if a.tail:
            if idx < 0:
                if p.text is None:
                    p.text = ''
                p.text += a.tail
            else:
                if p[idx].tail is None:
                    p[idx].tail = ''
                p[idx].tail += a.tail

    # Remove hyperlinks with no content as they cause rendering
    # artifacts in browser based renderers
    # Also remove empty <b>, <u> and <i> tags
    for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
        if a.get('id', None) is None and a.get('name', None) is None \
                and len(a) == 0 and not a.text:
            remove_elem(a)

    # Convert <br>s with content into paragraphs as ADE can't handle
    # them
    for br in xpath(data, '//h:br'):
        if len(br) > 0 or br.text:
            br.tag = XHTML('div')

    # Remove any stray text in the <head> section and format it nicely
    data.text = '\n  '
    head = xpath(data, '//h:head')
    if head:
        head = head[0]
        head.text = '\n    '
        head.tail = '\n  '
        for child in head:
            child.tail = '\n    '
        child.tail = '\n  '

    return data
Ejemplo n.º 16
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isinstance(src, bytes):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)

    def get_all(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = [x.strip() for x in ans if x.strip()]
        if not ans:
            ans = None
        return ans

    def get(field):
        ans = get_all(field)
        if ans:
            ans = ans[0]
        return ans

    # Title
    title = get('title') or title_tag.strip() or 'Unknown'

    # Author
    authors = authors_to_string(get_all('authors')) or 'Unknown'

    # Create MetaInformation with Title and Author
    mi = Metadata(title, string_to_authors(authors))

    # Single-value text fields
    for field in ('publisher', 'isbn'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    # Multi-value text fields
    for field in ('languages',):
        val = get_all(field)
        if val:
            setattr(mi, field, val)

    # HTML fields
    for field in ('comments',):
        val = get(field)
        if val:
            setattr(mi, field, val.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))

    # Date fields
    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 10:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get_all('tags')
    if tags:
        tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    # IDENTIFIERS
    for (k,v) in meta_tag_ids.items():
        v = [x.strip() for x in v if x.strip()]
        if v:
            mi.set_identifier(k, v[0])

    return mi
Ejemplo n.º 17
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.chardet import xml_to_unicode
        from ebook_converter.ebooks.metadata.opf2 import OPF
        from ebook_converter.utils.zipfile import ZipFile

        self.log = log
        html = u''
        top_levels = []

        # Extract content from zip archive.
        zf = ZipFile(stream)
        zf.extractall()

        # Find the HTML file in the archive. It needs to be
        # top level.
        index = u''
        multiple_html = False
        # Get a list of all top level files in the archive.
        for x in os.listdir(u'.'):
            if os.path.isfile(x):
                top_levels.append(x)
        # Try to find an index. file.
        for x in top_levels:
            if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
                index = x
                break
        # Look for multiple HTML files in the archive. We look at the
        # top level files only as only they matter in HTMLZ.
        for x in top_levels:
            if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'):
                # Set index to the first HTML file found if it's not
                # called index.
                if not index:
                    index = x
                else:
                    multiple_html = True
        # Warn the user if there multiple HTML file in the archive. HTMLZ
        # supports a single HTML file. A conversion with a multiple HTML file
        # HTMLZ archive probably won't turn out as the user expects. With
        # Multiple HTML files ZIP input should be used in place of HTMLZ.
        if multiple_html:
            log.warn('Multiple HTML files found in the archive. Only %s will '
                     'be used.' % index)

        if index:
            with open(index, 'rb') as tf:
                html = tf.read()
        else:
            raise Exception('No top level HTML file found.')

        if not html:
            raise Exception('Top level HTML file %s is empty' % index)

        # Encoding
        if options.input_encoding:
            ienc = options.input_encoding
        else:
            ienc = xml_to_unicode(html[:4096])[-1]
        html = html.decode(ienc, 'replace')

        # Run the HTML through the html processing plugin.
        from ebook_converter.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = os.getcwd()
        htmlfile = os.path.join(base, u'index.html')
        c = 0
        while os.path.exists(htmlfile):
            c += 1
            htmlfile = u'index%d.html' % c
        with open(htmlfile, 'wb') as f:
            f.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log, {})
        options.debug_pipeline = odi
        os.remove(htmlfile)

        # Set metadata from file.
        from ebook_converter.customize.ui import get_file_type_metadata
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        # Get the cover path from the OPF.
        cover_path = None
        opf = None
        for x in top_levels:
            if os.path.splitext(x)[1].lower() == u'.opf':
                opf = x
                break
        if opf:
            opf = OPF(opf, basedir=os.getcwd())
            cover_path = opf.raster_cover or opf.cover
        # Set the cover.
        if cover_path:
            cdata = None
            with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf:
                cdata = cf.read()
            cover_name = os.path.basename(cover_path)
            id, href = oeb.manifest.generate('cover', cover_name)
            oeb.manifest.add(id,
                             href,
                             mimetypes.guess_type(cover_name)[0],
                             data=cdata)
            oeb.guide.add('cover', 'Cover', href)

        return oeb
Ejemplo n.º 18
0
    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.metadata.fb2 import ensure_namespace
        from ebook_converter.ebooks.metadata.fb2 import get_fb2_data
        from ebook_converter.ebooks.metadata.opf2 import OPFCreator
        from ebook_converter.ebooks.metadata.meta import get_metadata
        from ebook_converter.ebooks.chardet import xml_to_unicode
        self.log = log
        log.debug('Parsing XML...')
        raw = get_fb2_data(stream)[0]
        raw = raw.replace(b'\0', b'')
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             assume_utf8=True,
                             resolve_entities=True)[0]
        try:
            doc = etree.fromstring(raw)
        except etree.XMLSyntaxError:
            doc = etree.fromstring(raw.replace('& ', '&amp;'))
        if doc is None:
            raise ValueError('The FB2 file is not valid XML')
        doc = ensure_namespace(doc)
        try:
            fb_ns = doc.nsmap[doc.prefix]
        except Exception:
            fb_ns = FB2NS

        NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS}
        stylesheets = doc.xpath('//*[local-name() = "stylesheet" and '
                                '@type="text/css"]')
        css = ''
        for s in stylesheets:
            css += etree.tostring(
                s, encoding='unicode', method='text', with_tail=False) + '\n\n'
        if css:
            import css_parser
            import logging
            parser = css_parser.CSSParser(fetcher=None,
                                          log=logging.getLogger('calibre.css'))

            XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS
            text = XHTML_CSS_NAMESPACE + css
            log.debug('Parsing stylesheet...')
            stylesheet = parser.parseString(text)
            stylesheet.namespaces['h'] = const.XHTML_NS
            css = stylesheet.cssText
            if isinstance(css, bytes):
                css = css.decode('utf-8', 'replace')
            css = css.replace('h|style', 'h|span')
            css = re.sub(r'name\s*=\s*', 'class=', css)
        self.extract_embedded_content(doc)
        log.debug('Converting XML to HTML...')
        with open(
                pkg_resources.resource_filename('ebook_converter',
                                                'data/fb2.xsl')) as f:
            ss = f.read()
        ss = ss.replace("__FB_NS__", fb_ns)
        if options.no_inline_fb2_toc:
            log.info('Disabling generation of inline FB2 TOC')
            ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
                            re.DOTALL).sub('', ss)

        styledoc = etree.fromstring(ss)

        transform = etree.XSLT(styledoc)
        result = transform(doc)

        # Handle links of type note and cite
        notes = {
            a.get('href')[1:]: a
            for a in result.xpath('//a[@link_note and @href]')
            if a.get('href').startswith('#')
        }
        cites = {
            a.get('link_cite'): a
            for a in result.xpath('//a[@link_cite]') if not a.get('href', '')
        }
        all_ids = {x for x in result.xpath('//*/@id')}
        for cite, a in cites.items():
            note = notes.get(cite, None)
            if note:
                c = 1
                while 'cite%d' % c in all_ids:
                    c += 1
                if not note.get('id', None):
                    note.set('id', 'cite%d' % c)
                    all_ids.add(note.get('id'))
                a.set('href', '#%s' % note.get('id'))
        for x in result.xpath('//*[@link_note or @link_cite]'):
            x.attrib.pop('link_note', None)
            x.attrib.pop('link_cite', None)

        for img in result.xpath('//img[@src]'):
            src = img.get('src')
            img.set('src', self.binary_map.get(src, src))
        index = transform.tostring(result)
        with open('index.xhtml', 'wb') as f:
            f.write(index.encode('utf-8'))
        with open('inline-styles.css', 'wb') as f:
            f.write(css.encode('utf-8'))
        stream.seek(0)
        mi = get_metadata(stream, 'fb2')
        if not mi.title:
            mi.title = 'Unknown'
        if not mi.authors:
            mi.authors = ['Unknown']
        cpath = None
        if mi.cover_data and mi.cover_data[1]:
            with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
                f.write(mi.cover_data[1])
            cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
        else:
            for img in doc.xpath('//f:coverpage/f:image',
                                 namespaces=NAMESPACES):
                href = img.get('{%s}href' % const.XLINK_NS,
                               img.get('href', None))
                if href is not None:
                    if href.startswith('#'):
                        href = href[1:]
                    cpath = os.path.abspath(href)
                    break

        opf = OPFCreator(os.getcwd(), mi)
        entries = [(f2, mimetypes.guess_type(f2)[0])
                   for f2 in os.listdir(u'.')]
        opf.create_manifest(entries)
        opf.create_spine(['index.xhtml'])
        if cpath:
            opf.guide.set_cover(cpath)
        with open('metadata.opf', 'wb') as f:
            opf.render(f)
        return os.path.join(os.getcwd(), 'metadata.opf')