예제 #1
0
파일: parsing.py 프로젝트: AEliu/calibre
    def test_polish_parser(self):
        ' Test parsing with the HTML5 parser used for polishing '
        for test in basic_checks:
            test(self, parse)

        root = parse('<html><p><svg><image /><b></svg>&nbsp;\n<b>xxx', discard_namespaces=True)
        self.assertTrue(root.xpath('//b'), 'Namespaces not discarded')
        self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>')

        for ds in (False, True):
            src = '\n<html>\n<p>\n<svg><image />\n<b></svg>&nbsp'
            root = parse(src, discard_namespaces=ds)
            for tag, lnum in {'html':2, 'head':3, 'body':3, 'p':3, 'svg':4, 'image':4, 'b':5}.iteritems():
                elem = root.xpath('//*[local-name()="%s"]' % tag)[0]
                self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src))

        for ds in (False, True):
            src = '\n<html>\n<p b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8><svg b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8>\n'
            root = parse(src, discard_namespaces=ds)
            for tag in ('p', 'svg'):
                for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()):
                    self.assertEqual(i+1, int(v))

        root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>')
        self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed')

        root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:extra="extra"><body/></html>')
        self.assertIn('extra', root.nsmap, 'Extra namespace declaration on <html> tag not preserved')
예제 #2
0
파일: parsing.py 프로젝트: wh0197m/calibre
    def test_polish_parser(self):
        ' Test parsing with the HTML5 parser used for polishing '
        for test in basic_checks:
            test(self, parse)

        root = parse('<html><p><svg><image /><b></svg>&nbsp;\n<b>xxx', discard_namespaces=True)
        self.assertTrue(root.xpath('//b'), 'Namespaces not discarded')
        self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>')

        for ds in (False, True):
            src = '\n<html>\n<p>\n<svg><image />\n<b></svg>&nbsp'
            root = parse(src, discard_namespaces=ds)
            for tag, lnum in {'html':2, 'head':3, 'body':3, 'p':3, 'svg':4, 'image':4, 'b':5}.iteritems():
                elem = root.xpath('//*[local-name()="%s"]' % tag)[0]
                self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src))

        for ds in (False, True):
            src = '\n<html>\n<p b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8><svg b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8>\n'
            root = parse(src, discard_namespaces=ds)
            for tag in ('p', 'svg'):
                for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()):
                    self.assertEqual(i+1, int(v))

        root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>')
        self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed')

        root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:extra="extra"><body/></html>')
        self.assertIn('extra', root.nsmap, 'Extra namespace declaration on <html> tag not preserved')
예제 #3
0
파일: book.py 프로젝트: won2930015/calibre
    def search(self, text, index, backwards=False):
        from calibre.ebooks.oeb.polish.parsing import parse
        pmap = [(i, path) for i, path in enumerate(self.spine)]
        if backwards:
            pmap.reverse()
        q = text.lower()
        for i, path in pmap:
            if (backwards and i < index) or (not backwards and i > index):
                with open(path, 'rb') as f:
                    raw = f.read().decode(path.encoding)
                root = parse(raw)
                fragments = []

                def serialize(elem):
                    if elem.text:
                        fragments.append(elem.text.lower())
                    if elem.tail:
                        fragments.append(elem.tail.lower())
                    for child in elem.iterchildren():
                        if hasattr(
                                getattr(child, 'tag', None),
                                'rpartition') and child.tag.rpartition(
                                    '}')[-1] not in {'script', 'style', 'del'}:
                            serialize(child)
                        elif getattr(child, 'tail', None):
                            fragments.append(child.tail.lower())

                for body in root.xpath('//*[local-name() = "body"]'):
                    body.tail = None
                    serialize(body)

                if q in ''.join(fragments):
                    return i
예제 #4
0
파일: view.py 프로젝트: AtulKumar2/calibre
def beautify_text(raw, syntax):
    from lxml import etree
    from calibre.ebooks.oeb.polish.parsing import parse
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
    from calibre.ebooks.chardet import strip_encoding_declarations
    if syntax == 'xml':
        root = etree.fromstring(strip_encoding_declarations(raw))
        pretty_xml_tree(root)
    elif syntax == 'css':
        import logging
        from calibre.ebooks.oeb.base import serialize, _css_logger
        from calibre.ebooks.oeb.polish.utils import setup_cssutils_serialization
        from cssutils import CSSParser, log
        setup_cssutils_serialization(tprefs['editor_tab_stop_width'])
        log.setLevel(logging.WARN)
        log.raiseExceptions = False
        parser = CSSParser(loglevel=logging.WARNING,
                           # We dont care about @import rules
                           fetcher=lambda x: (None, None), log=_css_logger)
        data = parser.parseString(raw, href='<string>', validate=False)
        return serialize(data, 'text/css')
    else:
        root = parse(raw, line_numbers=False)
        pretty_html_tree(None, root)
    return etree.tostring(root, encoding=unicode)
예제 #5
0
def beautify_text(raw, syntax):
    from lxml import etree
    from calibre.ebooks.oeb.polish.parsing import parse
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
    from calibre.ebooks.chardet import strip_encoding_declarations
    if syntax == 'xml':
        root = etree.fromstring(strip_encoding_declarations(raw))
        pretty_xml_tree(root)
    elif syntax == 'css':
        import logging
        from calibre.ebooks.oeb.base import serialize, _css_logger
        from calibre.ebooks.oeb.polish.utils import setup_css_parser_serialization
        from css_parser import CSSParser, log
        setup_css_parser_serialization(tprefs['editor_tab_stop_width'])
        log.setLevel(logging.WARN)
        log.raiseExceptions = False
        parser = CSSParser(loglevel=logging.WARNING,
                           # We dont care about @import rules
                           fetcher=lambda x: (None, None), log=_css_logger)
        data = parser.parseString(raw, href='<string>', validate=False)
        return serialize(data, 'text/css')
    else:
        root = parse(raw, line_numbers=False)
        pretty_html_tree(None, root)
    return etree.tostring(root, encoding=unicode)
예제 #6
0
파일: tests.py 프로젝트: j-howell/calibre
    def test_cfi_decode(self):
        from calibre.ebooks.oeb.polish.parsing import parse
        root = parse('''
<html>
<head></head>
<body id="body01">
        <p>…</p>
        <p>…</p>
        <p>…</p>
        <p>…</p>
        <p id="para05">xxx<em>yyy</em>0123456789</p>
        <p>…</p>
        <p>…</p>
        <img id="svgimg" src="foo.svg" alt="…"/>
        <p>…</p>
        <p><span>hello</span><span>goodbye</span>text here<em>adieu</em>text there</p>
    </body>
</html>
''', line_numbers=True, linenumber_attribute='data-lnum')
        body = root[-1]

        def test(cfi, expected):
            self.assertIs(decode_cfi(root, cfi), expected)

        for cfi in '/4 /4[body01] /900[body01] /2[body01]'.split():
            test(cfi, body)

        for i in range(len(body)):
            test('/4/{}'.format((i + 1)*2), body[i])

        p = body[4]
        test('/4/999[para05]', p)
        test('/4/999[para05]/2', p[0])
예제 #7
0
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'):
    ''' Create an empty book in the specified format at the specified location. '''
    path = os.path.abspath(path)
    lang = 'und'
    opf = metadata_to_opf(mi, as_string=False)
    for l in opf.xpath('//*[local-name()="language"]'):
        if l.text:
            lang = l.text
            break
    lang = lang_as_iso639_1(lang) or lang

    opfns = OPF_NAMESPACES['opf']
    m = opf.makeelement('{%s}manifest' % opfns)
    opf.insert(1, m)
    i = m.makeelement('{%s}item' % opfns, href=html_name, id='start')
    i.set('media-type', guess_type('a.xhtml'))
    m.append(i)
    i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx')
    i.set('media-type', guess_type(toc_name))
    m.append(i)
    s = opf.makeelement('{%s}spine' % opfns, toc="ncx")
    opf.insert(2, s)
    i = s.makeelement('{%s}itemref' % opfns, idref='start')
    s.append(i)
    CONTAINER = '''\
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
   <rootfiles>
      <rootfile full-path="{0}" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>
    '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8')
    HTML = P('templates/new_book.html', data=True).decode('utf-8').replace(
        '_LANGUAGE_', prepare_string_for_xml(lang, True)
    ).replace(
        '_TITLE_', prepare_string_for_xml(mi.title)
    ).replace(
        '_AUTHORS_', prepare_string_for_xml(authors_to_string(mi.authors))
    ).encode('utf-8')
    h = parse(HTML)
    pretty_html_tree(None, h)
    HTML = serialize(h, 'text/html')
    ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True)
    pretty_xml_tree(opf)
    opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True)
    if fmt == 'azw3':
        with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir):
            for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)):
                with open(name, 'wb') as f:
                    f.write(data)
            c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull())
            opf_to_azw3(opf_name, path, c)
    else:
        with ZipFile(path, 'w', compression=ZIP_STORED) as zf:
            zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED)
            zf.writestr('META-INF/', b'', 0755)
            zf.writestr('META-INF/container.xml', CONTAINER)
            zf.writestr(opf_name, opf)
            zf.writestr(html_name, HTML)
            zf.writestr(toc_name, ncx)
예제 #8
0
파일: book.py 프로젝트: AEliu/calibre
    def search(self, text, index, backwards=False):
        from calibre.ebooks.oeb.polish.parsing import parse
        pmap = [(i, path) for i, path in enumerate(self.spine)]
        if backwards:
            pmap.reverse()
        q = text.lower()
        for i, path in pmap:
            if (backwards and i < index) or (not backwards and i > index):
                with open(path, 'rb') as f:
                    raw = f.read().decode(path.encoding)
                root = parse(raw)
                fragments = []
                def serialize(elem):
                    if elem.text:
                        fragments.append(elem.text.lower())
                    if elem.tail:
                        fragments.append(elem.tail.lower())
                    for child in elem.iterchildren():
                        if hasattr(getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition('}')[-1] not in {'script', 'style', 'del'}:
                            serialize(child)
                        elif getattr(child, 'tail', None):
                            fragments.append(child.tail.lower())
                for body in root.xpath('//*[local-name() = "body"]'):
                    body.tail = None
                    serialize(body)

                if q in ''.join(fragments):
                    return i
예제 #9
0
    def test_cfi_decode(self):
        from calibre.ebooks.oeb.polish.parsing import parse
        root = parse('''
<html>
<head></head>
<body id="body01">
        <p>…</p>
        <p>…</p>
        <p>…</p>
        <p>…</p>
        <p id="para05">xxx<em>yyy</em>0123456789</p>
        <p>…</p>
        <p>…</p>
        <img id="svgimg" src="foo.svg" alt="…"/>
        <p>…</p>
        <p><span>hello</span><span>goodbye</span>text here<em>adieu</em>text there</p>
    </body>
</html>
''',
                     line_numbers=True,
                     linenumber_attribute='data-lnum')
        body = root[-1]

        def test(cfi, expected):
            self.assertIs(decode_cfi(root, cfi), expected)

        for cfi in '/4 /4[body01] /900[body01] /2[body01]'.split():
            test(cfi, body)

        for i in range(len(body)):
            test('/4/{}'.format((i + 1) * 2), body[i])

        p = body[4]
        test('/4/999[para05]', p)
        test('/4/999[para05]/2', p[0])
예제 #10
0
def parse_html(raw):
    root = parse(raw,
                 decoder=lambda x: x.decode('utf-8'),
                 line_numbers=True,
                 linenumber_attribute='data-lnum')
    ans = serialize(root, 'text/html')
    if not isinstance(ans, bytes):
        ans = ans.encode('utf-8')
    return ans
예제 #11
0
    def convert_epub3_nav(self, nav_path, opf, log):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME)
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
예제 #12
0
    def convert_epub3_nav(self, nav_path, opf, log):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME)
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
예제 #13
0
파일: spell.py 프로젝트: prajoria/calibre
def add_words_from_escaped_html(text, words, file_name, node, attr, locale):
    text = replace_entities(text)
    root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
    ewords = defaultdict(list)
    ewords[None] = 0
    read_words_from_html(root, ewords, file_name, locale)
    words[None] += ewords.pop(None)
    for k, locs in iteritems(ewords):
        for loc in locs:
            loc.location_node, loc.node_item = node, (False, attr)
        words[k].extend(locs)
예제 #14
0
    def test_polish_parser(self):
        ' Test parsing with the HTML5 parser used for polishing '
        for test in basic_checks:
            test(self, parse)

        root = parse('<html><p><svg><image /><b></svg>&nbsp;\n<b>xxx', discard_namespaces=True)
        self.assertTrue(root.xpath('//b'), 'Namespaces not discarded')
        self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>')

        for ds in (False, True):
            src = '\n<html>\n<p>\n<svg><image />\n<b></svg>&nbsp'
            root = parse(src, discard_namespaces=ds)
            for tag, lnum in {'html':2, 'head':3, 'body':3, 'p':3, 'svg':4, 'image':4, 'b':5}.iteritems():
                elem = root.xpath('//*[local-name()="%s"]' % tag)[0]
                self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src))

        for ds in (False, True):
            src = '\n<html>\n<p b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8><svg b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8>\n'
            root = parse(src, discard_namespaces=ds)
            for tag in ('p', 'svg'):
                for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()):
                    self.assertEqual(i+1, int(v))
예제 #15
0
def find_first_matching_rule(container,
                             html_file_name,
                             raw_html,
                             class_data,
                             lnum_attr='data-lnum'):
    lnum, tags = class_data['sourceline_address']
    class_name = class_data['class']
    root = parse(raw_html,
                 decoder=lambda x: x.decode('utf-8'),
                 line_numbers=True,
                 linenumber_attribute=lnum_attr)
    tags_on_line = root.xpath(f'//*[@{lnum_attr}={lnum}]')
    barenames = [barename(tag.tag) for tag in tags_on_line]
    if barenames[:len(tags)] != tags:
        raise NoMatchingTagFound(
            f'No tag matching the specification was found in {html_file_name}')
    target_elem = tags_on_line[len(tags) - 1]
    select = Select(root, ignore_inappropriate_pseudo_classes=True)
    for tag in root.iter('*'):
        tn = barename(tag.tag)
        if tn == 'style' and tag.text and tag.get('type',
                                                  'text/css') == 'text/css':
            try:
                sheet = container.parse_css(tag.text)
            except Exception:
                continue
            res = find_first_rule_that_matches_elem(container, target_elem,
                                                    select, class_name,
                                                    sheet.cssRules,
                                                    html_file_name)
            if res is not None:
                return res._replace(style_tag_address=(int(tag.get(lnum_attr)),
                                                       ['style']))
        elif tn == 'link' and tag.get('href') and tag.get(
                'rel') == 'stylesheet':
            sname = container.href_to_name(tag.get('href'), html_file_name)
            try:
                sheet = container.parsed(sname)
            except Exception:
                continue
            if not hasattr(sheet, 'cssRules'):
                continue
            res = find_first_rule_that_matches_elem(container, target_elem,
                                                    select, class_name,
                                                    sheet.cssRules, sname)
            if res is not None:
                return res
    raise NoMatchingRuleFound(
        f'No CSS rules that apply to the specified tag in {html_file_name} with the class {class_name} found'
    )
예제 #16
0
파일: basic.py 프로젝트: zyhong/calibre
def complete_anchor(name, data_conn):
    if name not in file_cache:
        data = raw = get_data(data_conn, 'file_data', name)
        if isinstance(raw, unicode_type):
            try:
                root = parse(raw, decoder=lambda x:x.decode('utf-8'))
            except Exception:
                pass
            else:
                data = (root, create_anchor_map(root))
        file_cache[name] = data
    data = file_cache[name]
    if isinstance(data, tuple) and len(data) > 1 and isinstance(data[1], dict):
        return tuple(sorted(frozenset(data[1]), key=numeric_sort_key)), data[1], {}
예제 #17
0
파일: basic.py 프로젝트: AEliu/calibre
def complete_anchor(name, data_conn):
    if name not in file_cache:
        data = raw = get_data(data_conn, 'file_data', name)
        if isinstance(raw, type('')):
            try:
                root = parse(raw, decoder=lambda x:x.decode('utf-8'))
            except Exception:
                pass
            else:
                data = (root, create_anchor_map(root))
        file_cache[name] = data
    data = file_cache[name]
    if isinstance(data, tuple) and len(data) > 1 and isinstance(data[1], dict):
        return frozenset(data[1]), data[1], {}
예제 #18
0
def render_jacket(mi,
                  output_profile,
                  alt_title=_('Unknown'),
                  alt_tags=[],
                  alt_comments='',
                  alt_publisher='',
                  rescale_fonts=False,
                  alt_authors=None):
    css = P('jacket/stylesheet.css', data=True).decode('utf-8')
    template = P('jacket/template.xhtml', data=True).decode('utf-8')

    template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL)
    css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)

    try:
        title_str = alt_title if mi.is_null('title') else mi.title
    except:
        title_str = _('Unknown')
    title_str = escape(title_str)
    title = '<span class="title">%s</span>' % title_str

    series = Series(mi.series, mi.series_index)
    try:
        publisher = mi.publisher if not mi.is_null(
            'publisher') else alt_publisher
    except:
        publisher = ''
    publisher = escape(publisher)

    try:
        if is_date_undefined(mi.pubdate):
            pubdate = ''
        else:
            dt = as_local_time(mi.pubdate)
            pubdate = strftime('%Y', dt.timetuple())
    except:
        pubdate = ''

    rating = get_rating(mi.rating, output_profile.ratings_char,
                        output_profile.empty_ratings_char)

    tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)

    comments = mi.comments if mi.comments else alt_comments
    comments = comments.strip()
    if comments:
        comments = comments_to_html(comments)

    orig = mi.authors
    if mi.is_null('authors'):
        mi.authors = list(alt_authors or (_('Unknown'), ))
    try:
        author = mi.format_authors()
    except:
        author = ''
    mi.authors = orig
    author = escape(author)
    has_data = {}

    def generate_html(comments):
        display = Attributes()
        args = dict(
            xmlns=XHTML_NS,
            title_str=title_str,
            identifiers=Identifiers(mi.identifiers),
            css=css,
            title=title,
            author=author,
            publisher=publisher,
            pubdate_label=_('Published'),
            pubdate=pubdate,
            series_label=ngettext('Series', 'Series', 1),
            series=series,
            rating_label=_('Rating'),
            rating=rating,
            tags_label=_('Tags'),
            tags=tags,
            comments=comments,
            footer='',
            display=display,
            searchable_tags=' '.join(
                escape(t) + 'ttt' for t in tags.tags_list),
        )
        for key in mi.custom_field_keys():
            m = mi.get_user_metadata(key, False) or {}
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                dkey = key.replace('#', '_')
                dt = m.get('datatype')
                if dt == 'series':
                    args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
                elif dt == 'rating':
                    args[dkey] = rating_to_stars(
                        mi.get(key),
                        m.get('display', {}).get('allow_half_stars', False))
                elif dt == 'comments':
                    val = val or ''
                    ctype = m.get('display', {}).get('interpret_as') or 'html'
                    if ctype == 'long-text':
                        val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(
                            val)
                    elif ctype == 'short-text':
                        val = '<span>%s</span>' % escape(val)
                    elif ctype == 'markdown':
                        val = markdown(val)
                    else:
                        val = comments_to_html(val)
                    args[dkey] = val
                else:
                    args[dkey] = escape(val)
                args[dkey + '_label'] = escape(display_name)
                setattr(display, dkey,
                        'none' if mi.is_null(key) else 'initial')
            except Exception:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith('_') and not key.endswith('_label'):
                    print(" {}: {}".format('#' + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')
        has_data['series'] = bool(series)
        has_data['tags'] = bool(tags)
        has_data['rating'] = bool(rating)
        has_data['pubdate'] = bool(pubdate)
        for k, v in has_data.items():
            setattr(display, k, 'initial' if v else 'none')
        display.title = 'initial'
        if mi.identifiers:
            display.identifiers = 'initial'

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        return strip_encoding_declarations(generated_html)

    from calibre.ebooks.oeb.polish.parsing import parse
    raw = generate_html(comments)
    root = parse(raw, line_numbers=False, force_html5_parse=True)

    if rescale_fonts:
        # We ensure that the conversion pipeline will set the font sizes for
        # text in the jacket to the same size as the font sizes for the rest of
        # the text in the book. That means that as long as the jacket uses
        # relative font sizes (em or %), the post conversion font size will be
        # the same as for text in the main book. So text with size x em will
        # be rescaled to the same value in both the jacket and the main content.
        #
        # We cannot use data-calibre-rescale 100 on the body tag as that will just
        # give the body tag a font size of 1em, which is useless.
        for body in root.xpath('//*[local-name()="body"]'):
            fw = body.makeelement(XHTML('div'))
            fw.set('data-calibre-rescale', '100')
            for child in body:
                fw.append(child)
            body.append(fw)
    postprocess_jacket(root, output_profile, has_data)
    from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
    pretty_html_tree(None, root)
    return root
예제 #19
0
 def test_lxml_tostring(self):
     ' Test for bug in some versions of lxml that causes incorrect serialization of sub-trees'
     from html5_parser import parse
     root = parse('<p>a<p>b<p>c')
     p = root.xpath('//p')[0]
     self.assertEqual(etree.tostring(p, encoding=str), '<p>a</p>')
예제 #20
0
파일: preview.py 프로젝트: palerdot/calibre
def parse_html(raw):
    root = parse(raw, decoder=lambda x: x.decode("utf-8"), line_numbers=True, linenumber_attribute="data-lnum")
    return serialize(root, "text/html").encode("utf-8")
예제 #21
0
    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                             assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring(
            '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>'
        )
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(
                    x, method='text', encoding=unicode,
                    with_tail=False).strip() or ' '.join(
                        x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src': href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx',
                                dir=os.path.dirname(nav_path),
                                delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwdu()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME,
                                          append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        opts.epub3_nav_href = urlnormalize(
            os.path.relpath(nav_path).replace(os.sep, '/'))
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = os.path.relpath(
                    os.path.join(base_path, urlunquote(href)), base_path)
                abs_href = urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with open(nav_path, 'wb') as f:
                    f.write(serialize(root, 'application/xhtml+xml'))
예제 #22
0
파일: preview.py 프로젝트: JimmXinu/calibre
def parse_html(raw):
    root = parse(raw, decoder=lambda x:x.decode('utf-8'), line_numbers=True, linenumber_attribute='data-lnum')
    ans = serialize(root, 'text/html')
    if not isinstance(ans, bytes):
        ans = ans.encode('utf-8')
    return ans
예제 #23
0
파일: preview.py 프로젝트: Kielek/calibre
def parse_html(raw):
    root = parse(raw, decoder=lambda x:x.decode('utf-8'), line_numbers=True, linenumber_attribute='data-lnum')
    return serialize(root, 'text/html').encode('utf-8')
예제 #24
0
파일: preview.py 프로젝트: suman95/calibre
def parse_html(raw):
    root = parse(raw,
                 decoder=lambda x: x.decode('utf-8'),
                 line_numbers=True,
                 linenumber_attribute='data-lnum')
    return serialize(root, 'text/html').encode('utf-8')
예제 #25
0
파일: spell.py 프로젝트: prajoria/calibre
def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale):
    text = replace_entities(text)
    root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8'))
    count_chars_in_html(root, counter, file_name, locale)
예제 #26
0
파일: create.py 프로젝트: MarioJC/calibre
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'):
    ''' Create an empty book in the specified format at the specified location. '''
    if fmt not in valid_empty_formats:
        raise ValueError('Cannot create empty book in the %s format' % fmt)
    if fmt == 'txt':
        with open(path, 'wb') as f:
            if not mi.is_null('title'):
                f.write(mi.title)
        return
    if fmt == 'docx':
        from calibre.ebooks.conversion.plumber import Plumber
        from calibre.ebooks.docx.writer.container import DOCX
        from calibre.utils.logging import default_log
        p = Plumber('a.docx', 'b.docx', default_log)
        p.setup_options()
        # Use the word default of one inch page margins
        for x in 'left right top bottom'.split():
            setattr(p.opts, 'margin_' + x, 72)
        DOCX(p.opts, default_log).write(path, mi, create_empty_document=True)
        return
    path = os.path.abspath(path)
    lang = 'und'
    opf = metadata_to_opf(mi, as_string=False)
    for l in opf.xpath('//*[local-name()="language"]'):
        if l.text:
            lang = l.text
            break
    lang = lang_as_iso639_1(lang) or lang

    opfns = OPF_NAMESPACES['opf']
    m = opf.makeelement('{%s}manifest' % opfns)
    opf.insert(1, m)
    i = m.makeelement('{%s}item' % opfns, href=html_name, id='start')
    i.set('media-type', guess_type('a.xhtml'))
    m.append(i)
    i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx')
    i.set('media-type', guess_type(toc_name))
    m.append(i)
    s = opf.makeelement('{%s}spine' % opfns, toc="ncx")
    opf.insert(2, s)
    i = s.makeelement('{%s}itemref' % opfns, idref='start')
    s.append(i)
    CONTAINER = '''\
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
   <rootfiles>
      <rootfile full-path="{0}" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>
    '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8')
    HTML = P('templates/new_book.html', data=True).decode('utf-8').replace(
        '_LANGUAGE_', prepare_string_for_xml(lang, True)
    ).replace(
        '_TITLE_', prepare_string_for_xml(mi.title)
    ).replace(
        '_AUTHORS_', prepare_string_for_xml(authors_to_string(mi.authors))
    ).encode('utf-8')
    h = parse(HTML)
    pretty_html_tree(None, h)
    HTML = serialize(h, 'text/html')
    ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True)
    pretty_xml_tree(opf)
    opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True)
    if fmt == 'azw3':
        with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir):
            for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)):
                with open(name, 'wb') as f:
                    f.write(data)
            c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull())
            opf_to_azw3(opf_name, path, c)
    else:
        with ZipFile(path, 'w', compression=ZIP_STORED) as zf:
            zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED)
            zf.writestr('META-INF/', b'', 0755)
            zf.writestr('META-INF/container.xml', CONTAINER)
            zf.writestr(opf_name, opf)
            zf.writestr(html_name, HTML)
            zf.writestr(toc_name, ncx)
예제 #27
0
파일: pretty.py 프로젝트: alfaniel/calibre
def fix_html(raw):
    root = parse(raw)
    return serialize(root, 'text/html').decode('utf-8')
예제 #28
0
def create_book(mi,
                path,
                fmt='epub',
                opf_name='metadata.opf',
                html_name='start.xhtml',
                toc_name='toc.ncx'):
    ''' Create an empty book in the specified format at the specified location. '''
    if fmt not in valid_empty_formats:
        raise ValueError('Cannot create empty book in the %s format' % fmt)
    if fmt == 'txt':
        with open(path, 'wb') as f:
            if not mi.is_null('title'):
                f.write(as_bytes(mi.title))
        return
    if fmt == 'docx':
        from calibre.ebooks.conversion.plumber import Plumber
        from calibre.ebooks.docx.writer.container import DOCX
        from calibre.utils.logging import default_log
        p = Plumber('a.docx', 'b.docx', default_log)
        p.setup_options()
        # Use the word default of one inch page margins
        for x in 'left right top bottom'.split():
            setattr(p.opts, 'margin_' + x, 72)
        DOCX(p.opts, default_log).write(path, mi, create_empty_document=True)
        return
    path = os.path.abspath(path)
    lang = 'und'
    opf = metadata_to_opf(mi, as_string=False)
    for l in opf.xpath('//*[local-name()="language"]'):
        if l.text:
            lang = l.text
            break
    lang = lang_as_iso639_1(lang) or lang

    opfns = OPF_NAMESPACES['opf']
    m = opf.makeelement('{%s}manifest' % opfns)
    opf.insert(1, m)
    i = m.makeelement('{%s}item' % opfns, href=html_name, id='start')
    i.set('media-type', guess_type('a.xhtml'))
    m.append(i)
    i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx')
    i.set('media-type', guess_type(toc_name))
    m.append(i)
    s = opf.makeelement('{%s}spine' % opfns, toc="ncx")
    opf.insert(2, s)
    i = s.makeelement('{%s}itemref' % opfns, idref='start')
    s.append(i)
    CONTAINER = '''\
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
   <rootfiles>
      <rootfile full-path="{0}" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>
    '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8')
    HTML = P('templates/new_book.html', data=True).decode('utf-8').replace(
        '_LANGUAGE_', prepare_string_for_xml(lang, True)).replace(
            '_TITLE_', prepare_string_for_xml(mi.title)).replace(
                '_AUTHORS_',
                prepare_string_for_xml(authors_to_string(
                    mi.authors))).encode('utf-8')
    h = parse(HTML)
    pretty_html_tree(None, h)
    HTML = serialize(h, 'text/html')
    ncx = etree.tostring(create_toc(mi, opf, html_name, lang),
                         encoding='utf-8',
                         xml_declaration=True,
                         pretty_print=True)
    pretty_xml_tree(opf)
    opf = etree.tostring(opf,
                         encoding='utf-8',
                         xml_declaration=True,
                         pretty_print=True)
    if fmt == 'azw3':
        with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir):
            for name, data in ((opf_name, opf), (html_name, HTML), (toc_name,
                                                                    ncx)):
                with open(name, 'wb') as f:
                    f.write(data)
            c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name,
                          DevNull())
            opf_to_azw3(opf_name, path, c)
    else:
        with ZipFile(path, 'w', compression=ZIP_STORED) as zf:
            zf.writestr('mimetype',
                        b'application/epub+zip',
                        compression=ZIP_STORED)
            zf.writestr('META-INF/', b'', 0o755)
            zf.writestr('META-INF/container.xml', CONTAINER)
            zf.writestr(opf_name, opf)
            zf.writestr(html_name, HTML)
            zf.writestr(toc_name, ncx)
예제 #29
0
    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwdu()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
                abs_href = urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with open(nav_path, 'wb') as f:
                    f.write(serialize(root, 'application/xhtml+xml'))