Ejemplo n.º 1
0
 def get(x, single=True):
     ans = m[x]
     if single:
         ans = clean_xml_chars(ans[0]) if ans else ''
     else:
         ans = [clean_xml_chars(y) for y in ans]
     return ans
Ejemplo n.º 2
0
 def get(x, single=True):
     ans = m[x]
     if single:
         ans = clean_xml_chars(ans[0]) if ans else ''
     else:
         ans = [clean_xml_chars(y) for y in ans]
     return ans
Ejemplo n.º 3
0
 def navpoint(parent, np):
     text = np.text
     if not text:
         text = ''
     c[1] += 1
     item_id = 'num_%d'%c[1]
     text = clean_xml_chars(text)
     elem = E.navPoint(
             E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
             E.content(src=unicode_type(np.href)+(('#' + unicode_type(np.fragment))
                 if np.fragment else '')),
             id=item_id,
             playOrder=str(np.play_order)
     )
     au = getattr(np, 'author', None)
     if au:
         au = re.sub(r'\s+', ' ', au)
         elem.append(C.meta(au, name='author'))
     desc = getattr(np, 'description', None)
     if desc:
         desc = re.sub(r'\s+', ' ', desc)
         try:
             elem.append(C.meta(desc, name='description'))
         except ValueError:
             elem.append(C.meta(clean_xml_chars(desc), name='description'))
     idx = getattr(np, 'toc_thumbnail', None)
     if idx:
         elem.append(C.meta(idx, name='toc_thumbnail'))
     parent.append(elem)
     for np2 in np:
         navpoint(elem, np2)
Ejemplo n.º 4
0
 def navpoint(parent, np):
     text = np.text
     if not text:
         text = ''
     c[1] += 1
     item_id = 'num_%d' % c[1]
     text = clean_xml_chars(text)
     elem = E.navPoint(
         E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
         E.content(src=unicode_type(np.href) + (
             ('#' + unicode_type(np.fragment)) if np.fragment else '')),
         id=item_id,
         playOrder=unicode_type(np.play_order))
     au = getattr(np, 'author', None)
     if au:
         au = re.sub(r'\s+', ' ', au)
         elem.append(C.meta(au, name='author'))
     desc = getattr(np, 'description', None)
     if desc:
         desc = re.sub(r'\s+', ' ', desc)
         try:
             elem.append(C.meta(desc, name='description'))
         except ValueError:
             elem.append(
                 C.meta(clean_xml_chars(desc), name='description'))
     idx = getattr(np, 'toc_thumbnail', None)
     if idx:
         elem.append(C.meta(idx, name='toc_thumbnail'))
     parent.append(elem)
     for np2 in np:
         navpoint(elem, np2)
Ejemplo n.º 5
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         self.mi.authors.append(au)
         if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
             self.mi.author_sort = au.strip()
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', _('Unknown')}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments  = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
         self.mi.tags = list(set(self.mi.tags))
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(content, as_utc=False)
         except:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Ejemplo n.º 6
0
    def _generate(self, f, feeds, cutoff, extra_css=None, style=None):
        from calibre.utils.cleantext import clean_xml_chars
        feed = feeds[f]
        head = HEAD(TITLE(feed.title))
        if style:
            head.append(STYLE(style, type='text/css'))
        if extra_css:
            head.append(STYLE(extra_css, type='text/css'))
        body = BODY()
        body.append(self.get_navbar(f, feeds))

        div = DIV(
                H2(feed.title,
                    CLASS('calibre_feed_title', 'calibre_rescale_160')),
                CLASS('calibre_rescale_100')
              )
        body.append(div)
        if getattr(feed, 'image', None):
            div.append(DIV(IMG(
                alt=feed.image_alt if feed.image_alt else '',
                src=feed.image_url
                ),
                CLASS('calibre_feed_image')))
        if getattr(feed, 'description', None):
            d = DIV(clean_xml_chars(feed.description), CLASS('calibre_feed_description',
                'calibre_rescale_80'))
            d.append(BR())
            div.append(d)
        ul = UL(CLASS('calibre_article_list'))
        for i, article in enumerate(feed.articles):
            if not getattr(article, 'downloaded', False):
                continue
            li = LI(
                    A(article.title, CLASS('article calibre_rescale_120',
                                    href=article.url)),
                    SPAN(article.formatted_date, CLASS('article_date')),
                    CLASS('calibre_rescale_100', id='article_%d'%i,
                            style='padding-bottom:0.5em')
                    )
            if article.summary:
                li.append(DIV(clean_xml_chars(cutoff(article.text_summary)),
                    CLASS('article_description', 'calibre_rescale_70')))
            ul.append(li)
        div.append(ul)
        div.append(self.get_navbar(f, feeds, top=False))
        self.root = HTML(head, body)
        if self.html_lang:
            self.root.set('lang', self.html_lang)
Ejemplo n.º 7
0
    def _generate(self, f, feeds, cutoff, extra_css=None, style=None):
        from calibre.utils.cleantext import clean_xml_chars
        feed = feeds[f]
        head = HEAD(TITLE(feed.title))
        if style:
            head.append(STYLE(style, type='text/css'))
        if extra_css:
            head.append(STYLE(extra_css, type='text/css'))
        body = BODY()
        body.append(self.get_navbar(f, feeds))

        div = DIV(
                H2(feed.title,
                    CLASS('calibre_feed_title', 'calibre_rescale_160')),
                CLASS('calibre_rescale_100')
              )
        body.append(div)
        if getattr(feed, 'image', None):
            div.append(DIV(IMG(
                alt=feed.image_alt if feed.image_alt else '',
                src=feed.image_url
                ),
                CLASS('calibre_feed_image')))
        if getattr(feed, 'description', None):
            d = DIV(clean_xml_chars(feed.description), CLASS('calibre_feed_description',
                'calibre_rescale_80'))
            d.append(BR())
            div.append(d)
        ul = UL(CLASS('calibre_article_list'))
        for i, article in enumerate(feed.articles):
            if not getattr(article, 'downloaded', False):
                continue
            li = LI(
                    A(article.title, CLASS('article calibre_rescale_120',
                                    href=article.url)),
                    SPAN(article.formatted_date, CLASS('article_date')),
                    CLASS('calibre_rescale_100', id='article_%d'%i,
                            style='padding-bottom:0.5em')
                    )
            if article.summary:
                li.append(DIV(clean_xml_chars(cutoff(article.text_summary)),
                    CLASS('article_description', 'calibre_rescale_70')))
            ul.append(li)
        div.append(ul)
        div.append(self.get_navbar(f, feeds, top=False))
        self.root = HTML(head, body)
        if self.html_lang:
            self.root.set('lang', self.html_lang)
Ejemplo n.º 8
0
 def navpoint(parent, np):
     text = np.text
     if not text:
         text = ""
     c[1] += 1
     item_id = "num_%d" % c[1]
     text = clean_xml_chars(text)
     elem = E.navPoint(
         E.navLabel(E.text(re.sub(r"\s+", " ", text))),
         E.content(src=unicode(np.href) + (("#" + unicode(np.fragment)) if np.fragment else "")),
         id=item_id,
         playOrder=str(np.play_order),
     )
     au = getattr(np, "author", None)
     if au:
         au = re.sub(r"\s+", " ", au)
         elem.append(C.meta(au, name="author"))
     desc = getattr(np, "description", None)
     if desc:
         desc = re.sub(r"\s+", " ", desc)
         elem.append(C.meta(desc, name="description"))
     idx = getattr(np, "toc_thumbnail", None)
     if idx:
         elem.append(C.meta(idx, name="toc_thumbnail"))
     parent.append(elem)
     for np2 in np:
         navpoint(elem, np2)
Ejemplo n.º 9
0
def parse_outline(raw, output_dir):
    from lxml import etree
    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
    raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
    outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]')
    if outline:
        from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
        outline = outline[0]
        toc = TOC()
        count = [0]

        def process_node(node, toc):
            for child in node.iterdescendants('*'):
                if child.tag == 'outline':
                    parent = toc.children[-1] if toc.children else toc
                    process_node(child, parent)
                else:
                    page = child.get('page', '1')
                    toc.add(child.text, 'index.html', page)
                    count[0] += 1
        process_node(outline, toc)
        if count[0] > 2:
            root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml')
            with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
                f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
Ejemplo n.º 10
0
    def _read_opf(self):
        data = self.oeb.container.read(None)
        data = self.oeb.decode(data)
        data = XMLDECL_RE.sub('', data)
        data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
                OPF1_NS, data)
        try:
            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
            try:
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
            except etree.XMLSyntaxError:
                data = re.sub(r'(?is)<tours>.+</tours>', '', data)
                data = data.replace('<dc-metadata>',
                    '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
                try:
                    opf = etree.fromstring(data)
                    self.logger.warn('OPF contains invalid tours section')
                except etree.XMLSyntaxError:
                    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
                    opf = etree.fromstring(data, parser=RECOVER_PARSER)
                    self.logger.warn('OPF contains invalid markup, trying to parse it anyway')

        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):
            raise OEBError('Invalid namespace %r for OPF document' % ns)
        opf = self._clean_opf(opf)
        return opf
Ejemplo n.º 11
0
    def html(self):
        raw = original_html = self.toHtml()
        check = self.toPlainText().strip()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
        raw = self.comments_pat.sub('', raw)
        if not check and '<img' not in raw.lower():
            return ''

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            root = parse(clean_xml_chars(raw), maybe_xhtml=False, sanitize_names=True)
        if root.xpath('//meta[@name="calibre-dont-sanitize"]'):
            # Bypass cleanup if special meta tag exists
            return original_html

        try:
            cleanup_qt_markup(root)
        except Exception:
            import traceback
            traceback.print_exc()
        elems = []
        for body in root.xpath('//body'):
            if body.text:
                elems.append(body.text)
            elems += [html.tostring(x, encoding='unicode') for x in body if
                x.tag not in ('script', 'style')]

        if len(elems) > 1:
            ans = '<div>%s</div>'%(''.join(elems))
        else:
            ans = ''.join(elems)
            if not ans.startswith('<'):
                ans = '<p>%s</p>'%ans
        return xml_replace_entities(ans)
Ejemplo n.º 12
0
def parse_html5(raw,
                decoder=None,
                log=None,
                discard_namespaces=False,
                line_numbers=True,
                linenumber_attribute=None,
                replace_entities=True,
                fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = clean_xml_chars(raw)
    root = html5_parser.parse(raw,
                              maybe_xhtml=not discard_namespaces,
                              line_number_attr=linenumber_attribute,
                              keep_doctype=False,
                              sanitize_names=True)
    if (discard_namespaces and root.tag != 'html') or (
            not discard_namespaces and
        (root.tag != '{{{}}}{}'.format(XHTML_NS, 'html') or root.prefix)):
        raise ValueError(
            'Failed to parse correctly, root has tag: {} and prefix: {}'.
            format(root.tag, root.prefix))
    return root
Ejemplo n.º 13
0
def parse_outline(raw, output_dir):
    from lxml import etree
    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
    raw = clean_xml_chars(
        xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
    outline = etree.fromstring(raw,
                               parser=RECOVER_PARSER).xpath('(//outline)[1]')
    if outline:
        from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
        outline = outline[0]
        toc = TOC()
        count = [0]

        def process_node(node, toc):
            for child in node.iterchildren('*'):
                if child.tag == 'outline':
                    parent = toc.children[-1] if toc.children else toc
                    process_node(child, parent)
                else:
                    if child.text:
                        page = child.get('page', '1')
                        toc.add(child.text, 'index.html', 'p' + page)
                        count[0] += 1

        process_node(outline, toc)
        if count[0] > 2:
            root = create_ncx(toc, (lambda x: x), 'pdftohtml', 'en',
                              'pdftohtml')
            with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
                f.write(
                    etree.tostring(root,
                                   pretty_print=True,
                                   with_tail=False,
                                   encoding='utf-8',
                                   xml_declaration=True))
Ejemplo n.º 14
0
 def append_text(el, attr):
     try:
         setattr(el, attr, (getattr(el, attr) or '') + data)
     except ValueError:
         text = data.replace('\u000c', ' ')
         try:
             setattr(el, attr, (getattr(el, attr) or '') + text)
         except ValueError:
             setattr(el, attr, (getattr(el, attr) or '') + clean_xml_chars(text))
Ejemplo n.º 15
0
 def append_text(el, attr):
     try:
         setattr(el, attr, (getattr(el, attr) or '') + data)
     except ValueError:
         text = data.replace('\u000c', ' ')
         try:
             setattr(el, attr, (getattr(el, attr) or '') + text)
         except ValueError:
             setattr(el, attr, (getattr(el, attr) or '') + clean_xml_chars(text))
 def build_node(current_node, parent=None):
     if parent is None:
         parent = etree.Element('ul')
     elif len(current_node.nodes):
         parent = element(parent, ('ul'))
     for node in current_node.nodes:
         point = element(parent, 'li')
         href = relpath(abspath(unquote(node.href)), dirname(ref_url))
         if isinstance(href, bytes):
             href = href.decode('utf-8')
         link = element(point, 'a', href=clean_xml_chars(href))
         title = node.title
         if isinstance(title, bytes):
             title = title.decode('utf-8')
         if title:
             title = re.sub(r'\s+', ' ', title)
         link.text = clean_xml_chars(title)
         build_node(node, point)
     return parent
Ejemplo n.º 17
0
 def build_node(current_node, parent=None):
     if parent is None:
         parent = etree.Element('ul')
     elif len(current_node.nodes):
         parent = element(parent, ('ul'))
     for node in current_node.nodes:
         point = element(parent, 'li')
         href = relpath(abspath(unquote(node.href)), dirname(ref_url))
         if isinstance(href, bytes):
             href = href.decode('utf-8')
         link = element(point, 'a', href=clean_xml_chars(href))
         title = node.title
         if isinstance(title, bytes):
             title = title.decode('utf-8')
         if title:
             title = re.sub(r'\s+', ' ', title)
         link.text = clean_xml_chars(title)
         build_node(node, point)
     return parent
Ejemplo n.º 18
0
def parse_html(markup):
    from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites
    from calibre.utils.cleantext import clean_xml_chars
    if isinstance(markup, unicode_type):
        markup = strip_encoding_declarations(markup)
        markup = substitute_entites(markup)
    else:
        markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
    markup = clean_xml_chars(markup)
    from html5_parser.soup import parse
    return parse(markup, return_root=False)
Ejemplo n.º 19
0
 def __init__(self, id, title, url, author, summary, published, content):
     from lxml import html
     self.downloaded = False
     self.id = id
     if not title or not isinstance(title, string_or_bytes):
         title = _('Unknown')
     title = force_unicode(title, 'utf-8')
     self._title = clean_xml_chars(title).strip()
     try:
         self._title = re.sub(r'&(\S+?);',
             entity_to_unicode, self._title)
     except:
         pass
     self._title = clean_ascii_chars(self._title)
     self.url = url
     self.author = author
     self.toc_thumbnail = None
     self.internal_toc_entries = ()
     if author and not isinstance(author, str):
         author = author.decode('utf-8', 'replace')
     if summary and not isinstance(summary, str):
         summary = summary.decode('utf-8', 'replace')
     summary = clean_xml_chars(summary) if summary else summary
     self.summary = summary
     if summary and '<' in summary:
         try:
             s = html.fragment_fromstring(summary, create_parent=True)
             summary = html.tostring(s, method='text', encoding='unicode')
         except:
             print('Failed to process article summary, deleting:')
             print(summary.encode('utf-8'))
             traceback.print_exc()
             summary = ''
     self.text_summary = clean_ascii_chars(summary)
     self.author = author
     self.content = content
     self.date = published
     self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
     self.localtime = self.utctime.astimezone(local_tz)
     self._formatted_date = None
Ejemplo n.º 20
0
 def __init__(self, id, title, url, author, summary, published, content):
     from lxml import html
     self.downloaded = False
     self.id = id
     if not title or not isinstance(title, string_or_bytes):
         title = _('Unknown')
     title = force_unicode(title, 'utf-8')
     self._title = clean_xml_chars(title).strip()
     try:
         self._title = re.sub(r'&(\S+?);',
             entity_to_unicode, self._title)
     except:
         pass
     self._title = clean_ascii_chars(self._title)
     self.url = url
     self.author = author
     self.toc_thumbnail = None
     if author and not isinstance(author, unicode_type):
         author = author.decode('utf-8', 'replace')
     if summary and not isinstance(summary, unicode_type):
         summary = summary.decode('utf-8', 'replace')
     summary = clean_xml_chars(summary) if summary else summary
     self.summary = summary
     if summary and '<' in summary:
         try:
             s = html.fragment_fromstring(summary, create_parent=True)
             summary = html.tostring(s, method='text', encoding=unicode_type)
         except:
             print('Failed to process article summary, deleting:')
             print(summary.encode('utf-8'))
             traceback.print_exc()
             summary = u''
     self.text_summary = clean_ascii_chars(summary)
     self.author = author
     self.content = content
     self.date = published
     self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
     self.localtime = self.utctime.astimezone(local_tz)
     self._formatted_date = None
Ejemplo n.º 21
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = clean_xml_chars(raw)
    root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Ejemplo n.º 22
0
def parse_opf(stream_or_path):
    stream = stream_or_path
    if not hasattr(stream, 'read'):
        stream = open(stream, 'rb')
    raw = stream.read()
    if not raw:
        raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
    raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
    raw = raw[raw.find('<'):]
    root = safe_xml_fromstring(clean_xml_chars(raw))
    if root is None:
        raise ValueError('Not an OPF file')
    return root
Ejemplo n.º 23
0
def html5_parse(data, max_nesting_depth=100):
    from html5_parser import parse
    from calibre.utils.cleantext import clean_xml_chars
    data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
    # Check that the asinine HTML 5 algorithm did not result in a tree with
    # insane nesting depths
    for x in data.iterdescendants():
        if isinstance(x.tag, basestring) and len(x) is 0:  # Leaf node
            depth = node_depth(x)
            if depth > max_nesting_depth:
                raise ValueError('HTML 5 parsing resulted in a tree with nesting'
                        ' depth > %d'%max_nesting_depth)
    return data
Ejemplo n.º 24
0
def html5_parse(data, max_nesting_depth=100):
    from html5_parser import parse
    from calibre.utils.cleantext import clean_xml_chars
    data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
    # Check that the asinine HTML 5 algorithm did not result in a tree with
    # insane nesting depths
    for x in data.iterdescendants():
        if isinstance(x.tag, basestring) and len(x) is 0:  # Leaf node
            depth = node_depth(x)
            if depth > max_nesting_depth:
                raise ValueError('HTML 5 parsing resulted in a tree with nesting'
                        ' depth > %d'%max_nesting_depth)
    return data
Ejemplo n.º 25
0
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
    import html5lib
    from calibre.utils.cleantext import clean_xml_chars
    html = strip_encoding_declarations(browser.html)
    if isinstance(html, unicode):
        html = clean_xml_chars(html)
    root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
    root = postprocess_html(root, url, recursion_level)
    if root is None:
        # user wants this page to be aborted
        raise AbortFetch('%s was aborted during postprocess' % url)
    with open(os.path.join(output_dir, 'index.html'), 'wb') as f:
        from lxml.html import tostring
        f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True))
        return f.name
Ejemplo n.º 26
0
def save_html(browser, output_dir, postprocess_html, url, recursion_level):
    import html5lib
    from calibre.utils.cleantext import clean_xml_chars
    html = strip_encoding_declarations(browser.html)
    if isinstance(html, unicode):
        html = clean_xml_chars(html)
    root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot()
    root = postprocess_html(root, url, recursion_level)
    if root is None:
        # user wants this page to be aborted
        raise AbortFetch('%s was aborted during postprocess' % url)
    with open(os.path.join(output_dir, 'index.html'), 'wb') as f:
        from lxml.html import tostring
        f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True))
        return f.name
Ejemplo n.º 27
0
def parse_html_toc(data):
    from html5_parser import parse
    from calibre.utils.cleantext import clean_xml_chars
    from lxml import etree
    if isinstance(data, bytes):
        data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
    root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
    for a in root.xpath('//*[@href and local-name()="a"]'):
        purl = urlparse(unquote(a.get('href')))
        href, fragment = purl[2], purl[5]
        if not fragment:
            fragment = None
        else:
            fragment = fragment.strip()
        href = href.strip()

        txt = etree.tostring(a, method='text', encoding='unicode')
        yield href, fragment, txt
Ejemplo n.º 28
0
    def index_to_soup(self, url_or_raw, raw=False):
        '''
        Convenience method that takes an URL to the index page and returns
        a parsed lxml tree representation of it. See http://lxml.de/tutorial.html

        `url_or_raw`: Either a URL or the downloaded index page as a string
        '''
        if re.match(r'\w+://', url_or_raw):
            self.jsbrowser.start_load(url_or_raw)
            html = self.jsbrowser.html
        else:
            html = url_or_raw
        if isinstance(html, bytes):
            html = xml_to_unicode(html)[0]
        html = strip_encoding_declarations(html)
        if raw:
            return html
        import html5lib
        root = html5lib.parse(clean_xml_chars(html), treebuilder='lxml', namespaceHTMLElements=False).getroot()
        return root
Ejemplo n.º 29
0
def sanitize(s):
    return unicodedata.normalize(
        'NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or ''))))
Ejemplo n.º 30
0
def clean(x):
    if isinstance(x, string_or_bytes):
        x = clean_xml_chars(x)
    return x
Ejemplo n.º 31
0
    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
        self.check_for_drm()
        processed_records = self.extract_text()
        if self.debug is not None:
            parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
        self.add_anchors()
        self.processed_html = self.processed_html.decode(self.book_header.codec,
            'ignore')
        self.processed_html = self.processed_html.replace('</</', '</')
        self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
                self.processed_html)
        self.processed_html = self.processed_html.replace('\ufeff', '')
        # Remove tags of the form <xyz: ...> as they can cause issues further
        # along the pipeline
        self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
                self.processed_html)

        self.processed_html = strip_encoding_declarations(self.processed_html)
        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
            self.processed_html)
        image_name_map = self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()

        self.log.debug('Parsing HTML...')
        self.processed_html = clean_xml_chars(self.processed_html)
        try:
            root = html.fromstring(self.processed_html)
            if len(root.xpath('//html')) > 5:
                root = html.fromstring(self.processed_html.replace('\x0c',
                    '').replace('\x14', ''))
        except Exception:
            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
            self.processed_html = self.remove_random_bytes(self.processed_html)
            root = html.fromstring(self.processed_html)
        if root.xpath('descendant::p/descendant::p'):
            from html5_parser import parse
            self.log.warning('Malformed markup, parsing using html5-parser')
            self.processed_html = strip_encoding_declarations(self.processed_html)
            # These trip up the html5 parser causing all content to be placed
            # under the <guide> tag
            self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I)
            self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I)
            try:
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
                self.processed_html = self.remove_random_bytes(self.processed_html)
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            if len(root.xpath('body/descendant::*')) < 1:
                # There are probably stray </html>s in the markup
                self.processed_html = self.processed_html.replace('</html>',
                        '')
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)

        if root.tag != 'html':
            self.log.warn('File does not have opening <html> tag')
            nroot = html.fromstring('<html><head></head><body></body></html>')
            bod = nroot.find('body')
            for child in list(root):
                child.getparent().remove(child)
                bod.append(child)
            root = nroot

        htmls = list(root.xpath('//html'))

        if len(htmls) > 1:
            self.log.warn('Markup contains multiple <html> tags, merging.')
            # Merge all <head> and <body> sections
            for h in htmls:
                p = h.getparent()
                if hasattr(p, 'remove'):
                    p.remove(h)
            bodies, heads = root.xpath('//body'), root.xpath('//head')
            for x in root:
                root.remove(x)
            head, body = map(root.makeelement, ('head', 'body'))
            for h in heads:
                for x in h:
                    h.remove(x)
                    head.append(x)
            for b in bodies:
                for x in b:
                    b.remove(x)
                    body.append(x)
            root.append(head), root.append(body)
        for x in root.xpath('//script'):
            x.getparent().remove(x)

        head = root.xpath('//head')
        if head:
            head = head[0]
        else:
            head = root.makeelement('head', {})
            root.insert(0, head)
        head.text = '\n\t'
        link = head.makeelement('link', {'type':'text/css',
            'href':'styles.css', 'rel':'stylesheet'})
        head.insert(0, link)
        link.tail = '\n\t'
        title = head.xpath('descendant::title')
        m = head.makeelement('meta', {'http-equiv':'Content-Type',
            'content':'text/html; charset=utf-8'})
        head.insert(0, m)
        if not title:
            title = head.makeelement('title', {})
            try:
                title.text = self.book_header.title
            except ValueError:
                title.text = clean_ascii_chars(self.book_header.title)
            title.tail = '\n\t'
            head.insert(0, title)
            head.text = '\n\t'

        self.upshift_markup(root, image_name_map)
        guides = root.xpath('//guide')
        guide = guides[0] if guides else None
        metadata_elems = root.xpath('//metadata')
        if metadata_elems and self.book_header.exth is None:
            self.read_embedded_metadata(root, metadata_elems[0], guide)
        for elem in guides + metadata_elems:
            elem.getparent().remove(elem)
        htmlfile = os.path.join(output_dir, 'index.html')
        try:
            for ref in guide.xpath('descendant::reference'):
                if 'href' in ref.attrib:
                    ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
        except AttributeError:
            pass

        def write_as_utf8(path, data):
            if isinstance(data, unicode_type):
                data = data.encode('utf-8')
            with lopen(path, 'wb') as f:
                f.write(data)

        parse_cache[htmlfile] = root
        self.htmlfile = htmlfile
        ncx = io.BytesIO()
        opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
        self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
        opf.render(lopen(self.created_opf_path, 'wb'), ncx,
            ncx_manifest_entry=ncx_manifest_entry)
        ncx = ncx.getvalue()
        if ncx:
            ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
            write_as_utf8(ncx_path, ncx)

        css = [self.base_css_rules, '\n\n']
        for cls, rule in self.tag_css_rules.items():
            css.append('.%s { %s }\n\n' % (cls, rule))
        write_as_utf8('styles.css', ''.join(css))

        if self.book_header.exth is not None or self.embedded_mi is not None:
            self.log.debug('Creating OPF...')
            ncx = io.BytesIO()
            opf, ncx_manifest_entry  = self.create_opf(htmlfile, guide, root)
            opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
                ncx_manifest_entry)
            ncx = ncx.getvalue()
            if ncx:
                write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
Ejemplo n.º 32
0
def build_index(books, num, search, sort, order, start, total, url_base, CKEYS, prefix, have_kobo_browser=False):
    logo = DIV(IMG(src=prefix + "/static/calibre.png", alt=__appname__), id="logo")

    search_box = build_search_box(num, search, sort, order, prefix)
    navigation = build_navigation(start, num, total, prefix + url_base)
    navigation2 = build_navigation(start, num, total, prefix + url_base)
    bookt = TABLE(id="listing")

    body = BODY(logo, search_box, navigation, HR(CLASS("spacer")), bookt, HR(CLASS("spacer")), navigation2)

    # Book list {{{
    for book in books:
        thumbnail = TD(
            IMG(type="image/jpeg", border="0", src=prefix + "/get/thumb/%s" % book["id"]), CLASS("thumbnail")
        )

        data = TD()
        for fmt in book["formats"].split(","):
            if not fmt or fmt.lower().startswith("original_"):
                continue
            file_extension = "kepub.epub" if have_kobo_browser and fmt.lower() == "kepub" else fmt
            a = quote(ascii_filename(book["authors"]))
            t = quote(ascii_filename(book["title"]))
            s = SPAN(
                A(fmt.lower(), href=prefix + "/get/%s/%s-%s_%d.%s" % (fmt, a, t, book["id"], file_extension.lower())),
                CLASS("button"),
            )
            s.tail = u""
            data.append(s)

        div = DIV(CLASS("data-container"))
        data.append(div)

        series = u"[%s - %s]" % (book["series"], book["series_index"]) if book["series"] else ""
        tags = u"Tags=[%s]" % book["tags"] if book["tags"] else ""

        ctext = ""
        for key in CKEYS:
            val = book.get(key, None)
            if val:
                ctext += "%s=[%s] " % tuple(val.split(":#:"))

        first = SPAN(
            u"\u202f%s %s by %s"
            % (clean_xml_chars(book["title"]), clean_xml_chars(series), clean_xml_chars(book["authors"])),
            CLASS("first-line"),
        )
        div.append(first)
        second = SPAN(u"%s - %s %s %s" % (book["size"], book["timestamp"], tags, ctext), CLASS("second-line"))
        div.append(second)

        bookt.append(TR(thumbnail, data))
    # }}}

    body.append(
        DIV(
            A(
                _("Switch to the full interface (non-mobile interface)"),
                href=prefix + "/browse",
                style="text-decoration: none; color: blue",
                title=_(
                    "The full interface gives you many more features, " "but it may not work well on a small screen"
                ),
            ),
            style="text-align:center",
        )
    )
    return HTML(
        HEAD(
            TITLE(__appname__ + " Library"),
            LINK(rel="icon", href="//calibre-ebook.com/favicon.ico", type="image/x-icon"),
            LINK(rel="stylesheet", type="text/css", href=prefix + "/mobile/style.css"),
            LINK(rel="apple-touch-icon", href="/static/calibre.png"),
            META(name="robots", content="noindex"),
        ),  # End head
        body,
    )  # End html
Ejemplo n.º 33
0
def clean(x):
    if isinstance(x, string_or_bytes):
        x = clean_xml_chars(x)
    return x
Ejemplo n.º 34
0
    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None
        self.page_progression_direction = None
        self.primary_writing_mode = None

        self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503:  # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = self.decode(content)
                except:
                    pass
            elif idx == 524:  # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            elif idx == 525:
                try:
                    pwm = content.decode(codec)
                    if pwm:
                        self.primary_writing_mode = pwm
                except Exception:
                    pass
            elif idx == 527:
                try:
                    ppd = content.decode(codec)
                    if ppd:
                        self.page_progression_direction = ppd
                except Exception:
                    pass
            # else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
Ejemplo n.º 35
0
    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None
        self.page_progression_direction = None
        self.primary_writing_mode = None

        self.decode = lambda x: clean_ascii_chars(x.decode(codec, 'replace'))

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503:  # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = self.decode(content)
                except:
                    pass
            elif idx == 524:  # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            elif idx == 525:
                try:
                    pwm = content.decode(codec)
                    if pwm:
                        self.primary_writing_mode = pwm
                except Exception:
                    pass
            elif idx == 527:
                try:
                    ppd = content.decode(codec)
                    if ppd:
                        self.page_progression_direction = ppd
                except Exception:
                    pass
            # else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(
                clean_xml_chars(clean_ascii_chars(title)))
Ejemplo n.º 36
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         # Author names in Amazon  MOBI files are usually in LN, FN format,
         # try to detect and auto-correct that.
         m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
         if m is not None:
             if tweaks['author_sort_copy_method'] != 'copy':
                 self.mi.authors.append(m.group(2) + ' ' + m.group(1))
             else:
                 self.mi.authors.append(m.group())
             if self.mi.is_null('author_sort'):
                 self.mi.author_sort = m.group()
         else:
             self.mi.authors.append(au)
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', _('Unknown')}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([
             x.strip()
             for x in clean_xml_chars(self.decode(content)).split(';')
         ])
         self.mi.tags = list(set(self.mi.tags))
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(content, as_utc=False)
         except:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(
             self.decode(content).strip())
     elif idx == 109:
         self.mi.rights = clean_xml_chars(self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Ejemplo n.º 37
0
def clean(x):
    if isinstance(x, str):
        x = clean_xml_chars(x)
    return x
Ejemplo n.º 38
0
    def _generate(self, f, feeds, cutoff, extra_css=None, style=None):
        from calibre.utils.cleantext import clean_xml_chars

        def trim_title(title,clip=18):
            if len(title)>clip:
                tokens = title.split(' ')
                new_title_tokens = []
                new_title_len = 0
                if len(tokens[0]) > clip:
                    return tokens[0][:clip] + '...'
                for token in tokens:
                    if len(token) + new_title_len < clip:
                        new_title_tokens.append(token)
                        new_title_len += len(token)
                    else:
                        new_title_tokens.append('...')
                        title = ' '.join(new_title_tokens)
                        break
            return title

        self.IS_HTML = False
        feed = feeds[f]

        # Construct the navbar
        navbar_t = TABLE(CLASS('touchscreen_navbar'))
        navbar_tr = TR()

        # Previous Section
        link = ''
        if f > 0:
            link = A(CLASS('feed_link'),
                     trim_title(feeds[f-1].title),
                     href='../feed_%d/index.html' % int(f-1))
        navbar_tr.append(TD(CLASS('feed_prev'),link))

        # Up to Sections
        link = A(_('Sections'), href="../index.html")
        navbar_tr.append(TD(CLASS('feed_up'),link))

        # Next Section
        link = ''
        if f < len(feeds)-1:
            link = A(CLASS('feed_link'),
                     trim_title(feeds[f+1].title),
                     href='../feed_%d/index.html' % int(f+1))
        navbar_tr.append(TD(CLASS('feed_next'),link))
        navbar_t.append(navbar_tr)
        top_navbar = navbar_t
        bottom_navbar = copy.copy(navbar_t)
        # print "\n%s\n" % etree.tostring(navbar_t, pretty_print=True)

        # Build the page
        head = HEAD(TITLE(feed.title))
        if style:
            head.append(STYLE(style, type='text/css'))
        if extra_css:
            head.append(STYLE(extra_css, type='text/css'))
        body = BODY()
        div = DIV(
                top_navbar,
                H2(feed.title, CLASS('feed_title'))
                )
        body.append(div)

        if getattr(feed, 'image', None):
            div.append(DIV(IMG(
                alt=feed.image_alt if feed.image_alt else '',
                src=feed.image_url
                ),
                CLASS('calibre_feed_image')))
        if getattr(feed, 'description', None):
            d = DIV(clean_xml_chars(feed.description), CLASS('calibre_feed_description',
                'calibre_rescale_80'))
            d.append(BR())
            div.append(d)

        for i, article in enumerate(feed.articles):
            if not getattr(article, 'downloaded', False):
                continue

            div_td = DIV(CLASS('article_summary'),
                    A(article.title, CLASS('summary_headline','calibre_rescale_120',
                                    href=article.url)))
            if article.author:
                div_td.append(DIV(article.author,
                    CLASS('summary_byline', 'calibre_rescale_100')))
            if article.summary:
                div_td.append(DIV(cutoff(article.text_summary),
                    CLASS('summary_text', 'calibre_rescale_100')))
            div.append(div_td)

        div.append(bottom_navbar)
        self.root = HTML(head, body)
        if self.html_lang:
            self.root.set('lang', self.html_lang)
Ejemplo n.º 39
0
def sanitize(s):
    return unicodedata.normalize('NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or ''))))
Ejemplo n.º 40
0
def sanitize(s):
    return clean_xml_chars(clean_ascii_chars(force_unicode(s or '')))
Ejemplo n.º 41
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         # Author names in Amazon  MOBI files are usually in LN, FN format,
         # try to detect and auto-correct that.
         m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
         if m is not None:
             if tweaks['author_sort_copy_method'] != 'copy':
                 self.mi.authors.append(m.group(2) + ' ' + m.group(1))
             else:
                 self.mi.authors.append(m.group())
             if self.mi.is_null('author_sort'):
                 self.mi.author_sort = m.group()
         else:
             self.mi.authors.append(au)
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', _('Unknown')}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments  = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
         self.mi.tags = list(set(self.mi.tags))
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(content, as_utc=False)
         except:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
     elif idx == 109:
         self.mi.rights = clean_xml_chars(self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Ejemplo n.º 42
0
def html5_parse(data, max_nesting_depth=100):
    import html5lib, warnings
    # HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195
    data = fix_self_closing_cdata_tags(data)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        try:
            data = html5lib.parse(data, treebuilder='lxml').getroot()
        except ValueError:
            from calibre.utils.cleantext import clean_xml_chars
            data = html5lib.parse(clean_xml_chars(data), treebuilder='lxml').getroot()

    # Check that the asinine HTML 5 algorithm did not result in a tree with
    # insane nesting depths
    for x in data.iterdescendants():
        if isinstance(x.tag, basestring) and len(x) is 0:  # Leaf node
            depth = node_depth(x)
            if depth > max_nesting_depth:
                raise ValueError('html5lib resulted in a tree with nesting'
                        ' depth > %d'%max_nesting_depth)

    # html5lib has the most inelegant handling of namespaces I have ever seen
    # Try to reconstitute destroyed namespace info
    xmlns_declaration = '{%s}'%XMLNS_NS
    non_html5_namespaces = {}
    seen_namespaces = set()
    for elem in tuple(data.iter(tag=etree.Element)):
        elem.attrib.pop('xmlns', None)
        # Set lang correctly
        xl = elem.attrib.pop('xmlU0003Alang', None)
        if xl is not None and 'lang' not in elem.attrib:
            elem.attrib['lang'] = xl
        namespaces = {}
        for x in tuple(elem.attrib):
            if x.startswith('xmlnsU') or x.startswith(xmlns_declaration):
                # A namespace declaration
                val = elem.attrib.pop(x)
                if x.startswith('xmlnsU0003A'):
                    prefix = x[11:]
                    namespaces[prefix] = val

        remapped_namespaces = {}
        if namespaces:
            # Some destroyed namespace declarations were found
            p = elem.getparent()
            if p is None:
                # We handle the root node later
                non_html5_namespaces = namespaces
            else:
                idx = p.index(elem)
                p.remove(elem)
                elem = clone_element(elem, nsmap=namespaces)
                p.insert(idx, elem)
                remapped_namespaces = {ns:namespaces[ns] for ns in set(namespaces) - set(elem.nsmap)}

        b = barename(elem.tag)
        idx = b.find('U0003A')
        if idx > -1:
            prefix, tag = b[:idx], b[idx+6:]
            ns = elem.nsmap.get(prefix, None)
            if ns is None:
                ns = non_html5_namespaces.get(prefix, None)
            if ns is None:
                ns = remapped_namespaces.get(prefix, None)
            if ns is not None:
                elem.tag = '{%s}%s'%(ns, tag)

        for b in tuple(elem.attrib):
            idx = b.find('U0003A')
            if idx > -1:
                prefix, tag = b[:idx], b[idx+6:]
                ns = elem.nsmap.get(prefix, None)
                if ns is None:
                    ns = non_html5_namespaces.get(prefix, None)
                if ns is None:
                    ns = remapped_namespaces.get(prefix, None)
                if ns is not None:
                    elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b)

        seen_namespaces |= set(elem.nsmap.itervalues())

    nsmap = dict(html5lib.constants.namespaces)
    nsmap[None] = nsmap.pop('html')
    non_html5_namespaces.update(nsmap)
    nsmap = non_html5_namespaces

    data = clone_element(data, nsmap=nsmap, in_context=False)

    # Remove unused namespace declarations
    fnsmap = {k:v for k,v in nsmap.iteritems() if v in seen_namespaces and v !=
            XMLNS_NS}
    return clone_element(data, nsmap=fnsmap, in_context=False)
Ejemplo n.º 43
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         self.mi.authors.append(au)
         if self.mi.is_null('author_sort') and re.match(
                 r'\S+?\s*,\s+\S+', au.strip()):
             self.mi.author_sort = au.strip()
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', _('Unknown')}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([
             x.strip()
             for x in clean_xml_chars(self.decode(content)).split(';')
         ])
         self.mi.tags = list(set(self.mi.tags))
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(content, as_utc=False)
         except:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(
             self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Ejemplo n.º 44
0
def sanitize(s):
    return clean_xml_chars(clean_ascii_chars(force_unicode(s or '')))
Ejemplo n.º 45
0
    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
        self.check_for_drm()
        processed_records = self.extract_text()
        if self.debug is not None:
            parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
        self.add_anchors()
        self.processed_html = self.processed_html.decode(self.book_header.codec,
            'ignore')
        self.processed_html = self.processed_html.replace('</</', '</')
        self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
                self.processed_html)
        self.processed_html = self.processed_html.replace(u'\ufeff', '')
        # Remove tags of the form <xyz: ...> as they can cause issues further
        # along the pipeline
        self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
                self.processed_html)

        self.processed_html = strip_encoding_declarations(self.processed_html)
        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
            self.processed_html)
        self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()

        self.log.debug('Parsing HTML...')
        self.processed_html = clean_xml_chars(self.processed_html)
        try:
            root = html.fromstring(self.processed_html)
            if len(root.xpath('//html')) > 5:
                root = html.fromstring(self.processed_html.replace('\x0c',
                    '').replace('\x14', ''))
        except Exception:
            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
            self.processed_html = self.remove_random_bytes(self.processed_html)
            root = html.fromstring(self.processed_html)
        if root.xpath('descendant::p/descendant::p'):
            from html5_parser import parse
            self.log.warning('Malformed markup, parsing using html5-parser')
            self.processed_html = strip_encoding_declarations(self.processed_html)
            try:
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
                self.processed_html = self.remove_random_bytes(self.processed_html)
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            if len(root.xpath('body/descendant::*')) < 1:
                # There are probably stray </html>s in the markup
                self.processed_html = self.processed_html.replace('</html>',
                        '')
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)

        if root.tag != 'html':
            self.log.warn('File does not have opening <html> tag')
            nroot = html.fromstring('<html><head></head><body></body></html>')
            bod = nroot.find('body')
            for child in list(root):
                child.getparent().remove(child)
                bod.append(child)
            root = nroot

        htmls = list(root.xpath('//html'))

        if len(htmls) > 1:
            self.log.warn('Markup contains multiple <html> tags, merging.')
            # Merge all <head> and <body> sections
            for h in htmls:
                p = h.getparent()
                if hasattr(p, 'remove'):
                    p.remove(h)
            bodies, heads = root.xpath('//body'), root.xpath('//head')
            for x in root:
                root.remove(x)
            head, body = map(root.makeelement, ('head', 'body'))
            for h in heads:
                for x in h:
                    h.remove(x)
                    head.append(x)
            for b in bodies:
                for x in b:
                    b.remove(x)
                    body.append(x)
            root.append(head), root.append(body)
        for x in root.xpath('//script'):
            x.getparent().remove(x)

        head = root.xpath('//head')
        if head:
            head = head[0]
        else:
            head = root.makeelement('head', {})
            root.insert(0, head)
        head.text = '\n\t'
        link = head.makeelement('link', {'type':'text/css',
            'href':'styles.css', 'rel':'stylesheet'})
        head.insert(0, link)
        link.tail = '\n\t'
        title = head.xpath('descendant::title')
        m = head.makeelement('meta', {'http-equiv':'Content-Type',
            'content':'text/html; charset=utf-8'})
        head.insert(0, m)
        if not title:
            title = head.makeelement('title', {})
            try:
                title.text = self.book_header.title
            except ValueError:
                title.text = clean_ascii_chars(self.book_header.title)
            title.tail = '\n\t'
            head.insert(0, title)
            head.text = '\n\t'

        self.upshift_markup(root)
        guides = root.xpath('//guide')
        guide = guides[0] if guides else None
        metadata_elems = root.xpath('//metadata')
        if metadata_elems and self.book_header.exth is None:
            self.read_embedded_metadata(root, metadata_elems[0], guide)
        for elem in guides + metadata_elems:
            elem.getparent().remove(elem)
        htmlfile = os.path.join(output_dir, 'index.html')
        try:
            for ref in guide.xpath('descendant::reference'):
                if 'href' in ref.attrib:
                    ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
        except AttributeError:
            pass

        def write_as_utf8(path, data):
            if isinstance(data, unicode_type):
                data = data.encode('utf-8')
            with lopen(path, 'wb') as f:
                f.write(data)

        parse_cache[htmlfile] = root
        self.htmlfile = htmlfile
        ncx = io.BytesIO()
        opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
        self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
        opf.render(lopen(self.created_opf_path, 'wb'), ncx,
            ncx_manifest_entry=ncx_manifest_entry)
        ncx = ncx.getvalue()
        if ncx:
            ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
            write_as_utf8(ncx_path, ncx)

        css = [self.base_css_rules, '\n\n']
        for cls, rule in self.tag_css_rules.items():
            css.append('.%s { %s }\n\n' % (cls, rule))
        write_as_utf8('styles.css', ''.join(css))

        if self.book_header.exth is not None or self.embedded_mi is not None:
            self.log.debug('Creating OPF...')
            ncx = io.BytesIO()
            opf, ncx_manifest_entry  = self.create_opf(htmlfile, guide, root)
            opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
                ncx_manifest_entry)
            ncx = ncx.getvalue()
            if ncx:
                write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
Ejemplo n.º 46
0
def build_index(books, num, search, sort, order, start, total, url_base, CKEYS,
        prefix, have_kobo_browser=False):
    logo = DIV(IMG(src=prefix+'/static/calibre.png', alt=__appname__), id='logo')

    search_box = build_search_box(num, search, sort, order, prefix)
    navigation = build_navigation(start, num, total, prefix+url_base)
    navigation2 = build_navigation(start, num, total, prefix+url_base)
    bookt = TABLE(id='listing')

    body = BODY(
        logo,
        search_box,
        navigation,
        HR(CLASS('spacer')),
        bookt,
        HR(CLASS('spacer')),
        navigation2
    )

    # Book list {{{
    for book in books:
        thumbnail = TD(
                IMG(type='image/jpeg', border='0',
                    src=prefix+'/get/thumb/%s' %
                            book['id']),
                CLASS('thumbnail'))

        data = TD()
        for fmt in book['formats'].split(','):
            if not fmt or fmt.lower().startswith('original_'):
                continue
            file_extension = "kepub.epub" if have_kobo_browser and fmt.lower() == "kepub" else fmt
            a = quote(ascii_filename(book['authors']))
            t = quote(ascii_filename(book['title']))
            s = SPAN(
                A(
                    fmt.lower(),
                    href=prefix+'/get/%s/%s-%s_%d.%s' % (fmt, a, t,
                        book['id'], file_extension.lower())
                ),
                CLASS('button'))
            s.tail = u''
            data.append(s)

        div = DIV(CLASS('data-container'))
        data.append(div)

        series = u'[%s - %s]'%(book['series'], book['series_index']) \
                if book['series'] else ''
        tags = u'Tags=[%s]'%book['tags'] if book['tags'] else ''

        ctext = ''
        for key in CKEYS:
            val = book.get(key, None)
            if val:
                ctext += '%s=[%s] '%tuple(val.split(':#:'))

        first = SPAN(u'\u202f%s %s by %s' % (clean_xml_chars(book['title']), clean_xml_chars(series),
            clean_xml_chars(book['authors'])), CLASS('first-line'))
        div.append(first)
        second = SPAN(u'%s - %s %s %s' % (book['size'],
            book['timestamp'],
            tags, ctext), CLASS('second-line'))
        div.append(second)

        bookt.append(TR(thumbnail, data))
    # }}}

    body.append(DIV(
        A(_('Switch to the full interface (non-mobile interface)'),
            href=prefix+"/browse",
            style="text-decoration: none; color: blue",
            title=_('The full interface gives you many more features, '
                'but it may not work well on a small screen')),
        style="text-align:center"))
    return HTML(
        HEAD(
            TITLE(__appname__ + ' Library'),
            LINK(rel='icon', href='http://calibre-ebook.com/favicon.ico',
                type='image/x-icon'),
            LINK(rel='stylesheet', type='text/css',
                href=prefix+'/mobile/style.css'),
            LINK(rel='apple-touch-icon', href="/static/calibre.png"),
            META(name="robots", content="noindex")
        ),  # End head
        body
    )  # End html
Ejemplo n.º 47
0
def clean(x):
    if isinstance(x, basestring):
        x = clean_xml_chars(x)
    return x
Ejemplo n.º 48
0
    def _generate(self, f, feeds, cutoff, extra_css=None, style=None):
        from calibre.utils.cleantext import clean_xml_chars

        def trim_title(title,clip=18):
            if len(title)>clip:
                tokens = title.split(' ')
                new_title_tokens = []
                new_title_len = 0
                if len(tokens[0]) > clip:
                    return tokens[0][:clip] + '...'
                for token in tokens:
                    if len(token) + new_title_len < clip:
                        new_title_tokens.append(token)
                        new_title_len += len(token)
                    else:
                        new_title_tokens.append('...')
                        title = ' '.join(new_title_tokens)
                        break
            return title

        self.IS_HTML = False
        feed = feeds[f]

        # Construct the navbar
        navbar_t = TABLE(CLASS('touchscreen_navbar'))
        navbar_tr = TR()

        # Previous Section
        link = ''
        if f > 0:
            link = A(CLASS('feed_link'),
                     trim_title(feeds[f-1].title),
                     href='../feed_%d/index.html' % int(f-1))
        navbar_tr.append(TD(CLASS('feed_prev'),link))

        # Up to Sections
        link = A(_('Sections'), href="../index.html")
        navbar_tr.append(TD(CLASS('feed_up'),link))

        # Next Section
        link = ''
        if f < len(feeds)-1:
            link = A(CLASS('feed_link'),
                     trim_title(feeds[f+1].title),
                     href='../feed_%d/index.html' % int(f+1))
        navbar_tr.append(TD(CLASS('feed_next'),link))
        navbar_t.append(navbar_tr)
        top_navbar = navbar_t
        bottom_navbar = copy.copy(navbar_t)
        # print "\n%s\n" % etree.tostring(navbar_t, pretty_print=True)

        # Build the page
        head = HEAD(TITLE(feed.title))
        if style:
            head.append(STYLE(style, type='text/css'))
        if extra_css:
            head.append(STYLE(extra_css, type='text/css'))
        body = BODY()
        div = DIV(
                top_navbar,
                H2(feed.title, CLASS('feed_title'))
                )
        body.append(div)

        if getattr(feed, 'image', None):
            div.append(DIV(IMG(
                alt=feed.image_alt if feed.image_alt else '',
                src=feed.image_url
                ),
                CLASS('calibre_feed_image')))
        if getattr(feed, 'description', None):
            d = DIV(clean_xml_chars(feed.description), CLASS('calibre_feed_description',
                'calibre_rescale_80'))
            d.append(BR())
            div.append(d)

        for i, article in enumerate(feed.articles):
            if not getattr(article, 'downloaded', False):
                continue

            div_td = DIV(CLASS('article_summary'),
                    A(article.title, CLASS('summary_headline','calibre_rescale_120',
                                    href=article.url)))
            if article.author:
                div_td.append(DIV(article.author,
                    CLASS('summary_byline', 'calibre_rescale_100')))
            if article.summary:
                div_td.append(DIV(cutoff(article.text_summary),
                    CLASS('summary_text', 'calibre_rescale_100')))
            div.append(div_td)

        div.append(bottom_navbar)
        self.root = HTML(head, body)
        if self.html_lang:
            self.root.set('lang', self.html_lang)