Exemple #1
0
    def remove_old_cover(self, cover_item, new_cover_href=None):
        from calibre.ebooks.oeb.base import XPath, XLINK
        from lxml import etree

        self.oeb.manifest.remove(cover_item)

        # Remove any references to the cover in the HTML
        affected_items = set()
        xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
        for i, item in enumerate(self.oeb.spine):
            try:
                images = xp(item.data)
            except Exception:
                images = ()
            removed = False
            for img in images:
                href = img.get('src') or img.get(XLINK('href'))
                try:
                    href = item.abshref(href)
                except Exception:
                    continue  # Invalid URL, ignore
                if href == cover_item.href:
                    if new_cover_href is not None:
                        replacement_href = item.relhref(new_cover_href)
                        attr = 'src' if img.tag.endswith('img') else XLINK(
                            'href')
                        img.set(attr, replacement_href)
                    else:
                        p = img.getparent()
                        if p.tag.endswith('}svg'):
                            p.getparent().remove(p)
                        else:
                            p.remove(img)
                        removed = True
            if removed:
                affected_items.add(item)

        # Check if the resulting HTML has no content, if so remove it
        for item in affected_items:
            body = XPath('//h:body')(item.data)
            if body:
                text = etree.tostring(body[0],
                                      method='text',
                                      encoding='unicode')
            else:
                text = ''
            text = re.sub(r'\s+', '', text)
            if not text and not XPath('//h:img|//svg:svg')(item.data):
                self.log('Removing %s as it is a wrapper around'
                         ' the cover image' % item.href)
                self.oeb.spine.remove(item)
                self.oeb.manifest.remove(item)
                self.oeb.guide.remove_by_href(item.href)
Exemple #2
0
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):

    changed = set()
    link_xpath = XPath('//h:a[@href]')
    svg_link_xpath = XPath('//svg:a')
    link_replacer = create_link_replacer(container, link_uid, changed)

    virtualized_names.add(name)
    root = container.parsed(name)
    rewrite_links(root, partial(link_replacer, name))

    def handle_link(a, attr='href'):
        href = a.get(attr) or ''
        if href.startswith(link_uid):
            a.set(attr, 'javascript:void(0)')
            parts = decode_url(href.split('|')[1])
            lname, lfrag = parts[0], parts[1]
            link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
            a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))
        elif href:
            a.set('target', '_blank')
            a.set('rel', 'noopener noreferrer')

    for a in link_xpath(root):
        handle_link(a)
    xhref = XLINK('href')
    for a in svg_link_xpath(root):
        handle_link(a, xhref)

    return name in changed
Exemple #3
0
 def map_resources(self, oeb_book):
     for item in oeb_book.manifest:
         if item.media_type in OEB_IMAGES:
             if item.href not in self.images:
                 ext = os.path.splitext(item.href)[1]
                 fname = '%s%s' % (len(self.images), ext)
                 fname = fname.zfill(10)
                 self.images[item.href] = fname
         if item in oeb_book.spine:
             self.get_link_id(item.href)
             root = item.data.find(XHTML('body'))
             link_attrs = set(html.defs.link_attrs)
             link_attrs.add(XLINK('href'))
             for el in root.iter():
                 attribs = el.attrib
                 try:
                     if not isinstance(el.tag, basestring):
                         continue
                 except:
                     continue
                 for attr in attribs:
                     if attr in link_attrs:
                         href = item.abshref(attribs[attr])
                         href, id = urldefrag(href)
                         if href in self.base_hrefs:
                             self.get_link_id(href, id)
Exemple #4
0
def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names):
    link_xpath = XPath('//h:a[@href]')
    svg_link_xpath = XPath('//svg:a')
    img_xpath = XPath('//h:img[@src]')
    res_link_xpath = XPath('//h:link[@href]')
    root = container.parsed(name)
    changed_names = set()
    link_replacer = create_link_replacer(container, link_uid, changed_names)

    # Used for viewing images
    for img in img_xpath(root):
        img_name = container.href_to_name(img.get('src'), name)
        if img_name:
            img.set('data-calibre-src', img_name)

    # Disable non-stylesheet link tags. This link will not be loaded by the
    # browser anyway and will causes the resource load check to hang
    for link in res_link_xpath(root):
        ltype = (link.get('type') or 'text/css').lower()
        rel = (link.get('rel') or 'stylesheet').lower()
        if ltype != 'text/css' or rel != 'stylesheet':
            link.attrib.clear()

    def transform_and_virtualize_sheet(sheet):
        changed = transform_sheet(sheet)
        if virtualize_resources:
            replaceUrls(sheet, partial(link_replacer, name))
            if name in changed_names:
                virtualized_names.add(name)
                changed = True
        return changed

    # Transform <style> and style=""
    transform_inline_styles(container, name, transform_sheet=transform_and_virtualize_sheet, transform_style=transform_declaration)

    if virtualize_resources:
        virtualize_html(container, name, link_uid, link_to_map, virtualized_names)
    else:

        def handle_link(a, attr='href'):
            href = a.get(attr)
            if href:
                href = link_replacer(name, href)
            if href and href.startswith(link_uid):
                a.set(attr, 'javascript:void(0)')
                parts = decode_url(href.split('|')[1])
                lname, lfrag = parts[0], parts[1]
                link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name)
                a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False))

        for a in link_xpath(root):
            handle_link(a)
        xhref = XLINK('href')
        for a in svg_link_xpath(root):
            handle_link(a, xhref)

    shtml = html_as_json(root)
    with container.open(name, 'wb') as f:
        f.write(shtml)
Exemple #5
0
 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = base64.encodestring(str(linkee))
         data = "data:%s;base64,%s" % (linkee.media_type, data)
         elem.attrib[XLINK('href')] = data
     return svg
Exemple #6
0
    def virtualize_resources(self):

        changed = set()
        link_uid = self.book_render_data['link_uid']
        resource_template = link_uid + '|{}|'
        xlink_xpath = XPath('//*[@xl:href]')
        link_xpath = XPath('//h:a[@href]')

        def link_replacer(base, url):
            if url.startswith('#'):
                frag = urlunquote(url[1:])
                if not frag:
                    return url
                changed.add(base)
                return resource_template.format(encode_url(base, frag))
            purl = urlparse(url)
            if purl.netloc or purl.query:
                return url
            if purl.scheme and purl.scheme != 'file':
                return url
            if not purl.path or purl.path.startswith('/'):
                return url
            url, frag = purl.path, purl.fragment
            name = self.href_to_name(url, base)
            if name:
                frag = urlunquote(frag)
                url = resource_template.format(encode_url(name, frag))
                changed.add(base)
            return url

        for name, mt in self.mime_map.iteritems():
            mt = mt.lower()
            if mt in OEB_STYLES:
                replaceUrls(self.parsed(name), partial(link_replacer, name))
                self.virtualized_names.add(name)
            elif mt in OEB_DOCS:
                self.virtualized_names.add(name)
                root = self.parsed(name)
                rewrite_links(root, partial(link_replacer, name))
                for a in link_xpath(root):
                    href = a.get('href')
                    if href.startswith(link_uid):
                        a.set('href', 'javascript:void(0)')
                        parts = decode_url(href.split('|')[1])
                        a.set('data-' + link_uid, json.dumps({'name':parts[0], 'frag':parts[1]}, ensure_ascii=False))
                    else:
                        a.set('target', '_blank')
                    changed.add(name)
            elif mt == 'image/svg+xml':
                self.virtualized_names.add(name)
                changed = False
                xlink = XLINK('href')
                for elem in xlink_xpath(self.parsed(name)):
                    elem.set(xlink, link_replacer(name, elem.get(xlink)))

        tuple(map(self.dirty, changed))
Exemple #7
0
 def dataize_svg(self, item, svg=None):
     if svg is None:
         svg = item.data
     hrefs = self.oeb.manifest.hrefs
     for elem in xpath(svg, '//svg:*[@xl:href]'):
         href = urlnormalize(elem.attrib[XLINK('href')])
         path = urldefrag(href)[0]
         if not path:
             continue
         abshref = item.abshref(path)
         if abshref not in hrefs:
             continue
         linkee = hrefs[abshref]
         data = str(linkee)
         ext = what(None, data) or 'jpg'
         with PersistentTemporaryFile(suffix='.' + ext) as pt:
             pt.write(data)
             self.temp_files.append(pt.name)
         elem.attrib[XLINK('href')] = pt.name
     return svg
Exemple #8
0
def extract_cover_from_embedded_svg(html, base, log):
    from lxml import etree
    from calibre.ebooks.oeb.base import XPath, SVG, XLINK
    root = etree.fromstring(html)

    svg = XPath('//svg:svg')(root)
    if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
        image = svg[0][0]
        href = image.get(XLINK('href'), None)
        path = os.path.join(base, *href.split('/'))
        if href and os.access(path, os.R_OK):
            return open(path, 'rb').read()
Exemple #9
0
def extract_cover_from_embedded_svg(html, base, log):
    from calibre.ebooks.oeb.base import XPath, SVG, XLINK
    from calibre.utils.xml_parse import safe_xml_fromstring
    root = safe_xml_fromstring(html)

    svg = XPath('//svg:svg')(root)
    if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
        image = svg[0][0]
        href = image.get(XLINK('href'), None)
        if href:
            path = os.path.join(base, *href.split('/'))
            return return_raster_image(path)
Exemple #10
0
    def virtualize_resources(self):

        changed = set()
        link_uid = self.book_render_data['link_uid']
        xlink_xpath = XPath('//*[@xl:href]')
        link_xpath = XPath('//h:a[@href]')
        link_replacer = create_link_replacer(self, link_uid, changed)

        ltm = self.book_render_data['link_to_map']

        for name, mt in iteritems(self.mime_map):
            mt = mt.lower()
            if mt in OEB_STYLES:
                replaceUrls(self.parsed(name), partial(link_replacer, name))
                self.virtualized_names.add(name)
            elif mt in OEB_DOCS:
                self.virtualized_names.add(name)
                root = self.parsed(name)
                rewrite_links(root, partial(link_replacer, name))
                for a in link_xpath(root):
                    href = a.get('href')
                    if href.startswith(link_uid):
                        a.set('href', 'javascript:void(0)')
                        parts = decode_url(href.split('|')[1])
                        lname, lfrag = parts[0], parts[1]
                        ltm.setdefault(lname,
                                       {}).setdefault(lfrag or '',
                                                      set()).add(name)
                        a.set(
                            'data-' + link_uid,
                            json.dumps({
                                'name': lname,
                                'frag': lfrag
                            },
                                       ensure_ascii=False))
                    else:
                        a.set('target', '_blank')
                        a.set('rel', 'noopener noreferrer')
            elif mt == 'image/svg+xml':
                self.virtualized_names.add(name)
                xlink = XLINK('href')
                altered = False
                for elem in xlink_xpath(self.parsed(name)):
                    href = elem.get(xlink)
                    if not href.startswith('#'):
                        elem.set(xlink, link_replacer(name, href))
                        altered = True
                if altered:
                    changed.add(name)

        tuple(map(self.dirty, changed))
Exemple #11
0
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names):
    if not virtualize_resources:
        return
    link_replacer = create_link_replacer(container, link_uid, set())
    xlink = XLINK('href')
    altered = False
    xlink_xpath = XPath('//*[@xl:href]')
    for elem in xlink_xpath(container.parsed(name)):
        href = elem.get(xlink)
        if not href.startswith('#'):
            elem.set(xlink, link_replacer(name, href))
            altered = True
    if altered:
        virtualized_names.add(name)
        container.dirty(name)
        container.commit_item(name)
Exemple #12
0
def find_cover_image_in_page(container, cover_page):
    root = container.parsed(cover_page)
    body = XPath('//h:body')(root)
    if len(body) != 1:
        return
    body = body[0]
    images = []
    for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
        href = img.get('src') or img.get(XLINK('href'))
        if href:
            name = container.href_to_name(href, base=cover_page)
            images.append(name)
    text = re.sub(r'\s+', '', xml2text(body))
    if text or len(images) > 1:
        # Document has more content than a single image
        return
    if images:
        return images[0]
Exemple #13
0
    def virtualize_resources(self):

        changed = set()
        link_uid = self.book_render_data['link_uid']
        resource_template = link_uid + '|{}|'
        xlink_xpath = XPath('//*[@xl:href]')
        link_xpath = XPath('//h:a[@href]')
        res_link_xpath = XPath('//h:link[@href]')

        def link_replacer(base, url):
            if url.startswith('#'):
                frag = urlunquote(url[1:])
                if not frag:
                    return url
                changed.add(base)
                return resource_template.format(encode_url(base, frag))
            purl = urlparse(url)
            if purl.netloc or purl.query:
                return url
            if purl.scheme and purl.scheme != 'file':
                return url
            if not purl.path or purl.path.startswith('/'):
                return url
            url, frag = purl.path, purl.fragment
            name = self.href_to_name(url, base)
            if name:
                if self.has_name(name):
                    frag = urlunquote(frag)
                    url = resource_template.format(encode_url(name, frag))
                else:
                    if isinstance(name, unicode):
                        name = name.encode('utf-8')
                    url = 'missing:' + force_unicode(quote(name), 'utf-8')
                changed.add(base)
            return url

        ltm = self.book_render_data['link_to_map']

        for name, mt in self.mime_map.iteritems():
            mt = mt.lower()
            if mt in OEB_STYLES:
                replaceUrls(self.parsed(name), partial(link_replacer, name))
                self.virtualized_names.add(name)
            elif mt in OEB_DOCS:
                self.virtualized_names.add(name)
                root = self.parsed(name)
                for link in res_link_xpath(root):
                    ltype = (link.get('type') or 'text/css').lower()
                    rel = (link.get('rel') or 'stylesheet').lower()
                    if ltype != 'text/css' or rel != 'stylesheet':
                        # This link will not be loaded by the browser anyway
                        # and will causes the resource load check to hang
                        link.attrib.clear()
                        changed.add(name)
                rewrite_links(root, partial(link_replacer, name))
                for a in link_xpath(root):
                    href = a.get('href')
                    if href.startswith(link_uid):
                        a.set('href', 'javascript:void(0)')
                        parts = decode_url(href.split('|')[1])
                        lname, lfrag = parts[0], parts[1]
                        ltm.setdefault(lname,
                                       {}).setdefault(lfrag or '',
                                                      set()).add(name)
                        a.set(
                            'data-' + link_uid,
                            json.dumps({
                                'name': lname,
                                'frag': lfrag
                            },
                                       ensure_ascii=False))
                    else:
                        a.set('target', '_blank')
                        a.set('rel', 'noopener noreferrer')
                    changed.add(name)
            elif mt == 'image/svg+xml':
                self.virtualized_names.add(name)
                changed.add(name)
                xlink = XLINK('href')
                for elem in xlink_xpath(self.parsed(name)):
                    elem.set(xlink, link_replacer(name, elem.get(xlink)))

        for name, amap in ltm.iteritems():
            for k, v in tuple(amap.iteritems()):
                amap[k] = tuple(v)  # needed for JSON serialization

        tuple(map(self.dirty, changed))