Example #1
0
def extract_cover_from_embedded_svg(html, base, log):
    from lxml import etree
    from calibre.ebooks.oeb.base import XPath, SVG, XLINK
    root = etree.fromstring(html)

    svg = XPath('//svg:svg')(root)
    if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
        image = svg[0][0]
        href = image.get(XLINK('href'), None)
        path = os.path.join(base, *href.split('/'))
        if href and os.access(path, os.R_OK):
            return open(path, 'rb').read()
Example #2
0
def extract_cover_from_embedded_svg(html, base, log):
    from calibre.ebooks.oeb.base import XPath, SVG, XLINK
    from calibre.utils.xml_parse import safe_xml_fromstring
    root = safe_xml_fromstring(html)

    svg = XPath('//svg:svg')(root)
    if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
        image = svg[0][0]
        href = image.get(XLINK('href'), None)
        if href:
            path = os.path.join(base, *href.split('/'))
            return return_raster_image(path)
Example #3
0
    def virtualize_resources(self):

        changed = set()
        link_uid = self.book_render_data['link_uid']
        xlink_xpath = XPath('//*[@xl:href]')
        link_xpath = XPath('//h:a[@href]')
        link_replacer = create_link_replacer(self, link_uid, changed)

        ltm = self.book_render_data['link_to_map']

        for name, mt in iteritems(self.mime_map):
            mt = mt.lower()
            if mt in OEB_STYLES:
                replaceUrls(self.parsed(name), partial(link_replacer, name))
                self.virtualized_names.add(name)
            elif mt in OEB_DOCS:
                self.virtualized_names.add(name)
                root = self.parsed(name)
                rewrite_links(root, partial(link_replacer, name))
                for a in link_xpath(root):
                    href = a.get('href')
                    if href.startswith(link_uid):
                        a.set('href', 'javascript:void(0)')
                        parts = decode_url(href.split('|')[1])
                        lname, lfrag = parts[0], parts[1]
                        ltm.setdefault(lname,
                                       {}).setdefault(lfrag or '',
                                                      set()).add(name)
                        a.set(
                            'data-' + link_uid,
                            json.dumps({
                                'name': lname,
                                'frag': lfrag
                            },
                                       ensure_ascii=False))
                    else:
                        a.set('target', '_blank')
                        a.set('rel', 'noopener noreferrer')
            elif mt == 'image/svg+xml':
                self.virtualized_names.add(name)
                xlink = XLINK('href')
                altered = False
                for elem in xlink_xpath(self.parsed(name)):
                    href = elem.get(xlink)
                    if not href.startswith('#'):
                        elem.set(xlink, link_replacer(name, href))
                        altered = True
                if altered:
                    changed.add(name)

        tuple(map(self.dirty, changed))
Example #4
0
    def virtualize_resources(self):

        changed = set()
        link_uid = self.book_render_data['link_uid']
        resource_template = link_uid + '|{}|'
        xlink_xpath = XPath('//*[@xl:href]')
        link_xpath = XPath('//h:a[@href]')

        def link_replacer(base, url):
            if url.startswith('#'):
                frag = urlunquote(url[1:])
                if not frag:
                    return url
                changed.add(base)
                return resource_template.format(encode_url(base, frag))
            purl = urlparse(url)
            if purl.netloc or purl.query:
                return url
            if purl.scheme and purl.scheme != 'file':
                return url
            if not purl.path or purl.path.startswith('/'):
                return url
            url, frag = purl.path, purl.fragment
            name = self.href_to_name(url, base)
            if name:
                frag = urlunquote(frag)
                url = resource_template.format(encode_url(name, frag))
                changed.add(base)
            return url

        for name, mt in self.mime_map.iteritems():
            if mt in OEB_STYLES:
                replaceUrls(self.parsed(name), partial(link_replacer, name))
            elif mt in OEB_DOCS:
                root = self.parsed(name)
                rewrite_links(root, partial(link_replacer, name))
                for a in link_xpath(root):
                    href = a.get('href')
                    if href.startswith(link_uid):
                        a.set('href', 'javascript:void(0)')
                        a.set('data-' + link_uid, href.split('|')[1])
                    else:
                        a.set('target', '_blank')
                    changed.add(name)
            elif mt == 'image/svg+xml':
                changed = False
                xlink = XLINK('href')
                for elem in xlink_xpath(self.parsed(name)):
                    elem.set(xlink, link_replacer(name, elem.get(xlink)))

        tuple(map(self.dirty, changed))
Example #5
0
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names):
    if not virtualize_resources:
        return
    link_replacer = create_link_replacer(container, link_uid, set())
    xlink = XLINK('href')
    altered = False
    xlink_xpath = XPath('//*[@xl:href]')
    for elem in xlink_xpath(container.parsed(name)):
        href = elem.get(xlink)
        if not href.startswith('#'):
            elem.set(xlink, link_replacer(name, href))
            altered = True
    if altered:
        virtualized_names.add(name)
        container.dirty(name)
        container.commit_item(name)
Example #6
0
def find_cover_image_in_page(container, cover_page):
    root = container.parsed(cover_page)
    body = XPath('//h:body')(root)
    if len(body) != 1:
        return
    body = body[0]
    images = []
    for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body):
        href = img.get('src') or img.get(XLINK('href'))
        if href:
            name = container.href_to_name(href, base=cover_page)
            images.append(name)
    text = re.sub(r'\s+', '', xml2text(body))
    if text or len(images) > 1:
        # Document has more content than a single image
        return
    if images:
        return images[0]
Example #7
0
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):

    changed = set()
    link_xpath = XPath('//h:a[@href]')
    svg_link_xpath = XPath('//svg:a')
    link_replacer = create_link_replacer(container, link_uid, changed)

    virtualized_names.add(name)
    root = container.parsed(name)
    rewrite_links(root, partial(link_replacer, name))

    def handle_link(a, attr='href'):
        href = a.get(attr) or ''
        if href.startswith(link_uid):
            a.set(attr, 'javascript:void(0)')
            parts = decode_url(href.split('|')[1])
            lname, lfrag = parts[0], parts[1]
            link_to_map.setdefault(lname, {}).setdefault(lfrag or '',
                                                         set()).add(name)
            a.set(
                'data-' + link_uid,
                json.dumps({
                    'name': lname,
                    'frag': lfrag
                }, ensure_ascii=False))
        elif href:
            a.set('target', '_blank')
            a.set('rel', 'noopener noreferrer')

    for a in link_xpath(root):
        handle_link(a)
    xhref = XLINK('href')
    for a in svg_link_xpath(root):
        handle_link(a, xhref)

    return name in changed
Example #8
0
    def virtualize_resources(self):

        changed = set()
        link_uid = self.book_render_data['link_uid']
        resource_template = link_uid + '|{}|'
        xlink_xpath = XPath('//*[@xl:href]')
        link_xpath = XPath('//h:a[@href]')
        res_link_xpath = XPath('//h:link[@href]')

        def link_replacer(base, url):
            if url.startswith('#'):
                frag = urlunquote(url[1:])
                if not frag:
                    return url
                changed.add(base)
                return resource_template.format(encode_url(base, frag))
            purl = urlparse(url)
            if purl.netloc or purl.query:
                return url
            if purl.scheme and purl.scheme != 'file':
                return url
            if not purl.path or purl.path.startswith('/'):
                return url
            url, frag = purl.path, purl.fragment
            name = self.href_to_name(url, base)
            if name:
                if self.has_name(name):
                    frag = urlunquote(frag)
                    url = resource_template.format(encode_url(name, frag))
                else:
                    if isinstance(name, unicode):
                        name = name.encode('utf-8')
                    url = 'missing:' + force_unicode(quote(name), 'utf-8')
                changed.add(base)
            return url

        ltm = self.book_render_data['link_to_map']

        for name, mt in self.mime_map.iteritems():
            mt = mt.lower()
            if mt in OEB_STYLES:
                replaceUrls(self.parsed(name), partial(link_replacer, name))
                self.virtualized_names.add(name)
            elif mt in OEB_DOCS:
                self.virtualized_names.add(name)
                root = self.parsed(name)
                for link in res_link_xpath(root):
                    ltype = (link.get('type') or 'text/css').lower()
                    rel = (link.get('rel') or 'stylesheet').lower()
                    if ltype != 'text/css' or rel != 'stylesheet':
                        # This link will not be loaded by the browser anyway
                        # and will causes the resource load check to hang
                        link.attrib.clear()
                        changed.add(name)
                rewrite_links(root, partial(link_replacer, name))
                for a in link_xpath(root):
                    href = a.get('href')
                    if href.startswith(link_uid):
                        a.set('href', 'javascript:void(0)')
                        parts = decode_url(href.split('|')[1])
                        lname, lfrag = parts[0], parts[1]
                        ltm.setdefault(lname,
                                       {}).setdefault(lfrag or '',
                                                      set()).add(name)
                        a.set(
                            'data-' + link_uid,
                            json.dumps({
                                'name': lname,
                                'frag': lfrag
                            },
                                       ensure_ascii=False))
                    else:
                        a.set('target', '_blank')
                        a.set('rel', 'noopener noreferrer')
                    changed.add(name)
            elif mt == 'image/svg+xml':
                self.virtualized_names.add(name)
                changed.add(name)
                xlink = XLINK('href')
                for elem in xlink_xpath(self.parsed(name)):
                    elem.set(xlink, link_replacer(name, elem.get(xlink)))

        for name, amap in ltm.iteritems():
            for k, v in tuple(amap.iteritems()):
                amap[k] = tuple(v)  # needed for JSON serialization

        tuple(map(self.dirty, changed))
Example #9
0
def transform_html(container, name, virtualize_resources, link_uid,
                   link_to_map, virtualized_names):
    link_xpath = XPath('//h:a[@href]')
    svg_link_xpath = XPath('//svg:a')
    img_xpath = XPath('//h:img[@src]')
    res_link_xpath = XPath('//h:link[@href]')
    root = container.parsed(name)
    changed_names = set()
    link_replacer = create_link_replacer(container, link_uid, changed_names)

    # Used for viewing images
    for img in img_xpath(root):
        img_name = container.href_to_name(img.get('src'), name)
        if img_name:
            img.set('data-calibre-src', img_name)

    # Disable non-stylesheet link tags. This link will not be loaded by the
    # browser anyway and will causes the resource load check to hang
    for link in res_link_xpath(root):
        ltype = (link.get('type') or 'text/css').lower()
        rel = (link.get('rel') or 'stylesheet').lower()
        if ltype != 'text/css' or rel != 'stylesheet':
            link.attrib.clear()

    def transform_and_virtualize_sheet(sheet):
        changed = transform_sheet(sheet)
        if virtualize_resources:
            replaceUrls(sheet, partial(link_replacer, name))
            if name in changed_names:
                virtualized_names.add(name)
                changed = True
        return changed

    # Transform <style> and style=""
    transform_inline_styles(container,
                            name,
                            transform_sheet=transform_and_virtualize_sheet,
                            transform_style=transform_declaration)

    if virtualize_resources:
        virtualize_html(container, name, link_uid, link_to_map,
                        virtualized_names)
    else:

        def handle_link(a, attr='href'):
            href = a.get(attr)
            if href:
                href = link_replacer(name, href)
            if href and href.startswith(link_uid):
                a.set(attr, 'javascript:void(0)')
                parts = decode_url(href.split('|')[1])
                lname, lfrag = parts[0], parts[1]
                link_to_map.setdefault(lname,
                                       {}).setdefault(lfrag or '',
                                                      set()).add(name)
                a.set(
                    'data-' + link_uid,
                    json.dumps({
                        'name': lname,
                        'frag': lfrag
                    },
                               ensure_ascii=False))

        for a in link_xpath(root):
            handle_link(a)
        xhref = XLINK('href')
        for a in svg_link_xpath(root):
            handle_link(a, xhref)

    shtml = html_as_json(root)
    with container.open(name, 'wb') as f:
        f.write(shtml)