def extract_cover_from_embedded_svg(html, base, log): from lxml import etree from calibre.ebooks.oeb.base import XPath, SVG, XLINK root = etree.fromstring(html) svg = XPath('//svg:svg')(root) if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): image = svg[0][0] href = image.get(XLINK('href'), None) path = os.path.join(base, *href.split('/')) if href and os.access(path, os.R_OK): return open(path, 'rb').read()
def extract_cover_from_embedded_svg(html, base, log): from calibre.ebooks.oeb.base import XPath, SVG, XLINK from calibre.utils.xml_parse import safe_xml_fromstring root = safe_xml_fromstring(html) svg = XPath('//svg:svg')(root) if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): image = svg[0][0] href = image.get(XLINK('href'), None) if href: path = os.path.join(base, *href.split('/')) return return_raster_image(path)
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') link_replacer = create_link_replacer(self, link_uid, changed) ltm = self.book_render_data['link_to_map'] for name, mt in iteritems(self.mime_map): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') elif mt == 'image/svg+xml': self.virtualized_names.add(name) xlink = XLINK('href') altered = False for elem in xlink_xpath(self.parsed(name)): href = elem.get(xlink) if not href.startswith('#'): elem.set(xlink, link_replacer(name, href)) altered = True if altered: changed.add(name) tuple(map(self.dirty, changed))
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) changed.add(base) return url for name, mt in self.mime_map.iteritems(): if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) elif mt in OEB_DOCS: root = self.parsed(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') a.set('data-' + link_uid, href.split('|')[1]) else: a.set('target', '_blank') changed.add(name) elif mt == 'image/svg+xml': changed = False xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) tuple(map(self.dirty, changed))
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names): if not virtualize_resources: return link_replacer = create_link_replacer(container, link_uid, set()) xlink = XLINK('href') altered = False xlink_xpath = XPath('//*[@xl:href]') for elem in xlink_xpath(container.parsed(name)): href = elem.get(xlink) if not href.startswith('#'): elem.set(xlink, link_replacer(name, href)) altered = True if altered: virtualized_names.add(name) container.dirty(name) container.commit_item(name)
def find_cover_image_in_page(container, cover_page): root = container.parsed(cover_page) body = XPath('//h:body')(root) if len(body) != 1: return body = body[0] images = [] for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body): href = img.get('src') or img.get(XLINK('href')) if href: name = container.href_to_name(href, base=cover_page) images.append(name) text = re.sub(r'\s+', '', xml2text(body)) if text or len(images) > 1: # Document has more content than a single image return if images: return images[0]
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names): changed = set() link_xpath = XPath('//h:a[@href]') svg_link_xpath = XPath('//svg:a') link_replacer = create_link_replacer(container, link_uid, changed) virtualized_names.add(name) root = container.parsed(name) rewrite_links(root, partial(link_replacer, name)) def handle_link(a, attr='href'): href = a.get(attr) or '' if href.startswith(link_uid): a.set(attr, 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) elif href: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') for a in link_xpath(root): handle_link(a) xhref = XLINK('href') for a in svg_link_xpath(root): handle_link(a, xhref) return name in changed
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') res_link_xpath = XPath('//h:link[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url ltm = self.book_render_data['link_to_map'] for name, mt in self.mime_map.iteritems(): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': # This link will not be loaded by the browser anyway # and will causes the resource load check to hang link.attrib.clear() changed.add(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed.add(name) xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) for name, amap in ltm.iteritems(): for k, v in tuple(amap.iteritems()): amap[k] = tuple(v) # needed for JSON serialization tuple(map(self.dirty, changed))
def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names): link_xpath = XPath('//h:a[@href]') svg_link_xpath = XPath('//svg:a') img_xpath = XPath('//h:img[@src]') res_link_xpath = XPath('//h:link[@href]') root = container.parsed(name) changed_names = set() link_replacer = create_link_replacer(container, link_uid, changed_names) # Used for viewing images for img in img_xpath(root): img_name = container.href_to_name(img.get('src'), name) if img_name: img.set('data-calibre-src', img_name) # Disable non-stylesheet link tags. This link will not be loaded by the # browser anyway and will causes the resource load check to hang for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': link.attrib.clear() def transform_and_virtualize_sheet(sheet): changed = transform_sheet(sheet) if virtualize_resources: replaceUrls(sheet, partial(link_replacer, name)) if name in changed_names: virtualized_names.add(name) changed = True return changed # Transform <style> and style="" transform_inline_styles(container, name, transform_sheet=transform_and_virtualize_sheet, transform_style=transform_declaration) if virtualize_resources: virtualize_html(container, name, link_uid, link_to_map, virtualized_names) else: def handle_link(a, attr='href'): href = a.get(attr) if href: href = link_replacer(name, href) if href and href.startswith(link_uid): a.set(attr, 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) for a in link_xpath(root): handle_link(a) xhref = XLINK('href') for a in svg_link_xpath(root): handle_link(a, xhref) shtml = html_as_json(root) with container.open(name, 'wb') as f: f.write(shtml)