def remove_old_cover(self, cover_item, new_cover_href=None): from calibre.ebooks.oeb.base import XPath, XLINK from lxml import etree self.oeb.manifest.remove(cover_item) # Remove any references to the cover in the HTML affected_items = set() xp = XPath('//h:img[@src]|//svg:image[@xl:href]') for i, item in enumerate(self.oeb.spine): try: images = xp(item.data) except Exception: images = () removed = False for img in images: href = img.get('src') or img.get(XLINK('href')) try: href = item.abshref(href) except Exception: continue # Invalid URL, ignore if href == cover_item.href: if new_cover_href is not None: replacement_href = item.relhref(new_cover_href) attr = 'src' if img.tag.endswith('img') else XLINK( 'href') img.set(attr, replacement_href) else: p = img.getparent() if p.tag.endswith('}svg'): p.getparent().remove(p) else: p.remove(img) removed = True if removed: affected_items.add(item) # Check if the resulting HTML has no content, if so remove it for item in affected_items: body = XPath('//h:body')(item.data) if body: text = etree.tostring(body[0], method='text', encoding='unicode') else: text = '' text = re.sub(r'\s+', '', text) if not text and not XPath('//h:img|//svg:svg')(item.data): self.log('Removing %s as it is a wrapper around' ' the cover image' % item.href) self.oeb.spine.remove(item) self.oeb.manifest.remove(item) self.oeb.guide.remove_by_href(item.href)
def virtualize_html(container, name, link_uid, link_to_map, virtualized_names): changed = set() link_xpath = XPath('//h:a[@href]') svg_link_xpath = XPath('//svg:a') link_replacer = create_link_replacer(container, link_uid, changed) virtualized_names.add(name) root = container.parsed(name) rewrite_links(root, partial(link_replacer, name)) def handle_link(a, attr='href'): href = a.get(attr) or '' if href.startswith(link_uid): a.set(attr, 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) elif href: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') for a in link_xpath(root): handle_link(a) xhref = XLINK('href') for a in svg_link_xpath(root): handle_link(a, xhref) return name in changed
def map_resources(self, oeb_book): for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: if item.href not in self.images: ext = os.path.splitext(item.href)[1] fname = '%s%s' % (len(self.images), ext) fname = fname.zfill(10) self.images[item.href] = fname if item in oeb_book.spine: self.get_link_id(item.href) root = item.data.find(XHTML('body')) link_attrs = set(html.defs.link_attrs) link_attrs.add(XLINK('href')) for el in root.iter(): attribs = el.attrib try: if not isinstance(el.tag, basestring): continue except: continue for attr in attribs: if attr in link_attrs: href = item.abshref(attribs[attr]) href, id = urldefrag(href) if href in self.base_hrefs: self.get_link_id(href, id)
def transform_html(container, name, virtualize_resources, link_uid, link_to_map, virtualized_names): link_xpath = XPath('//h:a[@href]') svg_link_xpath = XPath('//svg:a') img_xpath = XPath('//h:img[@src]') res_link_xpath = XPath('//h:link[@href]') root = container.parsed(name) changed_names = set() link_replacer = create_link_replacer(container, link_uid, changed_names) # Used for viewing images for img in img_xpath(root): img_name = container.href_to_name(img.get('src'), name) if img_name: img.set('data-calibre-src', img_name) # Disable non-stylesheet link tags. This link will not be loaded by the # browser anyway and will causes the resource load check to hang for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': link.attrib.clear() def transform_and_virtualize_sheet(sheet): changed = transform_sheet(sheet) if virtualize_resources: replaceUrls(sheet, partial(link_replacer, name)) if name in changed_names: virtualized_names.add(name) changed = True return changed # Transform <style> and style="" transform_inline_styles(container, name, transform_sheet=transform_and_virtualize_sheet, transform_style=transform_declaration) if virtualize_resources: virtualize_html(container, name, link_uid, link_to_map, virtualized_names) else: def handle_link(a, attr='href'): href = a.get(attr) if href: href = link_replacer(name, href) if href and href.startswith(link_uid): a.set(attr, 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] link_to_map.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) for a in link_xpath(root): handle_link(a) xhref = XLINK('href') for a in svg_link_xpath(root): handle_link(a, xhref) shtml = html_as_json(root) with container.open(name, 'wb') as f: f.write(shtml)
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = base64.encodestring(str(linkee)) data = "data:%s;base64,%s" % (linkee.media_type, data) elem.attrib[XLINK('href')] = data return svg
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) changed.add(base) return url for name, mt in self.mime_map.iteritems(): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) a.set('data-' + link_uid, json.dumps({'name':parts[0], 'frag':parts[1]}, ensure_ascii=False)) else: a.set('target', '_blank') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed = False xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) tuple(map(self.dirty, changed))
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def extract_cover_from_embedded_svg(html, base, log): from lxml import etree from calibre.ebooks.oeb.base import XPath, SVG, XLINK root = etree.fromstring(html) svg = XPath('//svg:svg')(root) if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): image = svg[0][0] href = image.get(XLINK('href'), None) path = os.path.join(base, *href.split('/')) if href and os.access(path, os.R_OK): return open(path, 'rb').read()
def extract_cover_from_embedded_svg(html, base, log): from calibre.ebooks.oeb.base import XPath, SVG, XLINK from calibre.utils.xml_parse import safe_xml_fromstring root = safe_xml_fromstring(html) svg = XPath('//svg:svg')(root) if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): image = svg[0][0] href = image.get(XLINK('href'), None) if href: path = os.path.join(base, *href.split('/')) return return_raster_image(path)
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') link_replacer = create_link_replacer(self, link_uid, changed) ltm = self.book_render_data['link_to_map'] for name, mt in iteritems(self.mime_map): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') elif mt == 'image/svg+xml': self.virtualized_names.add(name) xlink = XLINK('href') altered = False for elem in xlink_xpath(self.parsed(name)): href = elem.get(xlink) if not href.startswith('#'): elem.set(xlink, link_replacer(name, href)) altered = True if altered: changed.add(name) tuple(map(self.dirty, changed))
def transform_svg_image(container, name, link_uid, virtualize_resources, virtualized_names): if not virtualize_resources: return link_replacer = create_link_replacer(container, link_uid, set()) xlink = XLINK('href') altered = False xlink_xpath = XPath('//*[@xl:href]') for elem in xlink_xpath(container.parsed(name)): href = elem.get(xlink) if not href.startswith('#'): elem.set(xlink, link_replacer(name, href)) altered = True if altered: virtualized_names.add(name) container.dirty(name) container.commit_item(name)
def find_cover_image_in_page(container, cover_page): root = container.parsed(cover_page) body = XPath('//h:body')(root) if len(body) != 1: return body = body[0] images = [] for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body): href = img.get('src') or img.get(XLINK('href')) if href: name = container.href_to_name(href, base=cover_page) images.append(name) text = re.sub(r'\s+', '', xml2text(body)) if text or len(images) > 1: # Document has more content than a single image return if images: return images[0]
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') res_link_xpath = XPath('//h:link[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url ltm = self.book_render_data['link_to_map'] for name, mt in self.mime_map.iteritems(): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': # This link will not be loaded by the browser anyway # and will causes the resource load check to hang link.attrib.clear() changed.add(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed.add(name) xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) for name, amap in ltm.iteritems(): for k, v in tuple(amap.iteritems()): amap[k] = tuple(v) # needed for JSON serialization tuple(map(self.dirty, changed))