def iterlinks(self, name, get_line_numbers=True): ''' Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name, 'rb') as f: raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n') position = PositionFinder(raw) is_in_comment = CommentFinder(raw) for link, offset in itercsslinks(raw): if not is_in_comment(offset): lnum, col = position(offset) yield link, lnum, col else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def iterlinks(self, name, get_line_numbers=True): """ Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. """ media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath("//*[@href]"): yield (elem.get("href"), elem.sourceline, 0) if get_line_numbers else elem.get("href") elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name) as f: raw = self.decode(f.read()) for link, offset in itercsslinks(raw): yield link, 0, offset else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type("toc.ncx"): for elem in self.parsed(name).xpath("//*[@src]"): yield (elem.get("src"), elem.sourceline, 0) if get_line_numbers else elem.get("src")
def iterlinks(self, name, get_line_numbers=True): ''' Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name, 'rb') as f: raw = self.decode(f.read()).replace('\r\n', '\n').replace( '\r', '\n') position = PositionFinder(raw) is_in_comment = CommentFinder(raw) for link, offset in itercsslinks(raw): if not is_in_comment(offset): lnum, col = position(offset) yield link, lnum, col else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def remove_links_to(container, predicate): ''' predicate must be a function that takes the arguments (name, href, fragment=None) and returns True iff the link should be removed ''' from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML stylepath = XPath('//h:style') styleattrpath = XPath('//*[@style]') changed = set() for name, mt in iteritems(container.mime_map): removed = False if mt in OEB_DOCS: root = container.parsed(name) for el, attr, href, pos in iterlinks(root, find_links_in_css=False): hname = container.href_to_name(href, name) frag = href.partition('#')[-1] if predicate(hname, href, frag): if attr is None: el.text = None else: if el.tag == XHTML('link') or el.tag == XHTML('img'): extract(el) else: del el.attrib[attr] removed = True for tag in stylepath(root): if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css': sheet = container.parse_css(tag.text) if remove_links_in_sheet( partial(container.href_to_name, base=name), sheet, predicate): tag.text = css_text(sheet) removed = True for tag in styleattrpath(root): style = tag.get('style') if style: style = container.parse_css(style, is_declaration=True) if remove_links_in_declaration( partial(container.href_to_name, base=name), style, predicate): removed = True tag.set('style', css_text(style)) elif mt in OEB_STYLES: removed = remove_links_in_sheet( partial(container.href_to_name, base=name), container.parsed(name), predicate) if removed: changed.add(name) for i in changed: container.dirty(i) return changed
def __call__(self, oeb, context): import cssutils oeb.logger.info('Trimming unused files from manifest...') self.opts = context used = set() for term in oeb.metadata: for item in oeb.metadata[term]: if item.value in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[item.value]) elif item.value in oeb.manifest.ids: used.add(oeb.manifest.ids[item.value]) for ref in oeb.guide.values(): path, _ = urldefrag(ref.href) if path in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[path]) # TOC items are required to be in the spine for item in oeb.spine: used.add(item) unchecked = used while unchecked: new = set() for item in unchecked: if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') try: href = item.abshref(urlnormalize(href)) except: continue if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) elif item.media_type == CSS_MIME: for href in cssutils.getUrls(item.data): href = item.abshref(urlnormalize(href)) if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) used.update(new) unchecked = new for item in oeb.manifest.values(): if item not in used: oeb.logger.info('Trimming %r from manifest' % item.href) oeb.manifest.remove(item)
def remove_links_to(container, predicate): """ predicate must be a function that takes the arguments (name, href, fragment=None) and returns True iff the link should be removed """ from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML stylepath = XPath("//h:style") styleattrpath = XPath("//*[@style]") changed = set() for name, mt in container.mime_map.iteritems(): removed = False if mt in OEB_DOCS: root = container.parsed(name) for el, attr, href, pos in iterlinks(root, find_links_in_css=False): hname = container.href_to_name(href, name) frag = href.partition("#")[-1] if predicate(hname, href, frag): if attr is None: el.text = None else: if el.tag == XHTML("link") or el.tag == XHTML("img"): extract(el) else: del el.attrib[attr] removed = True for tag in stylepath(root): if tag.text and (tag.get("type") or "text/css").lower() == "text/css": sheet = container.parse_css(tag.text) if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate): tag.text = sheet.cssText removed = True for tag in styleattrpath(root): style = tag.get("style") if style: style = container.parse_css(style, is_declaration=True) if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate): removed = True tag.set("style", style.cssText) elif mt in OEB_STYLES: removed = remove_links_in_sheet( partial(container.href_to_name, base=name), container.parsed(name), predicate ) if removed: changed.add(name) tuple(map(container.dirty, changed)) return changed
def remove_links_to(container, predicate): ''' predicate must be a function that takes the arguments (name, href, fragment=None) and returns True iff the link should be removed ''' from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML stylepath = XPath('//h:style') styleattrpath = XPath('//*[@style]') changed = set() for name, mt in iteritems(container.mime_map): removed = False if mt in OEB_DOCS: root = container.parsed(name) for el, attr, href, pos in iterlinks(root, find_links_in_css=False): hname = container.href_to_name(href, name) frag = href.partition('#')[-1] if predicate(hname, href, frag): if attr is None: el.text = None else: if el.tag == XHTML('link') or el.tag == XHTML('img'): extract(el) else: del el.attrib[attr] removed = True for tag in stylepath(root): if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css': sheet = container.parse_css(tag.text) if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate): tag.text = css_text(sheet) removed = True for tag in styleattrpath(root): style = tag.get('style') if style: style = container.parse_css(style, is_declaration=True) if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate): removed = True tag.set('style', css_text(style)) elif mt in OEB_STYLES: removed = remove_links_in_sheet(partial(container.href_to_name, base=name), container.parsed(name), predicate) if removed: changed.add(name) tuple(map(container.dirty, changed)) return changed
def _manifest_add_missing(self, invalid): import cssutils manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = OEB_DOCS|OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except: self.oeb.log.exception(u'Failed to read from manifest ' u'entry with id: %s, ignoring'%item.id) invalid.add(item) continue if data is None: continue if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme except: self.oeb.log.exception( 'Skipping invalid href: %r'%href) continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: try: urls = list(cssutils.getUrls(data)) except: urls = [] for url in urls: href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set([]) for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] media_type = guessed or BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item)
def iterhtmllinks(container, name): for el, attr, link, pos in iterlinks(container.parsed(name)): tag = barename(el.tag).lower() if tag != 'a' and is_external(link): yield el, attr, link