Example #1
0
 def iterlinks(self, name, get_line_numbers=True):
     ''' Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. '''
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath('//*[@href]'):
             yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name, 'rb') as f:
                 raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
                 position = PositionFinder(raw)
                 is_in_comment = CommentFinder(raw)
                 for link, offset in itercsslinks(raw):
                     if not is_in_comment(offset):
                         lnum, col = position(offset)
                         yield link, lnum, col
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type('toc.ncx'):
         for elem in self.parsed(name).xpath('//*[@src]'):
             yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
Example #2
0
 def iterlinks(self, name, get_line_numbers=True):
     """ Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. """
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath("//*[@href]"):
             yield (elem.get("href"), elem.sourceline, 0) if get_line_numbers else elem.get("href")
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name) as f:
                 raw = self.decode(f.read())
                 for link, offset in itercsslinks(raw):
                     yield link, 0, offset
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type("toc.ncx"):
         for elem in self.parsed(name).xpath("//*[@src]"):
             yield (elem.get("src"), elem.sourceline, 0) if get_line_numbers else elem.get("src")
Example #3
0
 def iterlinks(self, name, get_line_numbers=True):
     ''' Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. '''
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath('//*[@href]'):
             yield (elem.get('href'), elem.sourceline,
                    0) if get_line_numbers else elem.get('href')
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name, 'rb') as f:
                 raw = self.decode(f.read()).replace('\r\n', '\n').replace(
                     '\r', '\n')
                 position = PositionFinder(raw)
                 is_in_comment = CommentFinder(raw)
                 for link, offset in itercsslinks(raw):
                     if not is_in_comment(offset):
                         lnum, col = position(offset)
                         yield link, lnum, col
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type('toc.ncx'):
         for elem in self.parsed(name).xpath('//*[@src]'):
             yield (elem.get('src'), elem.sourceline,
                    0) if get_line_numbers else elem.get('src')
Example #4
0
def remove_links_to(container, predicate):
    ''' predicate must be a function that takes the arguments (name, href,
    fragment=None) and returns True iff the link should be removed '''
    from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
    stylepath = XPath('//h:style')
    styleattrpath = XPath('//*[@style]')
    changed = set()
    for name, mt in iteritems(container.mime_map):
        removed = False
        if mt in OEB_DOCS:
            root = container.parsed(name)
            for el, attr, href, pos in iterlinks(root,
                                                 find_links_in_css=False):
                hname = container.href_to_name(href, name)
                frag = href.partition('#')[-1]
                if predicate(hname, href, frag):
                    if attr is None:
                        el.text = None
                    else:
                        if el.tag == XHTML('link') or el.tag == XHTML('img'):
                            extract(el)
                        else:
                            del el.attrib[attr]
                    removed = True
            for tag in stylepath(root):
                if tag.text and (tag.get('type')
                                 or 'text/css').lower() == 'text/css':
                    sheet = container.parse_css(tag.text)
                    if remove_links_in_sheet(
                            partial(container.href_to_name, base=name), sheet,
                            predicate):
                        tag.text = css_text(sheet)
                        removed = True
            for tag in styleattrpath(root):
                style = tag.get('style')
                if style:
                    style = container.parse_css(style, is_declaration=True)
                    if remove_links_in_declaration(
                            partial(container.href_to_name, base=name), style,
                            predicate):
                        removed = True
                        tag.set('style', css_text(style))
        elif mt in OEB_STYLES:
            removed = remove_links_in_sheet(
                partial(container.href_to_name, base=name),
                container.parsed(name), predicate)
        if removed:
            changed.add(name)
    for i in changed:
        container.dirty(i)
    return changed
Example #5
0
 def __call__(self, oeb, context):
     import cssutils
     oeb.logger.info('Trimming unused files from manifest...')
     self.opts = context
     used = set()
     for term in oeb.metadata:
         for item in oeb.metadata[term]:
             if item.value in oeb.manifest.hrefs:
                 used.add(oeb.manifest.hrefs[item.value])
             elif item.value in oeb.manifest.ids:
                 used.add(oeb.manifest.ids[item.value])
     for ref in oeb.guide.values():
         path, _ = urldefrag(ref.href)
         if path in oeb.manifest.hrefs:
             used.add(oeb.manifest.hrefs[path])
     # TOC items are required to be in the spine
     for item in oeb.spine:
         used.add(item)
     unchecked = used
     while unchecked:
         new = set()
         for item in unchecked:
             if (item.media_type in OEB_DOCS or
                 item.media_type[-4:] in ('/xml', '+xml')) and \
                item.data is not None:
                 hrefs = [r[2] for r in iterlinks(item.data)]
                 for href in hrefs:
                     if isinstance(href, bytes):
                         href = href.decode('utf-8')
                     try:
                         href = item.abshref(urlnormalize(href))
                     except:
                         continue
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
             elif item.media_type == CSS_MIME:
                 for href in cssutils.getUrls(item.data):
                     href = item.abshref(urlnormalize(href))
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
         used.update(new)
         unchecked = new
     for item in oeb.manifest.values():
         if item not in used:
             oeb.logger.info('Trimming %r from manifest' % item.href)
             oeb.manifest.remove(item)
Example #6
0
 def __call__(self, oeb, context):
     import cssutils
     oeb.logger.info('Trimming unused files from manifest...')
     self.opts = context
     used = set()
     for term in oeb.metadata:
         for item in oeb.metadata[term]:
             if item.value in oeb.manifest.hrefs:
                 used.add(oeb.manifest.hrefs[item.value])
             elif item.value in oeb.manifest.ids:
                 used.add(oeb.manifest.ids[item.value])
     for ref in oeb.guide.values():
         path, _ = urldefrag(ref.href)
         if path in oeb.manifest.hrefs:
             used.add(oeb.manifest.hrefs[path])
     # TOC items are required to be in the spine
     for item in oeb.spine:
         used.add(item)
     unchecked = used
     while unchecked:
         new = set()
         for item in unchecked:
             if (item.media_type in OEB_DOCS or
                 item.media_type[-4:] in ('/xml', '+xml')) and \
                item.data is not None:
                 hrefs = [r[2] for r in iterlinks(item.data)]
                 for href in hrefs:
                     if isinstance(href, bytes):
                         href = href.decode('utf-8')
                     try:
                         href = item.abshref(urlnormalize(href))
                     except:
                         continue
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
             elif item.media_type == CSS_MIME:
                 for href in cssutils.getUrls(item.data):
                     href = item.abshref(urlnormalize(href))
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
         used.update(new)
         unchecked = new
     for item in oeb.manifest.values():
         if item not in used:
             oeb.logger.info('Trimming %r from manifest' % item.href)
             oeb.manifest.remove(item)
Example #7
0
def remove_links_to(container, predicate):
    """ predicate must be a function that takes the arguments (name, href,
    fragment=None) and returns True iff the link should be removed """
    from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML

    stylepath = XPath("//h:style")
    styleattrpath = XPath("//*[@style]")
    changed = set()
    for name, mt in container.mime_map.iteritems():
        removed = False
        if mt in OEB_DOCS:
            root = container.parsed(name)
            for el, attr, href, pos in iterlinks(root, find_links_in_css=False):
                hname = container.href_to_name(href, name)
                frag = href.partition("#")[-1]
                if predicate(hname, href, frag):
                    if attr is None:
                        el.text = None
                    else:
                        if el.tag == XHTML("link") or el.tag == XHTML("img"):
                            extract(el)
                        else:
                            del el.attrib[attr]
                    removed = True
            for tag in stylepath(root):
                if tag.text and (tag.get("type") or "text/css").lower() == "text/css":
                    sheet = container.parse_css(tag.text)
                    if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate):
                        tag.text = sheet.cssText
                        removed = True
            for tag in styleattrpath(root):
                style = tag.get("style")
                if style:
                    style = container.parse_css(style, is_declaration=True)
                    if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate):
                        removed = True
                        tag.set("style", style.cssText)
        elif mt in OEB_STYLES:
            removed = remove_links_in_sheet(
                partial(container.href_to_name, base=name), container.parsed(name), predicate
            )
        if removed:
            changed.add(name)
    tuple(map(container.dirty, changed))
    return changed
Example #8
0
def remove_links_to(container, predicate):
    ''' predicate must be a function that takes the arguments (name, href,
    fragment=None) and returns True iff the link should be removed '''
    from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
    stylepath = XPath('//h:style')
    styleattrpath = XPath('//*[@style]')
    changed = set()
    for name, mt in iteritems(container.mime_map):
        removed = False
        if mt in OEB_DOCS:
            root = container.parsed(name)
            for el, attr, href, pos in iterlinks(root, find_links_in_css=False):
                hname = container.href_to_name(href, name)
                frag = href.partition('#')[-1]
                if predicate(hname, href, frag):
                    if attr is None:
                        el.text = None
                    else:
                        if el.tag == XHTML('link') or el.tag == XHTML('img'):
                            extract(el)
                        else:
                            del el.attrib[attr]
                    removed = True
            for tag in stylepath(root):
                if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css':
                    sheet = container.parse_css(tag.text)
                    if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate):
                        tag.text = css_text(sheet)
                        removed = True
            for tag in styleattrpath(root):
                style = tag.get('style')
                if style:
                    style = container.parse_css(style, is_declaration=True)
                    if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate):
                        removed = True
                        tag.set('style', css_text(style))
        elif mt in OEB_STYLES:
            removed = remove_links_in_sheet(partial(container.href_to_name, base=name), container.parsed(name), predicate)
        if removed:
            changed.add(name)
    tuple(map(container.dirty, changed))
    return changed
Example #9
0
    def _manifest_add_missing(self, invalid):
        import cssutils
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        cdoc = OEB_DOCS|OEB_STYLES
        invalid = set()
        while unchecked:
            new = set()
            for item in unchecked:
                data = None
                if (item.media_type in cdoc or
                        item.media_type[-4:] in ('/xml', '+xml')):
                    try:
                        data = item.data
                    except:
                        self.oeb.log.exception(u'Failed to read from manifest '
                                u'entry with id: %s, ignoring'%item.id)
                        invalid.add(item)
                        continue
                if data is None:
                    continue

                if (item.media_type in OEB_DOCS or
                        item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in iterlinks(data)]
                    for href in hrefs:
                        if isinstance(href, bytes):
                            href = href.decode('utf-8')
                        href, _ = urldefrag(href)
                        if not href:
                            continue
                        try:
                            href = item.abshref(urlnormalize(href))
                            scheme = urlparse(href).scheme
                        except:
                            self.oeb.log.exception(
                                'Skipping invalid href: %r'%href)
                            continue
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
                    try:
                        urls = list(cssutils.getUrls(data))
                    except:
                        urls = []
                    for url in urls:
                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
                            new.add(href)
            unchecked.clear()
            warned = set([])
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    if href not in warned:
                        self.logger.warn('Referenced file %r not found' % href)
                        warned.add(href)
                    continue
                if href not in warned:
                    self.logger.warn('Referenced file %r not in manifest' % href)
                    warned.add(href)
                id, _ = manifest.generate(id='added')
                guessed = guess_type(href)[0]
                media_type = guessed or BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)

            for item in invalid:
                self.oeb.manifest.remove(item)
Example #10
0
def iterhtmllinks(container, name):
    for el, attr, link, pos in iterlinks(container.parsed(name)):
        tag = barename(el.tag).lower()
        if tag != 'a' and is_external(link):
            yield el, attr, link
Example #11
0
def iterhtmllinks(container, name):
    for el, attr, link, pos in iterlinks(container.parsed(name)):
        tag = barename(el.tag).lower()
        if tag != 'a' and is_external(link):
            yield el, attr, link