def _handle_lxml_elem(el):
        """
        From source code of `lxml.html.iter_links` function.
        With added refactoring of multi-urls attributes, i.e. src-set

        Yielding and internally handling (element, attribute, link, pos),
        where attribute may be None
        (indicating the link is in the text).  ``pos`` is the position
        where the link occurs; often 0, but sometimes something else in
        the case of links in stylesheets or style tags.

        Note: multiple links inside of a single text string or
        attribute value are returned in reversed order.  This makes it
        possible to replace or delete them from the text string value
        based on their reported text positions.  Otherwise, a
        modification at one text position can change the positions of
        links reported later on.
        """
        attribs = el.attrib
        tag = _nons(el.tag)
        if tag == 'object':  # pragma: no cover
            codebase = None
            if 'codebase' in attribs:
                codebase = el.get('codebase')
                yield el, 'codebase', codebase, 0
            for attrib in ('classid', 'data'):
                if attrib in attribs:
                    value = el.get(attrib)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield el, attrib, value, 0
            if 'archive' in attribs:
                for match in _archive_re.finditer(el.get('archive')):
                    value = match.group(0)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield el, 'archive', value, match.start()
        else:
            for attrib in SINGLE_LINK_ATTRIBS:
                if attrib in attribs:
                    yield el, attrib, attribs[attrib], 0

            # XXX Patch for multi-url detection
            for attrib in LIST_LINK_ATTRIBS:
                if attrib in attribs:
                    urls = list(_iter_srcset_urls(attribs[attrib]))
                    if urls:
                        # yield in reversed order to simplify in-place modifications
                        for match in urls[::-1]:
                            url, start = _unquote_match(
                                match.group(1).strip(), match.start(1))
                            yield el, attrib, url, start
        if tag == 'meta':
            http_equiv = attribs.get('http-equiv', '').lower()
            if http_equiv == 'refresh':
                content = attribs.get('content', '')
                match = _parse_meta_refresh_url(content)
                url = (match.group('url') if match else content).strip()
                # unexpected content means the redirect won't work, but we might
                # as well be permissive and yield the entire string.
                if url:
                    url, pos = _unquote_match(
                        url,
                        match.start('url') if match else content.find(url))
                    yield el, 'content', url, pos
        elif tag == 'param':
            valuetype = el.get('valuetype') or ''
            if valuetype.lower() == 'ref':
                yield el, 'value', el.get('value'), 0
        elif tag == 'style' and el.text:
            urls = [
                # (start_pos, url)
                _unquote_match(match.group(1), match.start(1))[::-1]
                for match in _iter_css_urls(el.text)
            ] + [(match.start(1), match.group(1))
                 for match in _iter_css_imports(el.text)]
            if urls:
                # sort by start pos to bring both match sets back into order
                # and reverse the list to report correct positions despite
                # modifications
                urls.sort(reverse=True)
                for start, url in urls:
                    yield el, None, url, start
        if 'style' in attribs:
            urls = list(_iter_css_urls(attribs['style']))
            if urls:
                # yield in reversed order to simplify in-place modifications
                for match in urls[::-1]:
                    url, start = _unquote_match(match.group(1), match.start(1))
                    yield el, 'style', url, start
Exemple #2
0
 def __handle(self, el):
     """Handles a lxml element which is straight out of the parser
     and does the work of file objects building and starts the download.
     """
     attribs = el.attrib
     tag = _nons(el.tag)
     if tag == 'object':
         codebase = None
         if 'codebase' in attribs:
             codebase = el.get('codebase')
             self.handle(el, 'codebase', codebase, 0)
         for attrib in ('classid', 'data'):
             if attrib in attribs:
                 value = el.get(attrib)
                 if codebase is not None:
                     value = urljoin(codebase, value)
                 self.handle(el, attrib, value, 0)
         if 'archive' in attribs:
             for match in _archive_re.finditer(el.get('archive')):
                 value = match.group(0)
                 if codebase is not None:
                     value = urljoin(codebase, value)
                 self.handle(el, 'archive', value, match.start())
     else:
         for attrib in link_attrs:
             if attrib in attribs:
                 self.handle(el, attrib, attribs[attrib], 0)
         for attrib in list_link_attrs:
             if attrib in attribs:
                 urls = list(_iter_srcset_urls(attribs[attrib]))
                 if urls:
                     # return in reversed order to simplify in-place modifications
                     for match in urls[::-1]:
                         url, start = _unquote_match(
                             match.group(1).strip(), match.start(1))
                         self.handle(el, attrib, url, start)
     if tag == 'meta':
         http_equiv = attribs.get('http-equiv', '').lower()
         if http_equiv == 'refresh':
             content = attribs.get('content', '')
             match = _parse_meta_refresh_url(content)
             url = (match.group('url') if match else content).strip()
             # unexpected content means the redirect won't work, but we might
             # as well be permissive and return the entire string.
             if url:
                 url, pos = _unquote_match(
                     url,
                     match.start('url') if match else content.find(url))
                 self.handle(el, 'content', url, pos)
     elif tag == 'param':
         valuetype = el.get('valuetype') or ''
         if valuetype.lower() == 'ref':
             self.handle(el, 'value', el.get('value'), 0)
     elif tag == 'style' and el.text:
         urls = [
             # (start_pos, url)
             _unquote_match(match.group(1), match.start(1))[::-1]
             for match in _iter_css_urls(el.text)
         ] + [(match.start(1), match.group(1))
              for match in _iter_css_imports(el.text)]
         if urls:
             # sort by start pos to bring both match sets back into order
             # and reverse the list to report correct positions despite
             # modifications
             urls.sort(reverse=True)
             for start, url in urls:
                 self.handle(el, None, url, start)
     if 'style' in attribs:
         urls = list(_iter_css_urls(attribs['style']))
         if urls:
             # return in reversed order to simplify in-place modifications
             for match in urls[::-1]:
                 url, start = _unquote_match(match.group(1), match.start(1))
                 self.handle(el, 'style', url, start)