def _create_link_from_element(
        anchor,  # type: HTMLElement
        page_url,  # type: str
        base_url,  # type: str
):
    # type: (...) -> Optional[Link]
    """
    Convert an anchor element in a simple repository page to a Link.
    """
    href = anchor.get("href")
    if not href:
        return None

    url = _clean_link(urllib_parse.urljoin(base_url, href))
    pyrequire = anchor.get('data-requires-python')
    pyrequire = unescape(pyrequire) if pyrequire else None

    yanked_reason = anchor.get('data-yanked')
    if yanked_reason:
        # This is a unicode string in Python 2 (and 3).
        yanked_reason = unescape(yanked_reason)

    link = Link(
        url,
        comes_from=page_url,
        requires_python=pyrequire,
        yanked_reason=yanked_reason,
    )

    return link
Example #2
0
    def links(self):
        """
        Return the URLs of all the links on a page together with information
        about their "rel" attribute, for determining which ones to treat as
        downloads and which ones to queue for further scraping.
        """
        def clean(url):
            "Tidy up an URL."
            scheme, netloc, path, params, query, frag = urlparse(url)
            return urlunparse((scheme, netloc, quote(path),
                               params, query, frag))

        result = set()
        for match in self._href.finditer(self.data):
            d = match.groupdict('')
            rel = (d['rel1'] or d['rel2'] or d['rel3'] or
                   d['rel4'] or d['rel5'] or d['rel6'])
            url = d['url1'] or d['url2'] or d['url3']
            url = urljoin(self.base_url, url)
            url = unescape(url)
            url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
            result.add((url, rel))
        # We sort the result, hoping to bring the most recent versions
        # to the front
        result = sorted(result, key=lambda t: t[0], reverse=True)
        return result
Example #3
0
    def links(self):
        """
        Return the URLs of all the links on a page together with information
        about their "rel" attribute, for determining which ones to treat as
        downloads and which ones to queue for further scraping.
        """
        def clean(url):
            "Tidy up an URL."
            scheme, netloc, path, params, query, frag = urlparse(url)
            return urlunparse(
                (scheme, netloc, quote(path), params, query, frag))

        result = set()
        for match in self._href.finditer(self.data):
            d = match.groupdict('')
            rel = (d['rel1'] or d['rel2'] or d['rel3'] or d['rel4']
                   or d['rel5'] or d['rel6'])
            url = d['url1'] or d['url2'] or d['url3']
            url = urljoin(self.base_url, url)
            url = unescape(url)
            url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
            result.add((url, rel))
        # We sort the result, hoping to bring the most recent versions
        # to the front
        result = sorted(result, key=lambda t: t[0], reverse=True)
        return result
Example #4
0
File: index.py Project: rwols/pip
 def links(self):
     """Yields all links in the page"""
     for anchor in self.parsed.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = self.clean_link(urllib_parse.urljoin(
                 self.base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self, requires_python=pyrequire)
Example #5
0
 def links(self):
     """Yields all links in the page"""
     for anchor in self.parsed.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = self.clean_link(
                 urllib_parse.urljoin(self.base_url, href)
             )
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self, requires_python=pyrequire)
Example #6
0
 def iter_links(self):
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = _clean_link(urllib_parse.urljoin(base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self.url, requires_python=pyrequire)
Example #7
0
File: index.py Project: jaraco/pip
 def iter_links(self):
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = _clean_link(urllib_parse.urljoin(base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self.url, requires_python=pyrequire)