Esempio n. 1
0
def test_parse_links_caches_same_page():
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>')
    html_bytes = html.encode('utf-8')

    page_1 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )
    page_2 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )

    mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
    with mock_parse as mock_parse:
        mock_parse.return_value = html5lib.parse(
            page_1.content,
            transport_encoding=page_1.encoding,
            namespaceHTMLElements=False,
        )
        parsed_links_1 = list(parse_links(page_1))
        mock_parse.assert_called()

    with mock_parse as mock_parse:
        parsed_links_2 = list(parse_links(page_2))
        assert parsed_links_2 == parsed_links_1
        mock_parse.assert_not_called()
Esempio n. 2
0
def test_parse_link_handles_deprecated_usage_properly() -> None:
    html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    parsed_links = list(parse_links(page, use_deprecated_html5lib=True))

    assert len(parsed_links) == 2
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url
Esempio n. 3
0
def test_parse_links_caches_same_page_by_url() -> None:
    html = (
        "<!DOCTYPE html>"
        '<html><head><meta charset="utf-8"><head>'
        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
    )
    html_bytes = html.encode("utf-8")

    url = "https://example.com/simple/"

    page_1 = HTMLPage(
        html_bytes,
        encoding=None,
        url=url,
    )
    # Make a second page with zero content, to ensure that it's not accessed,
    # because the page was cached by url.
    page_2 = HTMLPage(
        b"",
        encoding=None,
        url=url,
    )
    # Make a third page which represents an index url, which should not be
    # cached, even for the same url. We modify the page content slightly to
    # verify that the result is not cached.
    page_3 = HTMLPage(
        re.sub(b"pkg1", b"pkg2", html_bytes),
        encoding=None,
        url=url,
        cache_link_parsing=False,
    )

    parsed_links_1 = list(parse_links(page_1, use_deprecated_html5lib=False))
    assert len(parsed_links_1) == 1
    assert "pkg1" in parsed_links_1[0].url

    parsed_links_2 = list(parse_links(page_2, use_deprecated_html5lib=False))
    assert parsed_links_2 == parsed_links_1

    parsed_links_3 = list(parse_links(page_3, use_deprecated_html5lib=False))
    assert len(parsed_links_3) == 1
    assert parsed_links_3 != parsed_links_1
    assert "pkg2" in parsed_links_3[0].url
Esempio n. 4
0
 def fetch_page(self, location):
     m = re.search(r"/simple/([^/]+)/?", location.url)
     if not m:
         return old_fetcher(self, location)
     name = m.group(1)
     if name not in fake_index:
         fake_index[name] = (FIXTURES / f"index/{name}.html").read_bytes()
     return HTMLPage(
         fake_index[name], "utf-8", location.url, cache_link_parsing=False
     )
Esempio n. 5
0
def test_parse_links_presents_warning_on_missing_doctype(
    caplog: pytest.LogCaptureFixture, ) -> None:
    html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    with caplog.at_level(logging.WARN):
        parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

    assert len(parsed_links) == 2, parsed_links
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url

    assert len(caplog.records) == 1
Esempio n. 6
0
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
    mock_deprecated: mock.Mock,
) -> None:
    html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

    assert len(parsed_links) == 2, parsed_links
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url

    mock_deprecated.assert_called_once()
Esempio n. 7
0
def _test_parse_links_data_attribute(anchor_html: str, attr: str,
                                     expected: Optional[str]) -> None:
    html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>'
    html_bytes = html.encode("utf-8")
    page = HTMLPage(
        html_bytes,
        encoding=None,
        # parse_links() is cached by url, so we inject a random uuid to ensure
        # the page content isn't cached.
        url=f"https://example.com/simple-{uuid.uuid4()}/",
    )
    links = list(parse_links(page))
    (link, ) = links
    actual = getattr(link, attr)
    assert actual == expected
Esempio n. 8
0
def test_parse_links__yanked_reason(anchor_html, expected):
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body>{}</body></html>').format(anchor_html)
    html_bytes = html.encode('utf-8')
    page = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )
    links = list(parse_links(page))
    link, = links
    actual = link.yanked_reason
    assert actual == expected
Esempio n. 9
0
def test_parse_links_presents_warning_on_html4_doctype(
    caplog: pytest.LogCaptureFixture, ) -> None:
    html = (b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" '
            b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
            b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>')
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    with caplog.at_level(logging.WARN):
        parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

    assert len(parsed_links) == 2, parsed_links
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url

    assert len(caplog.records) == 1
Esempio n. 10
0
def test_parse_links__yanked_reason(anchor_html, expected):
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body>{}</body></html>').format(anchor_html)
    html_bytes = html.encode('utf-8')
    page = HTMLPage(
        html_bytes,
        encoding=None,
        # parse_links() is cached by url, so we inject a random uuid to ensure
        # the page content isn't cached.
        url='https://example.com/simple-{}/'.format(uuid.uuid4()),
    )
    links = list(parse_links(page))
    link, = links
    actual = link.yanked_reason
    assert actual == expected
Esempio n. 11
0
        def process_url(self, url, retrieve=False):
            """Evaluate a URL as a possible download, and maybe retrieve it"""
            if url in self.scanned_urls and not retrieve:
                return
            self.scanned_urls[url] = True
            if not URL_SCHEME(url):
                self.process_filename(url)
                return
            else:
                dists = list(distros_for_url(url))
                if dists:
                    if not self.url_ok(url):
                        return
                    self.debug("Found link: %s", url)

            if dists or not retrieve or url in self.fetched_urls:
                list(map(self.add, dists))
                return  # don't need the actual page

            if not self.url_ok(url):
                self.fetched_urls[url] = True
                return

            self.info("Reading %s", url)
            self.fetched_urls[url] = True  # prevent multiple fetch attempts
            tmpl = "Download error on %s: %%s -- Some packages may not be found!"
            f = self.open_url(url, tmpl % url)
            if f is None:
                return
            if isinstance(f, urllib.error.HTTPError) and f.code == 401:
                self.info("Authentication error: %s" % f.msg)
            self.fetched_urls[f.url] = True
            if 'html' not in f.headers.get('content-type', '').lower():
                f.close()  # not html, we can't process it
                return

            base = f.url  # handle redirects
            page = f.read()

            # --- LOCAL CHANGES MADE HERE: ---

            if isinstance(page, six.text_type):
                page = page.encode('utf8')
                charset = 'utf8'
            else:
                if isinstance(f, urllib.error.HTTPError):
                    # Errors have no charset, assume latin1:
                    charset = 'latin-1'
                else:
                    try:
                        charset = f.headers.get_param('charset') or 'latin-1'
                    except AttributeError:
                        # Python 2
                        charset = f.headers.getparam('charset') or 'latin-1'
            try:
                html_page = HTMLPage(page, charset, base, cache_link_parsing=False)
            except TypeError:
                html_page = HTMLPage(page, charset, base)

            # https://github.com/buildout/buildout/issues/598
            # use_deprecated_html5lib is a required addition in pip 22.
            try:
                plinks = parse_links(html_page, use_deprecated_html5lib=False)
            except TypeError:
                plinks = parse_links(html_page)
            plinks = list(plinks)
            pip_links = [l.url for l in plinks]

            # --- END OF LOCAL CHANGES ---

            if not isinstance(page, str):
                # In Python 3 and got bytes but want str.
                page = page.decode(charset, "ignore")
            f.close()

            # --- LOCAL CHANGES MADE HERE: ---

            links = []
            for match in HREF.finditer(page):
                link = urllib.parse.urljoin(base, htmldecode(match.group(1)))
                links.append(_clean_link(link))

            # TODO: remove assertion and double index page parsing before releasing.
            assert set(pip_links) == set(links)

            for link in plinks:
                if _check_link_requires_python(link, PY_VERSION_INFO):
                    self.process_url(link.url)

            # --- END OF LOCAL CHANGES ---

            if url.startswith(self.index_url) and getattr(f, 'code', None) != 404:
                page = self.process_index(url, page)