Beispiel #1
0
def test_parse_links_caches_same_page():
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>')
    html_bytes = html.encode('utf-8')

    page_1 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )
    page_2 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )

    mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
    with mock_parse as mock_parse:
        mock_parse.return_value = html5lib.parse(
            page_1.content,
            transport_encoding=page_1.encoding,
            namespaceHTMLElements=False,
        )
        parsed_links_1 = list(parse_links(page_1))
        mock_parse.assert_called()

    with mock_parse as mock_parse:
        parsed_links_2 = list(parse_links(page_2))
        assert parsed_links_2 == parsed_links_1
        mock_parse.assert_not_called()
Beispiel #2
0
def test_parse_links_caches_same_page_by_url() -> None:
    html = (
        "<!DOCTYPE html>"
        '<html><head><meta charset="utf-8"><head>'
        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
    )
    html_bytes = html.encode("utf-8")

    url = "https://example.com/simple/"

    page_1 = IndexContent(
        html_bytes,
        "text/html",
        encoding=None,
        url=url,
    )
    # Make a second page with zero content, to ensure that it's not accessed,
    # because the page was cached by url.
    page_2 = IndexContent(
        b"",
        "text/html",
        encoding=None,
        url=url,
    )
    # Make a third page which represents an index url, which should not be
    # cached, even for the same url. We modify the page content slightly to
    # verify that the result is not cached.
    page_3 = IndexContent(
        re.sub(b"pkg1", b"pkg2", html_bytes),
        "text/html",
        encoding=None,
        url=url,
        cache_link_parsing=False,
    )

    parsed_links_1 = list(parse_links(page_1))
    assert len(parsed_links_1) == 1
    assert "pkg1" in parsed_links_1[0].url

    parsed_links_2 = list(parse_links(page_2))
    assert parsed_links_2 == parsed_links_1

    parsed_links_3 = list(parse_links(page_3))
    assert len(parsed_links_3) == 1
    assert parsed_links_3 != parsed_links_1
    assert "pkg2" in parsed_links_3[0].url
def test_parse_link_handles_deprecated_usage_properly() -> None:
    html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    parsed_links = list(parse_links(page, use_deprecated_html5lib=True))

    assert len(parsed_links) == 2
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url
Beispiel #4
0
def test_parse_links_caches_same_page_by_url():
    html = ('<html><head><meta charset="utf-8"><head>'
            '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>')
    html_bytes = html.encode('utf-8')

    url = 'https://example.com/simple/'

    page_1 = HTMLPage(
        html_bytes,
        encoding=None,
        url=url,
    )
    # Make a second page with zero content, to ensure that it's not accessed,
    # because the page was cached by url.
    page_2 = HTMLPage(
        b'',
        encoding=None,
        url=url,
    )
    # Make a third page which represents an index url, which should not be
    # cached, even for the same url. We modify the page content slightly to
    # verify that the result is not cached.
    page_3 = HTMLPage(
        re.sub(b'pkg1', b'pkg2', html_bytes),
        encoding=None,
        url=url,
        cache_link_parsing=False,
    )

    parsed_links_1 = list(parse_links(page_1))
    assert len(parsed_links_1) == 1
    assert 'pkg1' in parsed_links_1[0].url

    parsed_links_2 = list(parse_links(page_2))
    assert parsed_links_2 == parsed_links_1

    parsed_links_3 = list(parse_links(page_3))
    assert len(parsed_links_3) == 1
    assert parsed_links_3 != parsed_links_1
    assert 'pkg2' in parsed_links_3[0].url
Beispiel #5
0
def test_parse_links_presents_warning_on_missing_doctype(
    caplog: pytest.LogCaptureFixture, ) -> None:
    html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    with caplog.at_level(logging.WARN):
        parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

    assert len(parsed_links) == 2, parsed_links
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url

    assert len(caplog.records) == 1
Beispiel #6
0
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
    mock_deprecated: mock.Mock,
) -> None:
    html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

    assert len(parsed_links) == 2, parsed_links
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url

    mock_deprecated.assert_called_once()
Beispiel #7
0
def test_parse_links_json() -> None:
    json_bytes = json.dumps(
        {
            "meta": {"api-version": "1.0"},
            "name": "holygrail",
            "files": [
                {
                    "filename": "holygrail-1.0.tar.gz",
                    "url": "https://example.com/files/holygrail-1.0.tar.gz",
                    "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
                    "requires-python": ">=3.7",
                    "yanked": "Had a vulnerability",
                },
                {
                    "filename": "holygrail-1.0-py3-none-any.whl",
                    "url": "/files/holygrail-1.0-py3-none-any.whl",
                    "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"},
                    "requires-python": ">=3.7",
                    "dist-info-metadata": False,
                },
            ],
        }
    ).encode("utf8")
    page = IndexContent(
        json_bytes,
        "application/vnd.pypi.simple.v1+json",
        encoding=None,
        # parse_links() is cached by url, so we inject a random uuid to ensure
        # the page content isn't cached.
        url=f"https://example.com/simple-{uuid.uuid4()}/",
    )
    links = list(parse_links(page))

    assert links == [
        Link(
            "https://example.com/files/holygrail-1.0.tar.gz",
            comes_from=page.url,
            requires_python=">=3.7",
            yanked_reason="Had a vulnerability",
            hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
        ),
        Link(
            "https://example.com/files/holygrail-1.0-py3-none-any.whl",
            comes_from=page.url,
            requires_python=">=3.7",
            yanked_reason=None,
            hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"},
        ),
    ]
Beispiel #8
0
    def process_project_url(self, project_url, link_evaluator):
        # type: (Link, LinkEvaluator) -> List[InstallationCandidate]
        logger.debug("Fetching project page and analyzing links: %s",
                     project_url)
        html_page = self._link_collector.fetch_page(project_url)
        if html_page is None:
            return []

        page_links = list(parse_links(html_page))

        with indent_log():
            package_links = self.evaluate_links(link_evaluator,
                                                links=page_links)

        return package_links
Beispiel #9
0
def _test_parse_links_data_attribute(anchor_html: str, attr: str,
                                     expected: Optional[str]) -> None:
    html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>'
    html_bytes = html.encode("utf-8")
    page = HTMLPage(
        html_bytes,
        encoding=None,
        # parse_links() is cached by url, so we inject a random uuid to ensure
        # the page content isn't cached.
        url=f"https://example.com/simple-{uuid.uuid4()}/",
    )
    links = list(parse_links(page))
    (link, ) = links
    actual = getattr(link, attr)
    assert actual == expected
Beispiel #10
0
def test_parse_links__yanked_reason(anchor_html, expected):
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body>{}</body></html>').format(anchor_html)
    html_bytes = html.encode('utf-8')
    page = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )
    links = list(parse_links(page))
    link, = links
    actual = link.yanked_reason
    assert actual == expected
Beispiel #11
0
def test_parse_links_presents_warning_on_html4_doctype(
    caplog: pytest.LogCaptureFixture, ) -> None:
    html = (b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" '
            b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
            b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>')
    url = "https://example.com/simple/"
    page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)

    with caplog.at_level(logging.WARN):
        parsed_links = list(parse_links(page, use_deprecated_html5lib=False))

    assert len(parsed_links) == 2, parsed_links
    assert "pkg1-1.0" in parsed_links[0].url
    assert "pkg1-2.0" in parsed_links[1].url

    assert len(caplog.records) == 1
Beispiel #12
0
def test_parse_links__yanked_reason(anchor_html, expected):
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body>{}</body></html>').format(anchor_html)
    html_bytes = html.encode('utf-8')
    page = HTMLPage(
        html_bytes,
        encoding=None,
        # parse_links() is cached by url, so we inject a random uuid to ensure
        # the page content isn't cached.
        url='https://example.com/simple-{}/'.format(uuid.uuid4()),
    )
    links = list(parse_links(page))
    link, = links
    actual = link.yanked_reason
    assert actual == expected
Beispiel #13
0
    def process_project_url(
        self, project_url: Link, link_evaluator: LinkEvaluator
    ) -> List[InstallationCandidate]:
        logger.debug(
            "Fetching project page and analyzing links: %s",
            project_url,
        )
        index_response = self._link_collector.fetch_response(project_url)
        if index_response is None:
            return []

        page_links = list(parse_links(index_response))

        with indent_log():
            package_links = self.evaluate_links(
                link_evaluator,
                links=page_links,
            )

        return package_links
Beispiel #14
0
    def process_project_url(
            self, project_url: Link,
            link_evaluator: LinkEvaluator) -> List[InstallationCandidate]:
        logger.debug(
            "Fetching project page and analyzing links: %s",
            project_url,
        )
        html_page = self._link_collector.fetch_page(project_url)
        if html_page is None:
            return []

        page_links = list(parse_links(html_page,
                                      self._use_deprecated_html5lib))

        with indent_log():
            package_links = self.evaluate_links(
                link_evaluator,
                links=page_links,
            )

        return package_links
Beispiel #15
0
        def process_url(self, url, retrieve=False):
            """Evaluate a URL as a possible download, and maybe retrieve it"""
            if url in self.scanned_urls and not retrieve:
                return
            self.scanned_urls[url] = True
            if not URL_SCHEME(url):
                self.process_filename(url)
                return
            else:
                dists = list(distros_for_url(url))
                if dists:
                    if not self.url_ok(url):
                        return
                    self.debug("Found link: %s", url)

            if dists or not retrieve or url in self.fetched_urls:
                list(map(self.add, dists))
                return  # don't need the actual page

            if not self.url_ok(url):
                self.fetched_urls[url] = True
                return

            self.info("Reading %s", url)
            self.fetched_urls[url] = True  # prevent multiple fetch attempts
            tmpl = "Download error on %s: %%s -- Some packages may not be found!"
            f = self.open_url(url, tmpl % url)
            if f is None:
                return
            if isinstance(f, urllib.error.HTTPError) and f.code == 401:
                self.info("Authentication error: %s" % f.msg)
            self.fetched_urls[f.url] = True
            if 'html' not in f.headers.get('content-type', '').lower():
                f.close()  # not html, we can't process it
                return

            base = f.url  # handle redirects
            page = f.read()

            # --- LOCAL CHANGES MADE HERE: ---

            if isinstance(page, six.text_type):
                page = page.encode('utf8')
                charset = 'utf8'
            else:
                if isinstance(f, urllib.error.HTTPError):
                    # Errors have no charset, assume latin1:
                    charset = 'latin-1'
                else:
                    try:
                        charset = f.headers.get_param('charset') or 'latin-1'
                    except AttributeError:
                        # Python 2
                        charset = f.headers.getparam('charset') or 'latin-1'
            try:
                html_page = HTMLPage(page, charset, base, cache_link_parsing=False)
            except TypeError:
                html_page = HTMLPage(page, charset, base)

            # https://github.com/buildout/buildout/issues/598
            # use_deprecated_html5lib is a required addition in pip 22.
            try:
                plinks = parse_links(html_page, use_deprecated_html5lib=False)
            except TypeError:
                plinks = parse_links(html_page)
            plinks = list(plinks)
            pip_links = [l.url for l in plinks]

            # --- END OF LOCAL CHANGES ---

            if not isinstance(page, str):
                # In Python 3 and got bytes but want str.
                page = page.decode(charset, "ignore")
            f.close()

            # --- LOCAL CHANGES MADE HERE: ---

            links = []
            for match in HREF.finditer(page):
                link = urllib.parse.urljoin(base, htmldecode(match.group(1)))
                links.append(_clean_link(link))

            # TODO: remove assertion and double index page parsing before releasing.
            assert set(pip_links) == set(links)

            for link in plinks:
                if _check_link_requires_python(link, PY_VERSION_INFO):
                    self.process_url(link.url)

            # --- END OF LOCAL CHANGES ---

            if url.startswith(self.index_url) and getattr(f, 'code', None) != 404:
                page = self.process_index(url, page)