def test_parse_links_caches_same_page(): html = ( # Mark this as a unicode string for Python 2 since anchor_html # can contain non-ascii. u'<html><head><meta charset="utf-8"><head>' '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>') html_bytes = html.encode('utf-8') page_1 = HTMLPage( html_bytes, encoding=None, url='https://example.com/simple/', ) page_2 = HTMLPage( html_bytes, encoding=None, url='https://example.com/simple/', ) mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse") with mock_parse as mock_parse: mock_parse.return_value = html5lib.parse( page_1.content, transport_encoding=page_1.encoding, namespaceHTMLElements=False, ) parsed_links_1 = list(parse_links(page_1)) mock_parse.assert_called() with mock_parse as mock_parse: parsed_links_2 = list(parse_links(page_2)) assert parsed_links_2 == parsed_links_1 mock_parse.assert_not_called()
def test_parse_link_handles_deprecated_usage_properly() -> None: html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>' url = "https://example.com/simple/" page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) parsed_links = list(parse_links(page, use_deprecated_html5lib=True)) assert len(parsed_links) == 2 assert "pkg1-1.0" in parsed_links[0].url assert "pkg1-2.0" in parsed_links[1].url
def test_parse_links_caches_same_page_by_url() -> None: html = ( "<!DOCTYPE html>" '<html><head><meta charset="utf-8"><head>' '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>' ) html_bytes = html.encode("utf-8") url = "https://example.com/simple/" page_1 = HTMLPage( html_bytes, encoding=None, url=url, ) # Make a second page with zero content, to ensure that it's not accessed, # because the page was cached by url. page_2 = HTMLPage( b"", encoding=None, url=url, ) # Make a third page which represents an index url, which should not be # cached, even for the same url. We modify the page content slightly to # verify that the result is not cached. page_3 = HTMLPage( re.sub(b"pkg1", b"pkg2", html_bytes), encoding=None, url=url, cache_link_parsing=False, ) parsed_links_1 = list(parse_links(page_1, use_deprecated_html5lib=False)) assert len(parsed_links_1) == 1 assert "pkg1" in parsed_links_1[0].url parsed_links_2 = list(parse_links(page_2, use_deprecated_html5lib=False)) assert parsed_links_2 == parsed_links_1 parsed_links_3 = list(parse_links(page_3, use_deprecated_html5lib=False)) assert len(parsed_links_3) == 1 assert parsed_links_3 != parsed_links_1 assert "pkg2" in parsed_links_3[0].url
def fetch_page(self, location): m = re.search(r"/simple/([^/]+)/?", location.url) if not m: return old_fetcher(self, location) name = m.group(1) if name not in fake_index: fake_index[name] = (FIXTURES / f"index/{name}.html").read_bytes() return HTMLPage( fake_index[name], "utf-8", location.url, cache_link_parsing=False )
def test_parse_links_presents_warning_on_missing_doctype( caplog: pytest.LogCaptureFixture, ) -> None: html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>' url = "https://example.com/simple/" page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) with caplog.at_level(logging.WARN): parsed_links = list(parse_links(page, use_deprecated_html5lib=False)) assert len(parsed_links) == 2, parsed_links assert "pkg1-1.0" in parsed_links[0].url assert "pkg1-2.0" in parsed_links[1].url assert len(caplog.records) == 1
def test_parse_links_presents_deprecation_warning_on_non_html5_page( mock_deprecated: mock.Mock, ) -> None: html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>' url = "https://example.com/simple/" page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) parsed_links = list(parse_links(page, use_deprecated_html5lib=False)) assert len(parsed_links) == 2, parsed_links assert "pkg1-1.0" in parsed_links[0].url assert "pkg1-2.0" in parsed_links[1].url mock_deprecated.assert_called_once()
def _test_parse_links_data_attribute(anchor_html: str, attr: str, expected: Optional[str]) -> None: html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>' html_bytes = html.encode("utf-8") page = HTMLPage( html_bytes, encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure # the page content isn't cached. url=f"https://example.com/simple-{uuid.uuid4()}/", ) links = list(parse_links(page)) (link, ) = links actual = getattr(link, attr) assert actual == expected
def test_parse_links__yanked_reason(anchor_html, expected): html = ( # Mark this as a unicode string for Python 2 since anchor_html # can contain non-ascii. u'<html><head><meta charset="utf-8"><head>' '<body>{}</body></html>').format(anchor_html) html_bytes = html.encode('utf-8') page = HTMLPage( html_bytes, encoding=None, url='https://example.com/simple/', ) links = list(parse_links(page)) link, = links actual = link.yanked_reason assert actual == expected
def test_parse_links_presents_warning_on_html4_doctype( caplog: pytest.LogCaptureFixture, ) -> None: html = (b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' b'"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">' b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>') url = "https://example.com/simple/" page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) with caplog.at_level(logging.WARN): parsed_links = list(parse_links(page, use_deprecated_html5lib=False)) assert len(parsed_links) == 2, parsed_links assert "pkg1-1.0" in parsed_links[0].url assert "pkg1-2.0" in parsed_links[1].url assert len(caplog.records) == 1
def test_parse_links__yanked_reason(anchor_html, expected): html = ( # Mark this as a unicode string for Python 2 since anchor_html # can contain non-ascii. u'<html><head><meta charset="utf-8"><head>' '<body>{}</body></html>').format(anchor_html) html_bytes = html.encode('utf-8') page = HTMLPage( html_bytes, encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure # the page content isn't cached. url='https://example.com/simple-{}/'.format(uuid.uuid4()), ) links = list(parse_links(page)) link, = links actual = link.yanked_reason assert actual == expected
def process_url(self, url, retrieve=False): """Evaluate a URL as a possible download, and maybe retrieve it""" if url in self.scanned_urls and not retrieve: return self.scanned_urls[url] = True if not URL_SCHEME(url): self.process_filename(url) return else: dists = list(distros_for_url(url)) if dists: if not self.url_ok(url): return self.debug("Found link: %s", url) if dists or not retrieve or url in self.fetched_urls: list(map(self.add, dists)) return # don't need the actual page if not self.url_ok(url): self.fetched_urls[url] = True return self.info("Reading %s", url) self.fetched_urls[url] = True # prevent multiple fetch attempts tmpl = "Download error on %s: %%s -- Some packages may not be found!" f = self.open_url(url, tmpl % url) if f is None: return if isinstance(f, urllib.error.HTTPError) and f.code == 401: self.info("Authentication error: %s" % f.msg) self.fetched_urls[f.url] = True if 'html' not in f.headers.get('content-type', '').lower(): f.close() # not html, we can't process it return base = f.url # handle redirects page = f.read() # --- LOCAL CHANGES MADE HERE: --- if isinstance(page, six.text_type): page = page.encode('utf8') charset = 'utf8' else: if isinstance(f, urllib.error.HTTPError): # Errors have no charset, assume latin1: charset = 'latin-1' else: try: charset = f.headers.get_param('charset') or 'latin-1' except AttributeError: # Python 2 charset = f.headers.getparam('charset') or 'latin-1' try: html_page = HTMLPage(page, charset, base, cache_link_parsing=False) except TypeError: html_page = HTMLPage(page, charset, base) # https://github.com/buildout/buildout/issues/598 # use_deprecated_html5lib is a required addition in pip 22. try: plinks = parse_links(html_page, use_deprecated_html5lib=False) except TypeError: plinks = parse_links(html_page) plinks = list(plinks) pip_links = [l.url for l in plinks] # --- END OF LOCAL CHANGES --- if not isinstance(page, str): # In Python 3 and got bytes but want str. page = page.decode(charset, "ignore") f.close() # --- LOCAL CHANGES MADE HERE: --- links = [] for match in HREF.finditer(page): link = urllib.parse.urljoin(base, htmldecode(match.group(1))) links.append(_clean_link(link)) # TODO: remove assertion and double index page parsing before releasing. assert set(pip_links) == set(links) for link in plinks: if _check_link_requires_python(link, PY_VERSION_INFO): self.process_url(link.url) # --- END OF LOCAL CHANGES --- if url.startswith(self.index_url) and getattr(f, 'code', None) != 404: page = self.process_index(url, page)