def test_determine_base_url(html, url, expected): document = html5lib.parse( html, transport_encoding=None, namespaceHTMLElements=False, ) assert _determine_base_url(document, url) == expected
def parse_links( html, # type: bytes encoding, # type: Optional[str] url, # type: str ): # type: (...) -> Iterable[Link] """ Parse an HTML document, and yield its anchor elements as Link objects. :param url: the URL from which the HTML was downloaded. """ document = html5lib.parse( html, transport_encoding=encoding, namespaceHTMLElements=False, ) base_url = _determine_base_url(document, url) for anchor in document.findall(".//a"): link = _create_link_from_element( anchor, page_url=url, base_url=base_url, ) if link is None: continue yield link
def test_parse_links_caches_same_page(): html = ( # Mark this as a unicode string for Python 2 since anchor_html # can contain non-ascii. u'<html><head><meta charset="utf-8"><head>' '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>') html_bytes = html.encode('utf-8') page_1 = HTMLPage( html_bytes, encoding=None, url='https://example.com/simple/', ) page_2 = HTMLPage( html_bytes, encoding=None, url='https://example.com/simple/', ) mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse") with mock_parse as mock_parse: mock_parse.return_value = html5lib.parse( page_1.content, transport_encoding=page_1.encoding, namespaceHTMLElements=False, ) parsed_links_1 = list(parse_links(page_1)) mock_parse.assert_called() with mock_parse as mock_parse: parsed_links_2 = list(parse_links(page_2)) assert parsed_links_2 == parsed_links_1 mock_parse.assert_not_called()
def __init__(self, content, url, headers=None): # Determine if we have any encoding information in our headers encoding = None if headers and "Content-Type" in headers: content_type, params = cgi.parse_header(headers["Content-Type"]) if "charset" in params: encoding = params["charset"] self.content = content self.parsed = html5lib.parse(self.content, encoding=encoding, namespaceHTMLElements=False) self.url = url self.headers = headers
def iter_links(self): """Yields all links in the page""" document = html5lib.parse( self.content, transport_encoding=_get_encoding_from_headers(self.headers), namespaceHTMLElements=False, ) base_url = _determine_base_url(document, self.url) for anchor in document.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = _clean_link(urllib_parse.urljoin(base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self.url, requires_python=pyrequire)
def __init__(self, content, url, headers=None): # Determine if we have any encoding information in our headers encoding = None if headers and "Content-Type" in headers: content_type, params = cgi.parse_header(headers["Content-Type"]) if "charset" in params: encoding = params["charset"] self.content = content self.parsed = html5lib.parse(self.content, transport_encoding=encoding, namespaceHTMLElements=False) self.url = url self.headers = headers
def parse_links(page): # type: (HTMLPage) -> Iterable[Link] """ Parse an HTML document, and yield its anchor elements as Link objects. """ document = html5lib.parse( page.content, transport_encoding=page.encoding, namespaceHTMLElements=False, ) url = page.url base_url = _determine_base_url(document, url) for anchor in document.findall(".//a"): link = _create_link_from_element(anchor, page_url=url, base_url=base_url,) if link is None: continue yield link
def iter_links(self): # type: () -> Iterable[Link] """Yields all links in the page""" document = html5lib.parse( self.content, transport_encoding=_get_encoding_from_headers(self.headers), namespaceHTMLElements=False, ) base_url = _determine_base_url(document, self.url) for anchor in document.findall(".//a"): link = _create_link_from_element( anchor, page_url=self.url, base_url=base_url, ) if link is None: continue yield link
def search(self, query: str) -> SearchResult: pypi_simple = self.sources[0]["url"].rstrip("/") results = [] if pypi_simple.endswith("/simple"): search_url = pypi_simple[:-6] + "search" else: search_url = pypi_simple + "/search" with self.environment.get_finder() as finder: session = finder.session resp = session.get(search_url, params={"q": query}) if resp.status_code == 404: self.environment.project.core.ui.echo( termui.yellow( f"{pypi_simple!r} doesn't support '/search' endpoint, fallback " f"to {self.DEFAULT_INDEX_URL!r} now.\n" "This may take longer depending on your network condition." ), err=True, ) resp = session.get(f"{self.DEFAULT_INDEX_URL}/search", params={"q": query}) resp.raise_for_status() content = parse(resp.content, namespaceHTMLElements=False) for result in content.findall(".//*[@class='package-snippet']"): name = result.find("h3/*[@class='package-snippet__name']").text version = result.find( "h3/*[@class='package-snippet__version']").text if not name or not version: continue description = result.find( "p[@class='package-snippet__description']").text if not description: description = "" result = Package(name, version, description) results.append(result) return results
def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]: """ Parse an HTML document, and yield its anchor elements as Link objects. TODO: Remove when `html5lib` is dropped. """ document = html5lib.parse( page.content, transport_encoding=page.encoding, namespaceHTMLElements=False, ) url = page.url base_url = _determine_base_url(document, url) for anchor in document.findall(".//a"): link = _create_link_from_element( anchor.attrib, page_url=url, base_url=base_url, ) if link is None: continue yield link
def __init__(self, content, url, headers=None, trusted=None): self.content = content self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False) self.url = url self.headers = headers self.trusted = trusted
return wrapper_wrapper @with_cached_html_pages def parse_links(page): # type: (HTMLPage) -> Iterable[Link] """ Parse an HTML document, and yield its anchor elements as Link objects. """ <<<<<<< HEAD document = html5lib.bbc_parse(page.content) ======= document = html5lib.parse( page.content, transport_encoding=page.encoding, namespaceHTMLElements=False, ) >>>>>>> 241b678... create predictions url = page.url base_url = _determine_base_url(document, url) for anchor in document.findall(".//a"): link = _create_link_from_element( anchor, page_url=url, base_url=base_url, ) if link is None: continue yield link