def iter_links(self): """Yields all links in the page""" document = html5lib.parse( self.content, transport_encoding=_get_encoding_from_headers(self.headers), namespaceHTMLElements=False, ) base_url = _determine_base_url(document, self.url) for anchor in document.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = _clean_link(urllib_parse.urljoin(base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self.url, requires_python=pyrequire)
def __init__(self, content, url, headers=None): # Determine if we have any encoding information in our headers encoding = None if headers and "Content-Type" in headers: content_type, params = cgi.parse_header(headers["Content-Type"]) if "charset" in params: encoding = params['charset'] self.content = content self.parsed = html5lib.parse( self.content, transport_encoding=encoding, namespaceHTMLElements=False, ) self.url = url self.headers = headers
def parse_links(page: "HTMLPage") -> Iterable[Link]: """ Parse an HTML document, and yield its anchor elements as Link objects. """ document = html5lib.parse( page.content, transport_encoding=page.encoding, namespaceHTMLElements=False, ) url = page.url base_url = _determine_base_url(document, url) for anchor in document.findall(".//a"): link = _create_link_from_element( anchor, page_url=url, base_url=base_url, ) if link is None: continue yield link