def _create_link_from_element( anchor, # type: HTMLElement page_url, # type: str base_url, # type: str ): # type: (...) -> Optional[Link] """ Convert an anchor element in a simple repository page to a Link. """ href = anchor.get("href") if not href: return None url = _clean_link(urllib_parse.urljoin(base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yanked_reason = anchor.get('data-yanked') if yanked_reason: # This is a unicode string in Python 2 (and 3). yanked_reason = unescape(yanked_reason) link = Link( url, comes_from=page_url, requires_python=pyrequire, yanked_reason=yanked_reason, ) return link
def links(self): """ Return the URLs of all the links on a page together with information about their "rel" attribute, for determining which ones to treat as downloads and which ones to queue for further scraping. """ def clean(url): "Tidy up an URL." scheme, netloc, path, params, query, frag = urlparse(url) return urlunparse((scheme, netloc, quote(path), params, query, frag)) result = set() for match in self._href.finditer(self.data): d = match.groupdict('') rel = (d['rel1'] or d['rel2'] or d['rel3'] or d['rel4'] or d['rel5'] or d['rel6']) url = d['url1'] or d['url2'] or d['url3'] url = urljoin(self.base_url, url) url = unescape(url) url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url) result.add((url, rel)) # We sort the result, hoping to bring the most recent versions # to the front result = sorted(result, key=lambda t: t[0], reverse=True) return result
def links(self): """ Return the URLs of all the links on a page together with information about their "rel" attribute, for determining which ones to treat as downloads and which ones to queue for further scraping. """ def clean(url): "Tidy up an URL." scheme, netloc, path, params, query, frag = urlparse(url) return urlunparse( (scheme, netloc, quote(path), params, query, frag)) result = set() for match in self._href.finditer(self.data): d = match.groupdict('') rel = (d['rel1'] or d['rel2'] or d['rel3'] or d['rel4'] or d['rel5'] or d['rel6']) url = d['url1'] or d['url2'] or d['url3'] url = urljoin(self.base_url, url) url = unescape(url) url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url) result.add((url, rel)) # We sort the result, hoping to bring the most recent versions # to the front result = sorted(result, key=lambda t: t[0], reverse=True) return result
def links(self): """Yields all links in the page""" for anchor in self.parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = self.clean_link(urllib_parse.urljoin( self.base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self, requires_python=pyrequire)
def links(self): """Yields all links in the page""" for anchor in self.parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = self.clean_link( urllib_parse.urljoin(self.base_url, href) ) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self, requires_python=pyrequire)
def iter_links(self): """Yields all links in the page""" document = html5lib.parse( self.content, transport_encoding=_get_encoding_from_headers(self.headers), namespaceHTMLElements=False, ) base_url = _determine_base_url(document, self.url) for anchor in document.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = _clean_link(urllib_parse.urljoin(base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self.url, requires_python=pyrequire)