def find_links(self): if not self.output_path.endswith(".html"): return self.site.info("Finding links in {}", self) self._load_output() try: root = _XML(self.content) except Exception as e: self.site.info(str(e)) return assert root is not None, self.content links = self._gather_links(root) link_targets = self._gather_link_targets(root) for link in links: if link == "?": continue scheme, netloc, path, query, fragment = _urlsplit(link) if scheme and scheme not in ("file", "http", "https", "ftp"): continue if netloc in ("issues.apache.org", "bugzilla.redhat.com"): continue if (fragment and not path) or not path.startswith("/"): link = _urljoin(self.url, link) self.site.links[link].add(self.url) self.site.link_targets.update(link_targets)
def urlsplit(url): scheme, netloc, path, query, fragment = _urlsplit(url) if "#" in path: path, fragment = path.split("#", 1) return SplitResult(scheme, netloc, path, query, fragment)
def urlsplit(url, auto_prefix=False): if auto_prefix and not url.startswith("http://") and not url.startswith("https://") and not url.startswith("/"): url = "//" + url return _urlsplit(url)
def urlsplit(url: str) -> SplitResult: scheme, netloc, path, query, fragment = _urlsplit(url) if '#' in path: path, fragment = path.split('#', 1) return SplitResult(scheme, netloc, path, query, fragment)