Beispiel #1
0
    def find_links(self):
        if not self.output_path.endswith(".html"):
            return

        self.site.info("Finding links in {}", self)

        self._load_output()

        try:
            root = _XML(self.content)
        except Exception as e:
            self.site.info(str(e))
            return

        assert root is not None, self.content

        links = self._gather_links(root)
        link_targets = self._gather_link_targets(root)

        for link in links:
            if link == "?":
                continue

            scheme, netloc, path, query, fragment = _urlsplit(link)

            if scheme and scheme not in ("file", "http", "https", "ftp"):
                continue

            if netloc in ("issues.apache.org", "bugzilla.redhat.com"):
                continue

            if (fragment and not path) or not path.startswith("/"):
                link = _urljoin(self.url, link)

            self.site.links[link].add(self.url)

        self.site.link_targets.update(link_targets)
Beispiel #2
0
def urlsplit(url):
    scheme, netloc, path, query, fragment = _urlsplit(url)
    if "#" in path:
        path, fragment = path.split("#", 1)
    return SplitResult(scheme, netloc, path, query, fragment)
Beispiel #3
0
def urlsplit(url, auto_prefix=False):
    if auto_prefix and not url.startswith("http://") and not url.startswith("https://") and not url.startswith("/"):
        url = "//" + url
    return _urlsplit(url)
def urlsplit(url):
    scheme, netloc, path, query, fragment = _urlsplit(url)
    if "#" in path:
        path, fragment = path.split("#", 1)
    return SplitResult(scheme, netloc, path, query, fragment)
def urlsplit(url: str) -> SplitResult:
    scheme, netloc, path, query, fragment = _urlsplit(url)
    if '#' in path:
        path, fragment = path.split('#', 1)
    return SplitResult(scheme, netloc, path, query, fragment)