Example #1
0
    def _collect_link_data(self, link_sources, link_targets):
        link_targets.add(self.url)

        if not self.url.endswith(".html"):
            return

        root = _XML(_read_file(self._output_path))

        for elem in root.iter("*"):
            for name in ("href", "src", "action"):
                try:
                    url = elem.attrib[name]
                except KeyError:
                    continue

                split_url = _urlparse.urlsplit(url)

                if split_url.scheme or split_url.netloc:
                    continue

                normalized_url = _urlparse.urljoin(
                    self.url, _urlparse.urlunsplit(split_url))

                link_sources[normalized_url].add(self)

        for elem in root.iter("*"):
            if "id" in elem.attrib:
                normalized_url = _urlparse.urljoin(self.url,
                                                   f"#{elem.attrib['id']}")

                if normalized_url in link_targets:
                    self.site.warn("Duplicate link target in '{}'",
                                   normalized_url)

                link_targets.add(normalized_url)
Example #2
0
def html_table_functions():
    data = (
        (1, 2, 3),
        ("a", "b", "c"),
        (None, "", 0),
    )

    _XML(_html_table(data))
    _XML(_html_table(data, headings=("A", "B", "C")))

    with working_dir():
        with open("test.csv", "w", newline="") as f:
            writer = _csv.writer(f)
            writer.writerows(data)

        _XML(_html_table_csv("test.csv"))
Example #3
0
    def find_links(self):
        if not self.output_path.endswith(".html"):
            return

        self.site.info("Finding links in {}", self)

        self._load_output()

        try:
            root = _XML(self.content)
        except Exception as e:
            self.site.info(str(e))
            return

        assert root is not None, self.content

        links = self._gather_links(root)
        link_targets = self._gather_link_targets(root)

        for link in links:
            if link == "?":
                continue

            scheme, netloc, path, query, fragment = _urlsplit(link)

            if scheme and scheme not in ("file", "http", "https", "ftp"):
                continue

            if netloc in ("issues.apache.org", "bugzilla.redhat.com"):
                continue

            if (fragment and not path) or not path.startswith("/"):
                link = _urljoin(self.url, link)

            self.site.links[link].add(self.url)

        self.site.link_targets.update(link_targets)