def _collect_link_data(self, link_sources, link_targets): link_targets.add(self.url) if not self.url.endswith(".html"): return root = _XML(_read_file(self._output_path)) for elem in root.iter("*"): for name in ("href", "src", "action"): try: url = elem.attrib[name] except KeyError: continue split_url = _urlparse.urlsplit(url) if split_url.scheme or split_url.netloc: continue normalized_url = _urlparse.urljoin( self.url, _urlparse.urlunsplit(split_url)) link_sources[normalized_url].add(self) for elem in root.iter("*"): if "id" in elem.attrib: normalized_url = _urlparse.urljoin(self.url, f"#{elem.attrib['id']}") if normalized_url in link_targets: self.site.warn("Duplicate link target in '{}'", normalized_url) link_targets.add(normalized_url)
def html_table_functions(): data = ( (1, 2, 3), ("a", "b", "c"), (None, "", 0), ) _XML(_html_table(data)) _XML(_html_table(data, headings=("A", "B", "C"))) with working_dir(): with open("test.csv", "w", newline="") as f: writer = _csv.writer(f) writer.writerows(data) _XML(_html_table_csv("test.csv"))
def find_links(self): if not self.output_path.endswith(".html"): return self.site.info("Finding links in {}", self) self._load_output() try: root = _XML(self.content) except Exception as e: self.site.info(str(e)) return assert root is not None, self.content links = self._gather_links(root) link_targets = self._gather_link_targets(root) for link in links: if link == "?": continue scheme, netloc, path, query, fragment = _urlsplit(link) if scheme and scheme not in ("file", "http", "https", "ftp"): continue if netloc in ("issues.apache.org", "bugzilla.redhat.com"): continue if (fragment and not path) or not path.startswith("/"): link = _urljoin(self.url, link) self.site.links[link].add(self.url) self.site.link_targets.update(link_targets)