def _get_src_values(self) -> Set[str]: values = set() for tag in self._tree.iterfind(".//*[@src]"): if self.base_url: values.add(helpers.fix_possible_value(tag.attrib["src"])) else: values.add(helpers.fix_possible_url(tag.attrib["src"])) tag.attrib["src"] = "" return values
def _get_meta_refresh_values(self) -> Set[str]: values = set() for tag in self._tree.iterfind(".//meta[@http-equiv][@content]"): value = tag.attrib["content"] if "url=" in value.lower(): value = value.partition("=")[2].strip() value = helpers.fix_possible_value(value) values.add(value) return values
def _get_document_write_contents(self) -> Set[str]: document_writes = self._get_document_writes() document_writes_contents = set() for document_write in document_writes: write_begin_index = document_write.rfind("(") write_end_index = document_write.find(")") write_content = document_write[write_begin_index + 1:write_end_index] document_writes_contents.add( helpers.fix_possible_value(write_content)) return {contents for contents in document_writes_contents if contents}
def test_fix_possible_value(): assert helpers.fix_possible_value( '"//domain.com\\index\u0000.html"') == "//domain.com/index.html"
def get_base64_urls(self) -> Set[str]: fixed_base64_values = { helpers.fix_possible_value(v) for v in self.get_base64_values() } return {u for u in fixed_base64_values if URL(u).is_url}