def test_get_absolute_url(self): base_url_split = get_clean_url_split( "https://www.example.com/hello/index.html") self.assertEqual("https://www.example2.com/test.js", get_absolute_url_split("//www.example2.com/test.js", base_url_split).geturl()) self.assertEqual("https://www.example.com/hello2/test.html", get_absolute_url_split("/hello2/test.html", base_url_split).geturl()) self.assertEqual("https://www.example.com/hello/test.html", get_absolute_url_split("test.html", base_url_split).geturl()) self.assertEqual("https://www.example.com/test.html", get_absolute_url_split("../test.html", base_url_split).geturl())
def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if abs_url_split.scheme not in SUPPORTED_SCHEMES: continue link = Link(type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links
def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not self.worker_config.strict_mode: url = url.strip() if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if abs_url_split.scheme not in SUPPORTED_SCHEMES: continue link = Link(type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links