def test_should_not_follow_mailtos(self): site_text = '<a href="mailto:[email protected]">[email protected]</a>' images = extract_domains(site_text) self.assertEqual(len(images), 0)
def test_should_handle_webm_link(self): site_text = '<a href="//upload.wikimedia.org/wikipedia/commons/4/4e/Plasma_globe_23s.webm" title="Play media" target="new"><span class="play-btn-large"><span class="mw-tmh-playtext">Play media</span></span></a>' images = extract_domains(site_text) self.assertEqual(len(images), 0)
def test_should_extract_hrefs_from_a_tags(self): site_text = '<a href="http://example.com">text</a><a href="http://example2.com">text</a>' domains = extract_domains(site_text) self.assertEqual(len(domains), 2)
def test_should_ignore_duplicate_links_that_are_page_anchors(self): site_text = '<a href="http://example2.com/index.html">text</a><a href="http://example2.com/index.html#anchor">text</a>' domains = extract_domains(site_text) self.assertEqual(len(domains), 1)
def test_should_extract_relative_hrefs_from_a_tag(self): site_text = '<a href="http://example.com">text</a>' domains = extract_domains(site_text) self.assertEqual(len(domains), 1) self.assertEqual(domains[0], "http://example.com")