def iter_links_by_attrib(self, element): '''Iterate an element by looking at its attributes for links.''' for attrib_name in element.attrib.keys(): attrib_value = element.attrib.get(attrib_name) if attrib_name in self.LINK_ATTRIBUTES: if self.javascript_scraper and \ attrib_value.lstrip().startswith('javascript:'): for link in self.iter_links_by_js_attrib( attrib_name, percent_decode(attrib_value)): yield link else: yield attrib_name, attrib_value elif self.javascript_scraper and \ attrib_name[:5] in self.DYNAMIC_ATTRIBUTES: for link in self.iter_links_by_js_attrib(attrib_name, attrib_value): yield link elif attrib_name.startswith('data-'): if is_likely_link(attrib_value) \ and not is_unlikely_link(attrib_value): yield attrib_name, attrib_value elif attrib_name == 'srcset': items = self.iter_links_by_srcset_attrib( attrib_name, attrib_value) for item in items: yield item
def iter_links_by_attrib(self, element): '''Iterate an element by looking at its attributes for links.''' for attrib_name in element.attrib.keys(): try: attrib_value = element.attrib.get(attrib_name) except ValueError: # lxml.etree.__getNsTag can raise ValueError: Empty tag name # https://bugs.python.org/issue28236 attrib_value = "" if attrib_name in self.LINK_ATTRIBUTES: if self.javascript_scraper and \ attrib_value.lstrip().startswith('javascript:'): for link in self.iter_links_by_js_attrib( attrib_name, percent_decode(attrib_value)): yield link else: yield attrib_name, attrib_value elif self.javascript_scraper and \ attrib_name[:5] in self.DYNAMIC_ATTRIBUTES: for link in self.iter_links_by_js_attrib( attrib_name, attrib_value): yield link elif attrib_name.startswith('data-'): if is_likely_link(attrib_value) \ and not is_unlikely_link(attrib_value): yield attrib_name, attrib_value elif attrib_name == 'srcset': items = self.iter_links_by_srcset_attrib( attrib_name, attrib_value) for item in items: yield item
def iter_processed_text(self, file, encoding=None, base_url=None): for text, is_link in self.iter_text(file, encoding): if is_link: try: new_text = json.loads('"{0}"'.format(text)) except ValueError: yield (text, False) continue if is_unlikely_link(new_text) or not is_likely_link(new_text): yield (text, False) continue if base_url: new_link = urljoin_safe(base_url, new_text, allow_fragments=False) else: new_link = new_text if new_link: yield (new_link, identify_link_type(new_link) or True) else: yield (text, False) else: yield (text, False)
def test_is_likely_link(self): self.assertTrue(is_likely_link('image.png')) self.assertTrue(is_likely_link('video.mp4')) self.assertTrue(is_likely_link('/directory')) self.assertTrue(is_likely_link('directory/')) self.assertTrue(is_likely_link('/directory/')) self.assertTrue(is_likely_link('../directory/')) self.assertTrue(is_likely_link('http://example.com/')) self.assertTrue(is_likely_link('https://example.com/')) self.assertTrue(is_likely_link('ftp://example.com')) self.assertTrue(is_likely_link('directory/index.html')) self.assertFalse(is_likely_link('directory/another_directory')) self.assertTrue(is_likely_link('application/windows.exe')) self.assertTrue(is_likely_link('//example.com/admin')) self.assertFalse(is_likely_link('12.0')) self.assertFalse(is_likely_link('7')) self.assertFalse(is_likely_link('horse')) self.assertFalse(is_likely_link('')) self.assertFalse(is_likely_link('setTimeout(myTimer, 1000)')) self.assertFalse(is_likely_link('comment.delete')) self.assertFalse(is_likely_link('example.com')) self.assertFalse(is_likely_link('example.net')) self.assertFalse(is_likely_link('example.org')) self.assertFalse(is_likely_link('example.edu'))