def iter_links_by_attrib(self, element): '''Iterate an element by looking at its attributes for links.''' for attrib_name in element.attrib.keys(): attrib_value = element.attrib.get(attrib_name) if attrib_name in self.LINK_ATTRIBUTES: if self.javascript_scraper and \ attrib_value.lstrip().startswith('javascript:'): for link in self.iter_links_by_js_attrib( attrib_name, percent_decode(attrib_value)): yield link else: yield attrib_name, attrib_value elif self.javascript_scraper and \ attrib_name[:5] in self.DYNAMIC_ATTRIBUTES: for link in self.iter_links_by_js_attrib(attrib_name, attrib_value): yield link elif attrib_name.startswith('data-'): if is_likely_link(attrib_value) \ and not is_unlikely_link(attrib_value): yield attrib_name, attrib_value elif attrib_name == 'srcset': items = self.iter_links_by_srcset_attrib( attrib_name, attrib_value) for item in items: yield item
def iter_processed_text(self, file, encoding=None, base_url=None): for text, is_link in self.iter_text(file, encoding): if is_link: try: new_text = json.loads('"{0}"'.format(text)) except ValueError: yield (text, False) continue if is_unlikely_link(new_text) or not is_likely_link(new_text): yield (text, False) continue if base_url: new_link = urljoin_safe(base_url, new_text, allow_fragments=False) else: new_link = new_text if new_link: yield (new_link, identify_link_type(new_link) or True) else: yield (text, False) else: yield (text, False)
def iter_links_by_attrib(self, element): '''Iterate an element by looking at its attributes for links.''' for attrib_name in element.attrib.keys(): try: attrib_value = element.attrib.get(attrib_name) except ValueError: # lxml.etree.__getNsTag can raise ValueError: Empty tag name # https://bugs.python.org/issue28236 attrib_value = "" if attrib_name in self.LINK_ATTRIBUTES: if self.javascript_scraper and \ attrib_value.lstrip().startswith('javascript:'): for link in self.iter_links_by_js_attrib( attrib_name, percent_decode(attrib_value)): yield link else: yield attrib_name, attrib_value elif self.javascript_scraper and \ attrib_name[:5] in self.DYNAMIC_ATTRIBUTES: for link in self.iter_links_by_js_attrib( attrib_name, attrib_value): yield link elif attrib_name.startswith('data-'): if is_likely_link(attrib_value) \ and not is_unlikely_link(attrib_value): yield attrib_name, attrib_value elif attrib_name == 'srcset': items = self.iter_links_by_srcset_attrib( attrib_name, attrib_value) for item in items: yield item
def iter_processed_text(self, file, encoding=None, base_url=None): for text, is_link in self.iter_text(file, encoding): if is_link: try: new_text = json.loads('"{0}"'.format(text)) except ValueError: yield (text, False) continue if is_unlikely_link(new_text) or not is_likely_link(new_text): yield (text, False) continue if base_url: new_link = urljoin_safe(base_url, new_text, allow_fragments=False) else: new_link = new_text if new_link: yield (new_link, identify_link_type(new_link) or True) else: yield (text, False) else: yield (text, False)
def test_is_unlikely_link(self): self.assertTrue(is_unlikely_link('example.com+')) self.assertTrue(is_unlikely_link('www.')) self.assertTrue(is_unlikely_link(':example.com')) self.assertTrue(is_unlikely_link(',example.com')) self.assertTrue(is_unlikely_link('http:')) self.assertTrue(is_unlikely_link('.example.com')) self.assertTrue(is_unlikely_link('doc[0]')) self.assertTrue(is_unlikely_link('/')) self.assertTrue(is_unlikely_link('//')) self.assertTrue(is_unlikely_link('application/json')) self.assertTrue(is_unlikely_link('application/javascript')) self.assertTrue(is_unlikely_link('text/javascript')) self.assertTrue(is_unlikely_link('text/plain')) self.assertTrue(is_unlikely_link('/\\/')) self.assertTrue(is_unlikely_link('a.help')) self.assertTrue(is_unlikely_link('div.menu')) self.assertTrue(is_unlikely_link('apikey={YOUR_API_KEY_HERE}')) self.assertFalse(is_unlikely_link('http://')) self.assertFalse(is_unlikely_link('example')) self.assertFalse(is_unlikely_link('example.com')) self.assertFalse(is_unlikely_link('//example.com/assets/image.css')) self.assertFalse(is_unlikely_link('./image.css')) self.assertFalse(is_unlikely_link('../image.css')) self.assertFalse(is_unlikely_link('index.html')) self.assertFalse(is_unlikely_link('body.html'))
def test_is_unlikely_link(self): self.assertTrue(is_unlikely_link('example.com+')) self.assertTrue(is_unlikely_link('www.')) self.assertTrue(is_unlikely_link(':example.com')) self.assertTrue(is_unlikely_link(',example.com')) self.assertTrue(is_unlikely_link('http:')) self.assertTrue(is_unlikely_link('.example.com')) self.assertTrue(is_unlikely_link('doc[0]')) self.assertTrue(is_unlikely_link('/')) self.assertTrue(is_unlikely_link('//')) self.assertTrue(is_unlikely_link('application/json')) self.assertTrue(is_unlikely_link('application/javascript')) self.assertTrue(is_unlikely_link('text/javascript')) self.assertTrue(is_unlikely_link('text/plain')) self.assertTrue(is_unlikely_link('/\\/')) self.assertTrue(is_unlikely_link('a.help')) self.assertTrue(is_unlikely_link('div.menu')) self.assertTrue(is_unlikely_link('apikey={YOUR_API_KEY_HERE}')) self.assertFalse(is_unlikely_link('http://')) self.assertFalse(is_unlikely_link('example')) self.assertFalse(is_unlikely_link('example.com')) self.assertFalse(is_unlikely_link('//example.com/assets/image.css')) self.assertFalse(is_unlikely_link('./image.css')) self.assertFalse(is_unlikely_link('../image.css')) self.assertFalse(is_unlikely_link('index.html')) self.assertFalse(is_unlikely_link('body.html'))