def test_url_percent_encode(self): self.assertEqual('a ', percent_decode('a%20')) self.assertEqual('að', percent_decode('a%C3%B0')) self.assertEqual('a ', percent_decode_plus('a+')) self.assertEqual('að', percent_decode_plus('a%C3%B0')) self.assertEqual('a%20', percent_encode('a ')) self.assertEqual('a%C3%B0', percent_encode('að')) self.assertEqual('a+', percent_encode_plus('a ')) self.assertEqual('a%C3%B0', percent_encode_plus('að'))
def iter_links_by_attrib(self, element): '''Iterate an element by looking at its attributes for links.''' for attrib_name in element.attrib.keys(): attrib_value = element.attrib.get(attrib_name) if attrib_name in self.LINK_ATTRIBUTES: if self.javascript_scraper and \ attrib_value.lstrip().startswith('javascript:'): for link in self.iter_links_by_js_attrib( attrib_name, percent_decode(attrib_value)): yield link else: yield attrib_name, attrib_value elif self.javascript_scraper and \ attrib_name[:5] in self.DYNAMIC_ATTRIBUTES: for link in self.iter_links_by_js_attrib(attrib_name, attrib_value): yield link elif attrib_name.startswith('data-'): if is_likely_link(attrib_value) \ and not is_unlikely_link(attrib_value): yield attrib_name, attrib_value elif attrib_name == 'srcset': items = self.iter_links_by_srcset_attrib( attrib_name, attrib_value) for item in items: yield item
def iter_links_by_attrib(self, element): '''Iterate an element by looking at its attributes for links.''' for attrib_name in element.attrib.keys(): try: attrib_value = element.attrib.get(attrib_name) except ValueError: # lxml.etree.__getNsTag can raise ValueError: Empty tag name # https://bugs.python.org/issue28236 attrib_value = "" if attrib_name in self.LINK_ATTRIBUTES: if self.javascript_scraper and \ attrib_value.lstrip().startswith('javascript:'): for link in self.iter_links_by_js_attrib( attrib_name, percent_decode(attrib_value)): yield link else: yield attrib_name, attrib_value elif self.javascript_scraper and \ attrib_name[:5] in self.DYNAMIC_ATTRIBUTES: for link in self.iter_links_by_js_attrib( attrib_name, attrib_value): yield link elif attrib_name.startswith('data-'): if is_likely_link(attrib_value) \ and not is_unlikely_link(attrib_value): yield attrib_name, attrib_value elif attrib_name == 'srcset': items = self.iter_links_by_srcset_attrib( attrib_name, attrib_value) for item in items: yield item