Example #1
0
 def test_url_percent_encode(self):
     self.assertEqual('a ', percent_decode('a%20'))
     self.assertEqual('að', percent_decode('a%C3%B0'))
     self.assertEqual('a ', percent_decode_plus('a+'))
     self.assertEqual('að', percent_decode_plus('a%C3%B0'))
     self.assertEqual('a%20', percent_encode('a '))
     self.assertEqual('a%C3%B0', percent_encode('að'))
     self.assertEqual('a+', percent_encode_plus('a '))
     self.assertEqual('a%C3%B0', percent_encode_plus('að'))
Example #2
0
 def test_url_percent_encode(self):
     self.assertEqual('a ', percent_decode('a%20'))
     self.assertEqual('að', percent_decode('a%C3%B0'))
     self.assertEqual('a ', percent_decode_plus('a+'))
     self.assertEqual('að', percent_decode_plus('a%C3%B0'))
     self.assertEqual('a%20', percent_encode('a '))
     self.assertEqual('a%C3%B0', percent_encode('að'))
     self.assertEqual('a+', percent_encode_plus('a '))
     self.assertEqual('a%C3%B0', percent_encode_plus('að'))
Example #3
0
    def iter_links_by_attrib(self, element):
        '''Iterate an element by looking at its attributes for links.'''
        for attrib_name in element.attrib.keys():
            attrib_value = element.attrib.get(attrib_name)

            if attrib_name in self.LINK_ATTRIBUTES:
                if self.javascript_scraper and \
                        attrib_value.lstrip().startswith('javascript:'):
                    for link in self.iter_links_by_js_attrib(
                            attrib_name, percent_decode(attrib_value)):
                        yield link
                else:
                    yield attrib_name, attrib_value

            elif self.javascript_scraper and \
                    attrib_name[:5] in self.DYNAMIC_ATTRIBUTES:
                for link in self.iter_links_by_js_attrib(attrib_name,
                                                         attrib_value):
                    yield link

            elif attrib_name.startswith('data-'):
                if is_likely_link(attrib_value) \
                        and not is_unlikely_link(attrib_value):
                    yield attrib_name, attrib_value

            elif attrib_name == 'srcset':
                items = self.iter_links_by_srcset_attrib(
                    attrib_name, attrib_value)

                for item in items:
                    yield item
Example #4
0
    def iter_links_by_attrib(self, element):
        '''Iterate an element by looking at its attributes for links.'''
        for attrib_name in element.attrib.keys():
            try:
                attrib_value = element.attrib.get(attrib_name)
            except ValueError:
                # lxml.etree.__getNsTag can raise ValueError: Empty tag name
                # https://bugs.python.org/issue28236
                attrib_value = ""

            if attrib_name in self.LINK_ATTRIBUTES:
                if self.javascript_scraper and \
                        attrib_value.lstrip().startswith('javascript:'):
                    for link in self.iter_links_by_js_attrib(
                            attrib_name, percent_decode(attrib_value)):
                        yield link
                else:
                    yield attrib_name, attrib_value

            elif self.javascript_scraper and \
                    attrib_name[:5] in self.DYNAMIC_ATTRIBUTES:
                for link in self.iter_links_by_js_attrib(
                        attrib_name, attrib_value):
                    yield link

            elif attrib_name.startswith('data-'):
                if is_likely_link(attrib_value) \
                        and not is_unlikely_link(attrib_value):
                    yield attrib_name, attrib_value

            elif attrib_name == 'srcset':
                items = self.iter_links_by_srcset_attrib(
                    attrib_name, attrib_value)

                for item in items:
                    yield item