Esempio n. 1
0
    def iter_links_by_attrib(self, element):
        '''Iterate an element by looking at its attributes for links.'''
        for attrib_name in element.attrib.keys():
            attrib_value = element.attrib.get(attrib_name)

            if attrib_name in self.LINK_ATTRIBUTES:
                if self.javascript_scraper and \
                        attrib_value.lstrip().startswith('javascript:'):
                    for link in self.iter_links_by_js_attrib(
                            attrib_name, percent_decode(attrib_value)):
                        yield link
                else:
                    yield attrib_name, attrib_value

            elif self.javascript_scraper and \
                    attrib_name[:5] in self.DYNAMIC_ATTRIBUTES:
                for link in self.iter_links_by_js_attrib(attrib_name,
                                                         attrib_value):
                    yield link

            elif attrib_name.startswith('data-'):
                if is_likely_link(attrib_value) \
                        and not is_unlikely_link(attrib_value):
                    yield attrib_name, attrib_value

            elif attrib_name == 'srcset':
                items = self.iter_links_by_srcset_attrib(
                    attrib_name, attrib_value)

                for item in items:
                    yield item
Esempio n. 2
0
    def iter_processed_text(self, file, encoding=None, base_url=None):
        for text, is_link in self.iter_text(file, encoding):
            if is_link:
                try:
                    new_text = json.loads('"{0}"'.format(text))
                except ValueError:
                    yield (text, False)
                    continue

                if is_unlikely_link(new_text) or not is_likely_link(new_text):
                    yield (text, False)
                    continue

                if base_url:
                    new_link = urljoin_safe(base_url, new_text,
                                            allow_fragments=False)
                else:
                    new_link = new_text

                if new_link:
                    yield (new_link, identify_link_type(new_link) or True)
                else:
                    yield (text, False)
            else:
                yield (text, False)
Esempio n. 3
0
    def iter_links_by_attrib(self, element):
        '''Iterate an element by looking at its attributes for links.'''
        for attrib_name in element.attrib.keys():
            try:
                attrib_value = element.attrib.get(attrib_name)
            except ValueError:
                # lxml.etree.__getNsTag can raise ValueError: Empty tag name
                # https://bugs.python.org/issue28236
                attrib_value = ""

            if attrib_name in self.LINK_ATTRIBUTES:
                if self.javascript_scraper and \
                        attrib_value.lstrip().startswith('javascript:'):
                    for link in self.iter_links_by_js_attrib(
                            attrib_name, percent_decode(attrib_value)):
                        yield link
                else:
                    yield attrib_name, attrib_value

            elif self.javascript_scraper and \
                    attrib_name[:5] in self.DYNAMIC_ATTRIBUTES:
                for link in self.iter_links_by_js_attrib(
                        attrib_name, attrib_value):
                    yield link

            elif attrib_name.startswith('data-'):
                if is_likely_link(attrib_value) \
                        and not is_unlikely_link(attrib_value):
                    yield attrib_name, attrib_value

            elif attrib_name == 'srcset':
                items = self.iter_links_by_srcset_attrib(
                    attrib_name, attrib_value)

                for item in items:
                    yield item
Esempio n. 4
0
    def iter_processed_text(self, file, encoding=None, base_url=None):
        for text, is_link in self.iter_text(file, encoding):
            if is_link:
                try:
                    new_text = json.loads('"{0}"'.format(text))
                except ValueError:
                    yield (text, False)
                    continue

                if is_unlikely_link(new_text) or not is_likely_link(new_text):
                    yield (text, False)
                    continue

                if base_url:
                    new_link = urljoin_safe(base_url, new_text, allow_fragments=False)
                else:
                    new_link = new_text

                if new_link:
                    yield (new_link, identify_link_type(new_link) or True)
                else:
                    yield (text, False)
            else:
                yield (text, False)
Esempio n. 5
0
 def test_is_unlikely_link(self):
     self.assertTrue(is_unlikely_link('example.com+'))
     self.assertTrue(is_unlikely_link('www.'))
     self.assertTrue(is_unlikely_link(':example.com'))
     self.assertTrue(is_unlikely_link(',example.com'))
     self.assertTrue(is_unlikely_link('http:'))
     self.assertTrue(is_unlikely_link('.example.com'))
     self.assertTrue(is_unlikely_link('doc[0]'))
     self.assertTrue(is_unlikely_link('/'))
     self.assertTrue(is_unlikely_link('//'))
     self.assertTrue(is_unlikely_link('application/json'))
     self.assertTrue(is_unlikely_link('application/javascript'))
     self.assertTrue(is_unlikely_link('text/javascript'))
     self.assertTrue(is_unlikely_link('text/plain'))
     self.assertTrue(is_unlikely_link('/\\/'))
     self.assertTrue(is_unlikely_link('a.help'))
     self.assertTrue(is_unlikely_link('div.menu'))
     self.assertTrue(is_unlikely_link('apikey={YOUR_API_KEY_HERE}'))
     self.assertFalse(is_unlikely_link('http://'))
     self.assertFalse(is_unlikely_link('example'))
     self.assertFalse(is_unlikely_link('example.com'))
     self.assertFalse(is_unlikely_link('//example.com/assets/image.css'))
     self.assertFalse(is_unlikely_link('./image.css'))
     self.assertFalse(is_unlikely_link('../image.css'))
     self.assertFalse(is_unlikely_link('index.html'))
     self.assertFalse(is_unlikely_link('body.html'))
Esempio n. 6
0
 def test_is_unlikely_link(self):
     self.assertTrue(is_unlikely_link('example.com+'))
     self.assertTrue(is_unlikely_link('www.'))
     self.assertTrue(is_unlikely_link(':example.com'))
     self.assertTrue(is_unlikely_link(',example.com'))
     self.assertTrue(is_unlikely_link('http:'))
     self.assertTrue(is_unlikely_link('.example.com'))
     self.assertTrue(is_unlikely_link('doc[0]'))
     self.assertTrue(is_unlikely_link('/'))
     self.assertTrue(is_unlikely_link('//'))
     self.assertTrue(is_unlikely_link('application/json'))
     self.assertTrue(is_unlikely_link('application/javascript'))
     self.assertTrue(is_unlikely_link('text/javascript'))
     self.assertTrue(is_unlikely_link('text/plain'))
     self.assertTrue(is_unlikely_link('/\\/'))
     self.assertTrue(is_unlikely_link('a.help'))
     self.assertTrue(is_unlikely_link('div.menu'))
     self.assertTrue(is_unlikely_link('apikey={YOUR_API_KEY_HERE}'))
     self.assertFalse(is_unlikely_link('http://'))
     self.assertFalse(is_unlikely_link('example'))
     self.assertFalse(is_unlikely_link('example.com'))
     self.assertFalse(is_unlikely_link('//example.com/assets/image.css'))
     self.assertFalse(is_unlikely_link('./image.css'))
     self.assertFalse(is_unlikely_link('../image.css'))
     self.assertFalse(is_unlikely_link('index.html'))
     self.assertFalse(is_unlikely_link('body.html'))