Example #1
0
 def test_clean_link_soup(self):
     self.assertEqual('http://example.com',
                      clean_link_soup('http://example.com  '))
     self.assertEqual(
         'http://example.com/',
         clean_link_soup('\n\r\thttp://example.com\n\r\r\r\n\t/'))
     self.assertEqual(
         'http://example.com/ something',
         clean_link_soup('http://example.com\n\t / something  \n\r\t'))
     self.assertEqual(
         'http://example.com/dog cat/',
         clean_link_soup('http://example.com/\n dog \tcat\r/\n'))
     self.assertEqual('ßðf ¤Jáßðff ßðfœ³²œ¤ œë ßfœ',
                      clean_link_soup('ß\tðf ¤Jáßðf\n f ßðfœ³²œ¤ œë ßfœ '))
Example #2
0
    def scrape_file(self, file, encoding=None, base_url=None):
        '''Scrape a file for links.

        See :meth:`scrape` for the return value.
        '''
        elements = self.iter_elements(file, encoding=encoding)

        link_contexts = set()

        link_infos = self._element_walker.iter_links(elements)

        for link_info in link_infos:
            element_base_url = base_url

            if link_info.base_link:
                clean_base_url = clean_link_soup(link_info.base_link)

                if element_base_url and base_url:
                    element_base_url = urljoin_safe(
                        base_url, clean_base_url
                    ) or base_url

            if element_base_url:
                url = urljoin_safe(
                    element_base_url,
                    clean_link_soup(link_info.link),
                    allow_fragments=False
                )
            else:
                url = clean_link_soup(link_info.link)

            if url:
                link_contexts.add(LinkContext(
                    url,
                    inline=link_info.inline,
                    linked=link_info.linked,
                    link_type=link_info.link_type,
                    extra=link_info
                ))

        scrape_result = ScrapeResult(link_contexts, encoding)
        scrape_result['base_url'] = base_url
        return scrape_result
Example #3
0
 def test_clean_link_soup(self):
     self.assertEqual(
         'http://example.com',
         clean_link_soup('http://example.com  ')
     )
     self.assertEqual(
         'http://example.com/',
         clean_link_soup('\n\r\thttp://example.com\n\r\r\r\n\t/')
     )
     self.assertEqual(
         'http://example.com/ something',
         clean_link_soup('http://example.com\n\t / something  \n\r\t')
     )
     self.assertEqual(
         'http://example.com/dog cat/',
         clean_link_soup('http://example.com/\n dog \tcat\r/\n')
     )
     self.assertEqual(
         'ßðf ¤Jáßðff ßðfœ³²œ¤ œë ßfœ',
         clean_link_soup('ß\tðf ¤Jáßðf\n f ßðfœ³²œ¤ œë ßfœ ')
     )
Example #4
0
    def _process_elements(self, elements, response, base_url, link_contexts):
        robots_check_needed = self._robots
        robots_no_follow = False
        inject_refresh = True
        doc_base_url = None

        for element in elements:
            if not isinstance(element, Element):
                continue

            if robots_check_needed and ElementWalker.robots_cannot_follow(element):
                robots_check_needed = False
                robots_no_follow = True

            if not doc_base_url and element.tag == 'base':
                doc_base_url = urljoin_safe(
                    base_url, clean_link_soup(element.attrib.get('href', ''))
                )

            link_infos = self._element_walker.iter_links_element(element)

            if inject_refresh and 'Refresh' in response.fields:
                link = parse_refresh(response.fields['Refresh'])

                if link:
                    link_info = LinkInfo(
                        element=None, tag='_refresh', attrib=None,
                        link=link,
                        inline=False, linked=True,
                        base_link=None, value_type='refresh',
                        link_type=None  # treat it as a redirect
                    )
                    link_infos = itertools.chain(link_infos, [link_info])

                inject_refresh = False
            else:
                inject_refresh = False

            for link_info in link_infos:
                if self._only_relative:
                    if link_info.base_link or '://' in link_info.link:
                        continue

                if not self._is_accepted(link_info.tag):
                    continue

                element_base_url = doc_base_url or base_url

                if link_info.base_link:
                    clean_base_url = clean_link_soup(link_info.base_link)

                    if clean_base_url:
                        element_base_url = urljoin_safe(
                            base_url, clean_base_url
                        ) or base_url

                cleaned_url = clean_link_soup(link_info.link)

                if not cleaned_url:
                    continue

                url = urljoin_safe(
                    element_base_url,
                    cleaned_url,
                    allow_fragments=False
                )

                if url:
                    link_contexts.add(LinkContext(
                        url,
                        inline=link_info.inline,
                        linked=link_info.linked,
                        link_type=link_info.link_type,
                        extra=link_info,
                    ))

        return {'robots_no_follow': robots_no_follow}