Esempio n. 1
0
 def test_parse_refresh(self):
     self.assertEqual('http://example.com',
                      parse_refresh('10;url="http://example.com"'))
     self.assertEqual('http://example.com',
                      parse_refresh('10;url= http://example.com '))
     self.assertEqual('example.com', parse_refresh("url =' example.com '"))
     self.assertFalse(parse_refresh('url='))
     self.assertFalse(parse_refresh('url =     '))
Esempio n. 2
0
    def iter_links_meta_element(cls, element):
        """Iterate the ``meta`` element for links.

        This function handles refresh URLs.
        """
        if element.attrib.get("http-equiv", "").lower() == "refresh":
            content_value = element.attrib.get("content")

            if content_value:
                link = parse_refresh(content_value)

                if link:
                    yield LinkInfo(
                        element=element,
                        tag=element.tag,
                        attrib="http-equiv",
                        link=link,
                        inline=False,
                        linked=True,
                        base_link=None,
                        value_type="refresh",
                        link_type=None,  # treat it as a redirect
                    )
        else:
            for link_info in cls.iter_links_open_graph_meta(element):
                yield link_info
Esempio n. 3
0
    def iter_links_meta_element(cls, element):
        '''Iterate the ``meta`` element for links.

        This function handles refresh URLs.
        '''
        if element.attrib.get('http-equiv', '').lower() == 'refresh':
            content_value = element.attrib.get('content')

            if content_value:
                link = parse_refresh(content_value)

                if link:
                    yield LinkInfo(
                        element=element,
                        tag=element.tag,
                        attrib='http-equiv',
                        link=link,
                        inline=False,
                        linked=True,
                        base_link=None,
                        value_type='refresh',
                        link_type=None  # treat it as a redirect
                    )
        else:
            for link_info in cls.iter_links_open_graph_meta(element):
                yield link_info
Esempio n. 4
0
 def test_parse_refresh(self):
     self.assertEqual(
         'http://example.com', parse_refresh('10;url="http://example.com"')
     )
     self.assertEqual(
         'http://example.com', parse_refresh('10;url= http://example.com ')
     )
     self.assertEqual(
         'example.com', parse_refresh("url =' example.com '")
     )
     self.assertFalse(
         parse_refresh('url=')
     )
     self.assertFalse(
         parse_refresh('url =     ')
     )
Esempio n. 5
0
    def iter_links_meta_element(cls, element):
        '''Iterate the ``meta`` element for links.

        This function handles refresh URLs.
        '''
        if element.attrib.get('http-equiv', '').lower() == 'refresh':
            content_value = element.attrib.get('content')

            if content_value:
                link = parse_refresh(content_value)

                if link:
                    yield LinkInfo(
                        element=element, tag=element.tag, attrib='http-equiv',
                        link=link,
                        inline=False, linked=True,
                        base_link=None,
                        value_type='refresh',
                        link_type=None  # treat it as a redirect
                    )
        else:
            for link_info in cls.iter_links_open_graph_meta(element):
                yield link_info
Esempio n. 6
0
    def _process_elements(self, elements, response, base_url, link_contexts):
        robots_check_needed = self._robots
        robots_no_follow = False
        inject_refresh = True
        doc_base_url = None

        for element in elements:
            if not isinstance(element, Element):
                continue

            if robots_check_needed and ElementWalker.robots_cannot_follow(element):
                robots_check_needed = False
                robots_no_follow = True

            if not doc_base_url and element.tag == 'base':
                doc_base_url = urljoin_safe(
                    base_url, clean_link_soup(element.attrib.get('href', ''))
                )

            link_infos = self._element_walker.iter_links_element(element)

            if inject_refresh and 'Refresh' in response.fields:
                link = parse_refresh(response.fields['Refresh'])

                if link:
                    link_info = LinkInfo(
                        element=None, tag='_refresh', attrib=None,
                        link=link,
                        inline=False, linked=True,
                        base_link=None, value_type='refresh',
                        link_type=None  # treat it as a redirect
                    )
                    link_infos = itertools.chain(link_infos, [link_info])

                inject_refresh = False
            else:
                inject_refresh = False

            for link_info in link_infos:
                if self._only_relative:
                    if link_info.base_link or '://' in link_info.link:
                        continue

                if not self._is_accepted(link_info.tag):
                    continue

                element_base_url = doc_base_url or base_url

                if link_info.base_link:
                    clean_base_url = clean_link_soup(link_info.base_link)

                    if clean_base_url:
                        element_base_url = urljoin_safe(
                            base_url, clean_base_url
                        ) or base_url

                cleaned_url = clean_link_soup(link_info.link)

                if not cleaned_url:
                    continue

                url = urljoin_safe(
                    element_base_url,
                    cleaned_url,
                    allow_fragments=False
                )

                if url:
                    link_contexts.add(LinkContext(
                        url,
                        inline=link_info.inline,
                        linked=link_info.linked,
                        link_type=link_info.link_type,
                        extra=link_info,
                    ))

        return {'robots_no_follow': robots_no_follow}