def test_parse_refresh(self): self.assertEqual('http://example.com', parse_refresh('10;url="http://example.com"')) self.assertEqual('http://example.com', parse_refresh('10;url= http://example.com ')) self.assertEqual('example.com', parse_refresh("url =' example.com '")) self.assertFalse(parse_refresh('url=')) self.assertFalse(parse_refresh('url = '))
def iter_links_meta_element(cls, element): """Iterate the ``meta`` element for links. This function handles refresh URLs. """ if element.attrib.get("http-equiv", "").lower() == "refresh": content_value = element.attrib.get("content") if content_value: link = parse_refresh(content_value) if link: yield LinkInfo( element=element, tag=element.tag, attrib="http-equiv", link=link, inline=False, linked=True, base_link=None, value_type="refresh", link_type=None, # treat it as a redirect ) else: for link_info in cls.iter_links_open_graph_meta(element): yield link_info
def iter_links_meta_element(cls, element): '''Iterate the ``meta`` element for links. This function handles refresh URLs. ''' if element.attrib.get('http-equiv', '').lower() == 'refresh': content_value = element.attrib.get('content') if content_value: link = parse_refresh(content_value) if link: yield LinkInfo( element=element, tag=element.tag, attrib='http-equiv', link=link, inline=False, linked=True, base_link=None, value_type='refresh', link_type=None # treat it as a redirect ) else: for link_info in cls.iter_links_open_graph_meta(element): yield link_info
def test_parse_refresh(self): self.assertEqual( 'http://example.com', parse_refresh('10;url="http://example.com"') ) self.assertEqual( 'http://example.com', parse_refresh('10;url= http://example.com ') ) self.assertEqual( 'example.com', parse_refresh("url =' example.com '") ) self.assertFalse( parse_refresh('url=') ) self.assertFalse( parse_refresh('url = ') )
def _process_elements(self, elements, response, base_url, link_contexts): robots_check_needed = self._robots robots_no_follow = False inject_refresh = True doc_base_url = None for element in elements: if not isinstance(element, Element): continue if robots_check_needed and ElementWalker.robots_cannot_follow(element): robots_check_needed = False robots_no_follow = True if not doc_base_url and element.tag == 'base': doc_base_url = urljoin_safe( base_url, clean_link_soup(element.attrib.get('href', '')) ) link_infos = self._element_walker.iter_links_element(element) if inject_refresh and 'Refresh' in response.fields: link = parse_refresh(response.fields['Refresh']) if link: link_info = LinkInfo( element=None, tag='_refresh', attrib=None, link=link, inline=False, linked=True, base_link=None, value_type='refresh', link_type=None # treat it as a redirect ) link_infos = itertools.chain(link_infos, [link_info]) inject_refresh = False else: inject_refresh = False for link_info in link_infos: if self._only_relative: if link_info.base_link or '://' in link_info.link: continue if not self._is_accepted(link_info.tag): continue element_base_url = doc_base_url or base_url if link_info.base_link: clean_base_url = clean_link_soup(link_info.base_link) if clean_base_url: element_base_url = urljoin_safe( base_url, clean_base_url ) or base_url cleaned_url = clean_link_soup(link_info.link) if not cleaned_url: continue url = urljoin_safe( element_base_url, cleaned_url, allow_fragments=False ) if url: link_contexts.add(LinkContext( url, inline=link_info.inline, linked=link_info.linked, link_type=link_info.link_type, extra=link_info, )) return {'robots_no_follow': robots_no_follow}