def test_inside_script(self): baseurl = 'http://example.org' body = """ <html> <head><script>if(!foobar()){ $('<meta http-equiv="refresh" content="0;url=http://example.org/foobar_required" />').appendTo('body'); }</script></head> </html>""" self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/foobar_required"))
def test_float_refresh_intervals(self): # float refresh intervals baseurl = 'http://example.com' body = """<meta http-equiv="refresh" content=".1;URL=index.html" />""" self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html')) body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />""" self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
def test_redirections_in_different_ordering__in_meta_tag(self): baseurl = "http://localhost:8000" url1 = '<html><head><meta http-equiv="refresh" content="0;url=dummy.html"></head></html>' url2 = '<html><head><meta content="0;url=dummy.html" http-equiv="refresh"></head></html>' self.assertEqual(get_meta_refresh(url1, baseurl), (0.0, "http://localhost:8000/dummy.html")) self.assertEqual(get_meta_refresh(url2, baseurl), (0.0, "http://localhost:8000/dummy.html"))
def test_inside_noscript(self): baseurl = 'http://example.org' body = """ <html> <head><noscript><meta http-equiv="refresh" content="0;url=http://example.org/javascript_required" /></noscript></head> </html>""" self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/javascript_required"))
def test_without_url(self): # refresh without url should return (None, None) baseurl = 'http://example.org' body = """<meta http-equiv="refresh" content="5" />""" self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) body = """<meta http-equiv="refresh" content="5; url=http://example.org/newpage" /></head>""" self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
def get_meta_refresh(response): """Parse the http-equiv refrsh parameter from the given response""" if response not in _metaref_cache: text = response.body_as_unicode()[0:4096] _metaref_cache[response] = html.get_meta_refresh(text, response.url, \ response.encoding) return _metaref_cache[response]
def test_relative_redirects(self): # relative redirects baseurl = "http://example.com/page/this.html" body = """<meta http-equiv="refresh" content="3; url=other.html">""" self.assertEqual( get_meta_refresh(body, baseurl), (3, "http://example.com/page/other.html") )
def test_nonascii_url_latin1_query(self): # non-ascii chars in the url path and query (latin1) # only query part should be kept latin1 encoded before percent escaping baseurl = 'http://example.com' body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">""" self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3?unit=%B5'))
def test_nonascii_url_utf8(self): # non-ascii chars in the url (utf8 - default) baseurl = "http://example.com" body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">""" self.assertEqual( get_meta_refresh(body, baseurl), (3, "http://example.com/to%C2%A3") )
def get_meta_refresh(response): """Parse the http-equiv refrsh parameter from the given response""" if response not in _metaref_cache: text = response.text[0:4096] _metaref_cache[response] = html.get_meta_refresh(text, response.url, response.encoding, ignore_tags=('script', 'noscript')) return _metaref_cache[response]
def test_entities_in_redirect_url(self): # entities in the redirect url baseurl = "http://example.org" body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">""" self.assertEqual( get_meta_refresh(body, baseurl), (3, "http://www.example.com/other") )
def test_nonascii_url_latin1(self): # non-ascii chars in the url path (latin1) # should end up UTF-8 encoded anyway baseurl = 'http://example.com' body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">""" self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))
def get_meta_refresh(response, ignore_tags=('script', 'noscript')): """Parse the http-equiv refrsh parameter from the given response""" if response not in _metaref_cache: text = response.text[0:4096] _metaref_cache[response] = html.get_meta_refresh( text, response.url, response.encoding, ignore_tags=ignore_tags) return _metaref_cache[response]
def test_tag_name(self): baseurl = 'http://example.org' body = """ <html> <head><title>Dummy</title><metafoo http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
def test_multiline(self): # meta refresh in multiple lines baseurl = 'http://example.org' body = """<html><head> <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://example.org/newpage">""" self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))
def test_leading_newline_in_url(self): baseurl = 'http://example.org' body = """ <html> <head><title>Dummy</title><meta http-equiv="refresh" content="0; URL= http://www.example.org/index.php" /> </head> </html>""" self.assertEqual(get_meta_refresh(body, baseurl), (0.0, 'http://www.example.org/index.php'))
def get_meta_refresh(response): """Parse the http-equiv refrsh parameter from the given response""" if response not in _metaref_cache: text = response.text[0:4096] text = _noscript_re.sub(u'', text) text = _script_re.sub(u'', text) _metaref_cache[response] = html.get_meta_refresh(text, response.url, response.encoding) return _metaref_cache[response]
def test_get_meta_refresh(self): baseurl = "http://example.org" body = """ <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_meta_refresh(body, baseurl), (5, "http://example.org/newpage"))
def get_meta_refresh( response: "scrapy.http.response.text.TextResponse", ignore_tags: Optional[Iterable[str]] = ('script', 'noscript'), ) -> Union[Tuple[None, None], Tuple[float, str]]: """Parse the http-equiv refrsh parameter from the given response""" if response not in _metaref_cache: text = response.text[0:4096] _metaref_cache[response] = html.get_meta_refresh( text, response.url, response.encoding, ignore_tags=ignore_tags) return _metaref_cache[response]
def get_html_meta_refresh(response): """ text::response.text 获取html网页中meta refresh中的重定向url, 返回的是元组对::(interval, url) interval是一个整数,表示重定向的延迟。如果不存在就为0 如果不存在这个标签,就返回(None, None) """ text = html_to_unicode(response) result = get_meta_refresh(text) return result[1]
def get_url(self, response): result = response.meta['result'] url = None if isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response.body, response.url, response.encoding, ignore_tags=()) result['url'] = url # mark probable spam if self.isredditspam_link(result['url']): result['spam'] = 'url' result = SearchResultItem(result) yield self.parse_result(result)
def test_nonascii_url_latin1(self): # non-ascii chars in the url (latin1) baseurl = 'http://example.com' body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">""" self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%A3'))
def test_entities_in_redirect_url(self): # entities in the redirect url baseurl = 'http://example.org' body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other'))
def test_html_comments_with_uncommented_meta_refresh(self): # html comments must not interfere with uncommented meta refresh header baseurl = 'http://example.com' body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/'))
def test_get_meta_refresh(self): baseurl = 'http://example.org' body = """ <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>""" self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage')) # refresh without url should return (None, None) body = """<meta http-equiv="refresh" content="5" />""" self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) body = """<meta http-equiv="refresh" content="5; url=http://example.org/newpage" /></head>""" self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage')) # meta refresh in multiple lines body = """<html><head> <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://example.org/newpage">""" self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage')) # entities in the redirect url body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other')) baseurl = 'http://example.com/page/this.html' # relative redirects body = """<meta http-equiv="refresh" content="3; url=other.html">""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html')) # non-standard encodings (utf-16) baseurl = 'http://example.com' body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">""" body = body.decode('ascii').encode('utf-16') self.assertEqual(get_meta_refresh(body, baseurl, 'utf-16'), (3, 'http://example.com/redirect')) # non-ascii chars in the url (utf8 - default) body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3')) # non-ascii chars in the url (latin1) body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">""" self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3')) # html commented meta refresh header must not directed body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->""" self.assertEqual(get_meta_refresh(body, baseurl), (None, None)) # html comments must not interfere with uncommented meta refresh header body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/')) # float refresh intervals body = """<meta http-equiv="refresh" content=".1;URL=index.html" />""" self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html')) body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />""" self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
def test_commented_meta_refresh(self): # html commented meta refresh header must not directed baseurl = 'http://example.com' body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->""" self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
def test_nonascii_url_utf8(self): # non-ascii chars in the url (utf8 - default) baseurl = 'http://example.com' body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))
def test_relative_redirects(self): # relative redirects baseurl = 'http://example.com/page/this.html' body = """<meta http-equiv="refresh" content="3; url=other.html">""" self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html'))