def process_response(self, response): request = response.request if request.method != 'HEAD' and isinstance(response, HtmlResponse): interval, url = get_meta_refresh(response) if url and interval < self.max_delay: redirected = self._redirect_request_using_get(request, url) return self._redirect(redirected, request, 'meta refresh') return response
def test_get_meta_refresh(self): url = 'http://example.org' body = ''' <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (5, 'http://example.org/newpage')) # refresh without url should return (None, None) body = '''<meta http-equiv="refresh" content="5" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (None, None)) body = '''<meta http-equiv="refresh" content="5; url=http://example.org/newpage" /></head>''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (5, 'http://example.org/newpage')) # meta refresh in multiple lines body = '''<html><head> <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://example.org/newpage">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (1, 'http://example.org/newpage')) # entities in the redirect url body = '''<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://www.example.com/other')) url = 'http://example.com/page/this.html' # relative redirects body = '''<meta http-equiv="refresh" content="3; url=other.html">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/page/other.html')) # non-standard encodings (utf-16) url = 'http://example.com' body = '''<meta http-equiv="refresh" content="3; url=http://example.com/redirect">''' body = body.decode('ascii').encode('utf-16') self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body, encoding='utf-16')), (3, 'http://example.com/%FF%FEh%00t%00t%00p%00:%00/%00/%00e%00x%00a%00m%00p%00l%00e%00.%00c%00o%00m%00/%00r%00e%00d%00i%00r%00e%00c%00t%00')) # non-ascii chars in the url (utf8 - default) body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/to%C2%A3')) # non-ascii chars in the url (latin1) body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body, encoding='latin1')), (3, 'http://example.com/to%A3')) # html commented meta refresh header must not directed body = '''<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (None, None)) # html comments must not interfere with uncommented meta refresh header body = '''<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/')) # float refresh intervals body = '''<meta http-equiv="refresh" content=".1;URL=index.html" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (0.1, 'http://example.com/index.html')) body = '''<meta http-equiv="refresh" content="3.1;URL=index.html" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3.1, 'http://example.com/index.html'))
def test_get_meta_refresh(self): url = 'http://example.org' body = ''' <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (5, 'http://example.org/newpage')) # refresh without url should return (None, None) body = '''<meta http-equiv="refresh" content="5" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (None, None)) body = '''<meta http-equiv="refresh" content="5; url=http://example.org/newpage" /></head>''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (5, 'http://example.org/newpage')) # meta refresh in multiple lines body = '''<html><head> <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://example.org/newpage">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (1, 'http://example.org/newpage')) # entities in the redirect url body = '''<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://www.example.com/other')) url = 'http://example.com/page/this.html' # relative redirects body = '''<meta http-equiv="refresh" content="3; url=other.html">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/page/other.html')) # non-standard encodings (utf-16) url = 'http://example.com' body = '''<meta http-equiv="refresh" content="3; url=http://example.com/redirect">''' body = body.decode('ascii').encode('utf-16') self.assertEqual( get_meta_refresh(HtmlResponse(url, body=body, encoding='utf-16')), (3, 'http://example.com/%FF%FEh%00t%00t%00p%00:%00/%00/%00e%00x%00a%00m%00p%00l%00e%00.%00c%00o%00m%00/%00r%00e%00d%00i%00r%00e%00c%00t%00' )) # non-ascii chars in the url (utf8 - default) body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/to%C2%A3')) # non-ascii chars in the url (latin1) body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">''' self.assertEqual( get_meta_refresh(HtmlResponse(url, body=body, encoding='latin1')), (3, 'http://example.com/to%A3')) # html commented meta refresh header must not directed body = '''<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (None, None)) # html comments must not interfere with uncommented meta refresh header body = '''<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/')) # float refresh intervals body = '''<meta http-equiv="refresh" content=".1;URL=index.html" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (0.1, 'http://example.com/index.html')) body = '''<meta http-equiv="refresh" content="3.1;URL=index.html" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3.1, 'http://example.com/index.html'))