Example #1
0
 def test_inside_script(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><script>if(!foobar()){ $('<meta http-equiv="refresh" content="0;url=http://example.org/foobar_required" />').appendTo('body'); }</script></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/foobar_required"))
Example #2
0
    def test_float_refresh_intervals(self):
        # float refresh intervals
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
Example #3
0
 def test_redirections_in_different_ordering__in_meta_tag(self):
     baseurl = "http://localhost:8000"
     url1 = '<html><head><meta http-equiv="refresh" content="0;url=dummy.html"></head></html>'
     url2 = '<html><head><meta content="0;url=dummy.html" http-equiv="refresh"></head></html>'
     self.assertEqual(get_meta_refresh(url1, baseurl),
                      (0.0, "http://localhost:8000/dummy.html"))
     self.assertEqual(get_meta_refresh(url2, baseurl),
                      (0.0, "http://localhost:8000/dummy.html"))
Example #4
0
 def test_inside_noscript(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><noscript><meta http-equiv="refresh" content="0;url=http://example.org/javascript_required" /></noscript></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/javascript_required"))
Example #5
0
    def test_float_refresh_intervals(self):
        # float refresh intervals
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
Example #6
0
 def test_inside_noscript(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><noscript><meta http-equiv="refresh" content="0;url=http://example.org/javascript_required" /></noscript></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()),
                      (0.0, "http://example.org/javascript_required"))
Example #7
0
 def test_inside_script(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><script>if(!foobar()){ $('<meta http-equiv="refresh" content="0;url=http://example.org/foobar_required" />').appendTo('body'); }</script></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()),
                      (0.0, "http://example.org/foobar_required"))
Example #8
0
    def test_without_url(self):
        # refresh without url should return (None, None)
        baseurl = 'http://example.org'
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
Example #9
0
    def test_without_url(self):
        # refresh without url should return (None, None)
        baseurl = 'http://example.org'
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
Example #10
0
def get_meta_refresh(response):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.body_as_unicode()[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(text, response.url, \
            response.encoding)
    return _metaref_cache[response]
Example #11
0
 def test_relative_redirects(self):
     # relative redirects
     baseurl = "http://example.com/page/this.html"
     body = """<meta http-equiv="refresh" content="3; url=other.html">"""
     self.assertEqual(
         get_meta_refresh(body, baseurl), (3, "http://example.com/page/other.html")
     )
Example #12
0
 def test_nonascii_url_latin1_query(self):
     # non-ascii chars in the url path and query (latin1)
     # only query part should be kept latin1 encoded before percent escaping
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                      (3, 'http://example.com/to%C2%A3?unit=%B5'))
Example #13
0
 def test_nonascii_url_utf8(self):
     # non-ascii chars in the url (utf8 - default)
     baseurl = "http://example.com"
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
     self.assertEqual(
         get_meta_refresh(body, baseurl), (3, "http://example.com/to%C2%A3")
     )
Example #14
0
def get_meta_refresh(response):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(text, response.url,
            response.encoding, ignore_tags=('script', 'noscript'))
    return _metaref_cache[response]
Example #15
0
 def test_entities_in_redirect_url(self):
     # entities in the redirect url
     baseurl = "http://example.org"
     body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
     self.assertEqual(
         get_meta_refresh(body, baseurl), (3, "http://www.example.com/other")
     )
Example #16
0
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url path (latin1)
     # should end up UTF-8 encoded anyway
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                      (3, 'http://example.com/to%C2%A3'))
def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(
            text, response.url, response.encoding, ignore_tags=ignore_tags)
    return _metaref_cache[response]
Example #18
0
 def test_tag_name(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><title>Dummy</title><metafoo http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
         <body>blahablsdfsal&amp;</body>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
Example #19
0
 def test_multiline(self):
     # meta refresh in multiple lines
     baseurl = 'http://example.org'
     body = """<html><head>
            <META
            HTTP-EQUIV="Refresh"
            CONTENT="1; URL=http://example.org/newpage">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))
Example #20
0
 def test_multiline(self):
     # meta refresh in multiple lines
     baseurl = 'http://example.org'
     body = """<html><head>
            <META
            HTTP-EQUIV="Refresh"
            CONTENT="1; URL=http://example.org/newpage">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))
Example #21
0
 def test_tag_name(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><title>Dummy</title><metafoo http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
         <body>blahablsdfsal&amp;</body>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
Example #22
0
    def test_leading_newline_in_url(self):
        baseurl = 'http://example.org'
        body = """
        <html>
        <head><title>Dummy</title><meta http-equiv="refresh" content="0; URL=
http://www.example.org/index.php" />
        </head>
        </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.0, 'http://www.example.org/index.php'))
Example #23
0
def get_meta_refresh(response):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        text = _noscript_re.sub(u'', text)
        text = _script_re.sub(u'', text)
        _metaref_cache[response] = html.get_meta_refresh(text, response.url,
            response.encoding)
    return _metaref_cache[response]
Example #24
0
 def test_get_meta_refresh(self):
     baseurl = "http://example.org"
     body = """
         <html>
         <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
         <body>blahablsdfsal&amp;</body>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl),
                      (5, "http://example.org/newpage"))
Example #25
0
def get_meta_refresh(
    response: "scrapy.http.response.text.TextResponse",
    ignore_tags: Optional[Iterable[str]] = ('script', 'noscript'),
) -> Union[Tuple[None, None], Tuple[float, str]]:
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(
            text, response.url, response.encoding, ignore_tags=ignore_tags)
    return _metaref_cache[response]
Example #26
0
    def test_leading_newline_in_url(self):
        baseurl = 'http://example.org'
        body = """
        <html>
        <head><title>Dummy</title><meta http-equiv="refresh" content="0; URL=
http://www.example.org/index.php" />
        </head>
        </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (0.0, 'http://www.example.org/index.php'))
def get_html_meta_refresh(response):
    """
	text::response.text
	获取html网页中meta refresh中的重定向url, 返回的是元组对::(interval, url)
	interval是一个整数,表示重定向的延迟。如果不存在就为0
	如果不存在这个标签,就返回(None, None)
	"""
    text = html_to_unicode(response)
    result = get_meta_refresh(text)
    return result[1]
Example #28
0
    def get_url(self, response):
        result = response.meta['result']

        url = None
        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response.body, response.url, response.encoding, ignore_tags=())
            result['url'] = url

        # mark probable spam
        if self.isredditspam_link(result['url']):
            result['spam'] = 'url'

        result = SearchResultItem(result)
        yield self.parse_result(result)
Example #29
0
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url (latin1)
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%A3'))
Example #30
0
 def test_entities_in_redirect_url(self):
     # entities in the redirect url
     baseurl = 'http://example.org'
     body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other'))
Example #31
0
 def test_html_comments_with_uncommented_meta_refresh(self):
     # html comments must not interfere with uncommented meta refresh header
     baseurl = 'http://example.com'
     body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl),
                      (3, 'http://example.com/'))
Example #32
0
    def test_get_meta_refresh(self):
        baseurl = 'http://example.org'
        body = """
            <html>
            <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (5, 'http://example.org/newpage'))

        # refresh without url should return (None, None)
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (5, 'http://example.org/newpage'))

        # meta refresh in multiple lines
        body = """<html><head>
               <META
               HTTP-EQUIV="Refresh"
               CONTENT="1; URL=http://example.org/newpage">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (1, 'http://example.org/newpage'))

        # entities in the redirect url
        body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://www.example.com/other'))

        baseurl = 'http://example.com/page/this.html'
        # relative redirects
        body = """<meta http-equiv="refresh" content="3; url=other.html">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://example.com/page/other.html'))

        # non-standard encodings (utf-16)
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">"""
        body = body.decode('ascii').encode('utf-16')
        self.assertEqual(get_meta_refresh(body, baseurl, 'utf-16'),
                         (3, 'http://example.com/redirect'))

        # non-ascii chars in the url (utf8 - default)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://example.com/to%C2%A3'))

        # non-ascii chars in the url (latin1)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                         (3, 'http://example.com/to%C2%A3'))

        # html commented meta refresh header must not directed
        body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        # html comments must not interfere with uncommented meta refresh header
        body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://example.com/'))

        # float refresh intervals
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3.1, 'http://example.com/index.html'))
Example #33
0
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url (latin1)
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                      (3, 'http://example.com/to%A3'))
Example #34
0
 def test_commented_meta_refresh(self):
     # html commented meta refresh header must not directed
     baseurl = 'http://example.com'
     body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
Example #35
0
 def test_nonascii_url_utf8(self):
     # non-ascii chars in the url (utf8 - default)
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))
Example #36
0
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url path (latin1)
     # should end up UTF-8 encoded anyway
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))
Example #37
0
 def test_nonascii_url_latin1_query(self):
     # non-ascii chars in the url path and query (latin1)
     # only query part should be kept latin1 encoded before percent escaping
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3?unit=%B5'))
Example #38
0
 def test_commented_meta_refresh(self):
     # html commented meta refresh header must not directed
     baseurl = 'http://example.com'
     body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
Example #39
0
 def test_html_comments_with_uncommented_meta_refresh(self):
     # html comments must not interfere with uncommented meta refresh header
     baseurl = 'http://example.com'
     body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/'))
Example #40
0
    def test_get_meta_refresh(self):
        baseurl = 'http://example.org'
        body = """
            <html>
            <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))

        # refresh without url should return (None, None)
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))

        # meta refresh in multiple lines
        body = """<html><head>
               <META
               HTTP-EQUIV="Refresh"
               CONTENT="1; URL=http://example.org/newpage">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))

        # entities in the redirect url
        body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other'))

        baseurl = 'http://example.com/page/this.html'
        # relative redirects
        body = """<meta http-equiv="refresh" content="3; url=other.html">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html'))

        # non-standard encodings (utf-16)
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">"""
        body = body.decode('ascii').encode('utf-16')
        self.assertEqual(get_meta_refresh(body, baseurl, 'utf-16'), (3, 'http://example.com/redirect'))

        # non-ascii chars in the url (utf8 - default)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))

        # non-ascii chars in the url (latin1)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))

        # html commented meta refresh header must not directed
        body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        # html comments must not interfere with uncommented meta refresh header
        body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/'))

        # float refresh intervals
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
Example #41
0
 def test_relative_redirects(self):
     # relative redirects
     baseurl = 'http://example.com/page/this.html'
     body = """<meta http-equiv="refresh" content="3; url=other.html">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html'))