コード例 #1
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_inside_script(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><script>if(!foobar()){ $('<meta http-equiv="refresh" content="0;url=http://example.org/foobar_required" />').appendTo('body'); }</script></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/foobar_required"))
コード例 #2
0
    def test_float_refresh_intervals(self):
        # float refresh intervals
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
コード例 #3
0
ファイル: test_html.py プロジェクト: scrapy/w3lib
 def test_redirections_in_different_ordering__in_meta_tag(self):
     baseurl = "http://localhost:8000"
     url1 = '<html><head><meta http-equiv="refresh" content="0;url=dummy.html"></head></html>'
     url2 = '<html><head><meta content="0;url=dummy.html" http-equiv="refresh"></head></html>'
     self.assertEqual(get_meta_refresh(url1, baseurl),
                      (0.0, "http://localhost:8000/dummy.html"))
     self.assertEqual(get_meta_refresh(url2, baseurl),
                      (0.0, "http://localhost:8000/dummy.html"))
コード例 #4
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_inside_noscript(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><noscript><meta http-equiv="refresh" content="0;url=http://example.org/javascript_required" /></noscript></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()), (0.0, "http://example.org/javascript_required"))
コード例 #5
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
    def test_float_refresh_intervals(self):
        # float refresh intervals
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
コード例 #6
0
 def test_inside_noscript(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><noscript><meta http-equiv="refresh" content="0;url=http://example.org/javascript_required" /></noscript></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()),
                      (0.0, "http://example.org/javascript_required"))
コード例 #7
0
 def test_inside_script(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><script>if(!foobar()){ $('<meta http-equiv="refresh" content="0;url=http://example.org/foobar_required" />').appendTo('body'); }</script></head>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
     self.assertEqual(get_meta_refresh(body, baseurl, ignore_tags=()),
                      (0.0, "http://example.org/foobar_required"))
コード例 #8
0
    def test_without_url(self):
        # refresh without url should return (None, None)
        baseurl = 'http://example.org'
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
コード例 #9
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
    def test_without_url(self):
        # refresh without url should return (None, None)
        baseurl = 'http://example.org'
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))
コード例 #10
0
def get_meta_refresh(response):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.body_as_unicode()[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(text, response.url, \
            response.encoding)
    return _metaref_cache[response]
コード例 #11
0
ファイル: test_html.py プロジェクト: rrosajp/w3lib
 def test_relative_redirects(self):
     # relative redirects
     baseurl = "http://example.com/page/this.html"
     body = """<meta http-equiv="refresh" content="3; url=other.html">"""
     self.assertEqual(
         get_meta_refresh(body, baseurl), (3, "http://example.com/page/other.html")
     )
コード例 #12
0
 def test_nonascii_url_latin1_query(self):
     # non-ascii chars in the url path and query (latin1)
     # only query part should be kept latin1 encoded before percent escaping
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                      (3, 'http://example.com/to%C2%A3?unit=%B5'))
コード例 #13
0
ファイル: test_html.py プロジェクト: rrosajp/w3lib
 def test_nonascii_url_utf8(self):
     # non-ascii chars in the url (utf8 - default)
     baseurl = "http://example.com"
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
     self.assertEqual(
         get_meta_refresh(body, baseurl), (3, "http://example.com/to%C2%A3")
     )
コード例 #14
0
ファイル: response.py プロジェクト: ArturGaspar/scrapy
def get_meta_refresh(response):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(text, response.url,
            response.encoding, ignore_tags=('script', 'noscript'))
    return _metaref_cache[response]
コード例 #15
0
ファイル: test_html.py プロジェクト: rrosajp/w3lib
 def test_entities_in_redirect_url(self):
     # entities in the redirect url
     baseurl = "http://example.org"
     body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
     self.assertEqual(
         get_meta_refresh(body, baseurl), (3, "http://www.example.com/other")
     )
コード例 #16
0
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url path (latin1)
     # should end up UTF-8 encoded anyway
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                      (3, 'http://example.com/to%C2%A3'))
コード例 #17
0
def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(
            text, response.url, response.encoding, ignore_tags=ignore_tags)
    return _metaref_cache[response]
コード例 #18
0
 def test_tag_name(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><title>Dummy</title><metafoo http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
         <body>blahablsdfsal&amp;</body>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
コード例 #19
0
 def test_multiline(self):
     # meta refresh in multiple lines
     baseurl = 'http://example.org'
     body = """<html><head>
            <META
            HTTP-EQUIV="Refresh"
            CONTENT="1; URL=http://example.org/newpage">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))
コード例 #20
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_multiline(self):
     # meta refresh in multiple lines
     baseurl = 'http://example.org'
     body = """<html><head>
            <META
            HTTP-EQUIV="Refresh"
            CONTENT="1; URL=http://example.org/newpage">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))
コード例 #21
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_tag_name(self):
     baseurl = 'http://example.org'
     body = """
         <html>
         <head><title>Dummy</title><metafoo http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
         <body>blahablsdfsal&amp;</body>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
コード例 #22
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
    def test_leading_newline_in_url(self):
        baseurl = 'http://example.org'
        body = """
        <html>
        <head><title>Dummy</title><meta http-equiv="refresh" content="0; URL=
http://www.example.org/index.php" />
        </head>
        </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.0, 'http://www.example.org/index.php'))
コード例 #23
0
def get_meta_refresh(response):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        text = _noscript_re.sub(u'', text)
        text = _script_re.sub(u'', text)
        _metaref_cache[response] = html.get_meta_refresh(text, response.url,
            response.encoding)
    return _metaref_cache[response]
コード例 #24
0
ファイル: test_html.py プロジェクト: scrapy/w3lib
 def test_get_meta_refresh(self):
     baseurl = "http://example.org"
     body = """
         <html>
         <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
         <body>blahablsdfsal&amp;</body>
         </html>"""
     self.assertEqual(get_meta_refresh(body, baseurl),
                      (5, "http://example.org/newpage"))
コード例 #25
0
ファイル: response.py プロジェクト: Ahmadch101/webcrawler
def get_meta_refresh(
    response: "scrapy.http.response.text.TextResponse",
    ignore_tags: Optional[Iterable[str]] = ('script', 'noscript'),
) -> Union[Tuple[None, None], Tuple[float, str]]:
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(
            text, response.url, response.encoding, ignore_tags=ignore_tags)
    return _metaref_cache[response]
コード例 #26
0
    def test_leading_newline_in_url(self):
        baseurl = 'http://example.org'
        body = """
        <html>
        <head><title>Dummy</title><meta http-equiv="refresh" content="0; URL=
http://www.example.org/index.php" />
        </head>
        </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (0.0, 'http://www.example.org/index.php'))
コード例 #27
0
def get_html_meta_refresh(response):
    """
	text::response.text
	获取html网页中meta refresh中的重定向url, 返回的是元组对::(interval, url)
	interval是一个整数,表示重定向的延迟。如果不存在就为0
	如果不存在这个标签,就返回(None, None)
	"""
    text = html_to_unicode(response)
    result = get_meta_refresh(text)
    return result[1]
コード例 #28
0
    def get_url(self, response):
        result = response.meta['result']

        url = None
        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response.body, response.url, response.encoding, ignore_tags=())
            result['url'] = url

        # mark probable spam
        if self.isredditspam_link(result['url']):
            result['spam'] = 'url'

        result = SearchResultItem(result)
        yield self.parse_result(result)
コード例 #29
0
ファイル: test_html.py プロジェクト: christwell/w3lib
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url (latin1)
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%A3'))
コード例 #30
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_entities_in_redirect_url(self):
     # entities in the redirect url
     baseurl = 'http://example.org'
     body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other'))
コード例 #31
0
 def test_html_comments_with_uncommented_meta_refresh(self):
     # html comments must not interfere with uncommented meta refresh header
     baseurl = 'http://example.com'
     body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl),
                      (3, 'http://example.com/'))
コード例 #32
0
ファイル: test_html.py プロジェクト: nasirsphi/w3lib
    def test_get_meta_refresh(self):
        baseurl = 'http://example.org'
        body = """
            <html>
            <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (5, 'http://example.org/newpage'))

        # refresh without url should return (None, None)
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (5, 'http://example.org/newpage'))

        # meta refresh in multiple lines
        body = """<html><head>
               <META
               HTTP-EQUIV="Refresh"
               CONTENT="1; URL=http://example.org/newpage">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (1, 'http://example.org/newpage'))

        # entities in the redirect url
        body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://www.example.com/other'))

        baseurl = 'http://example.com/page/this.html'
        # relative redirects
        body = """<meta http-equiv="refresh" content="3; url=other.html">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://example.com/page/other.html'))

        # non-standard encodings (utf-16)
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">"""
        body = body.decode('ascii').encode('utf-16')
        self.assertEqual(get_meta_refresh(body, baseurl, 'utf-16'),
                         (3, 'http://example.com/redirect'))

        # non-ascii chars in the url (utf8 - default)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://example.com/to%C2%A3'))

        # non-ascii chars in the url (latin1)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                         (3, 'http://example.com/to%C2%A3'))

        # html commented meta refresh header must not directed
        body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        # html comments must not interfere with uncommented meta refresh header
        body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3, 'http://example.com/'))

        # float refresh intervals
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl),
                         (3.1, 'http://example.com/index.html'))
コード例 #33
0
ファイル: test_html.py プロジェクト: azizur77/w3lib
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url (latin1)
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'),
                      (3, 'http://example.com/to%A3'))
コード例 #34
0
 def test_commented_meta_refresh(self):
     # html commented meta refresh header must not directed
     baseurl = 'http://example.com'
     body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
コード例 #35
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_nonascii_url_utf8(self):
     # non-ascii chars in the url (utf8 - default)
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))
コード例 #36
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_nonascii_url_latin1(self):
     # non-ascii chars in the url path (latin1)
     # should end up UTF-8 encoded anyway
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))
コード例 #37
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_nonascii_url_latin1_query(self):
     # non-ascii chars in the url path and query (latin1)
     # only query part should be kept latin1 encoded before percent escaping
     baseurl = 'http://example.com'
     body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">"""
     self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3?unit=%B5'))
コード例 #38
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_commented_meta_refresh(self):
     # html commented meta refresh header must not directed
     baseurl = 'http://example.com'
     body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl), (None, None))
コード例 #39
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_html_comments_with_uncommented_meta_refresh(self):
     # html comments must not interfere with uncommented meta refresh header
     baseurl = 'http://example.com'
     body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/'))
コード例 #40
0
ファイル: test_html.py プロジェクト: Dior222/w3lib
    def test_get_meta_refresh(self):
        baseurl = 'http://example.org'
        body = """
            <html>
            <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
            <body>blahablsdfsal&amp;</body>
            </html>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))

        # refresh without url should return (None, None)
        body = """<meta http-equiv="refresh" content="5" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        body = """<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>"""
        self.assertEqual(get_meta_refresh(body, baseurl), (5, 'http://example.org/newpage'))

        # meta refresh in multiple lines
        body = """<html><head>
               <META
               HTTP-EQUIV="Refresh"
               CONTENT="1; URL=http://example.org/newpage">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (1, 'http://example.org/newpage'))

        # entities in the redirect url
        body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://www.example.com/other'))

        baseurl = 'http://example.com/page/this.html'
        # relative redirects
        body = """<meta http-equiv="refresh" content="3; url=other.html">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html'))

        # non-standard encodings (utf-16)
        baseurl = 'http://example.com'
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">"""
        body = body.decode('ascii').encode('utf-16')
        self.assertEqual(get_meta_refresh(body, baseurl, 'utf-16'), (3, 'http://example.com/redirect'))

        # non-ascii chars in the url (utf8 - default)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))

        # non-ascii chars in the url (latin1)
        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))

        # html commented meta refresh header must not directed
        body = """<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl), (None, None))

        # html comments must not interfere with uncommented meta refresh header
        body = """<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/'))

        # float refresh intervals
        body = """<meta http-equiv="refresh" content=".1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (0.1, 'http://example.com/index.html'))

        body = """<meta http-equiv="refresh" content="3.1;URL=index.html" />"""
        self.assertEqual(get_meta_refresh(body, baseurl), (3.1, 'http://example.com/index.html'))
コード例 #41
0
ファイル: test_html.py プロジェクト: Preetwinder/w3lib
 def test_relative_redirects(self):
     # relative redirects
     baseurl = 'http://example.com/page/this.html'
     body = """<meta http-equiv="refresh" content="3; url=other.html">"""
     self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/page/other.html'))