Esempio n. 1
0
    def test_find_refresh_url(self):
        url = find_refresh_url("""
            <meta http-equiv="refresh" content="5">
        """)
        self.assertEqual('', url)

        url = find_refresh_url("""
            <meta http-equiv="refresh" content="5;URL='http://example.com/'">
        """)
        self.assertEqual('http://example.com/', url)

        url = find_refresh_url("""
            <meta http-equiv="refresh" content="0;URL='http://example.com/'">
        """)
        self.assertEqual('http://example.com/', url)

        url = find_refresh_url("""
            <meta http-equiv="refresh" content="5; url=http://example.com/">
        """)
        self.assertEqual('http://example.com/', url)
Esempio n. 2
0
    def test_find_refresh_url(self):
        url = find_refresh_url("""
            <meta http-equiv="refresh" content="5">
        """)
        self.assertEqual('', url)

        url = find_refresh_url("""
            <meta http-equiv="refresh" content="5;URL='http://example.com/'">
        """)
        self.assertEqual('http://example.com/', url)

        url = find_refresh_url("""
            <meta http-equiv="refresh" content="0;URL='http://example.com/'">
        """)
        self.assertEqual('http://example.com/', url)

        url = find_refresh_url("""
            <meta http-equiv="refresh" content="5; url=http://example.com/">
        """)
        self.assertEqual('http://example.com/', url)
Esempio n. 3
0
    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.now()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, basestring):
                    post = post[:self.config['debug_post_limit']] + '...'
                else:
                    items = normalize_http_values(post, charset='utf-8')
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[:self.
                                          config['debug_post_limit']] + '...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n' %
                                     (self.request_counter, post))

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        old_refresh_count = self.config['refresh_redirect_count']
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        # Workaround
        if self.doc.grab is None:
            self.doc.grab = weakref.proxy(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        self.save_dumps()

        # TODO: check max redirect count
        if self.config['follow_refresh']:
            url = find_refresh_url(self.doc.unicode_body())
            print('URL', url)
            if url is not None:
                inc_count = old_refresh_count + 1
                if inc_count > self.config['redirect_limit']:
                    raise error.GrabTooManyRedirectsError()
                else:
                    print(inc_count)
                    return self.request(url=url,
                                        refresh_redirect_count=inc_count)

        return None
Esempio n. 4
0
    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.now()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, basestring):
                    post = post[:self.config['debug_post_limit']] + '...'
                else:
                    items = normalize_http_values(post, charset='utf-8')
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[:self.config['debug_post_limit']] + '...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n' % (self.request_counter, post))

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them again!
        old_refresh_count = self.config['refresh_redirect_count']
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        # Warkaround
        if self.doc.grab is None:
            self.doc.grab = weakref.proxy(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        self.save_dumps()

        # TODO: check max redirect count
        if self.config['follow_refresh']:
            url = find_refresh_url(self.doc.unicode_body())
            print('URL', url)
            if url is not None:
                inc_count = old_refresh_count + 1
                if inc_count > self.config['redirect_limit']:
                    raise error.GrabTooManyRedirectsError()
                else:
                    print(inc_count)
                    return self.request(url=url, refresh_redirect_count=inc_count)

        return None