Ejemplo n.º 1
0
    def test_task_priority(self):
        # Automatic random priority
        base.RANDOM_TASK_PRIORITY_RANGE = (10, 20)
        bot = build_spider(SimpleSpider, priority_mode='random')
        bot.setup_queue()
        task = Task('baz', url='http://xxx.com')
        self.assertEqual(task.priority, None)
        bot.add_task(task)
        self.assertTrue(10 <= task.priority <= 20)

        # Automatic constant priority
        base.DEFAULT_TASK_PRIORITY = 33
        bot = build_spider(SimpleSpider, priority_mode='const')
        bot.setup_queue()
        task = Task('baz', url='http://xxx.com')
        self.assertEqual(task.priority, None)
        bot.add_task(task)
        self.assertEqual(33, task.priority)

        # Automatic priority does not override explictily setted priority
        base.DEFAULT_TASK_PRIORITY = 33
        bot = build_spider(SimpleSpider, priority_mode='const')
        bot.setup_queue()
        task = Task('baz', url='http://xxx.com', priority=1)
        self.assertEqual(1, task.priority)
        bot.add_task(task)
        self.assertEqual(1, task.priority)

        self.assertRaises(SpiderMisuseError,
                          lambda: SimpleSpider(priority_mode='foo'))
Ejemplo n.º 2
0
    def test_spider_custom_proxy_source(self):
        proxy_port = self.server.port

        class TestSpider(Spider):
            def task_page(self, grab, unused_task):
                self.stat.collect('ports',
                                  int(grab.doc.headers.get('Listen-Port', 0)))

        class CustomProxySource(BaseProxySource):
            def load(self):
                return [
                    Proxy(ADDRESS, proxy_port, None, None, 'http'),
                ]

            def load_raw_data(self):
                return None

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.load_proxylist(CustomProxySource())
        bot.add_task(Task('page', url='http://yandex.ru/'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(set(bot.stat.collections['ports']),
                         set([TEST_SERVER_PORT]))
Ejemplo n.º 3
0
    def test_task_callback(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                self.meta['tokens'].append('0_handler')

        class FuncWithState(object):
            def __init__(self, tokens):
                self.tokens = tokens

            def __call__(self, grab, task):
                self.tokens.append('1_func')

        tokens = []
        func = FuncWithState(tokens)

        bot = build_spider(TestSpider, )
        bot.meta['tokens'] = tokens
        bot.setup_queue()
        # classic handler
        bot.add_task(Task('page', url=self.server.get_url()))
        # callback option overried classic handler
        bot.add_task(Task('page', url=self.server.get_url(), callback=func))
        # callback and null task name
        bot.add_task(Task(name=None, url=self.server.get_url(), callback=func))
        # callback and default task name
        bot.add_task(Task(url=self.server.get_url(), callback=func))
        bot.run()
        self.assertEqual(['0_handler', '1_func', '1_func', '1_func'],
                         sorted(tokens))
Ejemplo n.º 4
0
    def test_spider_custom_proxy_source(self):
        proxy_port = self.server.port

        class TestSpider(Spider):
            def task_page(self, grab, unused_task):
                self.stat.collect(
                    'ports', int(grab.doc.headers.get('Listen-Port', 0)))

        class CustomProxySource(BaseProxySource):
            def load(self):
                return [
                    Proxy(ADDRESS, proxy_port, None, None, 'http'),
                ]

            def load_raw_data(self):
                return None

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.load_proxylist(CustomProxySource())
        bot.add_task(Task('page', url='http://yandex.ru/'))
        bot.run()

        self.assertEqual(self.server.request['headers']['host'], 'yandex.ru')
        self.assertEqual(set(bot.stat.collections['ports']),
                         set([TEST_SERVER_PORT]))
Ejemplo n.º 5
0
 def test_delay_error(self):
     bot = build_spider(self.SimpleSpider)
     self.setup_queue(bot)
     bot.task_queue.clear()
     self.assertRaises(SpiderMisuseError,
                       bot.add_task,
                       Task('page', url=self.server.get_url(), delay=1))
Ejemplo n.º 6
0
    def test_generator_with_invalid_url(self):
        class SomeSpider(Spider):
            def task_generator(self):
                yield Task('page', url=INVALID_URL)

        bot = build_spider(SomeSpider)
        bot.run()
Ejemplo n.º 7
0
    def test_task_nohandler_error(self):
        class TestSpider(Spider):
            pass

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        self.assertRaises(NoTaskHandler, bot.run)
Ejemplo n.º 8
0
 def test_task_retry(self):
     self.server.response['get.data'] = 'xxx'
     self.server.response_once['code'] = 403
     bot = build_spider(SimpleSpider)
     bot.setup_queue()
     bot.add_task(Task('baz', self.server.get_url()))
     bot.run()
     self.assertEqual(b'xxx', bot.stat.collections['SAVED_ITEM'][0])
Ejemplo n.º 9
0
    def test_generator_with_invalid_url(self):

        class SomeSpider(Spider):
            def task_generator(self):
                yield Task('page', url=INVALID_URL)

        bot = build_spider(SomeSpider)
        bot.run()
Ejemplo n.º 10
0
 def test_task_retry(self):
     self.server.response['get.data'] = 'xxx'
     self.server.response_once['code'] = 403
     bot = build_spider(SimpleSpider)
     bot.setup_queue()
     bot.add_task(Task('baz', self.server.get_url()))
     bot.run()
     self.assertEqual(b'xxx', bot.stat.collections['SAVED_ITEM'][0])
Ejemplo n.º 11
0
    def test_fatal_error(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                raise FatalError

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        self.assertRaises(FatalError, bot.run)
Ejemplo n.º 12
0
 def test_spider(self):
     self.server.response['get.data'] = 'Hello spider!'
     self.server.response['sleep'] = 0
     bot = build_spider(SimpleSpider)
     bot.setup_queue()
     bot.add_task(Task('baz', self.server.get_url()))
     bot.run()
     self.assertEqual(b'Hello spider!',
                      bot.stat.collections['SAVED_ITEM'][0])
Ejemplo n.º 13
0
    def test_data_nohandler_error(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                yield Data('foo', num=1)

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        self.assertRaises(NoDataHandler, bot.run)
Ejemplo n.º 14
0
 def test_spider_nonmp_changes(self):
     """This test tests that in non-multiprocess-mode changes made
     inside handler applied to main spider instance."""
     bot = build_spider(self.SimpleSpider)
     bot.setup_queue()
     bot.meta['url'] = self.server.get_url()
     bot.add_task(Task('page', self.server.get_url()))
     bot.run()
     self.assertEqual(4, bot.foo_count)
Ejemplo n.º 15
0
 def test_queue_length(self):
     bot = build_spider(self.SimpleSpider)
     self.setup_queue(bot)
     bot.task_queue.clear()
     for _ in six.moves.range(5):
         bot.add_task(Task('page', url=self.server.get_url()))
     self.assertEqual(5, bot.task_queue.size())
     bot.run()
     self.assertEqual(0, bot.task_queue.size())
Ejemplo n.º 16
0
 def test_spider(self):
     self.server.response['get.data'] = 'Hello spider!'
     self.server.response['sleep'] = 0
     bot = build_spider(SimpleSpider)
     bot.setup_queue()
     bot.add_task(Task('baz', self.server.get_url()))
     bot.run()
     self.assertEqual(b'Hello spider!',
                      bot.stat.collections['SAVED_ITEM'][0])
Ejemplo n.º 17
0
    def test_fatal_error(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                raise FatalError

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        self.assertRaises(FatalError, bot.run)
Ejemplo n.º 18
0
    def test_task_raw(self):
        class TestSpider(Spider):
            def task_page(self, grab, unused_task):
                self.stat.collect('codes', grab.doc.code)

        self.server.response['code'] = 502

        bot = build_spider(TestSpider, network_try_limit=1)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(0, len(bot.stat.collections['codes']))

        bot = build_spider(TestSpider, network_try_limit=1)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url(), raw=True))
        bot.add_task(Task('page', url=self.server.get_url(), raw=True))
        bot.run()
        self.assertEqual(2, len(bot.stat.collections['codes']))
Ejemplo n.º 19
0
 def get_configured_spider(self, pause=None, spider_options=None):
     bot = build_spider(
         SimpleSpider,
         meta={'server': self.server, 'pause': (pause or [])},
         parser_pool_size=1,
         **(spider_options or {})
     )
     self.setup_cache(bot)
     bot.cache_reader_service.backend.clear()
     bot.setup_queue()
     return bot
Ejemplo n.º 20
0
    def test_task_limit(self):
        class CustomSimpleSpider(SimpleSpider):
            def create_grab_instance(self, **kwargs):
                return Grab(connect_timeout=1, timeout=1)

        self.server.response['get.data'] = 'Hello spider!'
        self.server.response['sleep'] = 1.1

        bot = build_spider(CustomSimpleSpider, network_try_limit=1)
        #bot.setup_grab(connect_timeout=1, timeout=1)
        bot.setup_queue()
        bot.add_task(Task('baz', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:task-baz'], 1)

        bot = build_spider(SimpleSpider, task_try_limit=2)
        bot.setup_queue()
        bot.add_task(Task('baz', self.server.get_url(), task_try_count=3))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:request-network'], 0)
Ejemplo n.º 21
0
    def test_task_limit(self):
        class CustomSimpleSpider(SimpleSpider):
            def create_grab_instance(self, **kwargs):
                return Grab(connect_timeout=1, timeout=1)

        self.server.response['get.data'] = 'Hello spider!'
        self.server.response['sleep'] = 1.1

        bot = build_spider(CustomSimpleSpider, network_try_limit=1)
        #bot.setup_grab(connect_timeout=1, timeout=1)
        bot.setup_queue()
        bot.add_task(Task('baz', self.server.get_url()))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:task-baz'], 1)

        bot = build_spider(SimpleSpider, task_try_limit=2)
        bot.setup_queue()
        bot.add_task(Task('baz', self.server.get_url(), task_try_count=3))
        bot.run()
        self.assertEqual(bot.stat.counters['spider:request-network'], 0)
Ejemplo n.º 22
0
    def test_response_not_valid(self):
        class SomeSimpleSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                self.stat.inc('xxx')
                raise ResponseNotValid

        bot = build_spider(SomeSimpleSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(bot.task_try_limit, bot.stat.counters['xxx'])
Ejemplo n.º 23
0
    def test_add_task_invalid_url_raise_error(self):
        class TestSpider(Spider):
            pass

        bot = build_spider(TestSpider, )
        bot.setup_queue()
        self.assertRaises(SpiderError, bot.add_task,
                          Task('page', url='zz://zz'), raise_error=True)
        self.assertEqual(0, bot.task_queue.size())
        bot.add_task(Task('page', url='http://example.com/'))
        self.assertEqual(1, bot.task_queue.size())
Ejemplo n.º 24
0
    def test_schedule_list_clear(self):
        bot = build_spider(self.SimpleSpider)
        self.setup_queue(bot)
        bot.task_queue.clear()

        for delay in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url(),
                              delay=delay + 1))

        self.assertEqual(5, len(bot.task_queue.schedule_list))
        bot.task_queue.clear()
        self.assertEqual(0, len(bot.task_queue.schedule_list))
Ejemplo n.º 25
0
    def test_task_useragent(self):
        bot = build_spider(SimpleSpider, )
        bot.setup_queue()

        grab = Grab()
        grab.setup(url=self.server.get_url())
        grab.setup(user_agent='Foo')

        task = Task('baz', grab=grab)
        bot.add_task(task.clone())
        bot.run()
        self.assertEqual(self.server.request['headers']['User-Agent'], 'Foo')
Ejemplo n.º 26
0
 def get_configured_spider(self, pause=None, spider_options=None):
     bot = build_spider(SimpleSpider,
                        meta={
                            'server': self.server,
                            'pause': (pause or [])
                        },
                        parser_pool_size=1,
                        **(spider_options or {}))
     self.setup_cache(bot)
     bot.cache_reader_service.backend.clear()
     bot.setup_queue()
     return bot
Ejemplo n.º 27
0
    def test_task_queue_clear(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                self.stop()

        bot = build_spider(TestSpider)
        bot.setup_queue()
        for _ in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.run()
        self.assertEqual(0, bot.task_queue.size())
Ejemplo n.º 28
0
    def test_task_queue_clear(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                self.stop()

        bot = build_spider(TestSpider)
        bot.setup_queue()
        for _ in six.moves.range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        self.assertEqual(5, bot.task_queue.size())
        bot.run()
        self.assertEqual(0, bot.task_queue.size())
Ejemplo n.º 29
0
    def test_exception_from_data_handler(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                yield Data('foo', num=1)

            def data_foo(self, num): # pylint: disable=unused-argument
                raise Exception('Shit happens!')

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
Ejemplo n.º 30
0
    def test_initial_urls(self):
        url = self.server.get_url()

        class TestSpider(Spider):
            initial_urls = [url]

            def task_initial(self, unused_grab, unused_task):
                self.stat.inc('foo', 1)

        bot = build_spider(TestSpider)
        bot.run()

        self.assertEqual(1, bot.stat.counters['foo'])
Ejemplo n.º 31
0
    def test_handler_result_none(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.points = []

            def task_page(self, unused_grab, unused_task):
                yield None

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
Ejemplo n.º 32
0
    def test_handler_result_none(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.points = []

            def task_page(self, unused_grab, unused_task):
                yield None

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
Ejemplo n.º 33
0
    def test_check_task_limits_invalid_value(self):
        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

            def check_task_limits(self, task):
                return False, 'zz'

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url(),
                          fallback_name='fallback_zz'))
        self.assertRaises(SpiderError, bot.run)
Ejemplo n.º 34
0
    def test_generator(self):
        server = self.server

        class TestSpider(Spider):
            def task_generator(self):
                for _ in six.moves.range(1111):
                    yield Task('page', url=server.get_url())

            def task_page(self, unused_grab, unused_task):
                self.stat.inc('count')

        bot = build_spider(TestSpider)
        bot.run()
        self.assertEqual(bot.stat.counters['count'], 1111)
Ejemplo n.º 35
0
    def test_worker_restored(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                pass

        bot = build_spider(
            TestSpider,
            parser_requests_per_process=2,
        )
        bot.setup_queue()
        for _ in range(5):
            bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertTrue(bot.stat.counters['parser:worker-restarted'] == 2)
Ejemplo n.º 36
0
    def test_task_url(self):
        bot = build_spider(SimpleSpider, )
        bot.setup_queue()
        task = Task('baz', url='http://xxx.com')
        self.assertEqual('http://xxx.com', task.url)
        bot.add_task(task)
        self.assertEqual('http://xxx.com', task.url)
        self.assertEqual(None, task.grab_config)

        grab = Grab(url='http://yyy.com')
        task = Task('baz', grab=grab)
        bot.add_task(task)
        self.assertEqual('http://yyy.com', task.url)
        self.assertEqual('http://yyy.com', task.grab_config['url'])
Ejemplo n.º 37
0
    def test_render_stats(self):
        class TestSpider(Spider):
            def prepare(self):
                self.stat.logging_period = 0
                self.stat.inc('foo')

            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        bot.render_stats()
Ejemplo n.º 38
0
    def test_generator(self):
        server = self.server

        class TestSpider(Spider):
            def task_generator(self):
                for _ in six.moves.range(1111):
                    yield Task('page', url=server.get_url())

            def task_page(self, unused_grab, unused_task):
                self.stat.inc('count')

        bot = build_spider(TestSpider)
        bot.run()
        self.assertEqual(bot.stat.counters['count'], 1111)
Ejemplo n.º 39
0
    def test_handler_result_invalid(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.points = []

            def task_page(self, unused_grab, unused_task):
                yield 1

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        #bot.run()
        #self.assertEqual(1, bot.stat.counters['spider:error-spidererror'])
        self.assertRaises(SpiderError, bot.run)
Ejemplo n.º 40
0
    def test_task_clone_post_request(self):
        class TestSpider(Spider):
            def task_foo(self, unused_grab, task):
                if not task.get('fin'):
                    yield task.clone(fin=True)

        bot = build_spider(TestSpider)
        bot.setup_queue()

        grab = Grab()
        grab.setup(url=self.server.get_url(), post={'x': 'y'})
        task = Task('foo', grab=grab)
        bot.add_task(task)
        bot.run()
        self.assertEqual('POST', self.server.request['method'])
Ejemplo n.º 41
0
    def test_create_table(self):
        self.server.response['get.data'] = content_generator()

        class TestSpider(Spider):
            def task_page(self, grab, task):
                pass

        bot = build_spider(TestSpider)
        self.setup_cache(bot)
        bot.cache_reader_service.backend.cursor.execute('begin')
        bot.cache_reader_service.backend.cursor.execute('DROP TABLE cache')
        bot.cache_reader_service.backend.cursor.execute('commit')
        self.setup_cache(bot)
        bot.cache_reader_service.backend.clear()
        self.assertEqual(0, bot.cache_reader_service.backend.size())
Ejemplo n.º 42
0
    def test_handler_result_invalid(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.points = []

            def task_page(self, unused_grab, unused_task):
                yield 1

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        #bot.run()
        #self.assertEqual(1, bot.stat.counters['spider:error-spidererror'])
        self.assertRaises(SpiderError, bot.run)
Ejemplo n.º 43
0
 def test_basic_priority(self):
     bot = build_spider(self.SimpleSpider, parser_pool_size=1,
                        thread_number=1)
     self.setup_queue(bot)
     bot.task_queue.clear()
     requested_urls = {}
     for priority in (4, 2, 1, 5):
         url = self.server.get_url() + '?p=%d' % priority
         requested_urls[priority] = url
         bot.add_task(Task('page', url=url,
                           priority=priority))
     bot.run()
     urls = [x[1] for x in sorted(requested_urls.items(),
                                  key=lambda x: x[0])]
     self.assertEqual(urls, bot.stat.collections['url_history'])
Ejemplo n.º 44
0
    def test_data_simple_case(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.data_processed = []

            def task_page(self, unused_grab, unused_task):
                yield Data('foo', number=1)

            def data_foo(self, number):
                self.data_processed.append(number)

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(bot.data_processed, [1])
Ejemplo n.º 45
0
 def test_setup_proxylist(self):
     with temp_file() as proxy_file:
         content = '\n'.join(x['proxy'] for x in
                             self.extra_servers.values())
         with open(proxy_file, 'w') as out:
             out.write(content)
         # Simple test, one task
         bot = build_spider(SimpleSpider, thread_number=1)
         bot.load_proxylist(proxy_file, 'text_file')
         bot.setup_queue()
         bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru',
                                            debug=True)))
         bot.run()
         serv = [x['server'] for x in self.extra_servers.values()
                 if x['server'].request['done']][0]
         self.assertEqual(serv.request['headers']['host'], 'yandex.ru')
         self.assertEqual(1, len(set(bot.stat.collections['ports'])))
Ejemplo n.º 46
0
    def test_stat_error_name_threaded_urllib3(self):

        server = self.server
        server.response['sleep'] = 2

        class SimpleSpider(Spider):
            def prepare(self):
                self.network_try_limit = 1

            def task_generator(self):
                grab = Grab(url=server.get_url(), timeout=1)
                yield Task('page', grab=grab)

            def task_page(self, grab, unused_task):
                pass

        bot = build_spider(SimpleSpider)
        bot.run()
        self.assertTrue('error:read-timeout-error' in bot.stat.counters)
Ejemplo n.º 47
0
    def test_complex_data(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.data_processed = []

            def task_page(self, unused_grab, unused_task):
                yield Data('foo', one=1, two=2, bar='gaz')

            def data_foo(self, one, two, **kwargs):
                self.data_processed.append(one)
                self.data_processed.append(two)
                self.data_processed.append(kwargs)

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(bot.data_processed, [1, 2, {'bar': 'gaz'}])
Ejemplo n.º 48
0
    def test_fallback_handler_by_default_name(self):
        class TestSpider(Spider):
            def prepare(self):
                # pylint: disable=attribute-defined-outside-init
                self.points = []

            def task_page(self, grab, task):
                pass

            def task_page_fallback(self, unused_task):
                self.points.append(1)

        self.server.response['code'] = 403

        bot = build_spider(TestSpider, network_try_limit=1)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))
        bot.run()
        self.assertEqual(bot.points, [1])
Ejemplo n.º 49
0
    def test_too_many_redirects(self):
        class TestSpider(Spider):
            def task_page(self, unused_grab, unused_task):
                pass

        bot = build_spider(TestSpider)
        bot.setup_queue()
        bot.add_task(Task('page', url=self.server.get_url()))

        self.server.response['headers'] = [
            ('Location', self.server.get_url()),
        ]
        self.server.response['code'] = 302
        bot.run()

        self.assertEqual(
            1, len(bot.stat.collections['network-count-rejected'])
        )
        self.assertTrue('error:too-many-redirects' in bot.stat.counters)
Ejemplo n.º 50
0
    def test_grab_attribute_exception(self):
        server = self.server
        server.response['sleep'] = 2

        class SimpleSpider(Spider):

            def task_generator(self):
                grab = Grab()
                grab.setup(url=server.get_url(),
                           timeout=1)
                yield Task('page', grab=grab,
                           raw=True)

            def task_page(self, grab, unused_task):
                self.meta['exc'] = grab.exception

        bot = build_spider(SimpleSpider)
        bot.run()
        self.assertTrue(isinstance(bot.meta['exc'], GrabTimeoutError))
Ejemplo n.º 51
0
    def test_redirect_with_invalid_url(self):

        server = self.server

        class TestSpider(Spider):
            def task_generator(self):
                # pylint: disable=attribute-defined-outside-init
                self.done_counter = 0
                # pylint: enable=attribute-defined-outside-init
                yield Task('page', url=server.get_url())

            def task_page(self, grab, task):
                pass

        self.server.response_once['code'] = 301
        self.server.response_once['headers'] = [
            ('Location', INVALID_URL),
        ]
        bot = build_spider(TestSpider, network_try_limit=1)
        bot.run()
Ejemplo n.º 52
0
    def test_setup_proxylist2(self):
        with temp_file() as proxy_file:
            content = '\n'.join(x['proxy'] for x in
                                self.extra_servers.values())
            with open(proxy_file, 'w') as out:
                out.write(content)

            # By default auto_change is True
            bot = build_spider(SimpleSpider, thread_number=1)
            bot.load_proxylist(proxy_file, 'text_file')
            bot.setup_queue()
            for _ in six.moves.range(10):
                bot.add_task(Task('baz', 'http://yandex.ru'))
            bot.run()

            servers = [x['server'] for x in self.extra_servers.values()
                       if x['server'].request['done']]
            for serv in servers:
                self.assertEqual(serv.request['headers']['host'], 'yandex.ru')
            self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)
Ejemplo n.º 53
0
    def test_schedule(self):
        """
        In this test I create a number of delayed task
        and then check the order in which they was executed
        """
        server = self.server

        class TestSpider(Spider):
            def task_generator(self):
                yield Task('page', url=server.get_url(), delay=1.5, num=3)
                yield Task('page', url=server.get_url(), delay=4.5, num=2)
                yield Task('page', url=server.get_url(), delay=3, num=4)
                yield Task('page', url=server.get_url(), num=1)

            def task_page(self, unused_grab, task):
                self.stat.collect('numbers', task.num)

        bot = build_spider(TestSpider, thread_number=1)
        self.setup_queue(bot)
        bot.run()
        self.assertEqual(bot.stat.collections['numbers'], [1, 3, 4, 2])