def test_task_priority(self): # Automatic random priority base.RANDOM_TASK_PRIORITY_RANGE = (10, 20) bot = build_spider(SimpleSpider, priority_mode='random') bot.setup_queue() task = Task('baz', url='http://xxx.com') self.assertEqual(task.priority, None) bot.add_task(task) self.assertTrue(10 <= task.priority <= 20) # Automatic constant priority base.DEFAULT_TASK_PRIORITY = 33 bot = build_spider(SimpleSpider, priority_mode='const') bot.setup_queue() task = Task('baz', url='http://xxx.com') self.assertEqual(task.priority, None) bot.add_task(task) self.assertEqual(33, task.priority) # Automatic priority does not override explictily setted priority base.DEFAULT_TASK_PRIORITY = 33 bot = build_spider(SimpleSpider, priority_mode='const') bot.setup_queue() task = Task('baz', url='http://xxx.com', priority=1) self.assertEqual(1, task.priority) bot.add_task(task) self.assertEqual(1, task.priority) self.assertRaises(SpiderMisuseError, lambda: SimpleSpider(priority_mode='foo'))
def test_spider_custom_proxy_source(self): proxy_port = self.server.port class TestSpider(Spider): def task_page(self, grab, unused_task): self.stat.collect('ports', int(grab.doc.headers.get('Listen-Port', 0))) class CustomProxySource(BaseProxySource): def load(self): return [ Proxy(ADDRESS, proxy_port, None, None, 'http'), ] def load_raw_data(self): return None bot = build_spider(TestSpider) bot.setup_queue() bot.load_proxylist(CustomProxySource()) bot.add_task(Task('page', url='http://yandex.ru/')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(set(bot.stat.collections['ports']), set([TEST_SERVER_PORT]))
def test_task_callback(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): self.meta['tokens'].append('0_handler') class FuncWithState(object): def __init__(self, tokens): self.tokens = tokens def __call__(self, grab, task): self.tokens.append('1_func') tokens = [] func = FuncWithState(tokens) bot = build_spider(TestSpider, ) bot.meta['tokens'] = tokens bot.setup_queue() # classic handler bot.add_task(Task('page', url=self.server.get_url())) # callback option overried classic handler bot.add_task(Task('page', url=self.server.get_url(), callback=func)) # callback and null task name bot.add_task(Task(name=None, url=self.server.get_url(), callback=func)) # callback and default task name bot.add_task(Task(url=self.server.get_url(), callback=func)) bot.run() self.assertEqual(['0_handler', '1_func', '1_func', '1_func'], sorted(tokens))
def test_spider_custom_proxy_source(self): proxy_port = self.server.port class TestSpider(Spider): def task_page(self, grab, unused_task): self.stat.collect( 'ports', int(grab.doc.headers.get('Listen-Port', 0))) class CustomProxySource(BaseProxySource): def load(self): return [ Proxy(ADDRESS, proxy_port, None, None, 'http'), ] def load_raw_data(self): return None bot = build_spider(TestSpider) bot.setup_queue() bot.load_proxylist(CustomProxySource()) bot.add_task(Task('page', url='http://yandex.ru/')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(set(bot.stat.collections['ports']), set([TEST_SERVER_PORT]))
def test_delay_error(self): bot = build_spider(self.SimpleSpider) self.setup_queue(bot) bot.task_queue.clear() self.assertRaises(SpiderMisuseError, bot.add_task, Task('page', url=self.server.get_url(), delay=1))
def test_generator_with_invalid_url(self): class SomeSpider(Spider): def task_generator(self): yield Task('page', url=INVALID_URL) bot = build_spider(SomeSpider) bot.run()
def test_task_nohandler_error(self): class TestSpider(Spider): pass bot = build_spider(TestSpider, ) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) self.assertRaises(NoTaskHandler, bot.run)
def test_task_retry(self): self.server.response['get.data'] = 'xxx' self.server.response_once['code'] = 403 bot = build_spider(SimpleSpider) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(b'xxx', bot.stat.collections['SAVED_ITEM'][0])
def test_fatal_error(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): raise FatalError bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) self.assertRaises(FatalError, bot.run)
def test_spider(self): self.server.response['get.data'] = 'Hello spider!' self.server.response['sleep'] = 0 bot = build_spider(SimpleSpider) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(b'Hello spider!', bot.stat.collections['SAVED_ITEM'][0])
def test_data_nohandler_error(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): yield Data('foo', num=1) bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) self.assertRaises(NoDataHandler, bot.run)
def test_spider_nonmp_changes(self): """This test tests that in non-multiprocess-mode changes made inside handler applied to main spider instance.""" bot = build_spider(self.SimpleSpider) bot.setup_queue() bot.meta['url'] = self.server.get_url() bot.add_task(Task('page', self.server.get_url())) bot.run() self.assertEqual(4, bot.foo_count)
def test_queue_length(self): bot = build_spider(self.SimpleSpider) self.setup_queue(bot) bot.task_queue.clear() for _ in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.run() self.assertEqual(0, bot.task_queue.size())
def test_task_raw(self): class TestSpider(Spider): def task_page(self, grab, unused_task): self.stat.collect('codes', grab.doc.code) self.server.response['code'] = 502 bot = build_spider(TestSpider, network_try_limit=1) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(0, len(bot.stat.collections['codes'])) bot = build_spider(TestSpider, network_try_limit=1) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url(), raw=True)) bot.add_task(Task('page', url=self.server.get_url(), raw=True)) bot.run() self.assertEqual(2, len(bot.stat.collections['codes']))
def get_configured_spider(self, pause=None, spider_options=None): bot = build_spider( SimpleSpider, meta={'server': self.server, 'pause': (pause or [])}, parser_pool_size=1, **(spider_options or {}) ) self.setup_cache(bot) bot.cache_reader_service.backend.clear() bot.setup_queue() return bot
def test_task_limit(self): class CustomSimpleSpider(SimpleSpider): def create_grab_instance(self, **kwargs): return Grab(connect_timeout=1, timeout=1) self.server.response['get.data'] = 'Hello spider!' self.server.response['sleep'] = 1.1 bot = build_spider(CustomSimpleSpider, network_try_limit=1) #bot.setup_grab(connect_timeout=1, timeout=1) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(bot.stat.counters['spider:task-baz'], 1) bot = build_spider(SimpleSpider, task_try_limit=2) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url(), task_try_count=3)) bot.run() self.assertEqual(bot.stat.counters['spider:request-network'], 0)
def test_response_not_valid(self): class SomeSimpleSpider(Spider): def task_page(self, unused_grab, unused_task): self.stat.inc('xxx') raise ResponseNotValid bot = build_spider(SomeSimpleSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(bot.task_try_limit, bot.stat.counters['xxx'])
def test_add_task_invalid_url_raise_error(self): class TestSpider(Spider): pass bot = build_spider(TestSpider, ) bot.setup_queue() self.assertRaises(SpiderError, bot.add_task, Task('page', url='zz://zz'), raise_error=True) self.assertEqual(0, bot.task_queue.size()) bot.add_task(Task('page', url='http://example.com/')) self.assertEqual(1, bot.task_queue.size())
def test_schedule_list_clear(self): bot = build_spider(self.SimpleSpider) self.setup_queue(bot) bot.task_queue.clear() for delay in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url(), delay=delay + 1)) self.assertEqual(5, len(bot.task_queue.schedule_list)) bot.task_queue.clear() self.assertEqual(0, len(bot.task_queue.schedule_list))
def test_task_useragent(self): bot = build_spider(SimpleSpider, ) bot.setup_queue() grab = Grab() grab.setup(url=self.server.get_url()) grab.setup(user_agent='Foo') task = Task('baz', grab=grab) bot.add_task(task.clone()) bot.run() self.assertEqual(self.server.request['headers']['User-Agent'], 'Foo')
def get_configured_spider(self, pause=None, spider_options=None): bot = build_spider(SimpleSpider, meta={ 'server': self.server, 'pause': (pause or []) }, parser_pool_size=1, **(spider_options or {})) self.setup_cache(bot) bot.cache_reader_service.backend.clear() bot.setup_queue() return bot
def test_task_queue_clear(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): self.stop() bot = build_spider(TestSpider) bot.setup_queue() for _ in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.run() self.assertEqual(0, bot.task_queue.size())
def test_exception_from_data_handler(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): yield Data('foo', num=1) def data_foo(self, num): # pylint: disable=unused-argument raise Exception('Shit happens!') bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
def test_initial_urls(self): url = self.server.get_url() class TestSpider(Spider): initial_urls = [url] def task_initial(self, unused_grab, unused_task): self.stat.inc('foo', 1) bot = build_spider(TestSpider) bot.run() self.assertEqual(1, bot.stat.counters['foo'])
def test_handler_result_none(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.points = [] def task_page(self, unused_grab, unused_task): yield None bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run()
def test_check_task_limits_invalid_value(self): class TestSpider(Spider): def task_page(self, grab, task): pass def check_task_limits(self, task): return False, 'zz' bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url(), fallback_name='fallback_zz')) self.assertRaises(SpiderError, bot.run)
def test_generator(self): server = self.server class TestSpider(Spider): def task_generator(self): for _ in six.moves.range(1111): yield Task('page', url=server.get_url()) def task_page(self, unused_grab, unused_task): self.stat.inc('count') bot = build_spider(TestSpider) bot.run() self.assertEqual(bot.stat.counters['count'], 1111)
def test_worker_restored(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): pass bot = build_spider( TestSpider, parser_requests_per_process=2, ) bot.setup_queue() for _ in range(5): bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertTrue(bot.stat.counters['parser:worker-restarted'] == 2)
def test_task_url(self): bot = build_spider(SimpleSpider, ) bot.setup_queue() task = Task('baz', url='http://xxx.com') self.assertEqual('http://xxx.com', task.url) bot.add_task(task) self.assertEqual('http://xxx.com', task.url) self.assertEqual(None, task.grab_config) grab = Grab(url='http://yyy.com') task = Task('baz', grab=grab) bot.add_task(task) self.assertEqual('http://yyy.com', task.url) self.assertEqual('http://yyy.com', task.grab_config['url'])
def test_render_stats(self): class TestSpider(Spider): def prepare(self): self.stat.logging_period = 0 self.stat.inc('foo') def task_page(self, grab, task): pass bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() bot.render_stats()
def test_handler_result_invalid(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.points = [] def task_page(self, unused_grab, unused_task): yield 1 bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) #bot.run() #self.assertEqual(1, bot.stat.counters['spider:error-spidererror']) self.assertRaises(SpiderError, bot.run)
def test_task_clone_post_request(self): class TestSpider(Spider): def task_foo(self, unused_grab, task): if not task.get('fin'): yield task.clone(fin=True) bot = build_spider(TestSpider) bot.setup_queue() grab = Grab() grab.setup(url=self.server.get_url(), post={'x': 'y'}) task = Task('foo', grab=grab) bot.add_task(task) bot.run() self.assertEqual('POST', self.server.request['method'])
def test_create_table(self): self.server.response['get.data'] = content_generator() class TestSpider(Spider): def task_page(self, grab, task): pass bot = build_spider(TestSpider) self.setup_cache(bot) bot.cache_reader_service.backend.cursor.execute('begin') bot.cache_reader_service.backend.cursor.execute('DROP TABLE cache') bot.cache_reader_service.backend.cursor.execute('commit') self.setup_cache(bot) bot.cache_reader_service.backend.clear() self.assertEqual(0, bot.cache_reader_service.backend.size())
def test_basic_priority(self): bot = build_spider(self.SimpleSpider, parser_pool_size=1, thread_number=1) self.setup_queue(bot) bot.task_queue.clear() requested_urls = {} for priority in (4, 2, 1, 5): url = self.server.get_url() + '?p=%d' % priority requested_urls[priority] = url bot.add_task(Task('page', url=url, priority=priority)) bot.run() urls = [x[1] for x in sorted(requested_urls.items(), key=lambda x: x[0])] self.assertEqual(urls, bot.stat.collections['url_history'])
def test_data_simple_case(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.data_processed = [] def task_page(self, unused_grab, unused_task): yield Data('foo', number=1) def data_foo(self, number): self.data_processed.append(number) bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(bot.data_processed, [1])
def test_setup_proxylist(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) with open(proxy_file, 'w') as out: out.write(content) # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() serv = [x['server'] for x in self.extra_servers.values() if x['server'].request['done']][0] self.assertEqual(serv.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports'])))
def test_stat_error_name_threaded_urllib3(self): server = self.server server.response['sleep'] = 2 class SimpleSpider(Spider): def prepare(self): self.network_try_limit = 1 def task_generator(self): grab = Grab(url=server.get_url(), timeout=1) yield Task('page', grab=grab) def task_page(self, grab, unused_task): pass bot = build_spider(SimpleSpider) bot.run() self.assertTrue('error:read-timeout-error' in bot.stat.counters)
def test_complex_data(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.data_processed = [] def task_page(self, unused_grab, unused_task): yield Data('foo', one=1, two=2, bar='gaz') def data_foo(self, one, two, **kwargs): self.data_processed.append(one) self.data_processed.append(two) self.data_processed.append(kwargs) bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(bot.data_processed, [1, 2, {'bar': 'gaz'}])
def test_fallback_handler_by_default_name(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.points = [] def task_page(self, grab, task): pass def task_page_fallback(self, unused_task): self.points.append(1) self.server.response['code'] = 403 bot = build_spider(TestSpider, network_try_limit=1) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(bot.points, [1])
def test_too_many_redirects(self): class TestSpider(Spider): def task_page(self, unused_grab, unused_task): pass bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) self.server.response['headers'] = [ ('Location', self.server.get_url()), ] self.server.response['code'] = 302 bot.run() self.assertEqual( 1, len(bot.stat.collections['network-count-rejected']) ) self.assertTrue('error:too-many-redirects' in bot.stat.counters)
def test_grab_attribute_exception(self): server = self.server server.response['sleep'] = 2 class SimpleSpider(Spider): def task_generator(self): grab = Grab() grab.setup(url=server.get_url(), timeout=1) yield Task('page', grab=grab, raw=True) def task_page(self, grab, unused_task): self.meta['exc'] = grab.exception bot = build_spider(SimpleSpider) bot.run() self.assertTrue(isinstance(bot.meta['exc'], GrabTimeoutError))
def test_redirect_with_invalid_url(self): server = self.server class TestSpider(Spider): def task_generator(self): # pylint: disable=attribute-defined-outside-init self.done_counter = 0 # pylint: enable=attribute-defined-outside-init yield Task('page', url=server.get_url()) def task_page(self, grab, task): pass self.server.response_once['code'] = 301 self.server.response_once['headers'] = [ ('Location', INVALID_URL), ] bot = build_spider(TestSpider, network_try_limit=1) bot.run()
def test_setup_proxylist2(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) with open(proxy_file, 'w') as out: out.write(content) # By default auto_change is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() for _ in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() servers = [x['server'] for x in self.extra_servers.values() if x['server'].request['done']] for serv in servers: self.assertEqual(serv.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)
def test_schedule(self): """ In this test I create a number of delayed task and then check the order in which they was executed """ server = self.server class TestSpider(Spider): def task_generator(self): yield Task('page', url=server.get_url(), delay=1.5, num=3) yield Task('page', url=server.get_url(), delay=4.5, num=2) yield Task('page', url=server.get_url(), delay=3, num=4) yield Task('page', url=server.get_url(), num=1) def task_page(self, unused_grab, task): self.stat.collect('numbers', task.num) bot = build_spider(TestSpider, thread_number=1) self.setup_queue(bot) bot.run() self.assertEqual(bot.stat.collections['numbers'], [1, 3, 4, 2])