def test_timeout(self): bot = build_spider(SimpleSpider, meta={'server': self.server}) self.setup_cache(bot) bot.cache.clear() bot.setup_queue() bot.add_task(Task('one', self.server.get_url())) bot.add_task(Task('one', self.server.get_url(), delay=2)) bot.run() self.assertEqual(2, bot.stat.counters['spider:request']) self.assertEqual(1, bot.stat.counters['spider:request-cache']) self.assertEqual([1, 1], bot.stat.collections['resp_counters']) bot = build_spider(SimpleSpider, meta={'server': self.server}, parser_pool_size=1) self.setup_cache(bot) # DO not clear the cache # bot.cache.clear() bot.setup_queue() bot.add_task(Task('one', self.server.get_url(), priority=1)) bot.add_task(Task('one', self.server.get_url(), priority=2, cache_timeout=0, delay=1)) bot.add_task(Task('one', self.server.get_url(), priority=3, cache_timeout=10, delay=3)) bot.add_task(Task('one', self.server.get_url(), priority=4, cache_timeout=0, delay=4)) bot.run() self.assertEqual([1, 2, 2, 3], bot.stat.collections['resp_counters'])
def test_setup_grab(self): # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.setup_grab(proxy=PROXY1) bot.setup_queue() bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(set(bot.stat.collections['ports']), set([self.server.port])) self.assertEqual(1, len(set(bot.stat.collections['ports']))) content = '%s\n%s' % (PROXY1, PROXY2) open('/tmp/__proxy.txt', 'w').write(content) # If proxy is configured with both methods # (setup_grab and load_proxylist) # then proxylist has priority bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.setup_grab(proxy=PROXY3) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertTrue(EXTRA_PORT2 not in bot.stat.collections['ports']) self.assertTrue(EXTRA_PORT2 not in set(bot.stat.collections['ports']))
def test_task_priority(self): # Automatic random priority grab.spider.base.RANDOM_TASK_PRIORITY_RANGE = (10, 20) bot = build_spider(SimpleSpider, priority_mode='random') bot.setup_queue() task = Task('baz', url='xxx') self.assertEqual(task.priority, None) bot.add_task(task) self.assertTrue(10 <= task.priority <= 20) # Automatic constant priority grab.spider.base.DEFAULT_TASK_PRIORITY = 33 bot = build_spider(SimpleSpider, priority_mode='const') bot.setup_queue() task = Task('baz', url='xxx') self.assertEqual(task.priority, None) bot.add_task(task) self.assertEqual(33, task.priority) # Automatic priority does not override explictily setted priority grab.spider.base.DEFAULT_TASK_PRIORITY = 33 bot = build_spider(SimpleSpider, priority_mode='const') bot.setup_queue() task = Task('baz', url='xxx', priority=1) self.assertEqual(1, task.priority) bot.add_task(task) self.assertEqual(1, task.priority) self.assertRaises(SpiderMisuseError, lambda: SimpleSpider(priority_mode='foo'))
def test_task_cache_timeout(self): class TestSpider(Spider): def task_page(self, grab, task): self.stat.collect('points', grab.doc.body) bot = build_spider(TestSpider, parser_pool_size=1) self.setup_cache(bot) bot.cache.clear() bot.setup_queue() # This task will receive first data from `get.data` iterator bot.add_task(Task('page', url=self.server.get_url())) # This task will be spawned in 1 second and will # receive cached data (cache timeout = 1.5sec > 1sec) bot.add_task(Task('page', url=self.server.get_url(), delay=1, cache_timeout=10)) # This task will be spawned in 2 seconds and will not # receive cached data (cache timeout = 1.5 sec < 2 sec) # So, this task will receive next data from `get.data` iterator bot.add_task(Task('page', url=self.server.get_url(), delay=2, cache_timeout=0.5)) self.server.response['get.data'] = iter([b'a', b'b']) bot.run() self.assertEqual(bot.stat.collections['points'], [b'a', b'a', b'b'])
def test_delay_error(self): bot = build_spider(self.SimpleSpider) self.setup_queue(bot) bot.task_queue.clear() self.assertRaises(SpiderMisuseError, bot.add_task, Task('page', url=self.server.get_url(), delay=1))
def test_task_limit(self): self.server.response['get.data'] = 'Hello spider!' self.server.response['sleep'] = 1.1 bot = build_spider(self.SimpleSpider, network_try_limit=1) bot.setup_grab(connect_timeout=1, timeout=1) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(bot.stat.counters['spider:task-baz'], 1) bot = build_spider(self.SimpleSpider, task_try_limit=2) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url(), task_try_count=3)) bot.run() self.assertEqual(bot.stat.counters['spider:request-network'], 0)
def test_spider_mp_changes(self): bot = build_spider(self.SimpleSpider) bot.setup_queue() bot.meta['url'] = self.server.get_url() bot.add_task(Task('page', self.server.get_url())) bot.run() self.assertEqual(2, bot.foo_count)
def test_task_callback(self): class TestSpider(Spider): def task_page(self, grab, task): self.meta['tokens'].append('0_handler') class FuncWithState(object): def __init__(self, tokens): self.tokens = tokens def __call__(self, grab, task): self.tokens.append('1_func') tokens = [] func = FuncWithState(tokens) bot = build_spider(TestSpider, ) bot.meta['tokens'] = tokens bot.setup_queue() # classic handler bot.add_task(Task('page', url=self.server.get_url())) # callback option overried classic handler bot.add_task(Task('page', url=self.server.get_url(), callback=func)) # callback and null task name bot.add_task(Task(name=None, url=self.server.get_url(), callback=func)) # callback and default task name bot.add_task(Task(url=self.server.get_url(), callback=func)) bot.run() self.assertEqual(['0_handler', '1_func', '1_func', '1_func'], sorted(tokens))
def test_counter(self): bot = build_spider(SimpleSpider, meta={'server': self.server}) self.setup_cache(bot) bot.cache.clear() bot.setup_queue() bot.add_task(Task('one', self.server.get_url())) bot.run() self.assertEqual([1], bot.stat.collections['resp_counters'])
def test_generator_with_invalid_url(self): class SomeSpider(Spider): def task_generator(self): yield Task('page', url=INVALID_URL) bot = build_spider(SomeSpider) bot.run()
def test_task_nohandler_error(self): class TestSpider(Spider): pass bot = build_spider(TestSpider, ) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) self.assertRaises(NoTaskHandler, bot.run)
def test_task_retry(self): self.server.response['get.data'] = 'xxx' self.server.response_once['code'] = 403 bot = build_spider(self.SimpleSpider) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(b'xxx', bot.stat.collections['SAVED_ITEM'][0])
def test_spider(self): self.server.response['get.data'] = 'Hello spider!' self.server.response['sleep'] = 0 bot = build_spider(self.SimpleSpider) bot.setup_queue() bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(b'Hello spider!', bot.stat.collections['SAVED_ITEM'][0])
def test_fatal_error(self): class TestSpider(Spider): def task_page(self, grab, task): raise FatalError bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) self.assertRaises(FatalError, bot.run)
def test_setup_grab(self): # Mulitple calls to `setup_grab` should accumulate # changes in config object. bot = build_spider(self.SimpleSpider) bot.setup_grab(log_dir='/tmp') bot.setup_grab(timeout=30) grab = bot.create_grab_instance() self.assertEqual(grab.config['log_dir'], '/tmp') self.assertEqual(grab.config['timeout'], 30)
def test_spider_nonmp_changes(self): """This test tests that in non-multiprocess-mode changes made inside handler applied to main spider instance.""" bot = build_spider(self.SimpleSpider) bot.setup_queue() bot.meta['url'] = self.server.get_url() bot.add_task(Task('page', self.server.get_url())) bot.run() self.assertEqual(4, bot.foo_count)
def test_data_nohandler_error(self): class TestSpider(Spider): def task_page(self, grab, task): yield Data('foo', num=1) bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) self.assertRaises(NoDataHandler, bot.run)
def test_something(self): bot = build_spider(SimpleSpider, meta={'server': self.server}, parser_pool_size=1) self.setup_cache(bot) bot.cache.clear() bot.setup_queue() bot.add_task(Task('foo', self.server.get_url())) bot.run() self.assertEqual([1, 1, 1, 2], bot.stat.collections['resp_counters'])
def test_clear(self): bot = build_spider(self.SimpleSpider) self.setup_queue(bot) bot.task_queue.clear() for x in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.task_queue.clear() self.assertEqual(0, bot.task_queue.size())
def test_connection_kwargs(self): class TestSpider(Spider): def task_page(self, grab, task): pass config = deepcopy(MONGODB_CONNECTION) # Set port that would go as **kwargs into MongoClient() MONGODB_CONNECTION.setdefault('port', 27017) bot = build_spider(TestSpider) bot.setup_cache(backend='mongo', **config)
def test_schedule_list_clear(self): bot = build_spider(self.SimpleSpider) self.setup_queue(bot) bot.task_queue.clear() for x in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url(), delay=x+1)) self.assertEqual(5, len(bot.task_queue.schedule_list)) bot.task_queue.clear() self.assertEqual(0, len(bot.task_queue.schedule_list))
def test_task_raw(self): class TestSpider(Spider): def task_page(self, grab, task): self.stat.collect('codes', grab.response.code) self.server.response['code'] = 502 bot = build_spider(TestSpider, network_try_limit=1) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(0, len(bot.stat.collections['codes'])) bot = build_spider(TestSpider, network_try_limit=1) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url(), raw=True)) bot.add_task(Task('page', url=self.server.get_url(), raw=True)) bot.run() self.assertEqual(2, len(bot.stat.collections['codes']))
def test_add_task_invalid_url_raise_error(self): class TestSpider(Spider): pass bot = build_spider(TestSpider, ) bot.setup_queue() self.assertRaises(SpiderError, bot.add_task, Task('page', url='zz://zz'), raise_error=True) self.assertEqual(0, bot.task_queue.size()) bot.add_task(Task('page', url='http://example.com/')) self.assertEqual(1, bot.task_queue.size())
def test_task_useragent(self): bot = build_spider(SimpleSpider, ) bot.setup_queue() g = Grab() g.setup(url=self.server.get_url()) g.setup(user_agent='Foo') task = Task('baz', grab=g) bot.add_task(task.clone()) bot.run() self.assertEqual(self.server.request['headers']['User-Agent'], 'Foo')
def test_only_cache_task(self): class TestSpider(Spider): def task_page(self, grab, task): self.stat.collect('points', 1) bot = build_spider(TestSpider, only_cache=True) self.setup_cache(bot) bot.cache.clear() bot.setup_queue() bot.add_task(Task('page', self.server.get_url())) bot.run() self.assertEqual(bot.stat.collections['points'], [])
def test_handler_result_none(self): class TestSpider(Spider): def prepare(self): self.points = [] def task_page(self, grab, task): yield None bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run()
def test_cache_size(self): class TestSpider(Spider): def task_page(self, grab, task): pass bot = build_spider(TestSpider) self.setup_cache(bot) bot.cache.clear() bot.setup_queue() bot.add_task(Task('page', self.server.get_url())) bot.run() self.assertEqual(bot.cache.size(), 1)
def test_create_table(self): class TestSpider(Spider): def task_page(self, grab, task): pass bot = build_spider(TestSpider) self.setup_cache(bot) bot.cache.cursor.execute('begin') bot.cache.cursor.execute('DROP TABLE cache') bot.cache.cursor.execute('commit') self.setup_cache(bot) bot.cache.clear() self.assertEqual(0, bot.cache.size())
def test_check_task_limits_invalid_value(self): class TestSpider(Spider): def task_page(self, grab, task): pass def check_task_limits(self, task): return False, 'zz' bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url(), fallback_name='fallback_zz')) self.assertRaises(SpiderError, bot.run)
def test_exception_from_data_handler(self): class TestSpider(Spider): def task_page(self, grab, task): yield Data('foo', num=1) def data_foo(self, num): 1/0 bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertTrue('data_foo' in bot.stat.collections['fatal'][0])
def test_counters_and_collections(self): class TestSpider(Spider): def prepare(self): self.stat.logging_period = 0 self.stat.inc('foo') def task_page_valid(self, grab, task): self.stat.inc('foo') def task_page_fail(self, grab, task): 1 / 0 bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page_valid', url=self.server.get_url())) bot.add_task(Task('page_fail', url=self.server.get_url())) bot.run() self.assertEqual(2, bot.stat.counters['foo']) self.assertEqual(1, len(bot.stat.collections['fatal']))
def test_task_url_and_grab_options(self): class TestSpider(Spider): def setup(self): # pylint: disable=attribute-defined-outside-init self.done = False def task_page(self, dummy_grab, dummy_task): # pylint: disable=attribute-defined-outside-init self.done = True bot = build_spider(TestSpider, ) bot.setup_queue() grab = Grab() grab.setup(url=self.server.get_url()) self.assertRaises(SpiderMisuseError, Task, 'page', grab=grab, url=self.server.get_url())
def test_setup_proxylist(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) with open(proxy_file, 'w') as out: out.write(content) # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() bot.add_task( Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() serv = [ x['server'] for x in self.extra_servers.values() if x['server'].request['done'] ][0] self.assertEqual(serv.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports'])))
def test_setup_proxylist2(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) open(proxy_file, 'w').write(content) # By default auto_change is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file') bot.setup_queue() for _ in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() servers = [x['server'] for x in self.extra_servers.values() if x['server'].request['done']] for serv in servers: self.assertEqual(serv.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1)
def test_too_large_document(self): self.server.response['get.data'] = ContentGenerator(self.server) class TestSpider(Spider): def task_page(self, grab, task): pass # The maximum BSON document size is 16 megabytes. self.server.response['get.data'] = 'x' * (1024 * 1024 * 17) bot = build_spider(TestSpider) self.setup_cache(bot, use_compression=False) bot.cache_pipeline.cache.clear() bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) patch = mock.Mock() with mock.patch('logging.error', patch): bot.run() self.assertEqual(bot.cache_pipeline.cache.size(), 0) self.assertTrue('Document too large' in patch.call_args[0][0])
def test_fallback_handler_by_default_name(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.points = [] def task_page(self, grab, task): pass def task_page_fallback(self, dummy_task): self.points.append(1) self.server.response['code'] = 403 bot = build_spider(TestSpider, network_try_limit=1) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(bot.points, [1])
def test_complex_data(self): class TestSpider(Spider): def prepare(self): # pylint: disable=attribute-defined-outside-init self.data_processed = [] def task_page(self, dummy_grab, dummy_task): yield Data('foo', one=1, two=2, bar='gaz') def data_foo(self, one, two, **kwargs): self.data_processed.append(one) self.data_processed.append(two) self.data_processed.append(kwargs) bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(bot.data_processed, [1, 2, {'bar': 'gaz'}])
def test_stat_error_name_threaded_urllib3(self): server = self.server server.response['sleep'] = 2 class SimpleSpider(Spider): def prepare(self): self.network_try_limit = 1 def task_generator(self): grab = Grab(url=server.get_url(), timeout=1) yield Task('page', grab=grab) def task_page(self, grab, unused_task): pass bot = build_spider(SimpleSpider) bot.run() self.assertTrue('error:read-timeout-error' in bot.stat.counters)
def test_setup_proxylist5(self): content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open('/tmp/__proxy.txt', 'w').write(content) # Disable auto_change # Disable auto_init # Proxylist will not be used by default bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False, auto_init=False) bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(self.server.request['headers'].get('host'), '%s:%s' % (ADDRESS, self.server.port)) self.assertEqual(1, len(set(bot.stat.collections['ports']))) self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
def test_update_grab_instance(self): class TestSpider(Spider): def update_grab_instance(self, grab): grab.setup(timeout=77) def task_generator(self): yield Task('page', url=self.meta['server'].get_url()) yield Task('page', grab=Grab(url=self.meta['server'].get_url(), timeout=1)) def task_page(self, grab, dummy_task): self.stat.collect('points', grab.config['timeout']) bot = build_spider(TestSpider, meta={'server': self.server}) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task(Task('page', grab=Grab(url=self.server.get_url(), timeout=1))) bot.run() self.assertEqual(set([77]), set(bot.stat.collections['points']))
def test_redirect_with_invalid_url(self): server = self.server class TestSpider(Spider): def task_generator(self): # pylint: disable=attribute-defined-outside-init self.done_counter = 0 # pylint: enable=attribute-defined-outside-init yield Task('page', url=server.get_url()) def task_page(self, grab, task): pass self.server.response_once['code'] = 301 self.server.response_once['headers'] = [ ('Location', INVALID_URL), ] bot = build_spider(TestSpider, network_try_limit=1) bot.run()
def test_multiple_internal_worker_error(self): class TestSpider(Spider): """ This class derived from Spider super-class contains fatal bug in overriden `process_network_result` method """ # pylint: disable=unused-argument def process_network_result(self, *args, **kwargs): raise Exception('Shit happens!') # pylint: enable=unused-argument def task_page(self, dummy_grab, dummy_task): pass bot = build_spider(TestSpider, ) bot.setup_queue() for _ in range(5): bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertTrue(bot.stat.counters['parser-pipeline-restore'] > 1)
def test_setup_proxylist5(self): with temp_file() as proxy_file: content = '\n'.join(x['proxy'] for x in self.extra_servers.values()) open(proxy_file, 'w').write(content) # Disable auto_change # Disable auto_init # Proxylist will not be used by default bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist(proxy_file, 'text_file', auto_change=False, auto_init=False) bot.setup_queue() for _ in six.moves.range(10): bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(self.server.request['headers'].get('host'), '%s:%s' % (ADDRESS, self.server.port)) self.assertEqual(1, len(set(bot.stat.collections['ports']))) self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
def test_null_grab_bug(self): # Test following bug: # Create task and process it # In task handler spawn another task with grab instance # received in arguments of current task server = self.server class SimpleSpider(Spider): def task_generator(self): yield Task('one', url=server.get_url()) def task_one(self, grab, task): self.stat.inc('page_count') yield Task('two', grab=grab) def task_two(self, grab, task): self.stat.inc('page_count') bot = build_spider(SimpleSpider, thread_number=1) bot.run() self.assertEqual(2, bot.stat.counters['page_count'])
def test_things_yiled_from_data_handler(self): server = self.server class TestSpider(Spider): def prepare(self): self.data_processed = [] def task_page(self, grab, task): yield Data('foo', count=task.get('count', 1)) def data_foo(self, count): self.data_processed.append(count) if count == 1: yield Data('foo', count=666) yield Task('page', url=server.get_url(), count=count + 1) bot = build_spider(TestSpider) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.run() self.assertEqual(bot.data_processed, [1, 666, 2])
def test_schedule(self): """ In this test I create a number of delayed task and then check the order in which they was executed """ server = self.server class TestSpider(Spider): def task_generator(self): yield Task('page', url=server.get_url(), delay=1.5, num=3) yield Task('page', url=server.get_url(), delay=4.5, num=2) yield Task('page', url=server.get_url(), delay=3, num=4) yield Task('page', url=server.get_url(), num=1) def task_page(self, dummy_grab, task): self.stat.collect('numbers', task.num) bot = build_spider(TestSpider, thread_number=1) self.setup_queue(bot) bot.run() self.assertEqual(bot.stat.collections['numbers'], [1, 3, 4, 2])
def test_fallback_handler_by_fallback_name(self): class TestSpider(Spider): def prepare(self): self.points = [] def task_page(self, grab, task): pass def fallback_zz(self, task): self.points.append(1) self.server.response['code'] = 403 bot = build_spider(TestSpider, network_try_limit=1) bot.setup_queue() bot.add_task( Task('page', url=self.server.get_url(), fallback_name='fallback_zz')) bot.run() self.assertEquals(bot.points, [1])
def test_spider_custom_proxy_source(self): class TestSpider(Spider): def task_page(self, grab, task): self.stat.collect( 'ports', int(grab.response.headers.get('Listen-Port', 0))) class CustomProxySource(BaseProxySource): def load(self): return [ Proxy(ADDRESS, TEST_SERVER_PORT, None, None, 'http'), ] bot = build_spider(TestSpider) bot.setup_queue() bot.load_proxylist(CustomProxySource()) bot.add_task(Task('page', url='http://yandex.ru/')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(set(bot.stat.collections['ports']), set([TEST_SERVER_PORT]))
def test_task_queue_clear(self): class TestSpider(Spider): def task_page(self, grab, task): self.stop() def task_keyboard_interrupt_page(self, grab, task): raise KeyboardInterrupt bot = build_spider(TestSpider) bot.setup_queue() for _ in six.moves.range(5): bot.add_task(Task('page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.run() self.assertEqual(0, bot.task_queue.size()) for _ in six.moves.range(5): bot.add_task( Task('keyboard_interrupt_page', url=self.server.get_url())) self.assertEqual(5, bot.task_queue.size()) bot.run() self.assertEqual(0, bot.task_queue.size())
def test_no_warning(self): """Simple spider should not generate any warnings (warning module sends messages to stderr) """ out = StringIO() with mock.patch('sys.stderr', out): server = self.server server.response['data'] = b'<div>test</div>' class SimpleSpider(Spider): # pylint: disable=unused-argument initial_urls = [server.get_url()] def task_initial(self, grab, task): yield Task('more', url=server.get_url()) def task_more(self, grab, task): #print(grab.doc.url) grab.doc('//div').text() bot = build_spider(SimpleSpider) bot.run() self.assertTrue(out.getvalue() == '')
def test_has_item(self): self.server.response['get.data'] = ContentGenerator(self.server) class TestSpider(Spider): def task_page(self, grab, task): pass bot = build_spider(TestSpider) self.setup_cache(bot) bot.cache_pipeline.cache.clear() bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task(Task('page', url=self.server.get_url('/foo'))) bot.run() self.assertTrue(bot.cache_pipeline.cache .has_item(self.server.get_url())) self.assertTrue(bot.cache_pipeline.cache .has_item(self.server.get_url(), timeout=100)) self.assertFalse(bot.cache_pipeline.cache .has_item(self.server.get_url(), timeout=0)) self.assertTrue(bot.cache_pipeline.cache .has_item(self.server.get_url('/foo'))) self.assertFalse(bot.cache_pipeline.cache .has_item(self.server.get_url('/bar')))
def test_create_grab_instance(self): class TestSpider(Spider): def create_grab_instance(self, **kwargs): grab = super(TestSpider, self).create_grab_instance(**kwargs) grab.setup(timeout=77) return grab def task_generator(self): yield Task('page', url=self.meta['server'].get_url()) yield Task('page', grab=Grab(url=self.meta['server'].get_url(), timeout=76)) def task_page(self, grab, unused_task): self.stat.collect('points', grab.config['timeout']) bot = build_spider(TestSpider, meta={'server': self.server}) bot.setup_queue() bot.add_task(Task('page', url=self.server.get_url())) bot.add_task(Task('page', grab=Grab(url=self.server.get_url(), timeout=75))) bot.run() self.assertEqual(set([77, 76, 75]), set(bot.stat.collections['points']))
def test_bug1(self): # Test the bug: # * enable cache # * fetch document (it goes to cache) # * request same URL # * got exception server = self.server class Bug1Spider(Spider): def task_foo(self, grab, task): grab.setup(url=server.get_url()) yield Task('bar', grab=grab) def task_bar(self, grab, task): pass bot = build_spider(Bug1Spider) self.setup_cache(bot) bot.cache.clear() bot.setup_queue() bot.add_task(Task('foo', self.server.get_url())) bot.run()
def test_inline_task(self): def callback(self): self.write(self.request.uri) self.finish() self.server.response['get.callback'] = callback server = self.server class TestSpider(Spider): def add_response(self, grab): self.stat.collect('responses', grab.doc.unicode_body()) def task_generator(self): url = server.get_url('/?foo=start') yield Task('inline', url=url) def subroutine_task(self, grab): for x in six.moves.range(2): url = server.get_url('/?foo=subtask%s' % x) grab.setup(url=url) grab = yield Task(grab=grab) self.add_response(grab) self.stat.collect('calls', 'subinline%s' % x) @inline_task def task_inline(self, grab, task): self.add_response(grab) self.stat.collect('calls', 'generator') for x in six.moves.range(3): url = server.get_url('/?foo=%s' % x) grab.setup(url=url) grab = yield Task(grab=grab) self.add_response(grab) self.stat.collect('calls', 'inline%s' % x) grab = yield self.subroutine_task(grab) # In this case the grab body will be the same # as is in subroutine task: /?foo=subtask1 self.add_response(grab) url = server.get_url('/?foo=yield') self.add_task(Task('yield', url=url)) def task_yield(self, grab, task): self.add_response(grab) self.stat.collect('calls', 'yield') url = server.get_url('/?foo=end') yield Task('end', url=url) def task_end(self, grab, task): self.add_response(grab) self.stat.collect('calls', 'end') bot = build_spider(TestSpider, ) bot.run() self.assertEqual([ '/?foo=start', '/?foo=0', '/?foo=subtask0', '/?foo=subtask1', '/?foo=subtask1', '/?foo=1', '/?foo=subtask0', '/?foo=subtask1', '/?foo=subtask1', '/?foo=2', '/?foo=subtask0', '/?foo=subtask1', '/?foo=subtask1', '/?foo=yield', '/?foo=end' ], bot.stat.collections['responses']) self.assertEqual([ 'generator', 'inline0', 'subinline0', 'subinline1', 'inline1', 'subinline0', 'subinline1', 'inline2', 'subinline0', 'subinline1', 'yield', 'end' ], bot.stat.collections['calls'])
def test_integrity_decorator_in_mp_mode(self): bot = build_spider(self.SimpleSpider) bot.setup_queue() bot.add_task(Task('page2', self.server.get_url())) bot.run()
def test_setup_proxylist(self): content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open('/tmp/__proxy.txt', 'w').write(content) # Simple test, one task bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() bot.add_task(Task('baz', grab=Grab(url='http://yandex.ru', debug=True))) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports']))) # By default auto_change is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1) # DO the same test with load_proxylist method bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file') bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertTrue(len(set(bot.stat.collections['ports'])) > 1) # Disable auto_change # By default auto_init is True bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False) bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', 'http://yandex.ru')) bot.run() self.assertEqual(self.server.request['headers']['host'], 'yandex.ru') self.assertEqual(1, len(set(bot.stat.collections['ports']))) # Disable auto_change # Disable auto_init # Proxylist will not be used by default bot = build_spider(SimpleSpider, thread_number=1) bot.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False, auto_init=False) bot.setup_queue() for x in six.moves.range(10): bot.add_task(Task('baz', self.server.get_url())) bot.run() self.assertEqual(self.server.request['headers'].get('host'), '%s:%s' % (ADDRESS, self.server.port)) self.assertEqual(1, len(set(bot.stat.collections['ports']))) self.assertEqual(bot.stat.collections['ports'][0], self.server.port)
def test_clear_collection(self): bot = build_spider(self.SimpleSpider) self.setup_queue(bot) bot.task_queue.clear()
def test_task_queue_render_stats(self): bot = build_spider(self.SimpleSpider) bot.render_stats()
def test_stop_timer_invalid_input(self): class TestSpider(Spider): pass bot = build_spider(TestSpider) self.assertRaises(KeyError, bot.timer.stop, 'zzz')