def test_close(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     fs.next_request()
     fs.frontier.manager.iteration = 5
     fs.close('reason')
     assert fs.frontier.manager._stopped is True
     assert fs.stats_manager.stats.get_value('frontera/pending_requests_count') == 2
     assert fs.stats_manager.stats.get_value('frontera/iterations') == 5
Beispiel #2
0
 def test_close(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     fs.next_request()
     fs.frontier.manager.iteration = 5
     fs.close('reason')
     assert fs.frontier.manager._stopped is True
     assert fs.stats_manager.stats.get_value(
         'frontera/pending_requests_count') == 2
     assert fs.stats_manager.stats.get_value('frontera/iterations') == 5
Beispiel #3
0
 def test_next_request_overused_keys_info(self):
     settings = Settings()
     settings['CONCURRENT_REQUESTS_PER_DOMAIN'] = 0
     settings['CONCURRENT_REQUESTS_PER_IP'] = 5
     crawler = FakeCrawler(settings)
     # the keys in the slot_dict are ip's, the first value in the pair is the
     # slot.active list(only it's length is needed) and the second value is slot.concurrency.
     slot_dict = {
         '1.2.3': ([0] * 3, 1),
         '2.1.3': ([0] * 30, 2),
         '3.2.2': ([0] * 5, 1),
         '4.1.3': ([0] * 110, 20)
     }
     crawler.set_slots(slot_dict)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     request = fs.next_request()
     assert request.url == fr1.url
     assert isinstance(request, Request)
     assert fs.frontier.manager.get_next_requests_kwargs[0][
         'key_type'] == 'ip'
     assert set(fs.frontier.manager.get_next_requests_kwargs[0]
                ['overused_keys']) == set(['2.1.3', '4.1.3'])
     assert fs.stats_manager.stats.get_value(
         'frontera/returned_requests_count') == 1
 def test_next_request_manager_finished(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     fs.frontier.manager.finished = True
     assert fs.next_request() is None
     assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') is None
Beispiel #5
0
 def test_next_request_manager_finished(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     fs.frontier.manager.finished = True
     assert fs.next_request() is None
     assert fs.stats_manager.stats.get_value(
         'frontera/returned_requests_count') is None
 def test_next_request(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     requests = [fs.next_request() for _ in range(3)]
     assert set([request.url for request in requests]) == set([fr1.url, fr2.url, fr3.url])
     assert all([isinstance(request, Request) for request in requests])
     assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') == 3
Beispiel #7
0
 def test_next_request(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     requests = [fs.next_request() for _ in range(3)]
     assert set([request.url for request in requests
                 ]) == set([fr1.url, fr2.url, fr3.url])
     assert all([isinstance(request, Request) for request in requests])
     assert fs.stats_manager.stats.get_value(
         'frontera/returned_requests_count') == 3
 def test_next_request_overused_keys_info(self):
     settings = Settings()
     settings['CONCURRENT_REQUESTS_PER_DOMAIN'] = 0
     settings['CONCURRENT_REQUESTS_PER_IP'] = 5
     crawler = FakeCrawler(settings)
     # the keys in the slot_dict are ip's, the first value in the pair is the
     # slot.active list(only it's length is needed) and the second value is slot.concurrency.
     slot_dict = {'1.2.3': ([0]*3, 1), '2.1.3': ([0]*30, 2), '3.2.2': ([0]*5, 1), '4.1.3': ([0]*110, 20)}
     crawler.set_slots(slot_dict)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     request = fs.next_request()
     assert request.url == fr1.url
     assert isinstance(request, Request)
     assert fs.frontier.manager.get_next_requests_kwargs[0]['key_type'] == 'ip'
     assert set(fs.frontier.manager.get_next_requests_kwargs[0]['overused_keys']) == set(['2.1.3', '4.1.3'])
     assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') == 1