Esempio n. 1
0
 def test_process_spider_output(self):
     i1 = {'name': 'item', 'item': 'i1'}
     i2 = {'name': 'item', 'item': 'i2'}
     no_requests = 3
     result = [r1, r2, r3, i1, i2]
     resp = Response(fr1.url,
                     request=Request(fr1.url,
                                     meta={b'frontier_request': fr1}))
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     out = list(fs.process_spider_output(resp, result, Spider))
     assert len(out) == len(result)
     out_request = out[:no_requests]
     assert set(r.url
                for r in out_request) == set(r.url
                                             for r in result[:no_requests])
     out_items = out[no_requests:]
     assert sorted(out_items, key=lambda i: sorted(i['item'])) == \
         sorted([i1, i2], key=lambda i: sorted(i['item']))
     assert isinstance(fs.frontier.manager.responses[0], FResponse)
     assert fs.frontier.manager.responses[0].url == resp.url
     assert set([request.url for request in fs.frontier.manager.links
                 ]) == set([r1.url, r2.url, r3.url])
     assert all([
         isinstance(request, FRequest)
         for request in fs.frontier.manager.links
     ])
     assert fs.stats_manager.stats.get_value(
         'frontera/crawled_pages_count') == 1
     assert fs.stats_manager.stats.get_value(
         'frontera/crawled_pages_count/200') == 1
     assert fs.stats_manager.stats.get_value(
         'frontera/links_extracted_count') == 3
Esempio n. 2
0
 def test_next_request_overused_keys_info(self):
     settings = Settings()
     settings['CONCURRENT_REQUESTS_PER_DOMAIN'] = 0
     settings['CONCURRENT_REQUESTS_PER_IP'] = 5
     crawler = FakeCrawler(settings)
     # the keys in the slot_dict are ip's, the first value in the pair is the
     # slot.active list(only it's length is needed) and the second value is slot.concurrency.
     slot_dict = {
         '1.2.3': ([0] * 3, 1),
         '2.1.3': ([0] * 30, 2),
         '3.2.2': ([0] * 5, 1),
         '4.1.3': ([0] * 110, 20)
     }
     crawler.set_slots(slot_dict)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     request = fs.next_request()
     assert request.url == fr1.url
     assert isinstance(request, Request)
     assert fs.frontier.manager.get_next_requests_kwargs[0][
         'key_type'] == 'ip'
     assert set(fs.frontier.manager.get_next_requests_kwargs[0]
                ['overused_keys']) == set(['2.1.3', '4.1.3'])
     assert fs.stats_manager.stats.get_value(
         'frontera/returned_requests_count') == 1
 def test_process_spider_output(self):
     i1 = {'name': 'item', 'item': 'i1'}
     i2 = {'name': 'item', 'item': 'i2'}
     result = [r1, r2, r3, i1, i2]
     resp = Response(fr1.url,
                     request=Request(fr1.url,
                                     meta={'frontier_request': fr1}))
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert sorted(list(fs.process_spider_output(
         resp, result, Spider))) == sorted([i1, i2])
     assert isinstance(fs.frontier.manager.responses[0], FResponse)
     assert fs.frontier.manager.responses[0].url == resp.url
     assert set([request.url for request in fs.frontier.manager.links
                 ]) == set([r1.url, r2.url, r3.url])
     assert all([
         isinstance(request, FRequest)
         for request in fs.frontier.manager.links
     ])
     assert fs.stats_manager.stats.get_value(
         'frontera/crawled_pages_count') == 1
     assert fs.stats_manager.stats.get_value(
         'frontera/crawled_pages_count/200') == 1
     assert fs.stats_manager.stats.get_value(
         'frontera/links_extracted_count') == 3
Esempio n. 4
0
    def test_process_spider_output(self):
        i1 = {'name': 'item', 'item': 'i1'}
        i2 = {'name': 'item', 'item': 'i2'}
        items = [i1, i2]
        requests = [r1, r2, r3]
        result = list(requests)
        result.extend(items)
        resp = Response(fr1.url,
                        request=Request(fr1.url,
                                        meta={b'frontier_request': fr1}))
        crawler = FakeCrawler()
        fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
        spider = Spider(name="testing")
        fs.open(spider)
        out_items = list(fs.process_spider_output(resp, result, spider))
        assert len(out_items) == len(items)
        assert set([r.url for r in fs.frontier.manager.links
                    ]) == set([r.url for r in requests])

        assert isinstance(fs.frontier.manager.responses[0], FResponse)
        assert fs.frontier.manager.responses[0].url == resp.url
        assert set([request.url for request in fs.frontier.manager.links
                    ]) == set([r1.url, r2.url, r3.url])
        assert all([
            isinstance(request, FRequest)
            for request in fs.frontier.manager.links
        ])
        assert fs.stats_manager.stats.get_value(
            'frontera/crawled_pages_count') == 1
        assert fs.stats_manager.stats.get_value(
            'frontera/crawled_pages_count/200') == 1
        assert fs.stats_manager.stats.get_value(
            'frontera/links_extracted_count') == 3
 def test_next_request_manager_finished(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     fs.frontier.manager.finished = True
     assert fs.next_request() is None
     assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') is None
 def test_redirect_disabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = False
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is False
     assert fs.enqueue_request(rr2) is False
     assert fs.enqueue_request(rr3) is False
 def test_next_request(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     requests = [fs.next_request() for _ in range(3)]
     assert set([request.url for request in requests]) == set([fr1.url, fr2.url, fr3.url])
     assert all([isinstance(request, Request) for request in requests])
     assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') == 3
Esempio n. 8
0
 def test_next_request_manager_finished(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     fs.frontier.manager.finished = True
     assert fs.next_request() is None
     assert fs.stats_manager.stats.get_value(
         'frontera/returned_requests_count') is None
 def test_redirect_enabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = True
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is True
     assert fs.enqueue_request(rr2) is True
     assert fs.enqueue_request(rr3) is True
     assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url, rr3.url])
Esempio n. 10
0
 def test_enqueue_requests(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(r1) is True
     assert fs.enqueue_request(r2) is True
     assert fs.enqueue_request(r3) is True
     assert set(seed.url for seed in fs.frontier.manager.seeds) == set([r1.url, r2.url, r3.url])
     assert all([isinstance(seed, FRequest) for seed in fs.frontier.manager.seeds])
     assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 3
Esempio n. 11
0
 def test_process_exception(self):
     exception = type('exception', (object,), {})
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.process_exception(r1, exception(), Spider)
     error = fs.frontier.manager.errors.pop()
     assert error[0].url == r1.url
     assert error[1] == 'exception'
     assert fs.stats_manager.stats.get_value('frontera/request_errors_count') == 1
     assert fs.stats_manager.stats.get_value('frontera/request_errors_count/exception') == 1
Esempio n. 12
0
 def test_next_request(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     requests = [fs.next_request() for _ in range(3)]
     assert set([request.url for request in requests
                 ]) == set([fr1.url, fr2.url, fr3.url])
     assert all([isinstance(request, Request) for request in requests])
     assert fs.stats_manager.stats.get_value(
         'frontera/returned_requests_count') == 3
Esempio n. 13
0
 def test_redirect_disabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = False
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is False
     assert fs.enqueue_request(rr2) is False
     assert fs.enqueue_request(rr3) is True
     assert isinstance(fs.frontier.manager.seeds[0], FRequest)
     assert len(fs.frontier.manager.seeds) == 1
     assert fs.frontier.manager.seeds[0].url == rr3.url
     assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1
Esempio n. 14
0
 def test_process_spider_output(self):
     i1 = {'name': 'item', 'item': 'i1'}
     i2 = {'name': 'item', 'item': 'i2'}
     result = [r1, r2, r3, i1, i2]
     resp = Response(fr1.url, request=Request(fr1.url, meta={'frontier_request': fr1}))
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert sorted(list(fs.process_spider_output(resp, result, Spider))) == sorted([i1, i2])
     assert isinstance(fs.frontier.manager.responses[0], FResponse)
     assert fs.frontier.manager.responses[0].url == resp.url
     assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
     assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
     assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1
     assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1
     assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3
Esempio n. 15
0
 def test_redirect_enabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = True
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is True
     assert fs.enqueue_request(rr2) is True
     assert fs.enqueue_request(rr3) is True
     assert len(fs.frontier.manager.seeds) == 1
     assert isinstance(fs.frontier.manager.seeds[0], FRequest)
     assert fs.frontier.manager.seeds[0].url == rr3.url
     assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url])
     assert all([isinstance(request, Request) for request in fs._pending_requests])
     assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1
     assert fs.stats_manager.stats.get_value('frontera/redirected_requests_count') == 2
Esempio n. 16
0
 def test_process_spider_output(self):
     i1 = {"name": "item", "item": "i1"}
     i2 = {"name": "item", "item": "i2"}
     result = [r1, r2, r3, i1, i2]
     resp = Response(fr1.url, request=Request(fr1.url, meta={b"frontier_request": fr1}))
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert sorted(list(fs.process_spider_output(resp, result, Spider)), key=lambda i: sorted(i["item"])) == sorted(
         [i1, i2], key=lambda i: sorted(i["item"])
     )
     assert isinstance(fs.frontier.manager.responses[0], FResponse)
     assert fs.frontier.manager.responses[0].url == resp.url
     assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
     assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
     assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count") == 1
     assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count/200") == 1
     assert fs.stats_manager.stats.get_value("frontera/links_extracted_count") == 3
Esempio n. 17
0
 def test_next_request_overused_keys_info(self):
     settings = Settings()
     settings['CONCURRENT_REQUESTS_PER_DOMAIN'] = 0
     settings['CONCURRENT_REQUESTS_PER_IP'] = 5
     crawler = FakeCrawler(settings)
     # the keys in the slot_dict are ip's, the first value in the pair is the
     # slot.active list(only it's length is needed) and the second value is slot.concurrency.
     slot_dict = {'1.2.3': ([0]*3, 1), '2.1.3': ([0]*30, 2), '3.2.2': ([0]*5, 1), '4.1.3': ([0]*110, 20)}
     crawler.set_slots(slot_dict)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1])
     request = fs.next_request()
     assert request.url == fr1.url
     assert isinstance(request, Request)
     assert fs.frontier.manager.get_next_requests_kwargs[0]['key_type'] == 'ip'
     assert set(fs.frontier.manager.get_next_requests_kwargs[0]['overused_keys']) == set(['2.1.3', '4.1.3'])
     assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') == 1
Esempio n. 18
0
 def test_close(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     fs.next_request()
     fs.frontier.manager.iteration = 5
     fs.close('reason')
     assert fs.frontier.manager._stopped is True
     assert fs.stats_manager.stats.get_value(
         'frontera/pending_requests_count') == 2
     assert fs.stats_manager.stats.get_value('frontera/iterations') == 5
Esempio n. 19
0
 def test_close(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.frontier.manager.put_requests([fr1, fr2, fr3])
     fs.next_request()
     fs.frontier.manager.iteration = 5
     fs.close('reason')
     assert fs.frontier.manager._stopped is True
     assert fs.stats_manager.stats.get_value('frontera/pending_requests_count') == 2
     assert fs.stats_manager.stats.get_value('frontera/iterations') == 5
    def test_process_spider_output(self):
        i1 = {'name': 'item', 'item': 'i1'}
        i2 = {'name': 'item', 'item': 'i2'}
        items = [i1 , i2]
        requests = [r1, r2, r3]
        result = list(requests)
        result.extend(items)
        resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1}))
        crawler = FakeCrawler()
        fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
        spider = Spider(name="testing")
        fs.open(spider)
        out_items = list(fs.process_spider_output(resp, result, spider))
        assert len(out_items) == len(items)
        assert set([r.url for r in fs.frontier.manager.links]) == set([r.url for r in requests])

        assert isinstance(fs.frontier.manager.responses[0], FResponse)
        assert fs.frontier.manager.responses[0].url == resp.url
        assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
        assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
        assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1
        assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1
        assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3
 def test_process_spider_output(self):
     i1 = {'name': 'item', 'item': 'i1'}
     i2 = {'name': 'item', 'item': 'i2'}
     no_requests = 3
     result = [r1, r2, r3, i1, i2]
     resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1}))
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     out = list(fs.process_spider_output(resp, result, Spider))
     assert len(out) == len(result)
     out_request = out[:no_requests]
     assert set(r.url for r in out_request) == set(r.url for r in result[:no_requests])
     out_items = out[no_requests:]
     assert sorted(out_items, key=lambda i: sorted(i['item'])) == \
         sorted([i1, i2], key=lambda i: sorted(i['item']))
     assert isinstance(fs.frontier.manager.responses[0], FResponse)
     assert fs.frontier.manager.responses[0].url == resp.url
     assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
     assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
     assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1
     assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1
     assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3
Esempio n. 22
0
    def add_frontera_scheduler(crawler):
        scheduler = FronteraScheduler(crawler)

        # mock these functions
        scheduler.frontier.page_crawled = lambda x: x
        scheduler.frontier.links_extracted = lambda x, y: x
        scheduler.stats_manager.add_crawled_page = lambda x, y: x

        class Engine(object):
            def __init__(self, scheduler):
                self.slot = type('slot', (object, ), {})
                self.slot.scheduler = scheduler

        crawler.engine = Engine(scheduler)
Esempio n. 23
0
 def test_redirect_disabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = False
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is False
     assert fs.enqueue_request(rr2) is False
     assert fs.enqueue_request(rr3) is False
Esempio n. 24
0
 def test_redirect_disabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = False
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is False
     assert fs.enqueue_request(rr2) is False
     assert fs.enqueue_request(rr3) is False
     assert len(fs.frontier.manager.seeds) == 0
     assert fs.stats_manager.stats.get_value('frontera/seeds_count') == None
Esempio n. 25
0
 def test_redirect_enabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = True
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is True
     assert fs.enqueue_request(rr2) is True
     assert fs.enqueue_request(rr3) is True
     assert set([request.url for request in fs._pending_requests
                 ]) == set([rr1.url, rr2.url, rr3.url])
Esempio n. 26
0
 def test_enqueue_requests(self):
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(r1) is True
     assert fs.enqueue_request(r2) is True
     assert fs.enqueue_request(r3) is True
     assert set(seed.url for seed in fs.frontier.manager.seeds) == set(
         [r1.url, r2.url, r3.url])
     assert all(
         [isinstance(seed, FRequest) for seed in fs.frontier.manager.seeds])
     assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 3
Esempio n. 27
0
 def test_process_exception(self):
     exception = type('exception', (object, ), {})
     crawler = FakeCrawler()
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     fs.process_exception(r1, exception(), Spider)
     error = fs.frontier.manager.errors.pop()
     assert error[0].url == r1.url
     assert error[1] == 'exception'
     assert fs.stats_manager.stats.get_value(
         'frontera/request_errors_count') == 1
     assert fs.stats_manager.stats.get_value(
         'frontera/request_errors_count/exception') == 1
Esempio n. 28
0
 def test_redirect_enabled_enqueue_requests(self):
     settings = Settings()
     settings['REDIRECT_ENABLED'] = True
     crawler = FakeCrawler(settings)
     fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
     fs.open(Spider)
     assert fs.enqueue_request(rr1) is True
     assert fs.enqueue_request(rr2) is True
     assert fs.enqueue_request(rr3) is False
     assert set([request.url for request in fs._pending_requests
                 ]) == set([rr1.url, rr2.url])
     assert all(
         [isinstance(request, Request) for request in fs._pending_requests])
     assert fs.stats_manager.stats.get_value('frontera/seeds_count') == None
     assert fs.stats_manager.stats.get_value(
         'frontera/redirected_requests_count') == 2