def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} no_requests = 3 result = [r1, r2, r3, i1, i2] resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) out = list(fs.process_spider_output(resp, result, Spider)) assert len(out) == len(result) out_request = out[:no_requests] assert set(r.url for r in out_request) == set(r.url for r in result[:no_requests]) out_items = out[no_requests:] assert sorted(out_items, key=lambda i: sorted(i['item'])) == \ sorted([i1, i2], key=lambda i: sorted(i['item'])) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links ]) == set([r1.url, r2.url, r3.url]) assert all([ isinstance(request, FRequest) for request in fs.frontier.manager.links ]) assert fs.stats_manager.stats.get_value( 'frontera/crawled_pages_count') == 1 assert fs.stats_manager.stats.get_value( 'frontera/crawled_pages_count/200') == 1 assert fs.stats_manager.stats.get_value( 'frontera/links_extracted_count') == 3
def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} items = [i1, i2] requests = [r1, r2, r3] result = list(requests) result.extend(items) resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) spider = Spider(name="testing") fs.open(spider) out_items = list(fs.process_spider_output(resp, result, spider)) assert len(out_items) == len(items) assert set([r.url for r in fs.frontier.manager.links ]) == set([r.url for r in requests]) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links ]) == set([r1.url, r2.url, r3.url]) assert all([ isinstance(request, FRequest) for request in fs.frontier.manager.links ]) assert fs.stats_manager.stats.get_value( 'frontera/crawled_pages_count') == 1 assert fs.stats_manager.stats.get_value( 'frontera/crawled_pages_count/200') == 1 assert fs.stats_manager.stats.get_value( 'frontera/links_extracted_count') == 3
def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} result = [r1, r2, r3, i1, i2] resp = Response(fr1.url, request=Request(fr1.url, meta={'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) assert sorted(list(fs.process_spider_output( resp, result, Spider))) == sorted([i1, i2]) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links ]) == set([r1.url, r2.url, r3.url]) assert all([ isinstance(request, FRequest) for request in fs.frontier.manager.links ]) assert fs.stats_manager.stats.get_value( 'frontera/crawled_pages_count') == 1 assert fs.stats_manager.stats.get_value( 'frontera/crawled_pages_count/200') == 1 assert fs.stats_manager.stats.get_value( 'frontera/links_extracted_count') == 3
def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} result = [r1, r2, r3, i1, i2] resp = Response(fr1.url, request=Request(fr1.url, meta={'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) assert sorted(list(fs.process_spider_output(resp, result, Spider))) == sorted([i1, i2]) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url]) assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links]) assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1 assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1 assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3
def test_process_spider_output(self): i1 = {"name": "item", "item": "i1"} i2 = {"name": "item", "item": "i2"} result = [r1, r2, r3, i1, i2] resp = Response(fr1.url, request=Request(fr1.url, meta={b"frontier_request": fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) assert sorted(list(fs.process_spider_output(resp, result, Spider)), key=lambda i: sorted(i["item"])) == sorted( [i1, i2], key=lambda i: sorted(i["item"]) ) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url]) assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links]) assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count") == 1 assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count/200") == 1 assert fs.stats_manager.stats.get_value("frontera/links_extracted_count") == 3
def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} items = [i1 , i2] requests = [r1, r2, r3] result = list(requests) result.extend(items) resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) spider = Spider(name="testing") fs.open(spider) out_items = list(fs.process_spider_output(resp, result, spider)) assert len(out_items) == len(items) assert set([r.url for r in fs.frontier.manager.links]) == set([r.url for r in requests]) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url]) assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links]) assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1 assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1 assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3
def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} no_requests = 3 result = [r1, r2, r3, i1, i2] resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) out = list(fs.process_spider_output(resp, result, Spider)) assert len(out) == len(result) out_request = out[:no_requests] assert set(r.url for r in out_request) == set(r.url for r in result[:no_requests]) out_items = out[no_requests:] assert sorted(out_items, key=lambda i: sorted(i['item'])) == \ sorted([i1, i2], key=lambda i: sorted(i['item'])) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url]) assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links]) assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1 assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1 assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3