def test_spider_crawls_links(spider, scrape_request, html_headers, mock_html_twolinks): """Ensure spider always picks up relevant links to HTML pages""" # Use only 1 user agent for easier counting ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0') spider.batch_user_agents = [ua] # Generate a mock response based on html containing two links mock_response = HtmlResponse(url='http://test:12345', body=mock_html_twolinks, encoding='utf-8') mock_response.request = scrape_request mock_response.headers = html_headers mock_response.meta['user_agent'] = ua mock_response.meta['sitescan'] = factories.SiteScanFactory() mock_response.status = 200 mock_response.flags = [] # Call spider on the mock response pipeline_generator = spider.parse(mock_response) # We should have two new requests and one MarkupItem sites_expected = set([ mock_response.url + '/link1.html', mock_response.url + '/link2.html', ]) sites_collected = [] for elem in pipeline_generator: if isinstance(elem, Request): sites_collected.append(elem.url) else: assert isinstance(elem, MarkupItem) assert sites_expected == set(sites_collected)