コード例 #1
0
def test_spider_crawls_links(spider, scrape_request, html_headers,
                             mock_html_twolinks):
    """Ensure spider always picks up relevant links to HTML pages"""
    # Use only 1 user agent for easier counting
    ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    spider.batch_user_agents = [ua]

    # Generate a mock response based on html containing two links
    mock_response = HtmlResponse(url='http://test:12345',
                                 body=mock_html_twolinks,
                                 encoding='utf-8')
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.meta['user_agent'] = ua
    mock_response.meta['sitescan'] = factories.SiteScanFactory()
    mock_response.status = 200
    mock_response.flags = []

    # Call spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # We should have two new requests and one MarkupItem
    sites_expected = set([
        mock_response.url + '/link1.html',
        mock_response.url + '/link2.html',
    ])

    sites_collected = []
    for elem in pipeline_generator:
        if isinstance(elem, Request):
            sites_collected.append(elem.url)
        else:
            assert isinstance(elem, MarkupItem)

    assert sites_expected == set(sites_collected)
コード例 #2
0
ファイル: test_spider.py プロジェクト: hallvors/spade
def test_spider_crawls_links(spider, scrape_request, html_headers,
                             mock_html_twolinks):
    """Ensure spider always picks up relevant links to HTML pages"""
    # Use only 1 user agent for easier counting
    ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    spider.batch_user_agents = [ua]

    # Generate a mock response based on html containing two links
    mock_response = HtmlResponse(url='http://test:12345',
                                 body=mock_html_twolinks,
                                 encoding='utf-8')
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.meta['user_agent'] = ua
    mock_response.meta['sitescan'] = factories.SiteScanFactory()
    mock_response.status = 200
    mock_response.flags = []

    # Call spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # We should have two new requests and one MarkupItem
    sites_expected = set([
        mock_response.url + '/link1.html',
        mock_response.url + '/link2.html',
    ])

    sites_collected = []
    for elem in pipeline_generator:
        if isinstance(elem, Request):
            sites_collected.append(elem.url)
        else:
            assert isinstance(elem, MarkupItem)

    assert sites_expected == set(sites_collected)