Ejemplo n.º 1
0
def test_useragents_spider(spider, scrape_request, html_headers,
                           mock_html_nolinks):
    """Ensure multiple requests with different user agent strings emitted"""
    ua1 = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    ua2 = factories.BatchUserAgentFactory.build(ua_string='Chrome / 20.0')
    spider.batch_user_agents = [ua1, ua2]

    # Generate a mock response
    mock_response = Response('http://test:12345',
                             body=mock_html_nolinks)
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.status = 200
    mock_response.encoding = u'utf-8'
    mock_response.flags = []

    # Call the spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # Assert that we have two requests for this linkless page, one for each
    # of the user agents we inserted
    request_uas = []
    for new_request in pipeline_generator:
        if isinstance(new_request, Request):
            request_uas.append(new_request.meta['user_agent'].ua_string)
        else:
            # We're not expecting anything other than Requests
            assert False

    assert set(request_uas) == set([u'Firefox / 11.0', u'Chrome / 20.0'])
Ejemplo n.º 2
0
def test_spider_crawls_links(spider, scrape_request, html_headers,
                             mock_html_twolinks):
    """Ensure spider always picks up relevant links to HTML pages"""
    # Use only 1 user agent for easier counting
    ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    spider.batch_user_agents = [ua]

    # Generate a mock response based on html containing two links
    mock_response = Response('http://test:12345',
                             body=mock_html_twolinks)
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.meta['user_agent'] = ua
    mock_response.status = 200
    mock_response.encoding = u'utf-8'
    mock_response.flags = []

    # Call spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # Assert that we got the expected set of new requests generated in the
    # spider and nothing else
    sites_expected = set([
            mock_response.url + '/link1.html',
            mock_response.url + '/link2.html',
            ])

    sites_collected = []
    for new_request in pipeline_generator:
        if isinstance(new_request, Request):
            sites_collected.append(new_request.url)
        else:
            pass

    assert sites_expected == set(sites_collected)
Ejemplo n.º 3
0
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js):
    """JS items are emitted correctly"""
    # Generate a mock response based on JS
    mock_url = 'http://test:12345/default.js'
    mock_response = Response(mock_url, body=mock_js)
    mock_response.request = linked_js_request
    mock_response.headers = js_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_js_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(js_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(js_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_js_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']
    item_expected['redirected_from'] = ''

    assert list(pipeline_generator) == [item_expected]
def _responses(request, status_codes):
    responses = []
    for code in status_codes:
        response = Response(request.url, status=code)
        response.request = request
        responses.append(response)
    return responses
def _responses(request, status_codes):
    responses = []
    for code in status_codes:
        response = Response(request.url, status=code)
        response.request = request
        responses.append(response)
    return responses
Ejemplo n.º 6
0
def pytest_funcarg__mock_response(request):
    """
    Fake response to the scrape request -- we only fill out the fields used by
    the middleware for testing purposes
    """
    scrape_request = request.getfuncargvalue("scrape_request")
    mock_response = Response('http://test.com')
    mock_response.request = scrape_request
    return mock_response
Ejemplo n.º 7
0
    def test_empty_content_type(self):
        name = "ebay4"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        response = Response(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
                            body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        # must not raise an error
        for result in spider.parse(response):
            pass
Ejemplo n.º 8
0
def test_hs_mware_process_spider_output_filter_request(hs_mware):
    response = Response('http://resp-url')
    # provide a response and a new request in result
    child_response = Response('http://resp-url-child')
    child_response.request = Request('http://resp-url-child-req')
    child_request = Request('http://req-url-child')
    hs_mware._seen = WeakKeyDictionary({response: 'riq'})
    result = list(hs_mware.process_spider_output(
        response, [child_response, child_request], Spider('test')))
    assert len(result) == 2
    # make sure that we update hsparent meta only for requests
    assert result[0].meta.get(HS_PARENT_ID_KEY) is None
    assert result[1].meta[HS_PARENT_ID_KEY] == 'riq'
Ejemplo n.º 9
0
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
    assert hs_spider_middleware._seen_requests == WeakKeyDictionary()
    assert hs_downloader_middleware._seen_requests == WeakKeyDictionary()
    assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests

    spider = Spider('test')
    url = 'http://resp-url'
    request_0 = Request(url)
    response_0 = Response(url)

    hs_downloader_middleware.process_request(request_0, spider)

    assert HS_REQUEST_ID_KEY not in request_0.meta
    assert HS_PARENT_ID_KEY not in request_0.meta
    assert len(hs_spider_middleware._seen_requests) == 0
    assert len(hs_downloader_middleware._seen_requests) == 0

    hs_downloader_middleware.process_response(request_0, response_0, spider)

    assert request_0.meta[HS_REQUEST_ID_KEY] == 0
    assert request_0.meta[HS_PARENT_ID_KEY] is None
    assert hs_spider_middleware._seen_requests[request_0] == 0

    response_0.request = request_0
    request_1 = Request(url)
    request_2 = Request(url)
    item1 = {}
    item2 = Item()
    output = [request_1, request_2, item1, item2]
    processed_output = list(
        hs_spider_middleware.process_spider_output(response_0, output, spider))

    assert processed_output[0] is request_1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[1] is request_2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[2] is item1
    assert processed_output[3] is item2

    response_1 = Response(url)
    hs_downloader_middleware.process_request(request_1, spider)
    hs_downloader_middleware.process_response(request_1, response_1, spider)
    assert request_1.meta[HS_REQUEST_ID_KEY] == 1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0

    response_2 = Response(url)
    hs_downloader_middleware.process_request(request_2, spider)
    hs_downloader_middleware.process_response(request_2, response_2, spider)
    assert request_2.meta[HS_REQUEST_ID_KEY] == 2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
    assert hs_spider_middleware._seen_requests == WeakKeyDictionary()
    assert hs_downloader_middleware._seen_requests == WeakKeyDictionary()
    assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests

    spider = Spider('test')
    url = 'http://resp-url'
    request_0 = Request(url)
    response_0 = Response(url)

    hs_downloader_middleware.process_request(request_0, spider)

    assert HS_REQUEST_ID_KEY not in request_0.meta
    assert HS_PARENT_ID_KEY not in request_0.meta
    assert len(hs_spider_middleware._seen_requests) == 0
    assert len(hs_downloader_middleware._seen_requests) == 0

    hs_downloader_middleware.process_response(request_0, response_0, spider)

    assert request_0.meta[HS_REQUEST_ID_KEY] == 0
    assert request_0.meta[HS_PARENT_ID_KEY] is None
    assert hs_spider_middleware._seen_requests[request_0] == 0

    response_0.request = request_0
    request_1 = Request(url)
    request_2 = Request(url)
    item1 = {}
    item2 = Item()
    output = [request_1, request_2, item1, item2]
    processed_output = list(hs_spider_middleware.process_spider_output(response_0, output, spider))

    assert processed_output[0] is request_1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[1] is request_2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[2] is item1
    assert processed_output[3] is item2

    response_1 = Response(url)
    hs_downloader_middleware.process_request(request_1, spider)
    hs_downloader_middleware.process_response(request_1, response_1, spider)
    assert request_1.meta[HS_REQUEST_ID_KEY] == 1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0

    response_2 = Response(url)
    hs_downloader_middleware.process_request(request_2, spider)
    hs_downloader_middleware.process_response(request_2, response_2, spider)
    assert request_2.meta[HS_REQUEST_ID_KEY] == 2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
def test_hs_mware_process_spider_input(hs_mware):
    response = Response('http://resp-url')
    response.request = Request('http://req-url')
    hs_mware.process_spider_input(response, Spider('test'))
    assert hs_mware.pipe_writer.write_request.call_count == 1
    args = hs_mware.pipe_writer.write_request.call_args[1]
    assert args == {
        'duration': 0,
        'fp': request_fingerprint(response.request),
        'method': 'GET',
        'parent': None,
        'rs': 0,
        'status': 200,
        'url': 'http://resp-url'
    }
    assert hs_mware._seen == WeakKeyDictionary({response: 0})
def test_hs_mware_process_spider_input(hs_mware):
    response = Response('http://resp-url')
    response.request = Request('http://req-url')
    hs_mware.process_spider_input(response, Spider('test'))
    assert hs_mware.pipe_writer.write_request.call_count == 1
    args = hs_mware.pipe_writer.write_request.call_args[1]
    assert args == {
        'duration': 0,
        'fp': request_fingerprint(response.request),
        'method': 'GET',
        'parent': None,
        'rs': 0,
        'status': 200,
        'url': 'http://resp-url'
    }
    assert hs_mware._seen == WeakKeyDictionary({response: 0})
Ejemplo n.º 13
0
def test_css_item_emission(spider, linked_css_request, css_headers, mock_css):
    """CSS items are emitted correctly"""
    # Use only 1 user agent for easier counting
    ua1 = factories.BatchUserAgentFactory(ua_string='Firefox / 11.0')
    spider.user_agents = [ua1]

    # Generate a mock response based on CSS
    mock_url = 'http://test:12345/default.css'
    mock_response = Response(mock_url,
                             body=mock_css)
    mock_response.request = linked_css_request
    mock_response.headers = css_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_css_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(css_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(css_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_css_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']

    item_collected = None
    for item in pipeline_generator:
        if isinstance(item, MarkupItem):
            item_collected = item
        else:
            assert False

    assert item_expected == item_collected
    def test_parse_declaration_doc(self):
        response = Response('http://old.vtek.lt/vtek/.../deklaracija2012.doc', body='msword msword msword')
        response.request = scrapy.Request(response.url)
        response.request.meta['year'] = '2012'

        def mock_doc2xml(msword):
            assert msword == 'msword msword msword'
            return 'xml xml xml'

        with mock.patch('manoseimas.scrapy.spiders.lobbyist_declarations.doc2xml', mock_doc2xml):
            with mock.patch.object(self.spider, 'parse_declaration_xml') as p_d_x:
                list(self.spider.parse_declaration_doc(response))
                assert p_d_x.call_count == 1
                new_response = p_d_x.call_args[0][0]
                assert new_response.meta['year'] == '2012'
                assert new_response.body == 'xml xml xml'
                assert isinstance(new_response, XmlResponse)
def test_hs_mware_process_spider_input(hs_mware):
    response = Response('http://resp-url')
    response.request = Request('http://req-url')
    hs_mware.hsref.job.requests.add.return_value = 'riq'
    hs_mware.process_spider_input(response, Spider('test'))
    assert hs_mware.hsref.job.requests.add.call_count == 1
    args = hs_mware.hsref.job.requests.add.call_args[1]
    ts = args.pop('ts', None)
    assert isinstance(ts, float)
    assert args == {
        'duration': 0,
        'fp': request_fingerprint(response.request),
        'method': 'GET',
        'parent': None,
        'rs': 0,
        'status': 200,
        'url': 'http://resp-url'}
    assert hs_mware._seen == WeakKeyDictionary({response: 'riq'})
Ejemplo n.º 16
0
    def test_process_spider_output(self):
        req = Request('http://scrapytest.org')
        resp = Response('http://scrapytest.org')
        resp.request = req
        result = [Request('http://scrapytest.org')]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = self.stats.get_value('request_depth_count/1', spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta['depth'] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = self.stats.get_value('request_depth_max', spider=self.spider)
        self.assertEquals(rdm, 1)
Ejemplo n.º 17
0
    def test_process_spider_output(self):
        req = Request('http://scrapytest.org')
        resp = Response('http://scrapytest.org')
        resp.request = req
        result = [Request('http://scrapytest.org')]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = self.stats.get_value('request_depth_count/1', spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta['depth'] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = self.stats.get_value('request_depth_max', spider=self.spider)
        self.assertEquals(rdm, 1)
    def _getresponse(self, coding):
        if coding not in FORMAT:
            raise ValueError()

        samplefile, contentencoding = FORMAT[coding]

        with open(join(SAMPLEDIR, samplefile), 'rb') as sample:
            body = sample.read()

        headers = {
                'Server': 'Yaws/1.49 Yet Another Web Server',
                'Date': 'Sun, 08 Mar 2009 00:41:03 GMT',
                'Content-Length': len(body),
                'Content-Type': 'text/html',
                'Content-Encoding': contentencoding,
                }

        response = Response('http://scrapytest.org/', body=body, headers=headers)
        response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip,deflate'})
        return response
    def _getresponse(self, coding):
        if coding not in FORMAT:
            raise ValueError()

        samplefile, contentencoding = FORMAT[coding]

        with open(join(SAMPLEDIR, samplefile), "rb") as sample:
            body = sample.read()

        headers = {
            "Server": "Yaws/1.49 Yet Another Web Server",
            "Date": "Sun, 08 Mar 2009 00:41:03 GMT",
            "Content-Length": len(body),
            "Content-Type": "text/html",
            "Content-Encoding": contentencoding,
        }

        response = Response("http://scrapytest.org/", body=body, headers=headers)
        response.request = Request("http://scrapytest.org", headers={"Accept-Encoding": "gzip,deflate"})
        return response
Ejemplo n.º 20
0
    def _getresponse(self, coding):
        if coding not in FORMAT:
            raise ValueError()

        samplefile, contentencoding = FORMAT[coding]

        with open(join(SAMPLEDIR, samplefile), 'rb') as sample:
            body = sample.read()

        headers = {
            'Server': 'Yaws/1.49 Yet Another Web Server',
            'Date': 'Sun, 08 Mar 2009 00:41:03 GMT',
            'Content-Length': len(body),
            'Content-Type': 'text/html',
            'Content-Encoding': contentencoding,
        }

        response = Response('http://scrapytest.org/', body=body, headers=headers)
        response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip, deflate'})
        return response
Ejemplo n.º 21
0
    def test_parse_declaration_doc(self):
        response = Response('http://old.vtek.lt/vtek/.../deklaracija2012.doc',
                            body='msword msword msword')
        response.request = scrapy.Request(response.url)
        response.request.meta['year'] = '2012'

        def mock_doc2xml(msword):
            assert msword == 'msword msword msword'
            return 'xml xml xml'

        with mock.patch(
                'manoseimas.scrapy.spiders.lobbyist_declarations.doc2xml',
                mock_doc2xml):
            with mock.patch.object(self.spider,
                                   'parse_declaration_xml') as p_d_x:
                list(self.spider.parse_declaration_doc(response))
                assert p_d_x.call_count == 1
                new_response = p_d_x.call_args[0][0]
                assert new_response.meta['year'] == '2012'
                assert new_response.body == 'xml xml xml'
                assert isinstance(new_response, XmlResponse)
Ejemplo n.º 22
0
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js):
    """JS items are emitted correctly"""
    # Generate a mock response based on JS
    mock_url = 'http://test:12345/default.js'
    mock_response = Response(mock_url,
                             body=mock_js)
    mock_response.request = linked_js_request
    mock_response.headers = js_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_js_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(js_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(js_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_js_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']
    item_expected['redirected_from'] = ''

    assert list(pipeline_generator) == [item_expected]