Esempio n. 1
0
    def to_frontier(self, scrapy_request):
        """request: Scrapy > Frontier"""
        if isinstance(scrapy_request.cookies, dict):
            cookies = scrapy_request.cookies
        else:
            cookies = dict(sum([d.items() for d in scrapy_request.cookies],
                               []))
        cb = scrapy_request.callback
        if callable(cb):
            cb = _find_method(self.spider, cb)
        eb = scrapy_request.errback
        if callable(eb):
            eb = _find_method(self.spider, eb)

        scrapy_meta = scrapy_request.meta
        meta = {}
        if 'frontier_request' in scrapy_meta:
            request = scrapy_meta['frontier_request']
            if isinstance(request, FrontierRequest):
                meta = request.meta
            del scrapy_meta['frontier_request']

        meta.update({
            'scrapy_callback': cb,
            'scrapy_errback': eb,
            'scrapy_meta': scrapy_meta,
            'origin_is_frontier': True,
        })
        return FrontierRequest(url=scrapy_request.url,
                               method=scrapy_request.method,
                               headers=scrapy_request.headers,
                               cookies=cookies,
                               meta=meta)
Esempio n. 2
0
 def to_frontier(self, request):
     """request: Requests > Frontier"""
     return FrontierRequest(
         url=request.url,
         method=request.method,
         headers=request.headers,
         cookies=request.cookies if hasattr(request, 'cookies') else {})
    def perform_test(self, output_func):
        def request_callback(response):
            yield Request('http://frontera.org')

        req = Request(url='http://www.scrapy.org',
                      callback=request_callback,
                      meta={
                          b'frontier_request':
                          FrontierRequest('http://www.scrapy.org')
                      })

        res = Response(url='http://www.scrapy.org', request=req)

        def call_request_callback(result, request, spider):
            dfd = Deferred()
            dfd.addCallback(request.callback)
            return dfd

        def test_failure(failure):
            # work around for test to fail with detailed traceback
            self._observer._errors.append(failure)

        dfd = self.smw.scrape_response(call_request_callback, res, req,
                                       self.spider)

        dfd.addCallback(output_func)
        dfd.addErrback(test_failure)

        dfd.callback(res)
Esempio n. 4
0
def test_request_response_converters():
    spider = TestSpider()
    rc = RequestConverter(spider)
    rsc = ResponseConverter(spider, rc)

    url = "http://test.com/test?param=123"
    request = ScrapyRequest(url=url,
                            callback=spider.callback,
                            errback=spider.errback,
                            body=REQUEST_BODY)
    request.meta['test_param'] = 'test_value'
    request.headers.appendlist("TestKey", "test value")
    request.cookies['MyCookie'] = 'CookieContent'

    frontier_request = rc.to_frontier(request)
    assert frontier_request.meta['scrapy_callback'] == 'callback'
    assert frontier_request.meta['scrapy_errback'] == 'errback'
    assert frontier_request.body == REQUEST_BODY
    assert frontier_request.url == url
    assert frontier_request.method == 'GET'
    assert frontier_request.headers['Testkey'] == 'test value'
    assert frontier_request.cookies['MyCookie'] == 'CookieContent'
    assert 'frontier_request' not in frontier_request.meta['scrapy_meta']

    request_converted = rc.from_frontier(frontier_request)
    assert request_converted.meta['test_param'] == 'test_value'
    assert request_converted.body == REQUEST_BODY
    assert request_converted.url == url
    assert request_converted.method == 'GET'
    assert request_converted.cookies['MyCookie'] == 'CookieContent'
    assert request_converted.headers.get('Testkey') == 'test value'

    # Some middleware could change .meta contents
    request_converted.meta['middleware_stuff'] = 'appeared'

    response = ScrapyResponse(url=url,
                              request=request_converted,
                              body=RESPONSE_BODY,
                              headers={'TestHeader': 'Test value'})

    frontier_response = rsc.to_frontier(response)
    assert frontier_response.body == RESPONSE_BODY
    assert frontier_response.meta['scrapy_meta']['test_param'] == 'test_value'
    assert frontier_response.meta['scrapy_meta'][
        'middleware_stuff'] == 'appeared'
    assert frontier_response.status_code == 200
    assert 'frontier_request' not in frontier_response.meta['scrapy_meta']

    response_converted = rsc.from_frontier(frontier_response)
    assert response_converted.body == RESPONSE_BODY
    assert response_converted.meta['test_param'] == 'test_value'
    assert response_converted.url == url
    assert response_converted.status == 200
    assert response_converted.headers['TestHeader'] == 'Test value'

    frontier_request = FrontierRequest(url)
    request_converted = rc.from_frontier(frontier_request)
    assert frontier_request.url == url
Esempio n. 5
0
    def to_frontier(self, scrapy_request):
        """request: Scrapy > Frontier"""
        if isinstance(scrapy_request.cookies, dict):
            cookies = scrapy_request.cookies
        else:
            cookies = dict(sum([d.items() for d in scrapy_request.cookies],
                               []))
        cb = scrapy_request.callback
        if callable(cb):
            cb = _find_method(self.spider, cb)
        eb = scrapy_request.errback
        if callable(eb):
            eb = _find_method(self.spider, eb)

        statevars = self.spider.crawler.settings.getlist(
            'FRONTERA_SCHEDULER_STATE_ATTRIBUTES', [])
        meta = {
            b'scrapy_callback':
            cb,
            b'scrapy_errback':
            eb,
            b'scrapy_meta':
            scrapy_request.meta,
            b'scrapy_body':
            scrapy_request.body,
            b'spider_state':
            [(attr, getattr(self.spider, attr, None)) for attr in statevars],
            b'origin_is_frontier':
            True,
        }

        fingerprint_scrapy_request = scrapy_request
        if fingerprint_scrapy_request.dont_filter:
            # if dont_filter is True, we need to simulate
            # not filtering by generating a different fingerprint each time we see same request.
            # So let's altere randomly the url
            fake_url = fingerprint_scrapy_request.url + str(uuid.uuid4())
            fingerprint_scrapy_request = fingerprint_scrapy_request.replace(
                url=fake_url)
        meta[b'frontier_fingerprint'] = scrapy_request.meta.get(
            'frontier_fingerprint',
            request_fingerprint(fingerprint_scrapy_request))
        callback_slot_prefix_map = self.spider.crawler.settings.getdict(
            "FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP")
        frontier_slot_prefix_num_slots = callback_slot_prefix_map.get(
            get_callback_name(scrapy_request))
        if frontier_slot_prefix_num_slots:
            frontier_slot_prefix, *rest = frontier_slot_prefix_num_slots.split(
                '/', 1)
            meta[b'frontier_slot_prefix'] = frontier_slot_prefix
            if rest:
                meta[b'frontier_number_of_slots'] = int(rest[0])
        return FrontierRequest(url=scrapy_request.url,
                               method=scrapy_request.method,
                               headers=dict(scrapy_request.headers.items()),
                               cookies=cookies,
                               meta=meta)
Esempio n. 6
0
 def to_frontier(cls, request):
     """request: Scrapy > Frontier"""
     if isinstance(request.cookies, dict):
         cookies = request.cookies
     else:
         cookies = dict(sum([d.items() for d in request.cookies], []))
     meta = {
         'scrapy_callback': request.callback,
         'origin_is_frontier': True,
     }
     meta.update(request.meta or {})
     return FrontierRequest(url=request.url,
                            method=request.method,
                            headers=request.headers,
                            cookies=cookies,
                            meta=meta)
Esempio n. 7
0
    def test_frontera_scheduler_spider_middleware_with_referer_middleware(
            self):
        def request_callback(response):
            yield Request('http://frontera.org')

        req = Request(url='http://www.scrapy.org',
                      callback=request_callback,
                      meta={
                          b'frontier_request':
                          FrontierRequest('http://www.scrapy.org')
                      })

        res = Response(url='http://www.scrapy.org', request=req)

        def call_request_callback(result, request, spider):
            dfd = Deferred()
            dfd.addCallback(request.callback)
            return dfd

        def test_middleware_output(result):
            out = list(result)
            self.assertEquals(len(out), 1)
            self.assertIsInstance(out[0], Request)
            self.assertIn('Referer', out[0].headers)
            self.assertEquals(out[0].headers['Referer'], to_bytes(res.url))

        def test_failure(failure):
            # work around for test to fail with detailed traceback
            self._observer._errors.append(failure)

        dfd = self.smw.scrape_response(call_request_callback, res, req,
                                       self.spider)

        dfd.addCallback(test_middleware_output)
        dfd.addErrback(test_failure)

        dfd.callback(res)
Esempio n. 8
0
 def to_frontier(cls, request):
     """request: AsyncRequest > Frontier"""
     return FrontierRequest(url=request.url,
                            method=request.method)