def to_frontier(self, scrapy_request): """request: Scrapy > Frontier""" if isinstance(scrapy_request.cookies, dict): cookies = scrapy_request.cookies else: cookies = dict(sum([d.items() for d in scrapy_request.cookies], [])) cb = scrapy_request.callback if callable(cb): cb = _find_method(self.spider, cb) eb = scrapy_request.errback if callable(eb): eb = _find_method(self.spider, eb) scrapy_meta = scrapy_request.meta meta = {} if 'frontier_request' in scrapy_meta: request = scrapy_meta['frontier_request'] if isinstance(request, FrontierRequest): meta = request.meta del scrapy_meta['frontier_request'] meta.update({ 'scrapy_callback': cb, 'scrapy_errback': eb, 'scrapy_meta': scrapy_meta, 'origin_is_frontier': True, }) return FrontierRequest(url=scrapy_request.url, method=scrapy_request.method, headers=scrapy_request.headers, cookies=cookies, meta=meta)
def to_frontier(self, request): """request: Requests > Frontier""" return FrontierRequest( url=request.url, method=request.method, headers=request.headers, cookies=request.cookies if hasattr(request, 'cookies') else {})
def perform_test(self, output_func): def request_callback(response): yield Request('http://frontera.org') req = Request(url='http://www.scrapy.org', callback=request_callback, meta={ b'frontier_request': FrontierRequest('http://www.scrapy.org') }) res = Response(url='http://www.scrapy.org', request=req) def call_request_callback(result, request, spider): dfd = Deferred() dfd.addCallback(request.callback) return dfd def test_failure(failure): # work around for test to fail with detailed traceback self._observer._errors.append(failure) dfd = self.smw.scrape_response(call_request_callback, res, req, self.spider) dfd.addCallback(output_func) dfd.addErrback(test_failure) dfd.callback(res)
def test_request_response_converters(): spider = TestSpider() rc = RequestConverter(spider) rsc = ResponseConverter(spider, rc) url = "http://test.com/test?param=123" request = ScrapyRequest(url=url, callback=spider.callback, errback=spider.errback, body=REQUEST_BODY) request.meta['test_param'] = 'test_value' request.headers.appendlist("TestKey", "test value") request.cookies['MyCookie'] = 'CookieContent' frontier_request = rc.to_frontier(request) assert frontier_request.meta['scrapy_callback'] == 'callback' assert frontier_request.meta['scrapy_errback'] == 'errback' assert frontier_request.body == REQUEST_BODY assert frontier_request.url == url assert frontier_request.method == 'GET' assert frontier_request.headers['Testkey'] == 'test value' assert frontier_request.cookies['MyCookie'] == 'CookieContent' assert 'frontier_request' not in frontier_request.meta['scrapy_meta'] request_converted = rc.from_frontier(frontier_request) assert request_converted.meta['test_param'] == 'test_value' assert request_converted.body == REQUEST_BODY assert request_converted.url == url assert request_converted.method == 'GET' assert request_converted.cookies['MyCookie'] == 'CookieContent' assert request_converted.headers.get('Testkey') == 'test value' # Some middleware could change .meta contents request_converted.meta['middleware_stuff'] = 'appeared' response = ScrapyResponse(url=url, request=request_converted, body=RESPONSE_BODY, headers={'TestHeader': 'Test value'}) frontier_response = rsc.to_frontier(response) assert frontier_response.body == RESPONSE_BODY assert frontier_response.meta['scrapy_meta']['test_param'] == 'test_value' assert frontier_response.meta['scrapy_meta'][ 'middleware_stuff'] == 'appeared' assert frontier_response.status_code == 200 assert 'frontier_request' not in frontier_response.meta['scrapy_meta'] response_converted = rsc.from_frontier(frontier_response) assert response_converted.body == RESPONSE_BODY assert response_converted.meta['test_param'] == 'test_value' assert response_converted.url == url assert response_converted.status == 200 assert response_converted.headers['TestHeader'] == 'Test value' frontier_request = FrontierRequest(url) request_converted = rc.from_frontier(frontier_request) assert frontier_request.url == url
def to_frontier(self, scrapy_request): """request: Scrapy > Frontier""" if isinstance(scrapy_request.cookies, dict): cookies = scrapy_request.cookies else: cookies = dict(sum([d.items() for d in scrapy_request.cookies], [])) cb = scrapy_request.callback if callable(cb): cb = _find_method(self.spider, cb) eb = scrapy_request.errback if callable(eb): eb = _find_method(self.spider, eb) statevars = self.spider.crawler.settings.getlist( 'FRONTERA_SCHEDULER_STATE_ATTRIBUTES', []) meta = { b'scrapy_callback': cb, b'scrapy_errback': eb, b'scrapy_meta': scrapy_request.meta, b'scrapy_body': scrapy_request.body, b'spider_state': [(attr, getattr(self.spider, attr, None)) for attr in statevars], b'origin_is_frontier': True, } fingerprint_scrapy_request = scrapy_request if fingerprint_scrapy_request.dont_filter: # if dont_filter is True, we need to simulate # not filtering by generating a different fingerprint each time we see same request. # So let's altere randomly the url fake_url = fingerprint_scrapy_request.url + str(uuid.uuid4()) fingerprint_scrapy_request = fingerprint_scrapy_request.replace( url=fake_url) meta[b'frontier_fingerprint'] = scrapy_request.meta.get( 'frontier_fingerprint', request_fingerprint(fingerprint_scrapy_request)) callback_slot_prefix_map = self.spider.crawler.settings.getdict( "FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP") frontier_slot_prefix_num_slots = callback_slot_prefix_map.get( get_callback_name(scrapy_request)) if frontier_slot_prefix_num_slots: frontier_slot_prefix, *rest = frontier_slot_prefix_num_slots.split( '/', 1) meta[b'frontier_slot_prefix'] = frontier_slot_prefix if rest: meta[b'frontier_number_of_slots'] = int(rest[0]) return FrontierRequest(url=scrapy_request.url, method=scrapy_request.method, headers=dict(scrapy_request.headers.items()), cookies=cookies, meta=meta)
def to_frontier(cls, request): """request: Scrapy > Frontier""" if isinstance(request.cookies, dict): cookies = request.cookies else: cookies = dict(sum([d.items() for d in request.cookies], [])) meta = { 'scrapy_callback': request.callback, 'origin_is_frontier': True, } meta.update(request.meta or {}) return FrontierRequest(url=request.url, method=request.method, headers=request.headers, cookies=cookies, meta=meta)
def test_frontera_scheduler_spider_middleware_with_referer_middleware( self): def request_callback(response): yield Request('http://frontera.org') req = Request(url='http://www.scrapy.org', callback=request_callback, meta={ b'frontier_request': FrontierRequest('http://www.scrapy.org') }) res = Response(url='http://www.scrapy.org', request=req) def call_request_callback(result, request, spider): dfd = Deferred() dfd.addCallback(request.callback) return dfd def test_middleware_output(result): out = list(result) self.assertEquals(len(out), 1) self.assertIsInstance(out[0], Request) self.assertIn('Referer', out[0].headers) self.assertEquals(out[0].headers['Referer'], to_bytes(res.url)) def test_failure(failure): # work around for test to fail with detailed traceback self._observer._errors.append(failure) dfd = self.smw.scrape_response(call_request_callback, res, req, self.spider) dfd.addCallback(test_middleware_output) dfd.addErrback(test_failure) dfd.callback(res)
def to_frontier(cls, request): """request: AsyncRequest > Frontier""" return FrontierRequest(url=request.url, method=request.method)