def test_generic_form_requests_with_file_field(self): name = "ebay2" spider = self.smanager.create(name) generic_form_request = list(spider.start_requests())[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'form_url': u'http://*****:*****@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}]}, 'headers': {}, 'url': u'file://tmp/test_params.txt', 'dont_filter': True, 'priority': 0, 'callback': 'parse_field_url_page', 'method': 'GET', 'errback': None}] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'fields': [{u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}], u'type': u'form', 'field_index': 1}, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse_form_page', 'method': 'GET', 'errback': None}] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual(generic_form_request.url, 'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc') response = HtmlResponse(url="http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [request_to_dict(req, spider) for req in generic_form_request.callback(response)] expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}] self.assertEqual(request_list, expected)
def test_inline_callback(): """Sample request with inline callback.""" spider = MySpider() cb = callback_for(FakeItemPage) request = scrapy.Request('http://example.com/', callback=cb) with pytest.raises(ValueError) as exc: request_to_dict(request, spider) msg = f'Function {cb} is not a method of: {spider}' assert str(exc.value) == msg
def test_instance_method_callback(): """Sample request specifying spider's instance method callback.""" spider = MySpider() request = scrapy.Request('http://example.com/', callback=spider.parse_item) request_dict = request_to_dict(request, spider) assert isinstance(request_dict, dict) assert request_dict['url'] == 'http://example.com/' assert request_dict['callback'] == 'parse_item' request = scrapy.Request('http://example.com/', callback=spider.parse_web) request_dict = request_to_dict(request, spider) assert isinstance(request_dict, dict) assert request_dict['url'] == 'http://example.com/' assert request_dict['callback'] == 'parse_web'
def test_generic_form_requests_with_file_field(self): name = "ebay2" spider = self.smanager.create(name) generic_form_request = list(spider.start_requests())[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = UTF8HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{'body': b'', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'form_url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', u'type': u'form', 'field_index': 1, u'fields': [{u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}]}, 'headers': {}, 'url': u'file://tmp/test_params.txt', 'dont_filter': True, 'priority': 0, 'callback': 'parse_field_url_page', 'method': 'GET', 'errback': None}] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt') response = UTF8HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read()) response.request = generic_form_request requests = list(generic_form_request.callback(response)) request_list = [request_to_dict(req, spider) for req in requests] expected = [{'_encoding': 'utf-8', 'cookies': {}, 'dont_filter': True, 'errback': None, 'meta': {'fields': [{'type': 'inurl', 'file_values': ['Cars', 'Boats'], 'xpath': ".//*[@name='_nkw']", 'value': 'file://tmp/test_params.txt'}, {'name': '_nkw2', 'file_values': ['Cars', 'Boats'], 'type': 'inurl', 'value': 'file://tmp/test_params.txt'}, {'xpath': ".//*[@name='_in_kw']", 'type': 'iterate'}], 'type': 'form', 'xpath': "//form[@name='adv_search_from']", 'field_index': 1}, 'method': 'GET', 'priority': 0, 'headers': {}, 'body': b'', 'url': 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'callback': 'parse_form_page'}] self.assertEqual(request_list, expected) generic_form_request = requests[0] self.assertEqual(generic_form_request.url, 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc') response = UTF8HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [request_to_dict(req, spider) for req in generic_form_request.callback(response)] expected = [ {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}, {'url': 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'parse', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}] self.assertEqual(request_list, expected)
def _dqpush(self, request): #self.dqs为空,也就是磁盘队列没有实例化,直接返回。 if self.dqs is None: return try: #把request请求放入到对应的优先级队列中。 #request_to_dict方法可以把request实例,转换成一个字典: #d = {'url': request.url.decode('ascii'), # urls should be safe (safe_string_url) #'callback': cb, #'errback': eb, #'method': request.method, #'headers': dict(request.headers), #'body': request.body, #'cookies': request.cookies, #'meta': request.meta, #'_encoding': request._encoding, #'priority': request.priority, #'dont_filter': request.dont_filter,} reqd = request_to_dict(request, self.spider) #字典根据优先级入列。 self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: logger.error( "Unable to serialize request: %(request)s - reason: %(reason)s", { 'request': request, 'reason': e }, exc_info=True, extra={'spider': self.spider}) return else: #如果没有错误,则返回真。 return True
def _dqpush(self, request): """ :param request: :return:是否需要磁盘队列去重 """ if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: msg = ("Unable to serialize request: %(request)s - reason:" " %(reason)s - no more unserializable requests will be" " logged (stats being collected)") logger.warning(msg, { 'request': request, 'reason': e }, exc_info=True, extra={'spider': self.spider}) self.logunser = False self.stats.inc_value('scheduler/unserializable', spider=self.spider) return else: return True
def enqueue_request(self, request): data = request_to_dict(request, self.spider) self.collection.insert({ 'data': data, 'created': datetime.datetime.utcnow() })
def test_login_requests(self): name = "pinterest.com" spider = self.smanager.create(name) login_request = list(spider.start_requests())[0] response = HtmlResponse(url="https://pinterest.com/login/", body=open(join(_PATH, "data", "pinterest.html")).read()) response.request = login_request form_request = login_request.callback(response) expected = {'_encoding': 'utf-8', 'body': 'email=test&password=testpass&csrfmiddlewaretoken=nLZy3NMzhTswZvweHJ4KVmq9UjzaZGn3&_ch=ecnwmar2', 'callback': 'after_login', 'cookies': {}, 'dont_filter': True, 'errback': None, 'headers': {'Content-Type': ['application/x-www-form-urlencoded']}, 'meta': {}, 'method': 'POST', 'priority': 0, 'url': u'https://pinterest.com/login/?next=%2F'} self.assertEqual(request_to_dict(form_request, spider), expected) # simulate a simple response to login post from which extract a link response = HtmlResponse(url="http://pinterest.com/", body="<html><body><a href='http://pinterest.com/categories'></body></html>") result = list(spider.after_login(response)) self.assertEqual([r.url for r in result], ['http://pinterest.com/categories', 'http://pinterest.com/popular/'])
def _encode_request(self, request): """Encode a request object""" try: obj = request.to_dict(spider=self.spider) except AttributeError: obj = request_to_dict(request, self.spider) return self.serializer.dumps(obj)
def _encode_request(self, request): """ encode request :param request: :return: """ return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
def request_to_dict(self, request): ''' Convert Request object to a dict. modified from scrapy.utils.reqser ''' req_dict = { # urls should be safe (safe_string_url) 'url': request.url.decode('ascii'), 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, # callback/errback are assumed to be a bound instance of the spider 'callback': None if request.callback is None else request.callback.func_name, 'errback': None if request.errback is None else request.errback.func_name, 'request': pickle.dumps(request_to_dict(request, self.spider), protocol=-1), } return req_dict
def process_request(self, request, spider): """ Login if we are not logged in yet. """ if '_autologin' in request.meta or request.meta.get('skip_autologin'): returnValue(None) yield self._ensure_login(request, spider) self.stats.set_value('autologin/logged_in', self.logged_in) if self.skipped: request.meta['autologin_active'] = False returnValue(None) elif self.logged_in: request.meta['autologin_active'] = True logout_url = request.meta.get('autologin_logout_url', self.logout_url) if logout_url and logout_url in request.url: logger.debug('Ignoring logout request %s', request.url) raise IgnoreRequest # Save original request to be able to retry it in case of logout req_copy = request.replace(meta=deepcopy(request.meta)) request.meta['_autologin'] = autologin_meta = {} try: autologin_meta['request'] = request_to_dict(req_copy, spider=spider) except ValueError: # Serialization failed, but it might be ok if we do not persist # requests, so store the request itself here. autologin_meta['request'] = req_copy # TODO - it should be possible to put auth cookies into the # cookiejar in process_response (but also check non-splash) if self.auth_cookies: request.cookies = self.auth_cookies autologin_meta['cookie_dict'] = { c['name']: c['value'] for c in self.auth_cookies }
def parse_request(request, spider, settings, testing=False, already_parsed=False): parsed_request = request if not already_parsed: parsed_request = request_to_dict(request, spider=spider) if not parsed_request['callback']: parsed_request['callback'] = 'parse' parsed_request['headers'] = parse_headers(parsed_request['headers'], spider, settings) parsed_request['body'] = parsed_request['body'].decode('utf-8') _meta = {} for key, value in parsed_request.get('meta').items(): _meta[key] = parse_object(value, spider, testing=testing, already_parsed=already_parsed, settings=settings) parsed_request['meta'] = _meta skipped_fields = settings.get('AUTOUNIT_REQUEST_SKIPPED_FIELDS', default=[]) if testing: for field in skipped_fields: parsed_request.pop(field) return parsed_request
def request_page_limit_reached(self, request, spider): # Collect items composing the redis key # grab the tld of the request req_dict = request_to_dict(request, spider) ex_res = self.extract(req_dict['url']) domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix) # grab the crawl id crawl_id = req_dict['meta']['crawlid'] # Compose the redis key composite_key = self.key_start + ':' + domain + ':' + crawl_id # Add new key if it doesn't exist if not self.server.exists(composite_key): self.server.set(composite_key, 0) # Stop incrementing the key when the limit is reached page_count = int(self.server.get(composite_key)) if page_count >= self.page_limit: return True # Increment key page_count = int(self.server.incr(composite_key)) # Set key expiration self.server.expire(composite_key, self.timeout) return page_count >= self.page_limit
def publish_links(self, requests_new, urls_other): # Serialize requests_new reqds_new = [] for request_new in requests_new: try: reqd_new = request_to_dict(request_new, self._spider) reqds_new.append(reqd_new) except ValueError as e: logger.error( u'Unable to serialize request: %(request)s - reason: %(reason)s', { 'request': request_new, 'reason': e }, exc_info=True, extra={'spider': self._spider}) # Send to queue if len(urls_other) <= 0 and len(reqds_new) <= 0: return payload = { 'requests_new': reqds_new, 'urls_other': urls_other, } self._queue_links.publish(payload)
def publish_error(self, request, exception): error = { 'request': request_to_dict(request, self._spider), 'cls': self._get_exception_class(exception), 'msg': str(exception) } self._queue_errors.publish(error)
def test_reference_callback_serialization(self): r = Request("http://www.example.com", callback=self.spider.parse_item_reference, errback=self.spider.handle_error_reference) self._assert_serializes_ok(r, spider=self.spider) request_dict = request_to_dict(r, self.spider) self.assertEqual(request_dict['callback'], 'parse_item_reference') self.assertEqual(request_dict['errback'], 'handle_error_reference')
def _request_to_dict(cls, request, spider=None): d = request_to_dict(request, spider) new_dict = dict() for key, value in d.items(): if value: new_dict[key] = value logger.debug(f"request_to_dict: {d}") return new_dict
def _encode_request(self, request): """Encode a request object""" dic=request_to_dict(request, self.spider) if dic.get("body"): dic["body"]=base64.urlsafe_b64encode(zlib.compress(request.body)).decode("utf-8") if dic.get("body")==b"": dic["body"]=None return dic
def test_default_callback(): """Sample request not specifying callback.""" spider = MySpider() request = scrapy.Request('http://example.com/') request_dict = request_to_dict(request, spider) assert isinstance(request_dict, dict) assert request_dict['url'] == 'http://example.com/' assert request_dict['callback'] is None
def process_request(self, request: Request, spider: Spider) -> Optional[Request]: if not self.enabled: return None try: crawlera_meta = request.meta[META_KEY] except KeyError: crawlera_meta = {} if crawlera_meta.get("skip") or crawlera_meta.get("original_request"): return None self._set_download_slot(request, spider) self.stats.inc_value("crawlera_fetch/request_count") self.stats.inc_value("crawlera_fetch/request_method_count/{}".format( request.method)) shub_jobkey = os.environ.get("SHUB_JOBKEY") if shub_jobkey: self.default_args["job_id"] = shub_jobkey # assemble JSON payload original_body_text = request.body.decode(request.encoding) body = {"url": request.url, "body": original_body_text} if request.method != "GET": body["method"] = request.method body.update(self.default_args) body.update(crawlera_meta.get("args") or {}) body_json = json.dumps(body) additional_meta = { "original_request": request_to_dict(request, spider=spider), "timing": { "start_ts": time.time() }, } crawlera_meta.update(additional_meta) additional_headers = { "Content-Type": "application/json", "Accept": "application/json", } if self.apikey: additional_headers["Authorization"] = self.auth_header request.headers.update(additional_headers) if shub_jobkey: request.headers["X-Crawlera-JobId"] = shub_jobkey if scrapy.version_info < (2, 0, 0): original_url_flag = "original url: {}".format(request.url) if original_url_flag not in request.flags: request.flags.append(original_url_flag) request.meta[META_KEY] = crawlera_meta return request.replace(url=self.url, method="POST", body=body_json)
def _encode_request(self, request): """Encode a request object""" spider = self.spider if request.meta.get('parser_request'): spider = self.spider.parse_spider self.__encode_parser_request__(request, spider) obj = request_to_dict(request, spider) return self.serializer.dumps(obj)
def process_exception(self, request, exception, spider): self.logger.error('SentryDownloaderMiddleware %s [%s]' % (exception, spider.name), exc_info=True, extra={ 'culprit': 'SentryDownloaderMiddleware/%s [spider: %s]' % (type(exception), spider.name), 'stack': True, 'data': { 'request': cPickle.dumps(request_to_dict(request, spider)), 'exception': exception, 'spider': spider, } })
def queue_url(redis, priority: int = 0, spider: str = "tordirectory", *urls: str): queue_key = f"zqueue:{spider}" for url in urls: payload = pickle.dumps( request_to_dict(Request(url, dont_filter=True, priority=priority))) redis.execute_command('ZADD', queue_key, -priority, payload)
def response_to_dict(response): d = { 'url': to_unicode(response.url), 'status': int(response.status), 'headers': dict(response.headers), 'body': response.body, 'flags': list(response.flags), 'request': request_to_dict(response.request) } return d
def enqueue_start_request(self, request): if self.requestqueue is None: return try: d = request_to_dict(request, self) self.requestqueue.push(d) self.stats.inc_value('startrequests/enqueued', spider=self) except ValueError as e: self.log('Non-serializable start request: %s (reason: %s)' % (request, e), level=log.ERROR)
def insert_seed(self, url, catstr): req = Request(url) req.meta['page'] = 'listing' req.meta['count'] = 0 req.meta['catstr'] = catstr # catstr d = request_to_dict(req) #, self.spider) d['callback']= 'parse' #'_response_downloaded' data = marshal.dumps(d) #self.queue.push(data, encoded=True) self.redis_server.rpush(self.queue_key, data)
def response_to_dict(response): return { 'cls': '{r.__module__}.{r.__class__.__name__}'.format(r=response), 'encoding': response.encoding, 'request': request_to_dict(response.request), 'url': response.url, 'status': response.status, 'headers': dict(response.headers), 'meta': response.meta, 'body': response.body, }
def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, request.priority) except ValueError: # non serializable request return else: stats.inc_value('scheduler/disk_enqueued', spider=self.spider) return True
def _request_to_dict(self, request): _request = request_to_dict(request, spider=self.spider) if not _request['callback']: _request['callback'] = 'parse' elif isinstance(self.spider, CrawlSpider): rule = request.meta.get('rule') if rule is not None: _request['callback'] = self.spider.rules[rule].callback self._clean_headers(_request['headers']) _request['meta'] = self._parse_meta(_request) return _request
def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError, e: # non serializable request if self.logunser: log.msg("Unable to serialize request: %s - reason: %s" % \ (request, str(e)), level=log.ERROR, spider=self.spider) return
def test_generic_form_requests(self): name = "ebay" spider = self.smanager.create(name) generic_form_request = list(spider.start_requests())[0] response = UTF8HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [request_to_dict(req, spider) for req in generic_form_request.callback(response)] expected = [{'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'parse', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}] self.assertEqual(request_list, expected)
def response_to_dict(response, spider, include_request=True, **kwargs): """Returns a dict based on a response from a spider""" d = { 'time': time.time(), 'url': response.url, 'headers': dict(response.headers), 'body': response.body, } if include_request: d['request'] = request_to_dict(response.request, spider) return d
def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: request_keys = {"url", "method", "body"} request_data = { k: str(v) for k, v in request_to_dict(request).items() if k in request_keys } fp_data = { "SCRAPY_FINGERPRINT": request_fingerprint(request), **request_data, } return json.dumps(fp_data, ensure_ascii=False, sort_keys=True)
def test_generic_form_requests_with_spider_args(self): name = "ebay3" args = {'search_string': 'Cars'} spider = self.smanager.create(name, **args) generic_form_request = list(spider.start_requests())[0] response = HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read()) response.request = generic_form_request request_list = [request_to_dict(req, spider) for req in generic_form_request.callback(response)] expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}] self.assertEqual(request_list, expected)
def response_to_dict(response): """Convert Response object to a dict""" d = { 'url': str_to_unicode(response.url, errors='ignore'), 'headers': dict(response.headers), 'body': str_to_unicode(response.body, errors='ignore'), 'encoding': response.encoding, 'status': response.status, 'request': request_to_dict(response.request), 'meta': response.request.meta } return d
def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return if self.stats: self.stats.inc_value('scheduler/enqueued/mongodb', spider=self.spider) self.collection.insert({ 'data': request_to_dict(request, self.spider), 'created': datetime.datetime.utcnow() })
def response_to_dict(response, spider, include_request=True, **kwargs): """Returns a dict based on a response from a spider""" d = { 'time': time.time(), 'status': response.status, 'url': response.url, 'headers': dict(response.headers), 'body': response.body, } if include_request: d['request'] = request_to_dict(response.request, spider) return d
def enqueue_request(self, request): if not request.dont_filter: if self.collection.find_one({'url': request.url, 'method': request.method}) is not None: return try: self.collection.insert(request_to_dict(request, self.spider)) self.stats.inc_value('scheduler/enqueued', spider=self.spider) except ValueError as e: # non serializable request if self.logunser: log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s", level=log.ERROR, spider=self.spider, request=request, reason=e) return
def dump_url(self,url,spider,rec_url): req = Request(url) flowid = "phoneflow-" + spider req.meta['flow'] = flowid req.meta['rule'] = 0 req.meta['cd'] = {'rec_url': rec_url, "spider": spider} # client data # req.headers["Referer"] = "http://www.autoimg.cn/" d = request_to_dict(req) data = json.dumps(d) flow=Flow(self.server, flowid) # print 2132 flow.q_push(data)
def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s", level=log.ERROR, spider=self.spider, request=request, reason=e) return else: return True
def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: logger.error("Unable to serialize request: %(request)s - reason: %(reason)s", {'request': request, 'reason': e}, exc_info=True, extra={'spider': self.spider}) return else: return True
def check_to_resch(self, flow, start, end): """check a stopped flow to see if we need to reschedule it. start and end are the last run's start and finish time. """ resch_period = int(flow.get("interval", 0)) if time.time()-start > resch_period: if not 'seeds' in flow: self.logger.info("flow %s: seed url not found", flow.name) for url in flow['seeds']: req = Request(url) req.meta['flow'] = flow.name data = json.dumps(request_to_dict(req)) flow.q_push(data) flow['state'] = 'ready' self.logger.info("flow %s: seed inserted, ready-to-go", flow.name)
def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: msg = ("Unable to serialize request: %(request)s - reason:" " %(reason)s - no more unserializable requests will be" " logged (stats being collected)") logger.warning(msg, {'request': request, 'reason': e}, exc_info=True, extra={'spider': self.spider}) self.logunser = False self.stats.inc_value('scheduler/unserializable', spider=self.spider) return else: return True
def _encode_request(self, request): """Encode a request object""" request_dict = request_to_dict(request, self.spider_) return self._request_to_crawldoc(request_dict, pickle.dumps(request_dict, protocol=1))
def enqueue_request(self, request): data = marshal.dumps(request_to_dict(request, self.spider)) self.client.push(data)
def _assert_serializes_ok(self, request, spider=None): d = request_to_dict(request, spider=spider) request2 = request_from_dict(d, spider=spider) self._assert_same_request(request, request2)
def _encode_request(self, request): """Encode a request object""" org_dict = request_to_dict(request, self.spider) red_dict = RequestDeCompress.reduce_request_dict(org_dict) return pickle.dumps(red_dict, protocol=1)
def _encode_request(self, request): return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
def push(self, request, encoded=False): if not encoded: data = marshal.dumps(request_to_dict(request, self.spider)) else: data = request self.server.rpush(self.key, data)
def _eqpush(self, request): req = json.dumps(request_to_dict(request, self.spider)) self.flowmodel.q_push(req)
def _encode_request(self, request): """Encode a request object""" obj = request_to_dict(request, self.spider) return self.serializer.dumps(obj)
def push(self, request, priority=0): if self.serialize: request = request_to_dict(request, self.spider) super(ScrapyPriorityQueue, self).push(request, priority)
def enqueue_request(self, request): if not request.dont_filter and self.dupefilter.request_seen(request): return False req_dict = request_to_dict(request, self.spider) self.queue.push(req_dict, request.priority) return True
def _encode_request(self, request): """Encode a request object""" return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
def push(self, request): data = marshal.dumps(request_to_dict(request, self.spider)) pairs = {data: -request.priority} self.redis.zadd(self.key, **pairs)