Exemple #1
0
    def test_generic_form_requests_with_file_field(self):
        name = "ebay2"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read())
        response.request = generic_form_request
        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'form_url': u'http://*****:*****@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}]}, 'headers': {}, 'url': u'file://tmp/test_params.txt', 'dont_filter': True, 'priority': 0, 'callback': 'parse_field_url_page', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt')
        response = HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read())
        response.request = generic_form_request

        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'fields': [{u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}], u'type': u'form', 'field_index': 1}, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse_form_page', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url, 'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc')
        response = HtmlResponse(url="http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                             for req in generic_form_request.callback(response)]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Cars&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_nkw2=Boats&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Boats&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)
Exemple #2
0
def test_inline_callback():
    """Sample request with inline callback."""
    spider = MySpider()
    cb = callback_for(FakeItemPage)
    request = scrapy.Request('http://example.com/', callback=cb)
    with pytest.raises(ValueError) as exc:
        request_to_dict(request, spider)

    msg = f'Function {cb} is not a method of: {spider}'
    assert str(exc.value) == msg
def test_instance_method_callback():
    """Sample request specifying spider's instance method callback."""
    spider = MySpider()
    request = scrapy.Request('http://example.com/', callback=spider.parse_item)
    request_dict = request_to_dict(request, spider)
    assert isinstance(request_dict, dict)
    assert request_dict['url'] == 'http://example.com/'
    assert request_dict['callback'] == 'parse_item'

    request = scrapy.Request('http://example.com/', callback=spider.parse_web)
    request_dict = request_to_dict(request, spider)
    assert isinstance(request_dict, dict)
    assert request_dict['url'] == 'http://example.com/'
    assert request_dict['callback'] == 'parse_web'
Exemple #4
0
    def test_generic_form_requests_with_file_field(self):
        name = "ebay2"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt')
        response = UTF8HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read())
        response.request = generic_form_request
        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{'body': b'', '_encoding': 'utf-8', 'cookies': {}, 'meta': {u'xpath': u"//form[@name='adv_search_from']", u'form_url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', u'type': u'form', 'field_index': 1, u'fields': [{u'xpath': u".//*[@name='_nkw']", 'file_values': ['Cars', 'Boats'], u'type': u'inurl', u'value': u'file://tmp/test_params.txt'}, {u'type': u'inurl', u'name': u'_nkw2', u'value': u'file://tmp/test_params.txt'}, {u'xpath': u".//*[@name='_in_kw']", u'type': u'iterate'}]}, 'headers': {}, 'url': u'file://tmp/test_params.txt', 'dont_filter': True, 'priority': 0, 'callback': 'parse_field_url_page', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url, 'file://tmp/test_params.txt')
        response = UTF8HtmlResponse(url='file://tmp/test_params.txt', body=open(join(_PATH, "data", "test_params.txt")).read())
        response.request = generic_form_request

        requests = list(generic_form_request.callback(response))
        request_list = [request_to_dict(req, spider) for req in requests]
        expected = [{'_encoding': 'utf-8', 'cookies': {}, 'dont_filter': True, 'errback': None, 'meta': {'fields': [{'type': 'inurl', 'file_values': ['Cars', 'Boats'], 'xpath': ".//*[@name='_nkw']", 'value': 'file://tmp/test_params.txt'}, {'name': '_nkw2', 'file_values': ['Cars', 'Boats'], 'type': 'inurl', 'value': 'file://tmp/test_params.txt'}, {'xpath': ".//*[@name='_in_kw']", 'type': 'iterate'}], 'type': 'form', 'xpath': "//form[@name='adv_search_from']", 'field_index': 1}, 'method': 'GET', 'priority': 0, 'headers': {}, 'body': b'', 'url': 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'callback': 'parse_form_page'}]
        self.assertEqual(request_list, expected)

        generic_form_request = requests[0]
        self.assertEqual(generic_form_request.url, 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc')
        response = UTF8HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                        for req in generic_form_request.callback(response)]
        expected = [
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Cars', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/i.html?_nkw=Boats&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50&_nkw2=Boats', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'after_form_page', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}},
            {'url': 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'method': 'GET', 'cookies': {}, 'headers': {}, 'dont_filter': True, 'callback': 'parse', '_encoding': 'utf-8', 'body': b'', 'errback': None, 'priority': 0, 'meta': {}}]
        self.assertEqual(request_list, expected)
 def _dqpush(self, request):
     #self.dqs为空,也就是磁盘队列没有实例化,直接返回。
     if self.dqs is None:
         return
     try:
         #把request请求放入到对应的优先级队列中。
         #request_to_dict方法可以把request实例,转换成一个字典:
         #d = {'url': request.url.decode('ascii'), # urls should be safe (safe_string_url)
         #'callback': cb,
         #'errback': eb,
         #'method': request.method,
         #'headers': dict(request.headers),
         #'body': request.body,
         #'cookies': request.cookies,
         #'meta': request.meta,
         #'_encoding': request._encoding,
         #'priority': request.priority,
         #'dont_filter': request.dont_filter,}
         reqd = request_to_dict(request, self.spider)
         #字典根据优先级入列。
         self.dqs.push(reqd, -request.priority)
     except ValueError as e:  # non serializable request
         if self.logunser:
             logger.error(
                 "Unable to serialize request: %(request)s - reason: %(reason)s",
                 {
                     'request': request,
                     'reason': e
                 },
                 exc_info=True,
                 extra={'spider': self.spider})
         return
     else:
         #如果没有错误,则返回真。
         return True
Exemple #6
0
 def _dqpush(self, request):
     """
     :param request:
     :return:是否需要磁盘队列去重
     """
     if self.dqs is None:
         return
     try:
         reqd = request_to_dict(request, self.spider)
         self.dqs.push(reqd, -request.priority)
     except ValueError as e:  # non serializable request
         if self.logunser:
             msg = ("Unable to serialize request: %(request)s - reason:"
                    " %(reason)s - no more unserializable requests will be"
                    " logged (stats being collected)")
             logger.warning(msg, {
                 'request': request,
                 'reason': e
             },
                            exc_info=True,
                            extra={'spider': self.spider})
             self.logunser = False
         self.stats.inc_value('scheduler/unserializable',
                              spider=self.spider)
         return
     else:
         return True
Exemple #7
0
    def enqueue_request(self, request):
	data = request_to_dict(request, self.spider)
	
	self.collection.insert({
		'data': data,
		'created': datetime.datetime.utcnow()
	})
Exemple #8
0
    def test_login_requests(self):
        name = "pinterest.com"
        spider = self.smanager.create(name)
        login_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="https://pinterest.com/login/", body=open(join(_PATH, "data", "pinterest.html")).read())
        response.request = login_request
        form_request = login_request.callback(response)
        expected = {'_encoding': 'utf-8',
            'body': 'email=test&password=testpass&csrfmiddlewaretoken=nLZy3NMzhTswZvweHJ4KVmq9UjzaZGn3&_ch=ecnwmar2',
            'callback': 'after_login',
            'cookies': {},
            'dont_filter': True,
            'errback': None,
            'headers': {'Content-Type': ['application/x-www-form-urlencoded']},
            'meta': {},
            'method': 'POST',
            'priority': 0,
            'url': u'https://pinterest.com/login/?next=%2F'}

        self.assertEqual(request_to_dict(form_request, spider), expected)

        # simulate a simple response to login post from which extract a link
        response = HtmlResponse(url="http://pinterest.com/", body="<html><body><a href='http://pinterest.com/categories'></body></html>")
        result = list(spider.after_login(response))
        self.assertEqual([r.url for r in result], ['http://pinterest.com/categories', 'http://pinterest.com/popular/'])
Exemple #9
0
 def _encode_request(self, request):
     """Encode a request object"""
     try:
         obj = request.to_dict(spider=self.spider)
     except AttributeError:
         obj = request_to_dict(request, self.spider)
     return self.serializer.dumps(obj)
Exemple #10
0
 def _encode_request(self, request):
     """
     encode request
     :param request:
     :return:
     """
     return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
Exemple #11
0
 def request_to_dict(self, request):
     '''
     Convert Request object to a dict.
     modified from scrapy.utils.reqser
     '''
     req_dict = {
         # urls should be safe (safe_string_url)
         'url':
         request.url.decode('ascii'),
         'method':
         request.method,
         'headers':
         dict(request.headers),
         'body':
         request.body,
         'cookies':
         request.cookies,
         'meta':
         request.meta,
         '_encoding':
         request._encoding,
         'priority':
         request.priority,
         'dont_filter':
         request.dont_filter,
         #  callback/errback are assumed to be a bound instance of the spider
         'callback':
         None if request.callback is None else request.callback.func_name,
         'errback':
         None if request.errback is None else request.errback.func_name,
         'request':
         pickle.dumps(request_to_dict(request, self.spider), protocol=-1),
     }
     return req_dict
Exemple #12
0
 def process_request(self, request, spider):
     """ Login if we are not logged in yet.
     """
     if '_autologin' in request.meta or request.meta.get('skip_autologin'):
         returnValue(None)
     yield self._ensure_login(request, spider)
     self.stats.set_value('autologin/logged_in', self.logged_in)
     if self.skipped:
         request.meta['autologin_active'] = False
         returnValue(None)
     elif self.logged_in:
         request.meta['autologin_active'] = True
         logout_url = request.meta.get('autologin_logout_url',
                                       self.logout_url)
         if logout_url and logout_url in request.url:
             logger.debug('Ignoring logout request %s', request.url)
             raise IgnoreRequest
         # Save original request to be able to retry it in case of logout
         req_copy = request.replace(meta=deepcopy(request.meta))
         request.meta['_autologin'] = autologin_meta = {}
         try:
             autologin_meta['request'] = request_to_dict(req_copy,
                                                         spider=spider)
         except ValueError:
             # Serialization failed, but it might be ok if we do not persist
             # requests, so store the request itself here.
             autologin_meta['request'] = req_copy
         # TODO - it should be possible to put auth cookies into the
         # cookiejar in process_response (but also check non-splash)
         if self.auth_cookies:
             request.cookies = self.auth_cookies
             autologin_meta['cookie_dict'] = {
                 c['name']: c['value']
                 for c in self.auth_cookies
             }
Exemple #13
0
def parse_request(request,
                  spider,
                  settings,
                  testing=False,
                  already_parsed=False):
    parsed_request = request
    if not already_parsed:
        parsed_request = request_to_dict(request, spider=spider)
        if not parsed_request['callback']:
            parsed_request['callback'] = 'parse'

        parsed_request['headers'] = parse_headers(parsed_request['headers'],
                                                  spider, settings)

        parsed_request['body'] = parsed_request['body'].decode('utf-8')

        _meta = {}
        for key, value in parsed_request.get('meta').items():
            _meta[key] = parse_object(value,
                                      spider,
                                      testing=testing,
                                      already_parsed=already_parsed,
                                      settings=settings)

        parsed_request['meta'] = _meta

    skipped_fields = settings.get('AUTOUNIT_REQUEST_SKIPPED_FIELDS',
                                  default=[])
    if testing:
        for field in skipped_fields:
            parsed_request.pop(field)

    return parsed_request
Exemple #14
0
    def request_page_limit_reached(self, request, spider):
        # Collect items composing the redis key
        # grab the tld of the request
        req_dict = request_to_dict(request, spider)
        ex_res = self.extract(req_dict['url'])
        domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix)

        # grab the crawl id
        crawl_id = req_dict['meta']['crawlid']

        # Compose the redis key
        composite_key = self.key_start + ':' + domain + ':' + crawl_id

        # Add new key if it doesn't exist
        if not self.server.exists(composite_key):
            self.server.set(composite_key, 0)

        # Stop incrementing the key when the limit is reached
        page_count = int(self.server.get(composite_key))
        if page_count >= self.page_limit:
            return True

        # Increment key
        page_count = int(self.server.incr(composite_key))
        # Set key expiration
        self.server.expire(composite_key, self.timeout)

        return page_count >= self.page_limit
Exemple #15
0
    def publish_links(self, requests_new, urls_other):
        # Serialize requests_new
        reqds_new = []
        for request_new in requests_new:
            try:
                reqd_new = request_to_dict(request_new, self._spider)
                reqds_new.append(reqd_new)
            except ValueError as e:
                logger.error(
                    u'Unable to serialize request: %(request)s - reason: %(reason)s',
                    {
                        'request': request_new,
                        'reason': e
                    },
                    exc_info=True,
                    extra={'spider': self._spider})

        # Send to queue
        if len(urls_other) <= 0 and len(reqds_new) <= 0:
            return

        payload = {
            'requests_new': reqds_new,
            'urls_other': urls_other,
        }
        self._queue_links.publish(payload)
Exemple #16
0
 def publish_error(self, request, exception):
     error = {
         'request': request_to_dict(request, self._spider),
         'cls': self._get_exception_class(exception),
         'msg': str(exception)
     }
     self._queue_errors.publish(error)
Exemple #17
0
 def test_reference_callback_serialization(self):
     r = Request("http://www.example.com",
                 callback=self.spider.parse_item_reference,
                 errback=self.spider.handle_error_reference)
     self._assert_serializes_ok(r, spider=self.spider)
     request_dict = request_to_dict(r, self.spider)
     self.assertEqual(request_dict['callback'], 'parse_item_reference')
     self.assertEqual(request_dict['errback'], 'handle_error_reference')
Exemple #18
0
 def _request_to_dict(cls, request, spider=None):
     d = request_to_dict(request, spider)
     new_dict = dict()
     for key, value in d.items():
         if value:
             new_dict[key] = value
     logger.debug(f"request_to_dict: {d}")
     return new_dict
Exemple #19
0
 def _encode_request(self, request):
     """Encode a request object"""
     dic=request_to_dict(request, self.spider)
     if dic.get("body"):
         dic["body"]=base64.urlsafe_b64encode(zlib.compress(request.body)).decode("utf-8")
     if dic.get("body")==b"":
         dic["body"]=None
     return dic
Exemple #20
0
def test_default_callback():
    """Sample request not specifying callback."""
    spider = MySpider()
    request = scrapy.Request('http://example.com/')
    request_dict = request_to_dict(request, spider)
    assert isinstance(request_dict, dict)
    assert request_dict['url'] == 'http://example.com/'
    assert request_dict['callback'] is None
    def process_request(self, request: Request,
                        spider: Spider) -> Optional[Request]:
        if not self.enabled:
            return None

        try:
            crawlera_meta = request.meta[META_KEY]
        except KeyError:
            crawlera_meta = {}

        if crawlera_meta.get("skip") or crawlera_meta.get("original_request"):
            return None

        self._set_download_slot(request, spider)

        self.stats.inc_value("crawlera_fetch/request_count")
        self.stats.inc_value("crawlera_fetch/request_method_count/{}".format(
            request.method))

        shub_jobkey = os.environ.get("SHUB_JOBKEY")
        if shub_jobkey:
            self.default_args["job_id"] = shub_jobkey

        # assemble JSON payload
        original_body_text = request.body.decode(request.encoding)
        body = {"url": request.url, "body": original_body_text}
        if request.method != "GET":
            body["method"] = request.method
        body.update(self.default_args)
        body.update(crawlera_meta.get("args") or {})
        body_json = json.dumps(body)

        additional_meta = {
            "original_request": request_to_dict(request, spider=spider),
            "timing": {
                "start_ts": time.time()
            },
        }
        crawlera_meta.update(additional_meta)

        additional_headers = {
            "Content-Type": "application/json",
            "Accept": "application/json",
        }
        if self.apikey:
            additional_headers["Authorization"] = self.auth_header
        request.headers.update(additional_headers)

        if shub_jobkey:
            request.headers["X-Crawlera-JobId"] = shub_jobkey

        if scrapy.version_info < (2, 0, 0):
            original_url_flag = "original url: {}".format(request.url)
            if original_url_flag not in request.flags:
                request.flags.append(original_url_flag)

        request.meta[META_KEY] = crawlera_meta
        return request.replace(url=self.url, method="POST", body=body_json)
Exemple #22
0
    def _encode_request(self, request):
        """Encode a request object"""
        spider  = self.spider
        if request.meta.get('parser_request'):
            spider = self.spider.parse_spider
            self.__encode_parser_request__(request, spider)

        obj = request_to_dict(request, spider)
        return self.serializer.dumps(obj)
Exemple #23
0
 def process_exception(self, request, exception, spider):
     self.logger.error('SentryDownloaderMiddleware %s [%s]' % (exception, spider.name), exc_info=True, extra={
         'culprit': 'SentryDownloaderMiddleware/%s [spider: %s]' % (type(exception), spider.name),
         'stack': True,
         'data': {
             'request': cPickle.dumps(request_to_dict(request, spider)),
             'exception': exception,
             'spider': spider,
         }
     })
Exemple #24
0
def queue_url(redis,
              priority: int = 0,
              spider: str = "tordirectory",
              *urls: str):
    queue_key = f"zqueue:{spider}"
    for url in urls:
        payload = pickle.dumps(
            request_to_dict(Request(url, dont_filter=True, priority=priority)))

        redis.execute_command('ZADD', queue_key, -priority, payload)
Exemple #25
0
 def response_to_dict(response):
     d = {
         'url': to_unicode(response.url),
         'status': int(response.status),
         'headers': dict(response.headers),
         'body': response.body,
         'flags': list(response.flags),
         'request': request_to_dict(response.request)
     }
     return d
Exemple #26
0
 def enqueue_start_request(self, request):
     if self.requestqueue is None:
         return
     try:
         d = request_to_dict(request, self)
         self.requestqueue.push(d)
         self.stats.inc_value('startrequests/enqueued', spider=self)
     except ValueError as e:
         self.log('Non-serializable start request: %s (reason: %s)' % (request, e),
             level=log.ERROR)
Exemple #27
0
 def insert_seed(self, url, catstr):
     req = Request(url)
     req.meta['page'] = 'listing'
     req.meta['count'] = 0
     req.meta['catstr'] = catstr   # catstr
     d = request_to_dict(req)  #, self.spider)
     d['callback']= 'parse'  #'_response_downloaded'
     data = marshal.dumps(d)
     #self.queue.push(data, encoded=True)
     self.redis_server.rpush(self.queue_key, data)
Exemple #28
0
def response_to_dict(response):
    return {
        'cls': '{r.__module__}.{r.__class__.__name__}'.format(r=response),
        'encoding': response.encoding,
        'request': request_to_dict(response.request),
        'url': response.url,
        'status': response.status,
        'headers': dict(response.headers),
        'meta': response.meta,
        'body': response.body,
    }
Exemple #29
0
 def _dqpush(self, request):
     if self.dqs is None:
         return
     try:
         reqd = request_to_dict(request, self.spider)
         self.dqs.push(reqd, request.priority)
     except ValueError: # non serializable request
         return
     else:
         stats.inc_value('scheduler/disk_enqueued', spider=self.spider)
         return True
Exemple #30
0
 def _request_to_dict(self, request):
     _request = request_to_dict(request, spider=self.spider)
     if not _request['callback']:
         _request['callback'] = 'parse'
     elif isinstance(self.spider, CrawlSpider):
         rule = request.meta.get('rule')
         if rule is not None:
             _request['callback'] = self.spider.rules[rule].callback
     self._clean_headers(_request['headers'])
     _request['meta'] = self._parse_meta(_request)
     return _request
Exemple #31
0
 def _dqpush(self, request):
     if self.dqs is None:
         return
     try:
         reqd = request_to_dict(request, self.spider)
         self.dqs.push(reqd, -request.priority)
     except ValueError, e: # non serializable request
         if self.logunser:
             log.msg("Unable to serialize request: %s - reason: %s" % \
                 (request, str(e)), level=log.ERROR, spider=self.spider)
         return
Exemple #32
0
    def test_generic_form_requests(self):
        name = "ebay"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        response = UTF8HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                        for req in generic_form_request.callback(response)]
        expected = [{'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=1&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=2&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=3&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'after_form_page', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/i.html?_nkw=Cars&_in_kw=4&_ex_kw=&_sacat=0&_okw=&_oexkw=&_adv=1&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_salic=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=&_sop=12&_dmd=1&_ipg=50', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}, {'_encoding': 'utf-8', 'cookies': {}, 'callback': 'parse', 'headers': {}, 'meta': {}, 'url': 'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'errback': None, 'dont_filter': True, 'priority': 0, 'method': 'GET', 'body': b''}]
        self.assertEqual(request_list, expected)
Exemple #33
0
def response_to_dict(response, spider, include_request=True, **kwargs):
    """Returns a dict based on a response from a spider"""
    d = {
        'time': time.time(),
        'url': response.url,
        'headers': dict(response.headers),
        'body': response.body,
    }
    if include_request:
        d['request'] = request_to_dict(response.request, spider)
    return d
Exemple #34
0
 def enqueue_start_request(self, request):
     if self.requestqueue is None:
         return
     try:
         d = request_to_dict(request, self)
         self.requestqueue.push(d)
         self.stats.inc_value('startrequests/enqueued', spider=self)
     except ValueError as e:
         self.log('Non-serializable start request: %s (reason: %s)' %
                  (request, e),
                  level=log.ERROR)
 def fingerprint(self, to_provide: Set[Callable], request: Request) -> str:
     request_keys = {"url", "method", "body"}
     request_data = {
         k: str(v)
         for k, v in request_to_dict(request).items()
         if k in request_keys
     }
     fp_data = {
         "SCRAPY_FINGERPRINT": request_fingerprint(request),
         **request_data,
     }
     return json.dumps(fp_data, ensure_ascii=False, sort_keys=True)
Exemple #36
0
    def test_generic_form_requests_with_spider_args(self):
        name = "ebay3"
        args = {'search_string': 'Cars'}
        spider = self.smanager.create(name, **args)
        generic_form_request = list(spider.start_requests())[0]

        response = HtmlResponse(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc", body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        request_list = [request_to_dict(req, spider)
                             for req in generic_form_request.callback(response)]
        expected = [{'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=1&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=2&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=3&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/i.html?_adv=1&_ex_kw=&_ftrv=1&_ftrt=901&_sabdlo=&_sabdhi=&_sop=12&_samihi=&_ipg=50&_salic=1&_sasl=&_udlo=&_okw=&_fsradio=%26LH_SpecificSeller%3D1&_udhi=&_in_kw=4&_nkw=Cars&_sacat=0&_oexkw=&_dmd=1&_saslop=1&_samilow=', 'dont_filter': True, 'priority': 0, 'callback': 'after_form_page', 'method': 'GET', 'errback': None}, {'body': '', '_encoding': 'utf-8', 'cookies': {}, 'meta': {}, 'headers': {}, 'url': u'http://www.ebay.com/sch/ebayadvsearch/?rt=nc', 'dont_filter': True, 'priority': 0, 'callback': 'parse', 'method': 'GET', 'errback': None}]
        self.assertEqual(request_list, expected)
Exemple #37
0
def response_to_dict(response):
    """Convert Response object to a dict"""
    d = {
        'url': str_to_unicode(response.url, errors='ignore'),
        'headers': dict(response.headers),
        'body': str_to_unicode(response.body, errors='ignore'),
        'encoding': response.encoding,
        'status': response.status,
        'request': request_to_dict(response.request),
        'meta': response.request.meta
    }
    return d
    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return

        if self.stats:
            self.stats.inc_value('scheduler/enqueued/mongodb', spider=self.spider)

        self.collection.insert({
            'data': request_to_dict(request, self.spider),
            'created': datetime.datetime.utcnow()
        })
Exemple #39
0
def response_to_dict(response, spider, include_request=True, **kwargs):
    """Returns a dict based on a response from a spider"""
    d = {
        'time': time.time(),
        'status': response.status,
        'url': response.url,
        'headers': dict(response.headers),
        'body': response.body,
      }
    if include_request:
        d['request'] = request_to_dict(response.request, spider)
    return d
Exemple #40
0
 def enqueue_request(self, request):
     if not request.dont_filter:
         if self.collection.find_one({'url': request.url, 'method': request.method}) is not None:
             return
     try:
         self.collection.insert(request_to_dict(request, self.spider))
         self.stats.inc_value('scheduler/enqueued', spider=self.spider)
     except ValueError as e:  # non serializable request
         if self.logunser:
             log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
                     level=log.ERROR, spider=self.spider,
                     request=request, reason=e)
         return
Exemple #41
0
    def dump_url(self,url,spider,rec_url):
         req = Request(url)
         flowid = "phoneflow-" + spider
         req.meta['flow'] = flowid
         req.meta['rule'] = 0
         req.meta['cd'] = {'rec_url': rec_url,
                           "spider": spider}  # client data
#        req.headers["Referer"] = "http://www.autoimg.cn/"
         d = request_to_dict(req)
         data = json.dumps(d)
         flow=Flow(self.server, flowid)
#         print 2132
         flow.q_push(data)
Exemple #42
0
 def _dqpush(self, request):
     if self.dqs is None:
         return
     try:
         reqd = request_to_dict(request, self.spider)
         self.dqs.push(reqd, -request.priority)
     except ValueError as e:  # non serializable request
         if self.logunser:
             log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
                     level=log.ERROR, spider=self.spider,
                     request=request, reason=e)
         return
     else:
         return True
Exemple #43
0
 def _dqpush(self, request):
     if self.dqs is None:
         return
     try:
         reqd = request_to_dict(request, self.spider)
         self.dqs.push(reqd, -request.priority)
     except ValueError as e: # non serializable request
         if self.logunser:
             logger.error("Unable to serialize request: %(request)s - reason: %(reason)s",
                          {'request': request, 'reason': e},
                          exc_info=True, extra={'spider': self.spider})
         return
     else:
         return True
Exemple #44
0
    def check_to_resch(self, flow, start, end):
        """check a stopped flow to see if we need to reschedule it. 
        start and end are the last run's start and finish time.
        """
        resch_period = int(flow.get("interval", 0))
        if time.time()-start > resch_period:
            if not 'seeds' in flow:
                self.logger.info("flow %s: seed url not found", flow.name)
            for url in flow['seeds']:
                req = Request(url)
                req.meta['flow'] = flow.name
                data = json.dumps(request_to_dict(req))
                flow.q_push(data)

            flow['state'] = 'ready'
            self.logger.info("flow %s: seed inserted, ready-to-go", flow.name)
Exemple #45
0
 def _dqpush(self, request):
     if self.dqs is None:
         return
     try:
         reqd = request_to_dict(request, self.spider)
         self.dqs.push(reqd, -request.priority)
     except ValueError as e:  # non serializable request
         if self.logunser:
             msg = ("Unable to serialize request: %(request)s - reason:"
                    " %(reason)s - no more unserializable requests will be"
                    " logged (stats being collected)")
             logger.warning(msg, {'request': request, 'reason': e},
                            exc_info=True, extra={'spider': self.spider})
             self.logunser = False
         self.stats.inc_value('scheduler/unserializable',
                              spider=self.spider)
         return
     else:
         return True
 def request_to_dict(self, request):
     '''
     Convert Request object to a dict.
     modified from scrapy.utils.reqser
     '''
     req_dict = {
         # urls should be safe (safe_string_url)
         'url': request.url.decode('ascii'),
         'method': request.method,
         'headers': dict(request.headers),
         'body': request.body,
         'cookies': request.cookies,
         'meta': request.meta,
         '_encoding': request._encoding,
         'priority': request.priority,
         'dont_filter': request.dont_filter,
          #  callback/errback are assumed to be a bound instance of the spider
         'callback': None if request.callback is None else request.callback.func_name,
         'errback': None if request.errback is None else request.errback.func_name,
         'request': pickle.dumps(request_to_dict(request, self.spider), protocol=-1),
     }
     return req_dict
Exemple #47
0
 def _encode_request(self, request):
   """Encode a request object"""
   request_dict = request_to_dict(request, self.spider_)
   return self._request_to_crawldoc(request_dict, pickle.dumps(request_dict, protocol=1))
    def enqueue_request(self, request):
	data = marshal.dumps(request_to_dict(request, self.spider))
        self.client.push(data)
 def _assert_serializes_ok(self, request, spider=None):
     d = request_to_dict(request, spider=spider)
     request2 = request_from_dict(d, spider=spider)
     self._assert_same_request(request, request2)
Exemple #50
0
 def _encode_request(self, request):
     """Encode a request object"""
     org_dict = request_to_dict(request, self.spider)
     red_dict = RequestDeCompress.reduce_request_dict(org_dict)
     return pickle.dumps(red_dict, protocol=1)
Exemple #51
0
 def _encode_request(self, request):
   return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
Exemple #52
0
 def push(self, request, encoded=False):
     if not encoded:
         data = marshal.dumps(request_to_dict(request, self.spider))
     else:
         data = request
     self.server.rpush(self.key, data)
Exemple #53
0
 def _eqpush(self, request):
     req = json.dumps(request_to_dict(request, self.spider))
     self.flowmodel.q_push(req)
Exemple #54
0
 def _encode_request(self, request):
     """Encode a request object"""
     obj = request_to_dict(request, self.spider)
     return self.serializer.dumps(obj)
Exemple #55
0
 def push(self, request, priority=0):
     if self.serialize:
         request = request_to_dict(request, self.spider)
     super(ScrapyPriorityQueue, self).push(request, priority)
Exemple #56
0
 def enqueue_request(self, request):
     if not request.dont_filter and self.dupefilter.request_seen(request):
         return False
     req_dict = request_to_dict(request, self.spider)
     self.queue.push(req_dict, request.priority)
     return True
Exemple #57
0
 def _encode_request(self, request):
     """Encode a request object"""
     return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
Exemple #58
0
 def push(self, request):
     data = marshal.dumps(request_to_dict(request, self.spider))
     pairs = {data: -request.priority}
     self.redis.zadd(self.key, **pairs)