コード例 #1
0
 def process_response(self, request, response, spider):
     """ If we were logged out, login again and retry request.
     """
     if request.meta.get('_autologin') and self.is_logout(response):
         autologin_meta = request.meta['_autologin']
         if isinstance(autologin_meta['request'], dict):
             retryreq = request_from_dict(autologin_meta['request'], spider)
         else:
             retryreq = autologin_meta['request'].copy()
         retryreq.dont_filter = True
         logger.debug('Logout at %s: %s', retryreq.url,
                      _response_cookies(response))
         if self.logged_in:
             # We could have already done relogin after initial logout
             if any(autologin_meta['cookie_dict'].get(c['name']) !=
                    c['value'] for c in self.auth_cookies):
                 logger.debug('Request was stale, will retry %s', retryreq)
             else:
                 self.logged_in = False
                 # It's better to re-login straight away
                 yield self._ensure_login(retryreq, spider)
                 logout_count = retryreq.meta['autologin_logout_count'] = (
                     retryreq.meta.get('autologin_logout_count', 0) + 1)
                 if logout_count >= self.max_logout_count:
                     logger.debug('Max logouts exceeded, will not retry %s',
                                  retryreq)
                     raise IgnoreRequest
                 else:
                     logger.debug(
                         'Request caused log out (%d), still retrying %s',
                         logout_count, retryreq)
         returnValue(retryreq)
     returnValue(response)
コード例 #2
0
ファイル: cli.py プロジェクト: ryonlife/scrapy-autounit
    def update(self):
        to_update = []
        if self.fixture:
            to_update.append(self.fixture_path)
        else:
            target = os.path.join(self.callback_dir, "*.bin")
            to_update = glob(target)

        for path in to_update:
            data, _, spider, _ = prepare_callback_replay(path)

            request = request_from_dict(data['request'], spider)

            response_cls = auto_import(
                data['response'].pop('cls', 'scrapy.http.HtmlResponse')
            )
            response = response_cls(
                request=data["request"], **data['response'])

            data["result"], _ = parse_callback_result(
                request.callback(response), spider
            )

            fixture_dir, filename = os.path.split(path)
            fixture_index = re.search(r"\d+", filename).group()
            add_sample(fixture_index, fixture_dir, filename, data)

            print("Fixture '{}' successfully updated.".format(
                os.path.relpath(path)))
コード例 #3
0
    def next_request(self):
        entry = self.collection.find_and_modify(sort={"$natural": self.queue_order}, remove=True)
        if entry:
            request = request_from_dict(entry["data"], self.spider)
            return request

        return None
コード例 #4
0
ファイル: utils.py プロジェクト: asdbaihu/scrapy-autounit
    def test(self):
        fx_result = data['result']
        fx_version = data.get('python_version')

        request = request_from_dict(data['request'], spider)
        response = HtmlResponse(request=request, **data['response'])

        middlewares = []
        middleware_paths = data['middlewares']
        for mw_path in middleware_paths:
            try:
                mw_cls = load_object(mw_path)
                mw = create_instance(mw_cls, settings, crawler)
                middlewares.append(mw)
            except NotConfigured:
                continue

        crawler.signals.send_catch_log(signal=signals.spider_opened,
                                       spider=spider)

        for mw in middlewares:
            if hasattr(mw, 'process_spider_input'):
                mw.process_spider_input(response, spider)

        result = arg_to_iter(request.callback(response))
        middlewares.reverse()

        for mw in middlewares:
            if hasattr(mw, 'process_spider_output'):
                result = mw.process_spider_output(response, result, spider)

        for index, (cb_obj, fx_item) in enumerate(
                six.moves.zip_longest(result,
                                      fx_result,
                                      fillvalue=NO_ITEM_MARKER)):
            if any(item == NO_ITEM_MARKER for item in (cb_obj, fx_item)):
                raise AssertionError(
                    "The fixture's data length doesn't match with "
                    "the current callback's output length.")

            cb_obj = parse_object(cb_obj, spider)

            fx_obj = fx_item['data']
            if fx_item['type'] == 'request':
                clean_request(fx_obj, settings)
                clean_request(cb_obj, settings)
            else:
                clean_item(fx_obj, settings)
                clean_item(cb_obj, settings)

            if fx_version == 2 and six.PY3:
                fx_obj = binary_check(fx_obj, cb_obj, encoding)

            try:
                datadiff.tools.assert_equal(fx_obj, cb_obj)
            except AssertionError as e:
                six.raise_from(
                    AssertionError(
                        "Callback output #{} doesn't match recorded "
                        "output:{}".format(index, e)), None)
コード例 #5
0
    def open(self, spider):
        self.spider = spider

        try:
            self.queue = load_object(self.queue_cls)(
                server=self.server,
                spider=spider,
                key=self.queue_key % {
                    'spider': spider.name
                },
                serializer=self.serializer,
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate queue class '%s': %s",
                             self.queue_cls, e)

        try:
            # for req_dict in self.col.find({"meta.category": {"$in": list(JdSpider.included_cat_list)}},
            #                               {'_id': 0}):
            count = 0
            for req_dict in self.col.find(
                {
                    "meta.item.category": {
                        "$in": list(JdSpider.included_cat_list)
                    }
                }, {'_id': 0}):
                # for req_dict in self.col.find({}, {'_id': 0}):
                # if 'item' in req_dict['meta'] and len(req_dict['meta']['item'][GIF.SKUID]) >= 10:
                #     continue
                print(
                    '-------------------add failure request to queue-------------------'
                )
                count += 1
                req = request_from_dict(req_dict, spider)
                req.dont_filter = True
                req.meta['dont_redirect'] = False
                req.priority = 2
                self.enqueue_request(req)
            print(count)
        finally:
            self.client.close()

        try:
            self.df = load_object(self.dupefilter_cls)(
                server=self.server,
                key=self.dupefilter_key % {
                    'spider': spider.name
                },
                debug=spider.settings.getbool('DUPEFILTER_DEBUG'),
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate dupefilter class '%s': %s",
                             self.dupefilter_cls, e)

        if self.flush_on_start:
            self.flush()
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" %
                       len(self.queue))
コード例 #6
0
ファイル: spiders.py プロジェクト: zt9/CodePool
 def _make_request(self, mframe, hframe, body):
     try:
         request = request_from_dict(pickle.loads(body), self)
     except Exception as e:
         body = body.decode()
         request = scrapy.Request(body, callback=self.parse)
     return request
コード例 #7
0
    def next_request(self):
	entry = self.collection.find_and_modify(sort={"$natural":self.queue_order}, remove=True)
	if entry:
	    request = request_from_dict(entry['data'], self.spider)
	    return request
	
        return None
コード例 #8
0
def get_config_requests(test_dir, spider, max_fixtures):
    curr_fixture_count = get_num_fixtures(test_dir)
    config = get_cb_settings(test_dir)
    try:
        requests_to_add = config.REQUESTS_TO_ADD
    except AttributeError:
        return []

    defaults = {
        'method': 'GET',
        'headers': None,
        'body': None,
        'cookies': None,
        'meta': None,
        '_encoding': 'utf-8',
        'priority': 0,
        'dont_filter': False,
        'errback': None,
        'flags': None,
        'cb_kwargs': None
    }
    complete_requests = []
    for req in requests_to_add:
        if curr_fixture_count < max_fixtures:
            for key, val in defaults.items():
                req[key] = req.get(key, val)
            req['callback'] = _get_method(spider, test_dir.split('/')[-1])
            req['meta']['_update'] = 1
            req['meta']['_fixture'] = curr_fixture_count + 1
            complete_requests.append(req)
            curr_fixture_count += 1
        else:
            break
    complete_requests = [request_from_dict(req) for req in complete_requests]
    return complete_requests
コード例 #9
0
ファイル: queue.py プロジェクト: Gerapy/GerapyRabbitMQ
 def _decode_request(self, encoded_request):
     """
     decode request
     :param encoded_request:
     :return:
     """
     return request_from_dict(pickle.loads(encoded_request), self.spider)
コード例 #10
0
	def recurse_from_dict(self, node, spider=None):
		newnode = None
		if isinstance(node, dict):
			if '__response__' in node:
				if node['__response__'] == 'Response':
					cls = Response
				elif node['__response__'] == 'TextResponse':
					cls = TextResponse
				else:
					cls = HtmlResponse

				newnode = self.response_from_dict(node, cls)
		
			elif '__request__' in node:
				newnode = request_from_dict(node, spider)
				for k in newnode.meta:
					newnode.meta[k] = self.recurse_from_dict(newnode.meta[k], spider=spider)
			else:
				for k in node:
					newnode[k] = self.recurse_from_dict(node[k], spider=spider)
		elif isinstance(node, tuple):
			newnode= tuple(map(functools.partial(self.recurse_from_dict, spider=spider), node))
		elif isinstance(node, list):
			newnode= map(functools.partial(self.recurse_from_dict, spider=spider), node)
		else:
			if isinstance(node, unicode):
				node = node.encode('utf8')
			newnode = node

		return newnode
コード例 #11
0
    def next_request(self):
	data = self.client.pop()
	if data is None or len(data) == 0:
	    return None
	
	request = request_from_dict(marshal.loads(data), self.spider)
        return request
コード例 #12
0
 def pop(self):
   # use atomic range/remove using multi/exec
   pipe = self.redis.pipeline()
   pipe.multi()
   pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
   results, count = pipe.execute()
   if results:
     return request_from_dict(marshal.loads(results[0]), self.spider)
コード例 #13
0
ファイル: queue.py プロジェクト: echobfy/weiboSearchCrawler
 def pop(self):
     # use atomic range/remove using multi/exec
     pipe = self.redis.pipeline()
     pipe.multi()
     pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
     results, count = pipe.execute()
     if results:
         return request_from_dict(marshal.loads(results[0]), self.spider)
コード例 #14
0
 def _dqpop(self):
     if self.queue:
         d = self.queue.get()
         if d:
             return request_from_dict(
                 json.loads(d, object_hook=scrapy_request_decoder),
                 self.spider
             )
コード例 #15
0
ファイル: scheduler.py プロジェクト: usakey/Any
 def next_request(self):
     request = self.collection.find_and_modify({'last_downloaded': {'$exists': False}},
                                               sort=[('priority', pymongo.DESCENDING)],
                                               update={'$set': {'last_downloaded': datetime.today()}})
     if request:
         request = request_from_dict(request, spider=self.spider)
         self.stats.inc_value('scheduler/dequeued', spider=self.spider)
     return request
コード例 #16
0
ファイル: endpoint.py プロジェクト: linhr/dighub
 def dequeue_start_request(self):
     if self.requestqueue is None:
         return
     d = self.requestqueue.pop()
     if d is None:
         return
     self.stats.inc_value('startrequests/dequeued', spider=self)
     return request_from_dict(d, self)
コード例 #17
0
 def dequeue_start_request(self):
     if self.requestqueue is None:
         return
     d = self.requestqueue.pop()
     if d is None:
         return
     self.stats.inc_value('startrequests/dequeued', spider=self)
     return request_from_dict(d, self)
コード例 #18
0
ファイル: squeues.py プロジェクト: atharwa-24/scrapy
        def pop(self):
            request = super().pop()

            if not request:
                return None

            request = request_from_dict(request, self.spider)
            return request
コード例 #19
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                if 'request' in item:
                    req = request_from_dict(pickle.loads(item['request']),
                                            self.spider)
                else:
                    req = Request(item['url'], meta=make_splash_meta({}))
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'],
                              meta=make_splash_meta({}))

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                if key != 'request':
                    req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
コード例 #20
0
ファイル: queue.py プロジェクト: cfhb/crawl_youtube
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     try:
       red_dict = pickle.loads(encoded_request)
       org_dict = RequestDeCompress.restore_request_dict(red_dict)
       return request_from_dict(org_dict, self.spider)
     except Exception, e:
       self.spider.log('Failed decode request:%s' % (e.message))
       return None
コード例 #21
0
    def _decode_request(self, encoded_request):
        """Decode an request previously encoded"""
        obj = self.serializer.loads(encoded_request)
        spider = self.spider
        if obj['meta'].get('parser_request'):
            spider = self.spider.parse_spider
            self.__decode_parser_request__(obj, spider)

        return request_from_dict(obj, spider)
コード例 #22
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                if 'request' in item:
                    req = request_from_dict(pickle.loads(item['request']), self.spider)
                else:
                    req = Request(item['url'], meta=make_splash_meta({}))
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'], meta=make_splash_meta({}))

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                if key != 'request' :
                    req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
コード例 #23
0
ファイル: scheduler.py プロジェクト: LightKool/scraper
 def next_request(self):
     item = self._retrieve_from_queue()
     if item:
         try:
             request = request_from_dict(item, self.spider)
         except KeyError:
             request = self._request_from_dict(item)
         return self._populate_request(request, item)
     else:
         return None
コード例 #24
0
    def next_request(self):
        entry = self.collection.find_and_modify(sort={"$natural": self.queue_order}, remove=True)
        if entry:
            request = request_from_dict(entry['data'], self.spider)
            if request and self.stats:
                self.stats.inc_value('scheduler/dequeued/mongodb', spider=self.spider)

            return request

        return None
コード例 #25
0
 def _make_request(self, mframe, hframe, body):
     try:
         request = request_from_dict(json.loads(body), self)
     except Exception:
         # body = body.decode()
         data = json.loads(body, encoding="utf-8")
         request = scrapy.Request(data['url'],
                                  callback=self.parse,
                                  dont_filter=True,
                                  meta=data['params'])
     return request
コード例 #26
0
ファイル: squeues.py プロジェクト: shantikumar/Web-Scraper
        def peek(self):
            """Returns the next object to be returned by :meth:`pop`,
            but without removing it from the queue.

            Raises :exc:`NotImplementedError` if the underlying queue class does
            not implement a ``peek`` method, which is optional for queues.
            """
            request = super().peek()
            if not request:
                return None
            return request_from_dict(request, self.spider)
コード例 #27
0
 def _decode_request(self, encoded_request):
   """Decode an request previously encoded"""
   try:
     red_dict = pickle.loads(encoded_request)
     org_dict = RequestDeCompress.restore_request_dict(red_dict)
     return request_from_dict(org_dict, self.spider)
   except Exception, e:
     #import traceback
     #print 'Failed decode reqeust'
     #print traceback.format_exc()
     self.spider.log('Failed decode request:%s, %s' % (e.message, encoded_request))
     return None
コード例 #28
0
    def _make_request(self,mframe,hframe,body):

        """
        选择发起请求策略 simple,splash
        {"url":"https://www.xuexi.cn/lgpage/detail/index.html?id=10522373568484213565&item_id=10522373568484213565","fields":{"content":"/html/body/div[@id='root']/div[@class='main-view']/section[@class='_3GhgGH8Y4Zh8H0uBP5aUMD _3mVsbsHWKWuZwBS5zIrFO9']/div[@class='oSnRgpdW2BnrDruxKh9We _3mVsbsHWKWuZwBS5zIrFO9']/div/div/div[@class='Iuu474S1L6y5p7yalKQbW grid-gr']/div[@class='grid-cell'][2]/section[@class='_3GhgGH8Y4Zh8H0uBP5aUMD _3mVsbsHWKWuZwBS5zIrFO9']/div[@class='oSnRgpdW2BnrDruxKh9We _3mVsbsHWKWuZwBS5zIrFO9']/div/div/div[@class='Iuu474S1L6y5p7yalKQbW grid-gr']/div[@class='grid-cell']/div[@class='render-detail-article']/div[@class='render-detail-article-content']/div[@class='render-detail-content cke-mode']"},"type":1}
        """

        try:
            item = json.loads(str(body, "utf-8"))
        except Exception:
            #request = request_from_dict(pickle.loads(body),self)
            logger.error("请求信息有误,忽略该请求")
            return request_from_dict(pickle.loads(body),self)

        return self.create_request(item)
コード例 #29
0
ファイル: scheduler.py プロジェクト: cfhb/crawl_youtube
 def _decode_request(self, crawl_doc):
   """Decode an request previously encoded"""
   try:
     if not crawl_doc or not crawl_doc.request or not crawl_doc.request.meta:
       self.logger_.info('recalled request: %s', crawl_doc.url)
       return self.spider_._create_request(url=crawl_doc.url,
                                           page_type=crawl_doc.page_type,
                                           doc_type=crawl_doc.doc_type,
                                           schedule_doc_type=ScheduleDocType.RECRAWL_PLAY,
                                           dont_filter=True)
     red_dict = pickle.loads(crawl_doc.request.meta)
     request = request_from_dict(red_dict, self.spider_)
     request.meta['crawl_doc'] = crawl_doc
     return request
   except:
     self.logger_.exception('failed decode request: %s', crawl_doc)
     return None
コード例 #30
0
ファイル: utils.py プロジェクト: hirajanwin/frontoxy
def response_from_dict(responsed):
    respcls = load_object(responsed['cls'])

    request = request_from_dict(responsed['request'])

    response = respcls(
        encoding=responsed['encoding'],
        request=request,
        url=responsed['url'],
        status=responsed['status'],
        headers=responsed['headers'],
        body=responsed['body'],
    )

    response.meta.update(responsed['meta'])

    return response
コード例 #31
0
ファイル: utils.py プロジェクト: sulthonzh/scrapy-autounit
    def test(self):
        fixture_objects = data['result']

        request = request_from_dict(data['request'], spider)
        response = HtmlResponse(request=request, **data['response'])

        middlewares = []
        middleware_paths = data['middlewares']
        for mw_path in middleware_paths:
            try:
                mw_cls = load_object(mw_path)
                mw = create_instance(mw_cls, settings, crawler)
                middlewares.append(mw)
            except NotConfigured:
                continue
            middlewares.append(mw)

        crawler.signals.send_catch_log(signal=signals.spider_opened,
                                       spider=spider)

        for mw in middlewares:
            if hasattr(mw, 'process_spider_input'):
                mw.process_spider_input(response, spider)

        result = request.callback(response) or []
        middlewares.reverse()

        for mw in middlewares:
            if hasattr(mw, 'process_spider_output'):
                result = mw.process_spider_output(response, result, spider)

        if isinstance(result, (Item, Request, dict)):
            result = [result]

        for index, _object in enumerate(result):
            fixture_data = fixture_objects[index]['data']
            if fixture_objects[index].get('type') == 'request':
                clean_request(fixture_data, settings)
            else:
                clean_item(fixture_data, settings)

            _object = parse_object(_object, spider)
            self.assertEqual(fixture_data, _object, 'Not equal!')
コード例 #32
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug(u"Found url to crawl {url}" \
                    .format(url=item['url']))
            if 'meta' in item:
                # item is a serialized request
                req = request_from_dict(item, self.spider)
            else:
                # item is a feed from outside, parse it manually
                req = self.request_from_feed(item)

            # extra check to add items to request
            if 'useragent' in req.meta and req.meta['useragent'] is not None:
                req.headers['User-Agent'] = req.meta['useragent']
            if 'cookie' in req.meta and req.meta['cookie'] is not None:
                if isinstance(req.meta['cookie'], dict):
                    req.cookies = req.meta['cookie']
                elif isinstance(req.meta['cookie'], string_types):
                    req.cookies = self.parse_cookie(req.meta['cookie'])

            return req

        return None
コード例 #33
0
ファイル: task_queue.py プロジェクト: heicks/github-crawler
 def _decode_request(self, encoded_request):
   return request_from_dict(pickle.loads(encoded_request), self.spider)
コード例 #34
0
ファイル: scheduler.py プロジェクト: ArturGaspar/scrapy
 def _dqpop(self):
     if self.dqs:
         d = self.dqs.pop()
         if d:
             return request_from_dict(d, self.spider)
コード例 #35
0
 def _assert_serializes_ok(self, request, spider=None):
     d = request_to_dict(request, spider=spider)
     request2 = request_from_dict(d, spider=spider)
     self._assert_same_request(request, request2)
コード例 #36
0
ファイル: queue.py プロジェクト: cfhb/crawl_youtube
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     red_dict = pickle.loads(encoded_request)
     org_dict = RequestDeCompress.restore_request_dict(red_dict)
     return request_from_dict(org_dict, self.spider)
コード例 #37
0
ファイル: queue.py プロジェクト: roycehaynes/scrapy-redis
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     request = request_from_dict(pickle.loads(encoded_request), self.spider)
     'the decoded request is {}'.format(request)
     return request_from_dict(pickle.loads(encoded_request), self.spider)
コード例 #38
0
 def pop(self):
     request = super(ScrapyPriorityQueue, self).pop()
     if request and self.serialize:
         request = request_from_dict(request, self.spider)
     return request
コード例 #39
0
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     if encoded_request.get("body") and encoded_request.get("body") is not None:
         encoded_request["body"]=zlib.decompress(base64.urlsafe_b64decode(encoded_request["body"].encode("utf-8")))
     return request_from_dict(encoded_request, self.spider)
コード例 #40
0
ファイル: scheduler.py プロジェクト: xacprod/ve1
 def _dqpop(self):
     if self.dqs:
         d = self.dqs.pop()
         if d:
             return request_from_dict(d, self.spider)
コード例 #41
0
ファイル: queue.py プロジェクト: wang1352083/eastmoney
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     return request_from_dict(pickle.loads(encoded_request), self.spider)
コード例 #42
0
def decode_request(data):
    """Decode an request previously encoded"""
    return request_from_dict(pickle.loads(data))
コード例 #43
0
ファイル: queue.py プロジェクト: 15310944349/scrapy-redis
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     obj = self.serializer.loads(encoded_request)
     return request_from_dict(obj, self.spider)
コード例 #44
0
    def process_response(self, request: Request, response: Response,
                         spider: Spider) -> Response:
        try:
            crawlera_meta = request.meta[META_KEY]
        except KeyError:
            crawlera_meta = {}

        if crawlera_meta.get(
                "skip") or not crawlera_meta.get("original_request"):
            return response

        original_request = request_from_dict(crawlera_meta["original_request"])

        self.stats.inc_value("crawlera_fetch/response_count")
        self._calculate_latency(request)

        self.stats.inc_value("crawlera_fetch/api_status_count/{}".format(
            response.status))

        if response.headers.get("X-Crawlera-Error"):
            message = response.headers["X-Crawlera-Error"].decode("utf8")
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value(
                "crawlera_fetch/response_error/{}".format(message))
            log_msg = "Error downloading <{} {}> (status: {}, X-Crawlera-Error header: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                message,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.error(log_msg)
                return response

        try:
            json_response = json.loads(response.text)
        except json.JSONDecodeError as exc:
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value(
                "crawlera_fetch/response_error/JSONDecodeError")
            log_msg = "Error decoding <{} {}> (status: {}, message: {}, lineno: {}, colno: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                exc.msg,
                exc.lineno,
                exc.colno,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg) from exc
            else:
                logger.error(log_msg)
                return response

        if json_response.get("crawlera_error"):
            error = json_response["crawlera_error"]
            message = json_response["body"]
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value(
                "crawlera_fetch/response_error/{}".format(error))
            log_msg = (
                "Error downloading <{} {}> (Original status: {}, Fetch API error message: {})"
            )
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                json_response["original_status"],
                message,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.error(log_msg)
                return response

        self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(
            json_response["original_status"]))

        crawlera_meta["upstream_response"] = {
            "status": response.status,
            "headers": response.headers,
            "body": json_response,
        }
        respcls = responsetypes.from_args(
            headers=json_response["headers"],
            url=json_response["url"],
            body=json_response["body"],
        )
        return response.replace(
            cls=respcls,
            request=original_request,
            headers=json_response["headers"],
            url=json_response["url"],
            body=json_response["body"],
            status=json_response["original_status"],
        )
コード例 #45
0
ファイル: pqueues.py プロジェクト: elacuesta/scrapy
 def pop(self):
     request = super(ScrapyPriorityQueue, self).pop()
     if request and self.serialize:
         request = request_from_dict(request, self.spider)
     return request
コード例 #46
0
    def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
        try:
            crawlera_meta = request.meta[META_KEY]
        except KeyError:
            crawlera_meta = {}

        if crawlera_meta.get("skip") or not crawlera_meta.get("original_request"):
            return response

        original_request = request_from_dict(crawlera_meta["original_request"], spider=spider)

        self.stats.inc_value("crawlera_fetch/response_count")
        self._calculate_latency(request)

        self.stats.inc_value("crawlera_fetch/api_status_count/{}".format(response.status))

        if response.headers.get("X-Crawlera-Error"):
            message = response.headers["X-Crawlera-Error"].decode("utf8")
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value("crawlera_fetch/response_error/{}".format(message))
            log_msg = "Error downloading <{} {}> (status: {}, X-Crawlera-Error header: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                message,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.warning(log_msg)
                return response

        try:
            json_response = json.loads(response.text)
        except json.JSONDecodeError as exc:
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value("crawlera_fetch/response_error/JSONDecodeError")
            log_msg = "Error decoding <{} {}> (status: {}, message: {}, lineno: {}, colno: {})"
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                response.status,
                exc.msg,
                exc.lineno,
                exc.colno,
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg) from exc
            else:
                logger.warning(log_msg)
                return response

        server_error = json_response.get("crawlera_error") or json_response.get("error_code")
        original_status = json_response.get("original_status")
        request_id = json_response.get("id") or json_response.get("uncork_id")
        if server_error:
            message = json_response.get("body") or json_response.get("message")
            self.stats.inc_value("crawlera_fetch/response_error")
            self.stats.inc_value("crawlera_fetch/response_error/{}".format(server_error))
            log_msg = (
                "Error downloading <{} {}> (Original status: {}, "
                "Fetch API error message: {}, Request ID: {})"
            )
            log_msg = log_msg.format(
                original_request.method,
                original_request.url,
                original_status or "unknown",
                message,
                request_id or "unknown",
            )
            if self.raise_on_error:
                raise CrawleraFetchException(log_msg)
            else:
                logger.warning(log_msg)
                return response

        self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status))

        crawlera_meta["upstream_response"] = {
            "status": response.status,
            "headers": response.headers,
            "body": json_response,
        }
        try:
            resp_body = base64.b64decode(json_response["body"], validate=True)
        except (binascii.Error, ValueError):
            resp_body = json_response["body"]

        respcls = responsetypes.from_args(
            headers=json_response["headers"],
            url=json_response["url"],
            body=resp_body,
        )
        return response.replace(
            cls=respcls,
            request=original_request,
            headers=json_response["headers"],
            url=json_response["url"],
            body=resp_body,
            status=original_status or 200,
        )
コード例 #47
0
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     obj = self.serializer.loads(encoded_request)
     return request_from_dict(obj, self.spider)
コード例 #48
0
ファイル: scheduler.py プロジェクト: hw20686832/crawler
 def _dqpop(self):
     if self.domainmodel:
         d = self.domainmodel.q_pop()
         if d:
             return request_from_dict(json.loads(d), self.spider)
コード例 #49
0
ファイル: queue.py プロジェクト: mobishift2011/amzn
 def pop(self):
     # use atomic range/remove using multi/exec
     result = self.server.lpop(self.key)
     if result:
         return request_from_dict(marshal.loads(result), self.spider)
コード例 #50
0
ファイル: queue.py プロジェクト: 0326/distribute_crawler
 def _decode_request(self, encoded_request):
     """Decode an request previously encoded"""
     return request_from_dict(pickle.loads(encoded_request), self.spider)
コード例 #51
0
 def __decode_parser_request__(self, obj, spider):
     if obj['meta'].get('garment'):
         serialized_requests = obj['meta']['garment']['meta']['requests_queue']
         requests_queue = [request_from_dict(req, spider) for req in serialized_requests]
         obj['meta']['garment']['meta']['requests_queue'] = requests_queue