Exemple #1
0
 def process_request(self, request, spider):
     current_req_data = self.req_data_list[self.counter % self.wx_num]
     req_data = TidyReqData.req_to_dict(
         current_req_data['load_more']['req_data'])
     request.set_method(req_data['method'])
     req_data['url_param_dict']['offset'] = request.meta['list_offset']
     url = req_data['url'] + dict_to_str(req_data['url_param_dict'])
     request._set_url(url)
     request.set_headers(req_data['headers'])
     self.counter += 1
     return None
 def process_request(self, request, spider):
     current_req_data = self.req_data_list[self.counter % self.wx_num]
     req_data = TidyReqData.req_to_dict(
         current_req_data['content']['req_data'])
     url = request._get_url()
     raw_url = copy(url)
     if "https" in raw_url:
         raw_url = raw_url.replace("https", "http")
     request.set_ext_data({"raw_url": raw_url})
     if "https" not in url:
         url = url.replace("http", "https")
     request._set_url(url)
     request.set_method(req_data['method'])
     if "Cookie" in req_data['headers']:
         req_data['headers'].pop("Cookie")
     request.set_headers(req_data['headers'])
     self.counter += 1
     return None
 def process_request(self, request, spider):
     current_req_data = self.req_data_list[self.counter % self.wx_num]
     req_data = TidyReqData.req_to_dict(
         current_req_data['getappmsgext']['req_data'])
     content_url = request._get_url()
     content_url_param_dict = str_to_dict(
         content_url.split('?')[-1], '&', '=')
     body_dict = req_data['body_dict']
     body_dict.update(content_url_param_dict)
     body_dict['comment_id'] = request.get_ext_data['comment_id']
     body_dict['is_need_reward'] = 1
     url = req_data['url'] + req_data['url_param_str']
     request._set_url(url)
     request.set_method(req_data['method'])
     request.set_headers(req_data['headers'])
     body_str = dict_to_str(body_dict)
     request._set_body(body_str)
     self.counter += 1
     return None
    def prepare_req_data(self, current_req_data, request, _type):
        """
        :param current_req_data: 本轮请求需要使用的请求参数
        :param request: Request对象
        :return: 准备爬取阅读数据的请求参数
        """
        request_data = {}

        if _type in ['getappmsgext', 'appmsg_comment']:
            req_data = TidyReqData.req_to_dict(
                current_req_data[_type]['req_data'])
        else:
            return request_data

        #根据原始文章的url构建body参数
        content_url = request._get_url()
        content_url_param_dict = str_to_dict(
            content_url.split('?')[-1], '&', '=')
        body_dict = copy(req_data['body_dict'])
        from tools.utils import update_dict_by_dict
        update_dict_by_dict(body_dict, content_url_param_dict,
                            ['mid', 'sn', 'idx', 'scene'])
        body_dict['comment_id'] = request.meta['comment_id']
        body_dict['is_need_reward'] = 1
        # 如果请求的是评论内容
        if "comment_id" in req_data['url_param_dict']:
            url_param_dict = copy(req_data['url_param_dict'])
            url_param_dict['comment_id'] = request.meta['comment_id']
            url_param_dict['idx'] = content_url_param_dict['idx']
            from tools.utils import dict_to_str
            url_param_str = dict_to_str(url_param_dict)
            request_data['url_str'] = req_data['url'] + url_param_str
        # 如果请求的是阅读量
        else:
            request_data[
                'url_str'] = req_data['url'] + req_data['url_param_str']
        request_data['header_dict'] = req_data['headers']
        request_data['body_dict'] = body_dict

        return request_data