def start_requests(self): httpid = redis_instance.get('__running_http_') redis_instance.delete('__running_http_') print('httpid %s' % httpid) http = mongo_instance.https.find_one(filter={'_id': ObjectId(httpid)}) task_obj_id = http['taskid'] print('taskid %s' % str(task_obj_id)) task = mongo_instance.tasks.find_one(filter={'_id': task_obj_id}) print('- task finded') print(task) self.http = http self.task = task cookie_str = http['actionhome']['REQUEST_HEADERS']['Cookie'].replace( ' ', '') cookie_arr = cookie_str.split(';') # NOTE 我曹! cookies = { item.split('=', 1)[0]: item.split('=', 1)[1] for item in cookie_arr } print('- cookies') print(cookies) FakeLoadParams.cookies['pass_ticket'] = http['pass_ticket'] FakeLoadParams.cookies['wap_sid2'] = cookies['wap_sid2'] FakeLoadParams.cookies['wxuin'] = cookies['wxuin'] FakeLoadParams.cookies['version'] = cookies['version'] FakeLoadParams.params['__biz'] = http['biz'] FakeLoadParams.params['pass_ticket'] = http['pass_ticket'] FakeLoadParams.params['appmsg_token'] = http['appmsg_token'] url = NORMAL_URLS.load arr = [] for key, val in FakeLoadParams.params.items(): # print(val) arr.append(key + '=' + val) queryString = '?' + '&'.join(arr) print(queryString) print('- FakeLoadParams cookies') print(FakeLoadParams.cookies) self.crawled_times = 1 if 'running_in_http' in self.task['task_status']: yield scrapy.Request(url=url + queryString, headers=FakeLoadParams.headers, cookies=FakeLoadParams.cookies, method='GET') else: return
def add_nick_name(ordered_req_dict): """ :param ordered_req_dict: :return:添加nick_name """ wxuin_nn_dict = {} nickname = TidyReqData.get_nickname() for key in redis_instance.keys("*.nick_name"): wxuin_nn_dict[(redis_instance.get(key)).decode('utf8')] = ( key.decode('utf8')).split('.')[0] for key in ordered_req_dict: ordered_req_dict[key]['nick_name'] = wxuin_nn_dict[key] ordered_req_dict[key]['wxuin'] = key ordered_req_dict[key]['nickname'] = nickname return ordered_req_dict
def get_all_req_data(): """ 获取redis中所有的请求文件,也就是key中含有.req字段的记录。最终返回的数据根据key中的时间戳进行排序过 :return: {'1532859863455.getappmsgext.req':dict_file,) '1523423421446.appmsg_comment.req':dict_file} """ unordered_req_dict = {} ordered_req_dict = collections.OrderedDict() # 遍历所有的请求文件 for key in redis_instance.keys("*.req"): req_bin_data = redis_instance.get(key) try: req_dict_data = json.loads(req_bin_data) except: req_dict_data = str(req_bin_data) # req_dict_data = req_bin_data.decode('utf8') unordered_req_dict[key.decode('utf8')] = req_dict_data # 按照时间顺序排序之后返回字典 for key in sorted(unordered_req_dict.keys()): ordered_req_dict[key] = unordered_req_dict[key] return ordered_req_dict
def get_nickname(): return redis_instance.get('current_nickname').decode('utf8')
def get_xcx_item_list(self, nickname, hand=False): """ 获取小程序所有请求数据 :param hand: 是否手动 :param nickname: 小程序名称 :return: """ print(nickname) TidyReqData.flush_data("*.req") self.home_to_search() self.search_xcx(nickname) # 选中第一个结果后进入小程序,先选择第一个栏目 self.oap.tap(tuple(eval(self.data['BTN']['JIU_QIAN_ZFJY']))) time.sleep(1) # self.oap.tap(tuple(eval(self.data['BTN']['JIU_QIAN_HWYJ']))) # 截图 与记录匹配获取相关信息 # 方案一:先拉取全部文章列表,然后遍历获取每篇文章 # 方案二:现截现获取信息 get_list_slide_num = 0 while redis_instance.get("xcx_get_list_stop") is None: self.oap.swap([60, 1000], [60, 250]) get_list_slide_num = get_list_slide_num + 1 time.sleep(0.5) # 回退到首部 if redis_instance.get("xcx_get_list_stop"): for i in range(get_list_slide_num): self.oap.swap([60, 250], [60, 1000]) # 获取小程序信息列表 xcx_item_list = TidyReqData.get_xcx_req_data("*._xcx") # xcx_item_list = [] for item in xcx_item_list: print("当前文档", item['title']) if xcx.doc_exist("jqzt", item['id']): self.oap.swap([60, 500], [60, 250]) continue # 遍历每一项,并截图处理 item_pos = self.vc.click_by_words(item['title'], tap=False) print(item_pos, "", item['title']) self.oap.tap(item_pos) time.sleep(3) self.oap.key(self.data['KEY']['BACK_KEYEVENT']) # 到达限制次数,退出循环 if redis_instance.get("xcx_get_detail_stop"): break self.oap.swap([60, 500], [60, 250]) # 滑动拉取列表拉完停止 time.sleep(1) self.oap.key(self.data['KEY']['BACK_KEYEVENT']) self.oap.key(self.data['KEY']['BACK_KEYEVENT']) print("原始数据进入mongo %s" % ("xcx_jqzt")) TidyReqData.insert_xcx_to_mongo("xcx_jqzt") print("原始数据进入mongo %s 完成" % ("xcx_jqzt")) print("正在为 %s 创建索引..." % ("jqzt")) index_result = xcx.index_db_docs("jqzt") print("索引完成", index_result) print("redis 相关数据设置缓存时间") ttl_result = TidyReqData.set_redis_ttl(60 * 60 * 5) print("redis 5小时失效时间设置完成")