def put_cookie_2_que(self, cookie): """ 将cookie放入队列里 """ que = config.ssn_2_slv cookie = loads_json(cookie) redis_cli.lpush(que, dumps_json(cookie))
def get_shixin_each_info(data_list, http, ch) -> _data: """获取各失信被执行人详情""" data = [] headers = cnf.headers_s_info url = cnf.url_s_info for each in data_list: params = deepcopy(cnf.params_s_info) params.update({ 'id': each[-1], 'caseCode': each[-2], 'pCode': ch.s_c_code, 'captchaId': ch.s_c_id, }) info = loads_json( http.receive_and_request(url=url, headers=headers, params=params, method='get')) if not info: # 代表验证码错误 # todo: 验证码更新 ch.get_new_captcha('shixin') else: data.append(info) return data
def get_zhixing_each_info(data_list, http, ch) -> _data: """获取各执行人列表的详情""" data = [] headers = cnf.headers_z_info url_info = cnf.url_z_info for each in data_list: pid = each[-1] params = deepcopy(cnf.params_z_info) # todo: 添加验证码部分 params.update({ 'id': pid, 'captchaId': ch.z_c_id, 'j_captcha': ch.z_c_code, '_': int(time.time() * 1000) }) info = loads_json( http.receive_and_request(url=url_info, headers=headers, params=params, method='get')) if not info: # 代表验证码错误 ch.get_new_captcha('zhixing') data.append(info) return data
def receive_new_captcha(self, choice): """获取新的验证码""" if choice == 'zhixing': msg = pop_msg('captcha_z') else: msg = pop_msg('captcha_s') return loads_json(msg)
def get_headers_with_cookie() -> dict: weibo_requests_header = deepcopy(firefox_request_header) weibo_requests_header['Host'] = 'weibo.cn' cookies_dict = loads_json(COOKIES_JSON) weibo_requests_header['Cookie'] = "; ".join([ "{}={}".format(cookie["name"], cookie["value"]) for cookie in cookies_dict ]) return weibo_requests_header
def parse_zhixing_shixin_list(data_list) -> _zs_list: """解析列表数据""" js_dict = loads_json(data_list) data, pages = [], 1 if js_dict is not None: for each in js_dict[0].get('result'): name = each.get('pname') if each.get('pname') else each.get( 'iname') jsonObject = each.get('jsonObject') date = '' if jsonObject: d_dict = loads_json(jsonObject) date = d_dict.get('caseCreateTime') if d_dict.get( 'caseCreateTime') else d_dict.get('regDate') gisId = each.get('caseCode') pid = str(each.get('id')) data.append([name, date, gisId, pid]) pages = js_dict[0].get('totalPage') return data, pages
def parse_json(html): """解析车系的数据""" js_dict = loads_json(html) data = [] if js_dict is not None: for each in js_dict.get('data'): id = each.get('id') name = each.get('name') # url = each.get('url') guochanhezijinkou = each.get('guochanhezijinkou') data.append([id, name, guochanhezijinkou]) return data
def take_out_a_seed(self): """ 从生成器里取出一个种子 之所以这样控制,seedMangement需要反馈 每次投放固定量的种子 """ seed = None try: seed = loads_json(self.seeds_store.__next__()) except: # 种子派发完毕 logger.info('种子派发完毕') return seed
def login_weibo(): """ 微博登录 :return: """ is_exists_cookies_json = is_path_exists(COOKIES_JSON) if is_exists_cookies_json: print("cookies json 已存在") is_expiry = is_expiry_sub() if is_expiry: print("Cookies 即将过期 重新获取") print("帐号密码登录") driver_initial() driver = get_browser() try: print('准备登陆Weibo.cn网站...') driver.get(WEIBO_LOGIN_URL) # WebDriverWait(driver, 10).until(ec.presence_of_element_located((By.ID, "loginAction"))) 该句相较于下句不起作用 WebDriverWait(driver, 10).until( ec.visibility_of_element_located((By.ID, "loginAction"))) elem_user = driver.find_element_by_id("loginName") elem_user.send_keys(Mine().username) # 用户名 elem_pwd = driver.find_element_by_id("loginPassword") elem_pwd.send_keys(Mine().password) # 密码 elem_sub = driver.find_element_by_id("loginAction") elem_sub.click() # 点击登陆,登录多次或异地登录可能会有验证码 WebDriverWait(driver, 20).until(ec.url_contains('m.weibo.cn')) sina_cookies = driver.get_cookies() # 包含多个 cookie 的字典列表 # for cookie in sina_cookies: # cookie['table'] = 'weibo_cookies' # with open(COOKIES_JSON, 'w', encoding="utf-8") as f: # 保存Cookies # f.write(json.dumps(sina_cookies, indent=4)) dump_dict_to_json(sina_cookies, COOKIES_JSON) print('<登陆成功>') driver.close() except Exception as e: print("Error: <登录失败> {}".format(e)) else: print("Cookies 登录") weibo_cookies = loads_json(COOKIES_JSON) driver_initial() driver = get_browser() driver.delete_all_cookies() driver.get("https://weibo.cn/") for wc in weibo_cookies: wc.pop('domain') driver.add_cookie(wc) driver.get("https://weibo.cn/")
def parase_html(self, html): info = [] total_num = 1 js_dict = loads_json(html) if js_dict is not None: data = js_dict.get('data') if isinstance(data, list) and data != []: for i in data: for each in i.get('result'): info.append(each) # 放入持久化 write_2_file(self.baidu_list, dumps_json(each)) # 还需要判断是否有下一页 total_num = i.get('dispNum', 1) return info, total_num
def __delete_cookie(self, user_id): """ 这里是删除思路就是遍历一遍cookie_list 除开删除的用户,其余重新写入文件里 :param user_id:待删除的 user_id """ cookie_list = self.load_cookies_list() new_cookie_list = [] for cookie in cookie_list: cookie = loads_json(cookie) if not cookie.get('userid') == user_id: new_cookie_list.append(cookie) # 重新写入文件 initial_file(config.user_info_file) if new_cookie_list != []: for each in new_cookie_list: write_2_file(config.user_info_file, dumps_json(each)) return
def listn_the_psm_que(): """持续监听psm_que这个队列 只要一有数据过来,就做存储 """ # 先反馈 # 完成后像队里推送一条已完成启动 print('持久化队列启动') que = config.task_que_fb ctx = dumps_json({'psm': 'done'}) redis_cli.lpush(que, ctx) while True: if redis_cli.exists(psm_que): # 就开始处理 token_set = make_set(token, blank='', index='') msg = redis_cli.rpop(psm_que) seed = loads_json(translate_2_json_dict(msg)) print('{0}\t收到数据'.format( datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'))) # 接下来就是做持久化处理了 do_persistence(seed, token_set) time.sleep(0.1)
def listen_task_que(): """启动后 开始监听 Task_Que 拿到任务,先弄清是是个啥 在转换自己的角色 正常的种子 {"url": "xxxx", ......} 任务: {"command": "xxxx"} session管理: ssnm seed管理: sedm persistence管理: psm # 09-07更新。需要为每一个节点打上一个标记 为了不放js文件混乱,才这样的。 """ task_que = config.task_que mark_que = config.mark_que # 在监听任务前,需要先监听mark_que # 具体就是,从mark队列里拿到数字标号 # 自增1作为自己的标号 # 同时将自己的标号放入mark队列里 # 先监听mark队列,拿到自己的编号 while True: if redis_cli.exists(mark_que): msg = redis_cli.rpop(mark_que) if not msg: continue mark = int(msg.decode()) + 1 break time.sleep(random.random()) # 放入队列里 redis_cli.lpush(mark_que, mark) print('当前slave编号\t{0}'.format(mark)) # 完成了后,才开始监听这个任务队列 while True: if redis_cli.exists(task_que): msg = redis_cli.rpop(task_que) # 开始分类msg属于什么任务: # if not msg: continue msg_dict = loads_json(msg.decode()) print('{0}\t收到数据'.format( datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'))) # 开始分类: if msg_dict.get('command'): # 这里commend commend = msg_dict.get('command') if commend == 'ssnm': sm = SessionMangement() sm.session_main_logic() elif commend == 'sedm': sm = SeedsMangement() sm.seed_main_logic() else: # 化身持久化模块 listn_the_psm_que() else: # 那就是种子了 # 这里要做的事情有 # 1. 请求一个cookie # 2. 完成html的请求 # 3. data放入psm队列里 # 4. 反馈给seesion/seed模块 time.sleep(random.random() * 10) seed = msg_dict # 调度spider, 把mark放入实例化中 sp = SpiderHandler(mark) sp.receive_seed_and_start_crawl(seed) # 结束上一个,等下一个种子 del sp time.sleep(0.1)
def download_img_and_ocr(type): """同样的去请求,然后拿到数据 需要去重写 requestAPI的 do_request 下载的过程,将 zhixing 和 shixin文件里的captchaId都下载 给个开关 # 2018-10-29 在每次执行最后,将文件清空 """ is_go_on = False headers = config.headers_CaptchaHandler captcha_list = [] if type == 'zhixing': captcha = config.file_captcha_zhixing headers.update({'Host': config.host_zs.get('zhixing')}) else: captcha = config.file_captcha_shixin headers.update({'Host': config.host_zs.get('shixin')}) for i in open(captcha, 'r', encoding='utf8'): url = config.url_captcha_zhixing if type == 'zhixing' else config.url_captcha_shixin params = config.params_captcha params.update({'captchaId': i.strip(), 'random': random.random()}) # 开始请求 logger.info('下载验证码图片\t{0}\t{1}'.format(type, i.strip())) di = Download_img() img = di.receive_and_request(url=url, headers=headers, params=params, method='GET') if img != 'null_html': """ # 保存图片 file_path = config.img_file.format(captcha) save_img(file_path=file_path, img=img) """ # 执行ocr url = config.url_svm img_d = base64.b64encode(img) payloads = {'pic': img_d, 'type': 'pste'} result = di.receive_and_request(url=url, payloads=payloads, method='POST') try: result_dict = loads_json(result) if result_dict.get('status_code') == 200: captcha_list.append( [i.strip(), result_dict.get('data').get('captcha')]) logger.info('完成图片ocr\t{0}\t{1}'.format(type, i.strip())) is_go_on = True except Exception as e: logger.warning('ocr识别失败\t{0}'.format(e)) else: logger.warning('下载验证码图片失败\t{0}\t{1}'.format(type, i.strip())) # 丢入队列里 # 先要加入一个判断,列表不为空则行 if captcha_list != []: que = config.que.get(type) logger.debug(captcha_list) push_2_que(que, dumps_json(captcha_list)) logger.debug('ocr结果推入队列') return is_go_on
def get_sub_expriy(): weibo_cookies = loads_json(COOKIES_JSON) _sub_cookie = list( filter(lambda cookie: cookie['name'] == 'SUB', weibo_cookies))[0] sub_expriy = _sub_cookie['expiry'] return sub_expriy
def verify_json_text(self, json_text): """验证返回的数据是否为正常""" js_dict = loads_json(json_text) return js_dict