def receive_seed_and_start_crawl(self, seed): """接受种子 拿出url 带cookie 完成请求 """ print('\n开始抓取') url = seed.get('url') # 一个反馈,请求一个cookie print('请求种子和cookie') self.feed_back_seed(seed, cookie=False) cookie = wait_for_msg_long(ssn_que_p) data = self.user_define_request(url, cookie) # 首先验证data是有有效 if data and data != ['null']: # 先反馈 self.feed_back_seed(seed, cookie=True) # 放数据 seed.update({'data': data}) # 丢入持久化队列里 redis_cli.lpush(psm_que, dumps_json(seed)) print('有数据,放入持久化队列') else: # 反馈该cookie失效 seed.update({'cookie_status': 1}) # 这里犯了一个大错,就是cookie呢 09/10 seed.update({'cookie': cookie}) # 丢入反馈队列里 self.feed_back_seed(seed, cookie=True) print('完成反馈') # 他的生命循环完成 del seed
def lets_do_spider(self): """针对2.0版本""" result = [] api_info = deepcopy(self.api_demo) # 省略了参数验证环节 pname = self.get_argument('pname') cardNum = self.get_argument('cardNum') # 使用多进程 pool = Pool(3) duty_list = ['zhixing', 'shixin', 'baidu'] for duty in duty_list: result.append(pool.apply_async(run, ( duty, pname, cardNum, ))) pool.close() pool.join() # 更新被执行人 api_info['data']['zhixing'] = result[0].get() # 更新失信被执行人 api_info['data']['shixin'] = result[1].get() # 更新百度 api_info['data']['baidu'] = result[2].get() return dumps_json(api_info)
def lets_fuck_recycle(self, data): """这个函数的作用就是炒回锅肉""" info = [] recycle = [] info_file = config.info_file.get(self.name) url_info = config.url_info_dict.get(self.name) headers_info = config.headers_info headers_info.update(config.headers_info_dict.get(self.name)) api = RequestAPI() for each in data: params_info = self.construct_params_info(each[-1]) json_text = api.receive_and_request(url=url_info, headers=headers_info, params=params_info, method='GET') # 需要验证是否有数据 js_dict = self.verify_json_text(json_text) if js_dict is not None: # 放回数据 info.append(js_dict) # 记录数据 write_2_file(info_file, dumps_json(js_dict)) else: self.feedback() self.pop_captcha_info() recycle.append(each) del params_info return info, recycle
def push_seed_2_queue(self, seed): """ 将拿到的seed放到队列里 :return: """ que = config.task_que redis_cli.lpush(que, dumps_json(seed))
def put_cookie_2_que(self, cookie): """ 将cookie放入队列里 """ que = config.ssn_2_slv cookie = loads_json(cookie) redis_cli.lpush(que, dumps_json(cookie))
def sendall(self, msg, ip, port=9000, close=True): try: sock = socket.socket() sock.connect((ip, port)) sock.sendall(utils.dumps_json(msg).encode() + b'\r\n\r\n') if close: utils.close_connection(sock) return sock except (TimeoutError, OSError): pass
def feed_back_seed(self, ctx, session, seed): """向seesion和seed管理反馈 session=True时候,需要向session管理发送 seed=True时候,需要向seed管理推送 """ if session and seed: redis_cli.lpush(slv_2_sed, dumps_json(ctx)) redis_cli.lpush(slv_2_ssn, dumps_json(ctx)) elif session and not seed: redis_cli.lpush(slv_2_ssn, dumps_json(ctx)) else: redis_cli.lpush(slv_2_sed, dumps_json(ctx)) return # if __name__ == '__main__': # seed = {"brand_id": "1199", "brand": "奥迪", "serise_id": "2614", "serise": "奥迪A5", "p_type": "合资", "url": "https://www.guazi.com/xinyang/dealrecord?tag_id=22288&date=2017100", "check_city": "xinyang", "date": "2018-1", "cookie": {}, "data": [], "cookie_status": 0, "epoh": 0} # cookie = {"clueSourceCode": "%2A%2300", "preTime": "%7B%22last%22%3A1537928136%2C%22this%22%3A1537928136%2C%22pre%22%3A1537928136%7D", "GZ_TOKEN": "ef52toYlCiG36xYV8f3011%2BZVJgkcTK8eTkkn31WYGulmX9gKIByhmHZp1d6sg%2BtwJ3L0CbW2avGHetiKQLSM5EvM90l2XbOHFMZs97irvp8flsdbMTJlK1okNg8BAtx6RkhoQ%2BhbwYPAaLDLw", "guaZiUserInfo": "0MSnBkg0hdYQNXvlLOYi2", "userid": "620499844"} # sh = SpiderHandler('1') # # sh.receive_seed_and_start_crawl(seed) # sh.demo(seed, cookie)
def receive_seed_and_start_crawl(self, seed): """接受种子 拿出url 带cookie 完成请求 """ print('\nspider获取任务') url = seed.get('url') # 一个反馈,请求一个cookie print('请求种子和cookie') self.feed_back_seed(seed, session=True, seed=False) # 等待cookie cookie = wait_for_msg_long(ssn_2_slv) # 实例化请求,解析模块 rm = RequestModel(self.mark) data = rm.user_define_request(url, cookie) # 首先验证data是有有效 if data != ['redirect'] and data != ['null'] and data != []: # 先反馈, 这时候只需要向种子管理反馈 self.feed_back_seed(seed, session=False, seed=True) # 放数据 seed.update({'data': data}) # 丢入持久化队列里 print('有数据,放入持久化队列\n') redis_cli.lpush(slv_2_psm, dumps_json(seed)) elif data == ['redirect']: # 反馈该cookie失效, 需要向两个队列同时反馈 seed_b = deepcopy(seed) seed_b.update({'cookie_status': 1}) # 这里犯了一个大错,就是cookie呢 09/10 seed_b.update({'cookie': cookie}) # 丢入反馈队列里 self.feed_back_seed(seed_b, session=True, seed=True) print('cookie失效,完成反馈\n') del seed_b else: # 没有数据,cookie仍旧是有效的 # 只需要向种子管理反馈 self.feed_back_seed(seed, session=False, seed=True) print('没有数据, 完成反馈\n') # 他的生命循环完成 del seed del rm
def discover(self, localhost_only=False): broadcast_msg = { 'operation': 'DISCOVER', 'join': True, 'sender': list(self.node), 'key': utils.generate_random_id() } broadcast_msg = utils.dumps_json(broadcast_msg) broadcast_msg = broadcast_msg.encode() sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) if localhost_only: for p in range(8000, 8011): if p != self.node.port: sock.sendto(broadcast_msg, ('127.0.0.1', p)) else: sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) sock.sendto(broadcast_msg, ('255.255.255.255', 8081))
def parase_html(self, html): info = [] total_num = 1 js_dict = loads_json(html) if js_dict is not None: data = js_dict.get('data') if isinstance(data, list) and data != []: for i in data: for each in i.get('result'): info.append(each) # 放入持久化 write_2_file(self.baidu_list, dumps_json(each)) # 还需要判断是否有下一页 total_num = i.get('dispNum', 1) return info, total_num
def insert_seed_save(url, brand, serise, city, date, brand_id, serise_id, p_type): """注入seed以及保存""" seed = deepcopy(seed_demo) seed.update({'url': url, 'brand_id': brand_id, 'brand': brand, 'serise_id': serise_id, 'serise': serise, 'check_city': city, 'date': date, 'p_type': p_type }) write_2_file(seed_file, dumps_json(seed)) del seed return
def start_node_in_order(cmd): """ 将指定的命令放入队里 """ que = config.task_que order = { 'ssnm': {'cmd': {'command': 'ssnm'}, 'name': 'SessionMangement'}, 'sedm': {'cmd': {'command': 'sedm'}, 'name': 'SeedMangement'}, 'psm': {'cmd': {'command': 'psm'}, 'name': 'PersistenceMangement'} } print('启动\t{0}'.format(order.get(cmd).get('name'))) redis_cli.lpush(que, dumps_json(order.get(cmd).get('cmd'))) # 等待回馈 wait_feed_back() print('完成启动') return
def lets_get_cookie(cookies): # result 用来验证cookie 从而验证登录是否成功 result = False index_list = ['userid', 'guaZiUserInfo', 'GZ_TOKEN'] user_info = {} for each in cookies: for i in index_list: if each.get('name') == i: result = True user_info.update({i: each.get('value')}) if result: # 保存账号 write_2_file('DB/user_info.txt', dumps_json(user_info)) print('该用户信息已经保存') else: print('登录失败,未能保存用户cookie信息')
def __delete_cookie(self, user_id): """ 这里是删除思路就是遍历一遍cookie_list 除开删除的用户,其余重新写入文件里 :param user_id:待删除的 user_id """ cookie_list = self.load_cookies_list() new_cookie_list = [] for cookie in cookie_list: cookie = loads_json(cookie) if not cookie.get('userid') == user_id: new_cookie_list.append(cookie) # 重新写入文件 initial_file(config.user_info_file) if new_cookie_list != []: for each in new_cookie_list: write_2_file(config.user_info_file, dumps_json(each)) return
def listn_the_psm_que(): """持续监听psm_que这个队列 只要一有数据过来,就做存储 """ # 先反馈 # 完成后像队里推送一条已完成启动 print('持久化队列启动') que = config.task_que_fb ctx = dumps_json({'psm': 'done'}) redis_cli.lpush(que, ctx) while True: if redis_cli.exists(psm_que): # 就开始处理 token_set = make_set(token, blank='', index='') msg = redis_cli.rpop(psm_que) seed = loads_json(translate_2_json_dict(msg)) print('{0}\t收到数据'.format( datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'))) # 接下来就是做持久化处理了 do_persistence(seed, token_set) time.sleep(0.1)
def seed_main_logic(self): """ 主要处理逻辑 1. 生产种子 2. 提取种子 3. 检测状态 """ print('seed管理已启动') # 完成后像队里推送一条已完成启动 que = config.task_que_fb ctx = dumps_json({'sedm': 'done'}) redis_cli.lpush(que, ctx) # 更新车系 update_brands_serise() # 第一步就是生产种子 print('生产种子') self.seeds_maker() # 完了后先丢20个种子 self.decide_push_seed_2_queue(0) # 开始监听队列,准备投放种子 slv_2_sed = config.slv_2_sed while True: msg = wait_for_msg_long(slv_2_sed) print('{0}\t接收反馈'.format( datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'))) if msg: # 开始处理这个反馈 # 主要看 cookie_status is_deal = self.deal_feed_back(msg) if is_deal: msg.update({'cookie_status': 0}) self.push_seed_2_queue(msg) continue else: # 通过的话,则上传一个新的种子 self.decide_push_seed_2_queue(1) print('完成推送') time.sleep(0.1)
def session_main_logic(self): """ 由slave调用的部分 实例化后,实现登录 在 删除,导入列表时候通过消息通信来完成 需要加一个结束模块 #09-05 解决bug 等待机制: 当收到第一个请求触发 统计数量 放入队列 """ print('session管理已启动') # 实例化我们的种子模块,并开始登录 self.logic_add_cookie() # # 并把所有的cookie都扔到消息队列里去 # self.decide_psuh_cookie_2_que(0) # 完成后像队里推送一条已完成启动 que = config.task_que_fb ctx = dumps_json({'ssnm': 'done'}) redis_cli.lpush(que, ctx) # 开始监听反馈队列 print('开始监听ssn_req队列') slv_2_ssn = config.slv_2_ssn while True: msg_list = [] msg = wait_for_msg_long(slv_2_ssn) print(datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'), '\t接受反馈') msg_list.append(msg) # 只要有消息来了,先处理,再就发一条cookie出去 is_deal = self.deal_feed_back(msg_list) if not is_deal: self.decide_psuh_cookie_2_que(1) print('完成cookie派发\n') """
def attend(client): while True: msg = self.recvall(client) if not msg: break # print(f'RECEIVED MSG {msg}') data = {'method': None} try: data = json.loads(msg) except: pass if data['method'] == 'PUBLISH': node = self.node.asTuple() self.publish(data, node, node) elif data['method'] == 'LOOKUP': answer = self.lookup_value(data['id']) founded, result, file_bytes = answer[0], answer[1], answer[2] if founded and result['value_type'] == 'file': if not file_bytes: file_bytes = self.recv_file() # try: client.sendall(file_bytes) # except: pass client.sendall(file_bytes) else: if not founded: result = None client.sendall(utils.dumps_json(result).encode() + b'\r\n\r\n') client.close() elif data['method'] == 'PING': client.send(b'PING') elif data['method'] == 'STORE': key, value = data['store_key'], data['store_value'] publisher, sender = tuple(data['publisher']), tuple(data['sender']) real_value = None if data['value_type'] == 'file': real_value = self.recv_file() self.node.STORE(key, value, publisher, sender, data['value_type'], real_value, data['to_update']) elif data['method'] == 'FIND_VALUE': founded, result, file_bytes = self.node.FIND_VALUE(data['id']) answer = {'operation': 'RESPONSE', 'result': (founded, result, file_bytes), 'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] } # client.sendall(utils.dumps_json(answer).encode() + b'\r\n\r\n') answer = utils.dumps_json(answer) client.sendall(answer.encode() + b'\r\n\r\n') if founded and result['value_type'] == 'file': # files_bytes = utils.load_file(result['value']) self._send_file(file_bytes, data['sender'][1]) if not Node.Equals(data['sender'], self.node): self.update(data['sender']) elif data['method'] == 'UPDATE': self._update(data['store_key'], data['store_value'], data['publisher'], data['sender']) elif data['method'] == 'FIND_NODE': result = self.node.FIND_NODE(data['id']) answer = {'operation': 'RESPONSE', 'result': result, 'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] } # client.sendall(utils.dumps_json(answer).encode() + b'\r\n\r\n') answer = utils.dumps_json(answer) client.sendall(answer.encode() + b'\r\n\r\n') if not Node.Equals(data['sender'], self.node): self.update(data['sender']) exit_thread()
def download_img_and_ocr(type): """同样的去请求,然后拿到数据 需要去重写 requestAPI的 do_request 下载的过程,将 zhixing 和 shixin文件里的captchaId都下载 给个开关 # 2018-10-29 在每次执行最后,将文件清空 """ is_go_on = False headers = config.headers_CaptchaHandler captcha_list = [] if type == 'zhixing': captcha = config.file_captcha_zhixing headers.update({'Host': config.host_zs.get('zhixing')}) else: captcha = config.file_captcha_shixin headers.update({'Host': config.host_zs.get('shixin')}) for i in open(captcha, 'r', encoding='utf8'): url = config.url_captcha_zhixing if type == 'zhixing' else config.url_captcha_shixin params = config.params_captcha params.update({'captchaId': i.strip(), 'random': random.random()}) # 开始请求 logger.info('下载验证码图片\t{0}\t{1}'.format(type, i.strip())) di = Download_img() img = di.receive_and_request(url=url, headers=headers, params=params, method='GET') if img != 'null_html': """ # 保存图片 file_path = config.img_file.format(captcha) save_img(file_path=file_path, img=img) """ # 执行ocr url = config.url_svm img_d = base64.b64encode(img) payloads = {'pic': img_d, 'type': 'pste'} result = di.receive_and_request(url=url, payloads=payloads, method='POST') try: result_dict = loads_json(result) if result_dict.get('status_code') == 200: captcha_list.append( [i.strip(), result_dict.get('data').get('captcha')]) logger.info('完成图片ocr\t{0}\t{1}'.format(type, i.strip())) is_go_on = True except Exception as e: logger.warning('ocr识别失败\t{0}'.format(e)) else: logger.warning('下载验证码图片失败\t{0}\t{1}'.format(type, i.strip())) # 丢入队列里 # 先要加入一个判断,列表不为空则行 if captcha_list != []: que = config.que.get(type) logger.debug(captcha_list) push_2_que(que, dumps_json(captcha_list)) logger.debug('ocr结果推入队列') return is_go_on
def feed_back_seed(self, seed, cookie): """向seesion和seed管理反馈""" redis_cli.lpush(ssn_que, dumps_json(seed)) if cookie: redis_cli.lpush(sed_que, dumps_json(seed))
def proccess_message(self, data, addr, running_in_thread=False): # data = json.loads(msg) if data['operation'] != 'DISCOVER': print("Data received: " + str(data)) if data['operation'] == 'DISCOVER': if data['join']: # addr = str(data['sender'][1]), int(data['sender'][2]) if addr != (self.node.ip, self.node.port): answer = { 'operation': 'CONTACT', 'sender': list(self.node), 'key': data['key'] } self.send_udp_msg(json.dumps(answer).encode(), addr) self.update(tuple(data['sender'])) print(f"{data['sender']} joined") else: if addr != (self.node.ip, self.node.port): ip, port = str(data['ip']), int(data['port']) server_addr = (self.node.ip, self.tcp_server_port) try: self.sendall(server_addr, ip, port) except: pass elif data['operation'] == 'CONTACT': contact = tuple(data['sender']) self.update(contact) self.lookup_node(self.node.ID) # A peer has to perform a method specified by other peer by RPC elif data['operation'] == 'EXECUTE': result = None if data['method'] == 'FIND_NODE': result = self.node.FIND_NODE(data['id']) elif data['method'] == 'FIND_VALUE': result = self.node.FIND_VALUE(data['id']) answer = {'operation': 'RESPONSE', 'result': result, 'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] } self.sendall(answer, addr[1]) if 'sender' in data: self.update(tuple(data['sender'])) return elif data['method'] == 'PING': result = self.node.PING() elif data['method'] == 'STORE': key, value = data['store_key'], data['store_value'] publisher, sender = tuple(data['publisher']), tuple(data['sender']) result = self.node.STORE(key, value, publisher, sender, to_update=data['to_update']) elif data['method'] == 'LOOKUP': result = self.lookup_value(data["id"]) elif data['method'] == 'PUBLISH': node = self.node.asTuple() self.publish(data, node, node) if result is not None: answer = {'operation': 'RESPONSE', 'result': result, 'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] } answer = utils.dumps_json(answer).encode() self.send_udp_msg(answer, addr) if 'sender' in data: self.update(tuple(data['sender'])) # A peer is requested to perform a RPC to other peer elif data['operation'] == 'RPC': msg = None if data['method'] == 'FIND_NODE': msg = utils.build_FIND_NODE_msg(data['id'], self.node) elif data['method'] == 'FIND_VALUE': msg = utils.build_FIND_VALUE_msg(data['id'], self.node) elif data['method'] == 'PING': msg = utils.build_PING_msg(self.node) elif data['method'] == 'STORE': msg = utils.build_STORE_msg(data['storeKey'], data['store_value'], self.node, self.node) if msg is not None: # The address of the remote peer wich it will be used as the target of the RPC addr = (data['ip'], data['port']) msg = utils.dumps_json(msg).encode() self.send_udp_msg(msg, addr) # The peer receives the answer of a RPC made before elif data['operation'] == 'RESPONSE': self.set_response(data['key'], data) if not Node.Equals(data['sender'], self.node): self.update(data['sender']) if running_in_thread: exit_thread()