def set_state(**kwargs): sid = kwargs['sid'] state = kwargs['state'] pre_state = kwargs['pre_state'] info = kwargs['info'] need_parameters = kwargs['need_parameters'] receive = kwargs.get('receive', False) filter_data = {'sid': sid} update_data = { 'sid': sid, 'state': state, 'receive': receive, 'pre_state': pre_state, 'need_parameters': need_parameters, 'info': info } try: for _ in xrange(3): result = db['state'].update_one(filter_data, {'$set': update_data}, upsert=True) if result.raw_result.get('ok') == 1: break time.sleep(0.1) else: logger('mongo_log', 'ERROR', 'update state error', **update_data) except: message = traceback.format_exc() logger('mongo_log', 'ERROR', message, **update_data)
def reset_parameter(**kwargs): sid = kwargs['state'].sid state_name = kwargs['state'].state_name filter_data = {'sid': sid} update_data = {'$set': {'receive': True}} try: for _ in xrange(3): result = db['params'].update_one(filter_data, update_data, upsert=True) if result.modified_count: break time.sleep(0.1) else: logger('mongo_log', 'ERROR', 'update params receive error', **{ 'sid': sid, 'state': state_name }) except: message = traceback.format_exc() logger('mongo_log', 'ERROR', message, **{ 'sid': sid, 'state': state_name })
def log(self, level='ERROR', message=None, state=None, missing_dict=None): """ 状态机日志调用: usage: self.log("ERROR", "crawl_error", message, response) """ data = {} log_name = 'state_log' state_log = {} if self.crawler is not None: crawler = self.crawler.__module__.replace("worker.crawler.", "").replace(".main", "") else: crawler = 'None' if not message: message = '{}:{}'.format(self.execute_message, self.state_name) state_log = { 'state_name': self.state_name, 'execute_status': self.execute_status, 'execute_msg': self.execute_message, 'next_action': self.next_action, 'state_flag': self.state_flag } if self.state_name in ['UnderVerify', 'UnderLogin']: parameters = self.parameters parameters.pop('sid', None) parameters.pop('crawler', None) state_log['parameters'] = parameters if missing_dict: state_log.update(missing_dict) if state: state_log['state_name'] = state.get('state_name', '') state_log['execute_status'] = state.get('execute_status', '') state_log['execute_msg'] = state.get('execute_message', '') data.update({ 'sid': self.sid, 'crawler': crawler, 'state_log': state_log }) logger(log_name, level, message, **data)
def update_db_data(collection, filter_data, update_data, upsert=True): try: for retry_time in xrange(3): res = db[collection].update_one(filter_data, {'$set': update_data}, upsert=True) if res.modified_count: break time.sleep(0.1) else: # logger('mongo_log', 'ERROR', 'update data error', **merge_two_dicts(filter_data, update_data)) logger('mongo_log', 'ERROR', 'update data error', **filter_data) except: message = traceback.format_exc() # logger('mongo_log', 'ERROR', message, **merge_two_dicts(filter_data, update_data)) logger('mongo_log', 'ERROR', message, **filter_data) return True return True
def log(self, code, msg, resp): """ crawler爬虫日志调用: code: 爬虫错: crawler 用户错: user 官网错: website *网络错: network *系统错: system msg: status_key: 'crawl_error' try...except: 'crawl_error:{}'.format(msg) usage: self.log('crawler', 'crawl_error:{}'.format(msg), resp) """ sid = self.kwargs.get('sid','') crawler = self.kwargs.get('crawler','').replace('worker.crawler.','').replace('.main','') log_name = 'crawler_log' crawler_log = {} if isinstance(resp, requests.models.Response): crawler_log = { 'func_name' : inspect.stack()[1][3], 'req_url': str(resp.request.url), 'req_params': str(resp.request.body), 'req_header': str(resp.request.headers), 'res_status_code': str(resp.status_code), 'res_header': str(resp.headers), 'res_body': str(resp.text) } data = { 'sid':sid, 'crawler':crawler, 'crawler_log':crawler_log, } if socket.gethostname() == 'w219': import pprint print '\n' print msg print str({k: v.encode('utf-8') for k, v in crawler_log.items()}).decode('string-escape') print 'line: ' print inspect.stack()[1][2] print '\n' logger(log_name, code, msg, **data)
def insert_db_data(table, data): if not data: logger('mongo_log', 'ERROR', 'There is no data to insert', **{'insert_data': data}) return True if type(data) is dict: result = db[table].insert_one(data) return result.inserted_id try: for retry_time in xrange(3): if db[table].insert_many(data): break time.sleep(0.1) except: message = traceback.format_exc() logger('mongo_log', 'ERROR', message, **{'insert_data': data}) return False return True
def save_status(sid, status, message, cache_time=None): timestamp = int(time.mktime(time.localtime())) filter_data = {'sid': sid} update_data = {'status': status, 'message': message, 'end_time': timestamp} if cache_time: update_data.update({'cache_time': cache_time}) # limu的渠道 sid_info = db['sid_info'].find_one({ 'sid': sid, 'crawler_channel': { '$exists': True } }) if sid_info: return True try: for _ in xrange(3): result = db['sid_info'].update_one(filter_data, {'$set': update_data}, upsert=True) if result.modified_count: break time.sleep(0.1) else: logger('mongo_log', 'ERROR', 'save status error', **{ 'sid': sid, 'update_data': update_data }) except: message = traceback.format_exc() logger('mongo_log', 'ERROR', message, **{ 'sid': sid, 'update_data': update_data }) return False return True
def data_fusion(**kwargs): #pub_param,final_bill_logs, missing_month_list fusion_start_time = time.time() final_bill_logs = kwargs['final_bill_logs'] missing_month_list = kwargs['missing_month_list'] tel = kwargs['tel'] sid = kwargs['sid'] pad_code = kwargs['pad_code'] bill_log_cache = get_tel_data(tel) cache_hit_month = [] craw_data = None all_miss_list = list(set(missing_month_list)) if bill_log_cache == None: bill_log_cache = {'tel': tel} bill_log_cache['uti'] = str(int(time.time())) try: from datetime import datetime ori_dict = { 'sid': sid, 'tel': tel, 'pad_code': pad_code, 'final_bill_logs': final_bill_logs, 'missing_month_list': missing_month_list, 'bill_log_cache': bill_log_cache, 'expiretime': datetime.utcnow() } register_other_bill_log(ori_dict) except: error_msg = traceback.format_exc() print error_msg pass try: #爬虫数据标准化 # print final_bill_logs craw_data, craw_month_list = craw_data_std(tel, final_bill_logs, missing_month_list) # print craw_month_list,'169' craw_month_list = clean_month_list(craw_month_list, all_miss_list) all_month = list(set(craw_month_list + all_miss_list)) if len(craw_month_list) > 0 or len(all_miss_list) > 0: craw_data, bill_log_cache, missing_month_list, cache_hit_month = data_fusion_kernel( bill_log_cache, craw_data, craw_month_list, all_miss_list) # print bill_log_cache # print craw_data craw_data = cross_key_name(craw_data) # print craw_data #数据剪枝 bill_log_cache = data_cut(bill_log_cache, all_month) craw_data = data_cut(craw_data, all_month) except: error_msg = traceback.format_exc() print error_msg return final_bill_logs, missing_month_list, [], 0.00 try: # print bill_log_cache ret = insert_data(bill_log_cache) pass except: message = traceback.format_exc() print message return final_bill_logs, missing_month_list, [], 0.00 call_log_list = [] for key, value in craw_data.items(): if len(key) == 6: call_log_list.append(value) cache_hit_month = list(set(cache_hit_month)) missing_month_list = list(set(missing_month_list)) fusion_end_time = time.time() fusion_cost_time = fusion_end_time - fusion_start_time log_data = { 'tel': tel, 'bill_cache_hit_month': cache_hit_month, 'bill_missing_month_list': missing_month_list, 'bill_fusion_cost_time': fusion_cost_time, 'bill_fusion_end_time': fusion_end_time, } logger('bill_data_fusion', 'INFO', '', **log_data) return call_log_list, missing_month_list, cache_hit_month, fusion_cost_time
def multiprocess_dama(sid, img_data, code_type): log_name = 'multiprocess_dama' data = {'sid': sid} parent_conn, child_conn = Pipe() begin_time = time.time() proccess_flag = 0 p1 = Process(target=dama_proccess, args=( sid, img_data, code_type, proccess_flag, child_conn, )) #申请子进程 p1.deamon = True p1.start() #运行进程 proccess_flag = 1 print('A进程启动完毕') data['p1'] = 'p1' data['A进程启动完毕'] = 'A进程启动完毕' while True: end_time = time.time() # print (end_time-begin_time) if end_time - begin_time > 20 and proccess_flag == 1: p2 = Process(target=dama_proccess, args=( sid, img_data, code_type, proccess_flag, child_conn, )) #申请子进程 p2.deamon = True p2.start() #运行进程 proccess_flag = 2 print('B进程启动完毕') data['B进程启动完毕'] = 'B进程启动完毕' data['p2'] = 'p2' if end_time - begin_time > 40 and proccess_flag == 2: p3 = Process(target=dama_proccess, args=( sid, img_data, code_type, proccess_flag, child_conn, )) #申请子进程 p3.start() #运行进程 p3.deamon = True proccess_flag = 3 print('C进程启动完毕') data['C进程启动完毕'] = 'C进程启动完毕' data['p3'] = 'p3' data['proccess_flag'] = proccess_flag if parent_conn.poll(): task = parent_conn.recv() if len(task) > 2: print('收到管道信息'.format(task)) data['收到管道信息'] = task parent_conn.close() child_conn.close() logger(log_name, task[0], task[1], **data) return task if end_time - begin_time > 60: break time.sleep(1) print(end_time - begin_time) logger(log_name, 'crawl_error', '打码平台超时', **data) return 'crawl_error', u"打码平台超时", "", None
def data_fusion(**kwargs): #pub_param,final_call_logs, missing_month_list, possibly_missing_list, part_missing_list fusion_start_time = time.time() final_call_logs = kwargs['final_call_logs'] missing_month_list = kwargs['missing_month_list'] possibly_missing_list = kwargs['possibly_missing_list'] part_missing_list = kwargs['part_missing_list'] tel = kwargs['tel'] call_log_cache = get_tel_data(tel) cache_hit_month = [] all_move_miss = [] all_miss_list = list( set(missing_month_list + possibly_missing_list + part_missing_list)) if call_log_cache == None and len(final_call_logs) < 1: return final_call_logs,missing_month_list,possibly_missing_list,part_missing_list,[],0 try: #爬虫数据标准化 craw_data, craw_month_list = craw_data_std(tel, final_call_logs, all_miss_list) all_miss_list = list( set(missing_month_list + possibly_missing_list + part_missing_list)) craw_month_list = clean_month_list(craw_month_list, all_miss_list) all_month = list(set(craw_month_list + all_miss_list)) if len(craw_month_list) > 0: craw_data, call_log_cache, craw_month_list, cache_hit_list, move_miss = data_fusion_kernel( call_log_cache, craw_data, craw_month_list, 1) cache_hit_month.extend(cache_hit_list) all_move_miss.append(move_miss) if len(missing_month_list) > 0: craw_data, call_log_cache, missing_month_list, cache_hit_list, move_miss = data_fusion_kernel( call_log_cache, craw_data, missing_month_list, 0) cache_hit_month.extend(cache_hit_list) all_move_miss.append(move_miss) if len(possibly_missing_list) > 0: craw_data, call_log_cache, possibly_missing_list, cache_hit_list, move_miss = data_fusion_kernel( call_log_cache, craw_data, possibly_missing_list, 3) cache_hit_month.extend(cache_hit_list) all_move_miss.append(move_miss) if len(part_missing_list) > 0: craw_data, call_log_cache, part_missing_list, cache_hit_list, move_miss = data_fusion_kernel( call_log_cache, craw_data, part_missing_list, 2) cache_hit_month.extend(cache_hit_list) all_move_miss.append(move_miss) # print all_move_miss for move_ in all_move_miss: for key, value in move_.items(): if value == 0: missing_month_list.append(key) elif value == 2: part_missing_list.append(key) elif value == 3: possibly_missing_list.append(key) # print craw_data craw_data = cross_key_name(craw_data) # print craw_data #数据剪枝 call_log_cache = data_cut(call_log_cache, all_month) craw_data = data_cut(craw_data, all_month) except: error_msg = traceback.format_exc() print error_msg return final_call_logs,missing_month_list,possibly_missing_list,part_missing_list,[],0 try: # print call_log_cache ret = insert_data(call_log_cache) pass except: message = traceback.format_exc() print message return final_call_logs,missing_month_list,possibly_missing_list,part_missing_list,[],0 call_log_list = [] for key, value in craw_data.items(): if type(value).__name__ == 'dict': if value.has_key('status'): for x in value['det']: x['month'] = key call_log_list.append(x) cache_hit_month = list(set(cache_hit_month)) possibly_missing_list = list(set(possibly_missing_list)) part_missing_list = list(set(part_missing_list)) missing_month_list = list(set(missing_month_list)) fusion_end_time = time.time() fusion_cost_time = fusion_end_time - fusion_start_time log_data = { 'tel': tel, 'cache_hit_month': cache_hit_month, 'possibly_missing_list': possibly_missing_list, 'part_missing_list': part_missing_list, 'missing_month_list': missing_month_list, 'fusion_cost_time': fusion_cost_time, 'fusion_end_time': fusion_end_time, 'all_move_miss': all_move_miss } logger('data_fusion', 'INFO', '', **log_data) return call_log_list, missing_month_list, possibly_missing_list, part_missing_list, cache_hit_month, fusion_cost_time