def check_status(self): # 这里会不会有问题,造成api访问太过频繁超限? def is_done(status_dict, task_count): if status_dict: all_successed = all(status_dict.values()) if not all_successed: raise WorkerFailed("download-failed") else: if len(status_dict) == task_count: return True return False else: return False key_list = [("%s_status" % key) for key in self.key_list] status_dict = CacheAdpter.get_many(key_list, self.cache_db) has_done = is_done(status_dict, len(self.key_list)) time_interval = 0.1 # 还是给一个超时吧,超时就下载失败。(还要考虑全局的流控,比如今天关键词的报表下载完毕) timeout = len(self.adgroup_id_list) * 60 # 超时时间依据下载的adgroup个数来计算(最好还有一个最大超时时间) st_time = time.time() while (not has_done): time.sleep(time_interval) if time.time() - st_time >= timeout: # 记录状态 log.error("timeout: total %s, finished %s, undone task=%s" % (len(self.key_list), len(status_dict), list(set(self.key_list) - set(status_dict.keys())))) raise WorkerTimeout("download-timeout") status_dict = CacheAdpter.get_many(key_list, self.cache_db) has_done = is_done(status_dict, len(self.key_list)) CacheAdpter.delete_many(key_list, self.cache_db) # 确认了本次任务的状态之后,清掉这些标准位,以免影响下次的判断 return True
def sum_result(self): result_dict = CacheAdpter.get_many(self.key_list, self.cache_db) if result_dict: rpt_list = reduce(list.__add__, result_dict.values()) new_rpt_list = KeywordRpt.simply_rpt(rpt_list) else: new_rpt_list = [] return new_rpt_list
def check_status(self): """检查完成进度""" key_list = [ '%s_status' % sub_prj['data_key'] for sub_prj in self.sub_prj_list ] value_dict = CacheAdpter.get_many(key_list, self.db_name) time_interval = 0.3 time_out = 20 start_time = time.time() ratio = float(len(value_dict)) / len(self.sub_prj_list) while ratio < 0.85: time.sleep(time_interval) if time.time() - start_time >= time_out: log.info('waiting for worker finishing time out!') break value_dict = CacheAdpter.get_many(key_list, self.db_name) ratio = float(len(value_dict)) / len(self.sub_prj_list) return True
def sum_prj_result(self, sub_prj_list, db_name): candi_kw_dict = {} key_list = [] for prj in sub_prj_list: key_list.append(prj['data_key']) log.info('sum project result item_id=%s' % (self.item_id)) worker_result_dict = CacheAdpter.get_many(key_list, db_name) # 汇总农民工的结果数据 for temp_dict in worker_result_dict.values(): for k, v in temp_dict.items(): if not candi_kw_dict.has_key(k): candi_kw_dict[k] = [] candi_kw_dict[k].extend(v) # 汇总排序 result_list = [] filter_index = 0 for filter in self.select_conf.select_conf_list: filter_index += 1 kw_list = candi_kw_dict.get(filter.candi_filter, [])[0:10000] # 卡死,某一类别最多10000,绝对够了 if not kw_list: continue sort_func = 'kw_list.sort(sort_kwlist_by_%s)' % filter.sort_mode eval(sort_func) # 根据配置的数目获取 range_list = filter.select_num.split('-') if float(range_list[0]) < 1.0: # 按照百分比 start_index = int(len(kw_list) * float(range_list[0])) end_index = int(len(kw_list) * float(range_list[1])) else: start_index = int(range_list[0]) - 1 end_index = int(range_list[1]) temp_list = [ kw + [str(filter_index)] for kw in kw_list[start_index:end_index - start_index] ] result_list.extend(temp_list) # result_list = remove_same_words(result_list) # 去除重复关键词 TODO wuhuaqiao 有问题,去重时重新排序,影响原来结果 log.info('select keyword from kwlib,result=%s' % len(result_list)) return result_list
def get_prj_statu(self, sub_prj_list, db_name): key_list = [] server_dict = {} for prj in sub_prj_list: if prj['statu'] == 'finished': continue key_list.append(prj['data_key'] + '_statu') server_dict[prj['host'] + ':' + str(prj['port'])] = 1 log.info('server is working, unfinished server is: %s' % (','.join(server_dict.keys()))) value_dict = CacheAdpter.get_many(key_list, db_name) if not value_dict: return for prj in sub_prj_list: if prj['statu'] == 'finished': continue value = value_dict.get(prj['data_key'] + '_statu', None) if value: prj['statu'] = value
def sum_result(self): """汇总结果""" candi_kw_dict = {} key_list = [] for prj in self.sub_prj_list: key_list.append(prj['data_key']) worker_result_dict = CacheAdpter.get_many(key_list, self.db_name) # 汇总农民工的结果数据 for temp_dict in worker_result_dict.values(): for k, v in temp_dict.items(): if not candi_kw_dict.has_key(k): candi_kw_dict[k] = [] candi_kw_dict[k].extend(v) # 汇总排序 result_list = [] filter_index = 0 for filter in self.select_conf.select_conf_list: # @ReservedAssignment filter_index += 1 kw_list = candi_kw_dict.get(filter.candi_filter, [])[0:10000] # 卡死,某一类别最多10000,绝对够了 if not kw_list: continue sort_func = 'kw_list.sort(sort_kwlist_by_%s)' % filter.sort_mode eval(sort_func) # 根据配置的数目获取 range_list = filter.select_num.split('-') if float(range_list[0]) < 1.0: # 按照百分比 start_index = int(len(kw_list) * float(range_list[0])) end_index = int(len(kw_list) * float(range_list[1])) else: start_index = int(range_list[0]) - 1 end_index = int(range_list[1]) temp_list = [ kw + [str(filter_index)] for kw in kw_list[start_index:end_index - start_index] ] result_list.extend(temp_list) return result_list