def init(): timestamp=datetime.datetime.now().strftime("%Y-%m-%d-%H") g_conf.read("../conf/ad_svr.conf") logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='[%Y-%m_%d %H:%M:%S]', filename='../log/ad_svr.'+timestamp+'.log', filemode='a') service_para=g_conf.get("gearman","para") service_name=g_conf.get("gearman","name") process_num=int(g_conf.get("gearman","process_num")) service_para=service_para.split(',') #load adx map adx_id_obj=adx_id_map(g_conf) global g_adx_interface g_adx_interface=adx_interface_t(adx_id_obj,g_conf) global g_invert_idx_mgr g_invert_idx_mgr={} ridx_name_list=g_conf.get("index","ridx_list").split(",") for name in ridx_name_list: obj=inverted_index_t() obj.load_file("../index/"+name+".ridx") g_invert_idx_mgr[name]=obj #print obj.inverted_dict,name global g_idx_mgr g_idx_mgr={} idx_name_list=g_conf.get("index","idx_list").split(",") for name in idx_name_list: obj=index_t() obj.load_file("../index/"+name+".idx") g_idx_mgr[name]=obj #init ip region global g_ip_obj g_ip_obj=ip_parse_t(g_conf.get("file","ip_table"),g_conf.get("file","ip_region")) #init id obj global g_id_obj g_id_obj=transform_id_t(g_conf.get("file","fea_id_file")) #init fea obj extract_fea_obj=extract_feature_t(g_ip_obj,g_id_obj) #init model obj model_obj=predict_model_t(g_conf.get("file","model")) #init idea operator operator_obj=idea_operator_t(g_conf.get("file","idea_operate")) #init rank bit module global g_rank_bid g_rank_bid=rank_bid_t(extract_fea_obj,model_obj,operator_obj) logging.info("init complete") #init filter global g_filter_obj g_filter_obj=filter_t(g_invert_idx_mgr,g_conf) return [service_para,service_name,process_num]
def process(input_file, fea_output_file, ml_file): input_fp = open(input_file, "r") fea_fp = open(fea_output_file, "w") ml_fp = open(ml_file, "w") global g_ip_obj global g_id_obj g_ip_obj = ip_parse_t("../data/ip.all", "../data/region.txt") g_id_obj = transform_id_t(g_conf.get("file", "fea_id_file")) line_cnt = 0 single_feature_list = ["region", "app", "time", "app_type", "manufacture"] # combine_feature_list=["combine-region-time","combine-app-region"] # combine_feature_list=["combine-region-time"] combine_feature_list = [] header = "click\tconsume\t" + "\t".join(single_feature_list) + "\t" + "\t".join(combine_feature_list) + "\n" fea_fp.write(header) for line in input_fp: line_cnt += 1 line = line.rstrip("\r\n").split("\t") click = int(line[2]) consume = int(line[3]) time_str = line[1] req_json = line[4] click = int(line[2]) try: req_dict = json.loads(req_json) fea_json_dict = json.loads(req_dict["request"]) except: logging.warning("load json failed[%d]" % (line_cnt)) continue fea_str = "" feature_value_dict = extract_feature(time_str, fea_json_dict, single_feature_list, combine_feature_list) for feature in single_feature_list: fea_str += feature_value_dict[feature] + "\t" for feature in combine_feature_list: fea_str += feature_value_dict[feature] + "\t" fea_str = fea_str.rstrip("\t") fea_fp.write("%d\t%d\t%s\n" % (click, consume, fea_str)) # transform to id fea_id_list = [] fea_id_str = "" list_temp = fea_str.split("\t") for fea_name in list_temp: fea_id = g_id_obj.get_id(fea_name) fea_id_list.append(fea_id) fea_id_list.sort() for fea_id in fea_id_list: fea_id_str += str(fea_id) + ":1" + " " fea_id_str = fea_id_str.rstrip(" ") ml_fp.write("%d %s\n" % (click, fea_id_str))
def process(input_file,output_file): input_fp=open(input_file,"r") output_fp=open(output_file,"w") global g_ip_obj g_ip_obj=ip_parse_t("../data/ip.all","../data/region.txt") line_cnt=0 for line in input_fp: line_cnt+=1 line=line.rstrip("\r\n").split("\t") req_json=line[4] click=int(line[2]) try: req_dict=json.loads(req_json) fea_json_dict=json.loads(req_dict["request"]) except: logging.warning("load json failed[%d]" %(line_cnt)) continue region_feature=extract_feature(fea_json_dict) output_fp.write("%s\t%s\t%s\n" %(line[2],line[3],region_feature))
def tongji(req_file): ip_obj=ip_parse_t("../data/ip.all","../data/region.txt") output_fp=open("../data/region_result","w") line_cnt=0 fp=open(req_file,"r") for line in fp: line_cnt+=1 line=line.rstrip("\r\n").split("\001") try: json_dict=json.loads(line[1]) request_dict=json.loads(json_dict["request"]) ip_str=request_dict["device"]["ip"] region_result_list=ip_obj.search(ip_str) if region_result_list!=None: output_fp.write("%s\t%d\t%d\n" %(ip_str,region_result_list[2],region_result_list[3])) else: logging.warning("ip not found[%s]" %(ip_str)) continue except: logging.warning("extract content failed[%d]" %(line_cnt)) continue fp.close()
def process(input_file,fea_output_file,ml_file,type): input_fp=open(input_file,"r") fea_fp=open(fea_output_file,"w") ml_fp=open(ml_file,"w") global g_ip_obj global g_id_obj g_ip_obj=ip_parse_t("../data/ip.all","../data/region.txt") g_id_obj=transform_id_t(g_conf.get("file","fea_id_file")) line_cnt=0 single_feature_list=["region","app","time","app_type","manufacture"] #combine_feature_list=["combine-region-time","combine-app-region"] #combine_feature_list=["combine-region-time"] combine_feature_list=[] header="click\tdownload\tinstall\topen\tconsume\t"+"\t".join(single_feature_list)+"\t"+"\t".join(combine_feature_list)+"\n" fea_fp.write(header) for line in input_fp: line_cnt+=1 line=line.rstrip("\r\n").split("\t") time_str=line[0] try: fea_json_dict=json.loads(line[2]) except: logging.warning("load json failed[%d]" %(line_cnt)) continue (fea_str,fea_id_str)=extract_feature(time_str,fea_json_dict,single_feature_list,combine_feature_list) ad_action_dict=json.loads(line[1]) click=ad_action_dict["click"] download=ad_action_dict["download"] install=ad_action_dict["install"] app_open=ad_action_dict["open"] consume=ad_action_dict["cost"] fea_fp.write("%d\t%d\t%d\t%d\t%d\t%s\n" %(click,download,install,app_open,consume,fea_str)) #transform to id if type=="train" and download==1: count=5 else: count=1 for i in range(0,count): ml_fp.write("%d %s\n" %(click,fea_id_str))
def tongji(req_file): ip_obj = ip_parse_t("../data/ip.all", "../data/region.txt") output_fp = open("../data/region_result", "w") line_cnt = 0 fp = open(req_file, "r") for line in fp: line_cnt += 1 line = line.rstrip("\r\n").split("\001") try: json_dict = json.loads(line[1]) request_dict = json.loads(json_dict["request"]) ip_str = request_dict["device"]["ip"] region_result_list = ip_obj.search(ip_str) if region_result_list != None: output_fp.write( "%s\t%d\t%d\n" % (ip_str, region_result_list[2], region_result_list[3])) else: logging.warning("ip not found[%s]" % (ip_str)) continue except: logging.warning("extract content failed[%d]" % (line_cnt)) continue fp.close()