def predict_ctr(input_files, ctr_model_file_name ): """predict :param input_files: :param the_ctr_model: :return: """ info_key = ['win price', 'bid floor', 'occurrence num'] the_ctr_model = ctr_model.LrCtrModel(ctr_model_file_name, 'new') out_file = open(out_file_name,'w') bid_info = {'10544':4,'10564':1.8,'10501':2,'10527':2} req_parser = join.Parser() line_num = 0 actual_pred_list = [] for input_file in input_files: for line in open(input_file,'r'): line_num += 1 if line_num % 1000 == 0: print line_num #if line_num > 100: # sys.exit(1) req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) click_flag = 1 if field_dict['click_flag'] == True else 0 actual_pred_list.append((click_flag,pctr)) print ctr_model_evaluation.predict_evaluation(actual_pred_list)
def predict_4_files(input_files, the_ctr_model): req_parser = join.Parser() actual_pred_list = [] for input_file in input_files: for line in open(input_file): req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) actual = 1 if field_dict['click_flag'] == True else 0 actual_pred_list.append((actual, pctr)) return actual_pred_list
def join_log_stat(file_names, bin_num, fit_flag): ''' file_name为clear_price文件,bin_num为价格分区间的数目 fit_flag:是否拟合的标记 return: stat_result[(camp_id,grp_id)] = {'impression':0,'click':0,'cost':0, 'price_bin':{'bin_size':bin_size,'x':statx,'y':staty}, 'win_function_fitting':'fitting_para','filtered_xy'} ''' #jls_logger.info('****************join_log_stat start*******************') stat_result = {} # 格式{[cam id,grp id]:stat_result,[cam id,grp id]:} total_line_num = 0 bad_line_count = 0 req_parser = join.Parser() for file_name in file_names: for line in open(file_name): total_line_num += 1 req_parser.feed(line) field_dict = req_parser.get_all() #field_dict = get_field_dict(line) if field_dict == None: bad_line_count += 1 continue # 0 click,1campaign id,2adgroup id,3price camp_id = field_dict['campaign_id'] grp_id = field_dict['adgroup_id'] if not stat_result.has_key((camp_id, grp_id)): stat_result[(camp_id, grp_id)] = { 'impression': 0, 'click': 0, 'cost': 0, 'price_stat': {} } # impression stat stat_result[(camp_id, grp_id)]['impression'] += 1 # price stat round_price = round(float(field_dict['win_price']), 4) if not stat_result[(camp_id, grp_id)]['price_stat'].has_key(round_price): stat_result[(camp_id, grp_id)]['price_stat'][round_price] = 0 stat_result[(camp_id, grp_id)]['price_stat'][round_price] += round_price # cost stat stat_result[(camp_id, grp_id)]['cost'] += 1 # click stat if field_dict['click_flag'] == True: stat_result[(camp_id, grp_id)]['click'] += 1 stats_str = '' stats_str += "bad_line_count:%d\n" % bad_line_count for (camp_id, grp_id) in stat_result.keys(): stats_str += 'camp_id:%s,grp_id:%s,impression num:%s,click num:%s\n' % ( camp_id, grp_id, stat_result[(camp_id, grp_id)]['impression'], stat_result[(camp_id, grp_id)]['click']) #jls_logger.info('log stat information:%s' % stats_str) # jls_logger.info('log_stat stats:\ntotal_line_num:%s,adgroup list:%s' % (total_line_num,stat_result.keys())) # if total_line_num < 10000: # jls_logger.info('clear_price%s' % file_name) # return None # 对价格进行分区 if fit_flag == True: for (camp_id, grp_id) in stat_result.keys(): # if (sum(stat_result[grp_id].values())+0.0)/total_line_num <0.1: # continue print 'len(stat_result price stat)', len( stat_result[(camp_id, grp_id)]['price_stat']) max_price = max(stat_result[(camp_id, grp_id)]['price_stat']) min_price = min(stat_result[(camp_id, grp_id)]['price_stat']) bin_size = (max_price - min_price) / bin_num if bin_size == 0: stat_result[(camp_id, grp_id)]['price_bin'] = { 'bin_size': bin_size, 'x': None, 'y': None } continue price_bin = [0] * bin_num for price in stat_result[(camp_id, grp_id)]['price_stat'].keys(): bin_idx = int(round((price - min_price) / bin_size)) bin_idx = bin_idx if bin_idx != bin_num else bin_idx - 1 price_bin[bin_idx] += stat_result[( camp_id, grp_id)]['price_stat'][price] print 'price bin %d', price_bin total_imp = stat_result[(camp_id, grp_id)]['impression'] #total_click = stat_result[(camp_id, grp_id)]['click'] histogram = [ float(price_bin[idx]) / total_imp for idx in range(0, bin_num) ] # plot data print 'bin_size', bin_size # x为左端点,这里statx为所有闭区间的左端点以及最后一个右开区间的左端点 statx = [ min_price + idx * bin_size for idx in range(0, bin_num + 1) ] staty = map(lambda idx: sum(histogram[0:idx]), range(1, len(histogram) + 1)) staty = [0] + staty #avg_ctr = float(total_click) / total_imp del stat_result[(camp_id, grp_id)]['price_stat'] stat_result[(camp_id, grp_id)]['price_bin'] = { 'bin_size': bin_size, 'x': statx, 'y': staty } # 'avg_ctr':avg_ctr,'total_click':total_click,'total_imp':total_imp} # fitting for (camp_id, grp_id) in stat_result.keys(): fit_fun = lambda x, p: (p[0] + x) / (p[1] + x) # 待拟合的函数,x是变量,p是参数 # (fitting_para,filtered_xy)= win_function_fitting('win_function_sample',fit_fun) if stat_result[(camp_id, grp_id)]['price_bin']['bin_size'] == 0: print 'bin size 0' continue #print 'stat result', len(stat_result[(camp_id, grp_id)]['price_bin']['x']), len(stat_result[(camp_id, grp_id)]['price_bin']['y']) #print stat_result[(camp_id, grp_id)]['price_bin']['x'] #print stat_result[(camp_id, grp_id)]['price_bin']['y'] (fitting_para, filtered_xy) = win_function_fitting( x=stat_result[(camp_id, grp_id)]['price_bin']['x'], y=stat_result[(camp_id, grp_id)]['price_bin']['y'], fit_fun=fit_fun) if fitting_para == None: stat_result[(camp_id, grp_id)]['win_function_fitting'] = None continue print 'c1', fitting_para[0], 'c2', fitting_para[1] # win_function_plot(filtered_xy[0],filtered_xy[1],fit_fun,fitting_para) stat_result[(camp_id, grp_id)]['win_function_fitting'] = { 'fitting_para': fitting_para, 'filtered_xy': filtered_xy } #jls_logger.info('win function fitting by campaign and adgroup(%s,%s):%s' % (camp_id, grp_id, fitting_para)) #print '\n'.join(['campaign,adgroup:'+str(camp_grp)+'\n'.join(stat_result.keys()) for camp_grp in stat_result]) result_string = '\n' for camp_grp in stat_result: result_string += '*********************\n' result_string += 'files:' + ','.join(file_names) + '\n' result_string += 'camp_id:%s,adgroup_id:%s\n' % camp_grp for info in stat_result[camp_grp]: result_string += info + ':' + str( stat_result[camp_grp][info]) + '\n' #print '\n'.join(['campaign,adgroup:'+str(camp_grp)+'\n'.join[str(info) +':'+ str(stat_result[camp_grp][info]) for info in stat_result[camp_grp].keys()] for camp_grp in stat_result]) #jls_logger.info('join_log_stat result:%s' % result_string) #jls_logger.info('****************join_log_stat end*******************') return stat_result
def history_repeat_conditional(bid_history_files, the_ctr_model, bid_model, candidate_paras, condition_type, stat_result): ''' 历史重演,对一批历史数据进行模拟投放 paras: bid_history_files:join文件名列表,list类型 the_ctr_model:ctr模型 bid_model:出价模型 candidate_paras:候选参数 return: paras_perf[camp_grp][para] = {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':} ''' candidate_paras = { camp_grp: candidate_paras[camp_grp] for camp_grp in candidate_paras.keys() if len(candidate_paras[camp_grp]) != 0 } bpo_logger.info( '**************history_repeat function start**************') bpo_logger.info('files:%s' % ','.join(bid_history_files)) bpo_logger.info('************** bid_model: %s**************' % (bid_model.keys())) bpo_logger.info('************** candidate_paras: %s**************' % (candidate_paras)) bpo_logger.info('************** condition_type: %s**************' % (condition_type)) stop_flag_idx = {} result = {} #预制所有的键 for camp_grp in bid_model.keys(): #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不在bid_model中,则continue if (not candidate_paras.has_key(camp_grp)) or ( not bid_model.has_key(camp_grp)): continue if not condition_type.has_key(camp_grp): continue tmp_num = len(candidate_paras[camp_grp]) stop_flag_idx[camp_grp] = tmp_num result[camp_grp] = [None] * tmp_num for idx in range(0, tmp_num): result[camp_grp][idx] = { 'impression': 0, 'avg_pctr': 0, 'click': 0, 'cost': 0 } req_parser = join.Parser() for bid_history_file in bid_history_files: for line in open(bid_history_file): req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue win_price = float(field_dict['win_price']) camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id']) if win_price == None or win_price <= 0: continue if not result.has_key(camp_grp): continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) candidate_bid_prices = bid_model[camp_grp].get_bids_auc( ctr=pctr, variable_paras=candidate_paras[camp_grp]) first_ge_idx = binary_search(candidate_bid_prices, win_price) #非累积方式 if first_ge_idx < stop_flag_idx[camp_grp]: for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]): result[camp_grp][tmp_idx]['impression'] += 1 result[camp_grp][tmp_idx]['avg_pctr'] += pctr result[camp_grp][tmp_idx]['click'] += ( 1 if field_dict['click_flag'] == True else 0) result[camp_grp][tmp_idx]['cost'] += win_price #指定资源限制方法:50%*总成本,50%*总点击 if condition_type[camp_grp] == 'cost': stop_flag_idx[camp_grp] = binary_search([ result[camp_grp][idx]['cost'] for idx in range(0, stop_flag_idx[camp_grp]) ], stat_result[camp_grp]['cost'] / 2) elif condition_type[camp_grp] == 'click': stop_flag_idx[camp_grp] = binary_search([ result[camp_grp][idx]['click'] for idx in range(0, stop_flag_idx[camp_grp]) ], stat_result[camp_grp]['click'] / 2) #计算ctr/avg_pctr/ecpc/cpm等衍生度量 for camp_grp in result: for tmp_idx in range(0, len(result[camp_grp])): result[camp_grp][tmp_idx]['cost'] /= 1000 # 修改单位,cpm为千次展示价格 tmp_dict = result[camp_grp][tmp_idx] result[camp_grp][tmp_idx]['ctr'] = ( (tmp_dict['click'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['avg_pctr'] = ( (tmp_dict['avg_pctr'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['ecpc'] = ( tmp_dict['cost'] ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0 result[camp_grp][tmp_idx][ 'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[ 'impression'] if tmp_dict['impression'] != 0 else 0 #提取出具体参数,而非参数索引 paras_perf = { camp_grp: { candidate_paras[camp_grp][idx]: result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp])) } for camp_grp in result.keys() } #打印结果 bpo_logger.info('**************history_repeat function end**************') formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n' for camp_grp in paras_perf.keys(): log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files) log_perf_str += formatter.format( camp=camp_grp[0], grp=camp_grp[1], strategy=bid_model[camp_grp].bid_strategy_type, para=bid_model[camp_grp].fixed_parameter) log_perf_str += 'condition type:%s\n' % condition_type[camp_grp] log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n' log_perf_str += 'unit: US dollar\n' for para in sorted(paras_perf[camp_grp].keys()): tmp_dict = paras_perf[camp_grp][para] log_perf_str += str( para ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict bpo_logger.info('************parameter performance start************') bpo_logger.info(log_perf_str) bpo_logger.info('************parameter performance end************') return paras_perf
import sys sys.path.append('/home/chester/KuaiPan/workspace/tukmob') import util.join as join import matplotlib.pyplot as pp dir_pre = '/home/chester/KuaiPan/workspace/tukmob/resource/join_log/join_log_april_adx/' for adx_name in ["adiquity", "axonix", "doubleclick", "inmobi", "nexage", "smaato", "tapsense",]: parser = join.Parser() price_stat = {} for line in open(dir_pre + '/' + 'join_log_april_1_' + adx_name,'r'): parser.feed(line) price = round(parser.get_all()['win_price'],2) if price not in price_stat: price_stat[price] = 0 price_stat[price] += 1 sorted_price_stat = sorted(price_stat) pp.clf() pp.plot(sorted_price_stat,[price_stat[price] for price in sorted_price_stat]) pp.title("win_price_"+adx_name) pp.savefig('/home/chester/win_price_' + adx_name + ".png")
def history_repeat(bid_history_files, the_ctr_model, bid_model, candidate_paras): ''' 历史重演,对一批历史数据进行模拟投放 paras: bid_history_files:join文件列表 the_ctr_model:ctr模型 bid_model:出价模型 candidate_paras:候选参数 return: paras_perf[camp_grp][para] = {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':} ''' candidate_paras = { camp_grp: candidate_paras[camp_grp] for camp_grp in candidate_paras.keys() if len(candidate_paras[camp_grp]) != 0 } bpo_logger.info( '**************history_repeat function start**************') bpo_logger.info('files:%s' % ','.join(bid_history_files)) bpo_logger.info('************** bid_model: %s**************' % (bid_model.keys())) bpo_logger.info('************** candidate_paras: %s**************' % (candidate_paras)) stop_flag_idx = {} result = {} for (camp_id, grp_id) in bid_model.keys(): tmp_num = len(candidate_paras[(camp_id, grp_id)]) stop_flag_idx[(camp_id, grp_id)] = tmp_num result[(camp_id, grp_id)] = [None] * tmp_num for idx in range(0, tmp_num): result[(camp_id, grp_id)][idx] = { 'impression': 0, 'avg_pctr': 0, 'click': 0, 'cost': 0 } req_parser = join.Parser() total_time = 0 for bid_history_file in bid_history_files: for line in open(bid_history_file): req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue win_price = float(field_dict['win_price']) camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id']) #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不再bid_model中,则continue if (not candidate_paras.has_key(camp_grp)) or ( not bid_model.has_key(camp_grp) ) or win_price == None or win_price <= 0: continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) candidate_bid_prices = bid_model[camp_grp].get_bids_auc( ctr=pctr, variable_paras=candidate_paras[camp_grp]) first_ge_idx = binary_search(candidate_bid_prices, win_price) #非累积方式 #if first_ge_idx < stop_flag_idx[camp_grp]: # for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]): # result[camp_grp][tmp_idx]['impression'] += 1 # result[camp_grp][tmp_idx]['avg_pctr'] += pctr # result[camp_grp][tmp_idx]['click'] += (1 if field_dict['click_flag'] == '1' else 0) # result[camp_grp][tmp_idx]['cost'] += win_price # 因为没有投放限制(比如总预算,总预订点击等),下列语句被注释了 # 指定资源限制方法:这里是成本限制,50%*总成本 # print float(total_budget[camp_grp])/2 # stop_flag_idx[camp_grp] = binary_search([result[camp_grp][idx]['cost'] for idx in range(0,stop_flag_idx[camp_grp])],float(total_budget[camp_grp])/16) # print stop_flag_idx[camp_grp] #累积方式,累积方式是不能添加资源限制的。累计方式速度更快 if first_ge_idx < stop_flag_idx[camp_grp]: result[camp_grp][first_ge_idx]['impression'] += 1 result[camp_grp][first_ge_idx]['avg_pctr'] += pctr result[camp_grp][first_ge_idx]['click'] += ( 1 if field_dict['click_flag'] == True else 0) result[camp_grp][first_ge_idx]['cost'] += win_price for camp_grp in result: for idx in range(1, len(result[camp_grp])): for tmp_metric in ['impression', 'avg_pctr', 'click', 'cost']: result[camp_grp][idx][tmp_metric] += result[camp_grp][ idx - 1][tmp_metric] #计算ctr/avg_pctr/ecpc/cpm等衍生度量 for camp_grp in result: for tmp_idx in range(0, len(result[camp_grp])): result[camp_grp][tmp_idx]['cost'] /= 1000 # 修改单位 # print result[camp_grp] tmp_dict = result[camp_grp][tmp_idx] result[camp_grp][tmp_idx]['ctr'] = ( (tmp_dict['click'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['avg_pctr'] = ( (tmp_dict['avg_pctr'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['ecpc'] = ( tmp_dict['cost'] ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0 result[camp_grp][tmp_idx][ 'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[ 'impression'] if tmp_dict['impression'] != 0 else 0 #提取出具体参数,而非参数索引 paras_perf = {} #for camp_grp in result.keys(): # paras_perf[camp_grp] = {} # for idx in range(0, len(candidate_paras[camp_grp])): # paras_perf[camp_grp][candidate_paras[camp_grp][idx]] = result[camp_grp][idx] paras_perf = { camp_grp: { candidate_paras[camp_grp][idx]: result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp])) } for camp_grp in result.keys() } #打印结果 bpo_logger.info('**************history_repeat function end**************') formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n' for camp_grp in paras_perf.keys(): log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files) log_perf_str += formatter.format( camp=camp_grp[0], grp=camp_grp[1], strategy=bid_model[camp_grp].bid_strategy_type, para=bid_model[camp_grp].fixed_parameter) log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n' log_perf_str += 'unit: US dollar\n' for para in sorted(paras_perf[camp_grp].keys()): tmp_dict = paras_perf[camp_grp][para] log_perf_str += str( para ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict bpo_logger.info('************parameter performance start************') bpo_logger.info(log_perf_str) bpo_logger.info('************parameter performance end************') #print paras_perf return paras_perf
def simulate_camp(target_ecpc,input_files, ctr_model_file_name, bid_landscape_model_file_name, out_file_name): """predict :param input_files: :param the_ctr_model: :return: """ simulate_result = {idx*0.025:{'cnt':0,'cost':0,'click':0} for idx in range(1,40)} info_key = ['win price', 'bid floor', 'occurrence num'] the_ctr_model = ctr_model.LrCtrModel(ctr_model_file_name, 'new') bid_landscape_model = pickle.load(open(bid_landscape_model_file_name, 'r')) out_file = open(out_file_name,'w') bid_info = {'10544':4,'10564':1.8,'10501':2,'10527':2} surplus_stats = {} req_parser = join.Parser() actual_pred_list = [] line_num = 0 for input_file in input_files: for line in open(input_file,'r'): line_num += 1 if line_num % 1000 == 0: print line_num #if line_num > 100: # break req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) line_array = line.strip('\n').split('|') feat_values = dict.fromkeys(feats) bid_floor = None for key_val in line_array: if ':' not in key_val: continue key, val = key_val.split(':', 1) if key in feats: feat_values[key] = val if key == bid_floor_field_name: bid_floor = float(val) if key == win_price_field_name: win_price = float(val) if key == campaign_field_name: camp_id = val comb_value = tuple([feat_values[feat] for feat in feats]) if comb_value == 'doubleclick': continue click_flag = 1 if field_dict['click_flag'] == True else 0 #bid = bid_info[camp_id]*0.62 bid = target_ecpc*pctr*1000 win_rate = bid_landscape_model.predict(comb_value,bid) #print bid expected_cost = bid_landscape_model.get_cost(comb_value,bid) ecpc = float(expected_cost)/pctr surplus = (bid - expected_cost) * win_rate #print target_ecpc,pctr,expected_cost,win_rate if round(surplus,2) not in surplus_stats: surplus_stats[round(surplus,2)] = 0 surplus_stats[round(surplus,2)] += 1 #if surplus > 0.4: # out_file.write('%d\t%f\t%f\t%f\t%f\t%f\t%f\n' % (click_flag,pctr,win_rate,bid,win_price,expected_cost,ecpc)) surplus_bin = int(surplus/0.025)*0.025 if bid < win_price: continue if surplus_bin in simulate_result: simulate_result[surplus_bin]['cnt'] += 1 simulate_result[surplus_bin]['cost'] += win_price if click_flag == 1: simulate_result[surplus_bin]['click'] += 1 print surplus_stats print simulate_result cnt,cost,click = 0,0,0 print 'surplus bin','imp','cost','click','ecpc','ctr' for surplus_bin in sorted(simulate_result.keys(),reverse=True): cnt += simulate_result[surplus_bin]['cnt'] cost += simulate_result[surplus_bin]['cost'] click += simulate_result[surplus_bin]['click'] print surplus_bin,cnt,cost,click,cost/click,float(click)/cnt
def predict_4_files(target_ecpc,input_files, ctr_model_file_name, bid_landscape_model_file_name, out_file_name): """predict :param input_files: :param the_ctr_model: :return: """ info_key = ['win price', 'bid floor', 'occurrence num'] the_ctr_model = ctr_model.LrCtrModel(ctr_model_file_name, 'new') bid_landscape_model = pickle.load(open(bid_landscape_model_file_name, 'r')) out_file = open(out_file_name,'w') #bid_info = {'10544':4,'10564':1.8,'10501':2,'10527':2} req_parser = join.Parser() line_num = 0 for input_file in input_files: for line in open(input_file,'r'): line_num += 1 if line_num % 1000 == 0: print line_num #if line_num > 100: # sys.exit(1) req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) line_array = line.strip('\n').split('|') feat_values = dict.fromkeys(feats) bid_floor = None for key_val in line_array: if ':' not in key_val: continue key, val = key_val.split(':', 1) if key in feats: feat_values[key] = val if key == bid_floor_field_name: bid_floor = float(val) if key == win_price_field_name: win_price = float(val) if key == campaign_field_name: camp_id = val comb_value = tuple([feat_values[feat] for feat in feats]) if comb_value == 'doubleclick': continue click_flag = 1 if field_dict['click_flag'] == True else 0 #bid = bid_info[camp_id]*0.62 bid = target_ecpc*pctr*1000 win_rate = bid_landscape_model.predict(comb_value,bid) #print bid expected_cost = bid_landscape_model.get_cost(comb_value,bid) ecpc = float(expected_cost)/pctr if bid < win_price: continue out_file.write('%d\t%f\t%f\t%f\t%f\t%f\t%f\n' % (click_flag,pctr,win_rate,bid,win_price,expected_cost,ecpc))
class SampleDataProcessor(): id_class = set([ 'tran_id', 'req_id', 'req_device_geo_latitude', 'req_device_geo_longitude', 'req_device_ip', 'req_impressions_id', 'req_ext_udi_idfa', 'req_user_custom_data', 'req_device_ext_nex_ifa', 'req_device_geo_zip', 'req_user_geo_zip', 'req_user_ext_nex_hhi', 'req_ext_x_uidh', 'req_site_ref', 'req_user_keywords', 'req_device_dev_platform_id_sha1', 'req_user_id', 'req_ext_udi_udidmd5', 'req_ext_udi_udidsha1', 'req_ext_udi_openudid', 'req_user_ext_nex_vzwuidh', 'req_ext_udi_macmd5', 'req_ext_udi_odin', 'req_device_ext_nex_macsha1', 'req_device_dev_platform_id_md5', 'req_user_cookie_age_seconds', 'req_device_hashed_idfa', 'req_app_content_detected_vertical_weight', 'req_user_ext_nex_dma', 'req_app_keywords', 'req_user_geo_city', 'req_request_time', 'req_device_dev_platform_id', 'req_user_cookie_age_seconds' 'req_device_geo_city', 'rsp_id', 'rsp_seatbid_bid_impid', 'rsp_seatbid_bid_nurl', 'rsp_seatbid_bid_adm', 'rsp_seatbid_bid_price', 'rsp_seatbid_bid_id', 'rsp_bidid', 'event_action_flag', 'event_action_tr', 'event_action_ts', 'event_adx', #'event_ca','event_ch','event_click_flag','event_click_tr','event_click_ts','event_co', 'event_crv', 'event_cur', 'event_dev', 'event_devt', 'event_enct', #'event_grp', 'event_impression_flag', 'event_impression_ts', 'event_ip', 'event_price', 'event_win_notice_ts' ]) join_parser = join.Parser() def gethash(self, str): mmh3Value = mmh3.hash(str, 3419) return mmh3Value % 16777216 def clear(self, line): """去除不需要的特征和不符合格式的数据 返回处理后的特征值列表 """ if 'event_impression_flag:0' in line: return clk = 0 enventCa = '' #记录campaign eventGrp = '' #记录广告组 win_price = '' #记录成交价 hashed_feat_vals = {} line_arr = line.strip('\n').split('|') self.join_parser.feed(line) all_info = self.join_parser.get_all() for elem in line_arr: if ":" not in elem: continue elemArr = elem.split(':', 1) if elemArr[1] == '': continue if elemArr[0] == 'event_click_flag': clk = int(elemArr[1]) continue if elemArr[0] == 'event_price': win_price = float(elemArr[1]) if elemArr[0] == 'event_ca': eventCa = elemArr[1] if elemArr[0] == 'event_grp': eventGrp = elemArr[1] #hash处理 if elemArr[0] in self.id_class: continue if '[' in elemArr[1] or ']' in elemArr[1]: feat_vals_list = elemArr[1].strip('[').strip(']').split(',') hashed_feat_vals.update({ self.gethash(elemArr[0] + ':' + feat_val): elemArr[0] + ':' + feat_val for feat_val in feat_vals_list }) else: hashed_feat_vals[self.gethash(elem)] = elem return win_price, hashed_feat_vals def get_extended_feat(self): pass
def prediction_file(self, sample_file): target_ecpc = 0.06 import util.join as join req_parser = join.Parser() the_model = ctr_model.LrCtrModel( '/home/chester/KuaiPan/workspace/tukmob/resource/wzn/wzn', 'new') bid_array = [] expected_price_array = [] surplus_array = [] line_no = 0 for line in open(sample_file): #if line_no % 1000 == 0: if line_no == 1000: print line_no break line_no += 1 #print ftrl.get_win_rate(line,0.2), #print ftrl.get_cost(line,0.2), req_parser.feed(line) if req_parser.get_adx() == 'youku': continue bid = round(target_ecpc * the_model.predict_ctr(line) * 1000, 2) expected_price = round(self.get_cost(line, bid), 4) print 'win_price', 'pctr', 'bid', 'expected_price', 'win_rate', 'surplus' print req_parser.get_price(), print the_model.predict_ctr(line), print bid, expected_price, print self.get_win_rate(line, bid), print(bid - expected_price) * self.get_win_rate(line, bid) bid_array.append(round(bid, 2)) expected_price_array.append(round(expected_price, 2)) surplus_array.append( round((bid - expected_price) * self.get_win_rate(line, bid), 2)) #print bid_array,expected_price_array,surplus_array import collections import matplotlib.pyplot as pp cnt = collections.Counter(bid_array) total_cnt = len(bid_array) sort_cnt_key = sorted(cnt.keys()) pp.clf() pp.plot(sort_cnt_key, [(cnt[ppp] + 0.0) / total_cnt for ppp in sort_cnt_key]) pp.savefig('/home/chester/bid.png') cnt = collections.Counter(expected_price_array) total_cnt = len(expected_price_array) sort_cnt_key = sorted(cnt.keys()) pp.clf() pp.plot(sort_cnt_key, [(cnt[ppp] + 0.0) / total_cnt for ppp in sort_cnt_key]) pp.savefig('/home/chester/expected_price_array.png') cnt = collections.Counter(surplus_array) total_cnt = len(surplus_array) sort_cnt_key = sorted(cnt.keys()) pp.clf() pp.plot(sort_cnt_key, [(cnt[ppp] + 0.0) / total_cnt for ppp in sort_cnt_key]) pp.savefig('/home/chester/surplus.png')