def history_repeat_conditional(bid_history_files, the_ctr_model, bid_model, candidate_paras, condition_type, stat_result): ''' 历史重演,对一批历史数据进行模拟投放 paras: bid_history_files:join文件名列表,list类型 the_ctr_model:ctr模型 bid_model:出价模型 candidate_paras:候选参数 return: paras_perf[camp_grp][para] = {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':} ''' candidate_paras = { camp_grp: candidate_paras[camp_grp] for camp_grp in candidate_paras.keys() if len(candidate_paras[camp_grp]) != 0 } bpo_logger.info( '**************history_repeat function start**************') bpo_logger.info('files:%s' % ','.join(bid_history_files)) bpo_logger.info('************** bid_model: %s**************' % (bid_model.keys())) bpo_logger.info('************** candidate_paras: %s**************' % (candidate_paras)) bpo_logger.info('************** condition_type: %s**************' % (condition_type)) stop_flag_idx = {} result = {} #预制所有的键 for camp_grp in bid_model.keys(): #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不在bid_model中,则continue if (not candidate_paras.has_key(camp_grp)) or ( not bid_model.has_key(camp_grp)): continue if not condition_type.has_key(camp_grp): continue tmp_num = len(candidate_paras[camp_grp]) stop_flag_idx[camp_grp] = tmp_num result[camp_grp] = [None] * tmp_num for idx in range(0, tmp_num): result[camp_grp][idx] = { 'impression': 0, 'avg_pctr': 0, 'click': 0, 'cost': 0 } req_parser = join.Parser() for bid_history_file in bid_history_files: for line in open(bid_history_file): req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue win_price = float(field_dict['win_price']) camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id']) if win_price == None or win_price <= 0: continue if not result.has_key(camp_grp): continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) candidate_bid_prices = bid_model[camp_grp].get_bids_auc( ctr=pctr, variable_paras=candidate_paras[camp_grp]) first_ge_idx = binary_search(candidate_bid_prices, win_price) #非累积方式 if first_ge_idx < stop_flag_idx[camp_grp]: for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]): result[camp_grp][tmp_idx]['impression'] += 1 result[camp_grp][tmp_idx]['avg_pctr'] += pctr result[camp_grp][tmp_idx]['click'] += ( 1 if field_dict['click_flag'] == True else 0) result[camp_grp][tmp_idx]['cost'] += win_price #指定资源限制方法:50%*总成本,50%*总点击 if condition_type[camp_grp] == 'cost': stop_flag_idx[camp_grp] = binary_search([ result[camp_grp][idx]['cost'] for idx in range(0, stop_flag_idx[camp_grp]) ], stat_result[camp_grp]['cost'] / 2) elif condition_type[camp_grp] == 'click': stop_flag_idx[camp_grp] = binary_search([ result[camp_grp][idx]['click'] for idx in range(0, stop_flag_idx[camp_grp]) ], stat_result[camp_grp]['click'] / 2) #计算ctr/avg_pctr/ecpc/cpm等衍生度量 for camp_grp in result: for tmp_idx in range(0, len(result[camp_grp])): result[camp_grp][tmp_idx]['cost'] /= 1000 # 修改单位,cpm为千次展示价格 tmp_dict = result[camp_grp][tmp_idx] result[camp_grp][tmp_idx]['ctr'] = ( (tmp_dict['click'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['avg_pctr'] = ( (tmp_dict['avg_pctr'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['ecpc'] = ( tmp_dict['cost'] ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0 result[camp_grp][tmp_idx][ 'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[ 'impression'] if tmp_dict['impression'] != 0 else 0 #提取出具体参数,而非参数索引 paras_perf = { camp_grp: { candidate_paras[camp_grp][idx]: result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp])) } for camp_grp in result.keys() } #打印结果 bpo_logger.info('**************history_repeat function end**************') formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n' for camp_grp in paras_perf.keys(): log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files) log_perf_str += formatter.format( camp=camp_grp[0], grp=camp_grp[1], strategy=bid_model[camp_grp].bid_strategy_type, para=bid_model[camp_grp].fixed_parameter) log_perf_str += 'condition type:%s\n' % condition_type[camp_grp] log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n' log_perf_str += 'unit: US dollar\n' for para in sorted(paras_perf[camp_grp].keys()): tmp_dict = paras_perf[camp_grp][para] log_perf_str += str( para ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict bpo_logger.info('************parameter performance start************') bpo_logger.info(log_perf_str) bpo_logger.info('************parameter performance end************') return paras_perf
def history_repeat(bid_history_files, the_ctr_model, bid_model, candidate_paras): ''' 历史重演,对一批历史数据进行模拟投放 paras: bid_history_files:join文件列表 the_ctr_model:ctr模型 bid_model:出价模型 candidate_paras:候选参数 return: paras_perf[camp_grp][para] = {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':} ''' candidate_paras = { camp_grp: candidate_paras[camp_grp] for camp_grp in candidate_paras.keys() if len(candidate_paras[camp_grp]) != 0 } bpo_logger.info( '**************history_repeat function start**************') bpo_logger.info('files:%s' % ','.join(bid_history_files)) bpo_logger.info('************** bid_model: %s**************' % (bid_model.keys())) bpo_logger.info('************** candidate_paras: %s**************' % (candidate_paras)) stop_flag_idx = {} result = {} for (camp_id, grp_id) in bid_model.keys(): tmp_num = len(candidate_paras[(camp_id, grp_id)]) stop_flag_idx[(camp_id, grp_id)] = tmp_num result[(camp_id, grp_id)] = [None] * tmp_num for idx in range(0, tmp_num): result[(camp_id, grp_id)][idx] = { 'impression': 0, 'avg_pctr': 0, 'click': 0, 'cost': 0 } req_parser = join.Parser() total_time = 0 for bid_history_file in bid_history_files: for line in open(bid_history_file): req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue win_price = float(field_dict['win_price']) camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id']) #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不再bid_model中,则continue if (not candidate_paras.has_key(camp_grp)) or ( not bid_model.has_key(camp_grp) ) or win_price == None or win_price <= 0: continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) candidate_bid_prices = bid_model[camp_grp].get_bids_auc( ctr=pctr, variable_paras=candidate_paras[camp_grp]) first_ge_idx = binary_search(candidate_bid_prices, win_price) #非累积方式 #if first_ge_idx < stop_flag_idx[camp_grp]: # for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]): # result[camp_grp][tmp_idx]['impression'] += 1 # result[camp_grp][tmp_idx]['avg_pctr'] += pctr # result[camp_grp][tmp_idx]['click'] += (1 if field_dict['click_flag'] == '1' else 0) # result[camp_grp][tmp_idx]['cost'] += win_price # 因为没有投放限制(比如总预算,总预订点击等),下列语句被注释了 # 指定资源限制方法:这里是成本限制,50%*总成本 # print float(total_budget[camp_grp])/2 # stop_flag_idx[camp_grp] = binary_search([result[camp_grp][idx]['cost'] for idx in range(0,stop_flag_idx[camp_grp])],float(total_budget[camp_grp])/16) # print stop_flag_idx[camp_grp] #累积方式,累积方式是不能添加资源限制的。累计方式速度更快 if first_ge_idx < stop_flag_idx[camp_grp]: result[camp_grp][first_ge_idx]['impression'] += 1 result[camp_grp][first_ge_idx]['avg_pctr'] += pctr result[camp_grp][first_ge_idx]['click'] += ( 1 if field_dict['click_flag'] == True else 0) result[camp_grp][first_ge_idx]['cost'] += win_price for camp_grp in result: for idx in range(1, len(result[camp_grp])): for tmp_metric in ['impression', 'avg_pctr', 'click', 'cost']: result[camp_grp][idx][tmp_metric] += result[camp_grp][ idx - 1][tmp_metric] #计算ctr/avg_pctr/ecpc/cpm等衍生度量 for camp_grp in result: for tmp_idx in range(0, len(result[camp_grp])): result[camp_grp][tmp_idx]['cost'] /= 1000 # 修改单位 # print result[camp_grp] tmp_dict = result[camp_grp][tmp_idx] result[camp_grp][tmp_idx]['ctr'] = ( (tmp_dict['click'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['avg_pctr'] = ( (tmp_dict['avg_pctr'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['ecpc'] = ( tmp_dict['cost'] ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0 result[camp_grp][tmp_idx][ 'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[ 'impression'] if tmp_dict['impression'] != 0 else 0 #提取出具体参数,而非参数索引 paras_perf = {} #for camp_grp in result.keys(): # paras_perf[camp_grp] = {} # for idx in range(0, len(candidate_paras[camp_grp])): # paras_perf[camp_grp][candidate_paras[camp_grp][idx]] = result[camp_grp][idx] paras_perf = { camp_grp: { candidate_paras[camp_grp][idx]: result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp])) } for camp_grp in result.keys() } #打印结果 bpo_logger.info('**************history_repeat function end**************') formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n' for camp_grp in paras_perf.keys(): log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files) log_perf_str += formatter.format( camp=camp_grp[0], grp=camp_grp[1], strategy=bid_model[camp_grp].bid_strategy_type, para=bid_model[camp_grp].fixed_parameter) log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n' log_perf_str += 'unit: US dollar\n' for para in sorted(paras_perf[camp_grp].keys()): tmp_dict = paras_perf[camp_grp][para] log_perf_str += str( para ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict bpo_logger.info('************parameter performance start************') bpo_logger.info(log_perf_str) bpo_logger.info('************parameter performance end************') #print paras_perf return paras_perf
def bid_paras_optimization_middle(bid_history_files, the_ctr_model, bid_model, min_para, max_para, max_ecpc, cny_to_usd, stat_result, status_file): ''' 对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。 paras: bid_history_files: 为join好的全部为win的日志文件列表 initial_paras:默认为单个参数,即为一个浮点型变量 endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC return: optimal_paras[camp_grp]:[optimal_para,] or [] ''' #判断历史重演的类型和条件 status_config = ConfigParser.ConfigParser() status_config.read(status_file) candidate_paras = {} condition_type = {} for (camp_id, grp_id) in bid_model.keys(): #initial_para定为最大参数和最小参数之间的平均 candidate_paras[(camp_id, grp_id)] = bid_model[( camp_id, grp_id)].get_paras_by_log( central_para=(min_para + max_para + 0.0) / 2, max_para=max_para, min_para=min_para, range_num=200) #比较优化时所用ecpc和实际投放的ecpc tmp_ecpc = stat_result[(camp_id, grp_id)]['cost'] / stat_result[ (camp_id, grp_id)]['click'] if not status_config.has_option(str(camp_id), 'last_middle_ecpc'): continue if float(status_config.get(str(camp_id), 'last_middle_ecpc')) > tmp_ecpc: condition_type[(camp_id, grp_id)] = "cost" else: condition_type[(camp_id, grp_id)] = "click" paras_perf = {} paras_perf = history_repeat_conditional(bid_history_files, the_ctr_model, bid_model, candidate_paras, condition_type, stat_result) optimal_paras = {} for camp_grp in bid_model.keys(): if not paras_perf.has_key(camp_grp): continue #大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定 para_list = paras_perf[camp_grp].keys() #tmp_paras_ecpc为保留了4位小数的ecpc字典 tmp_paras_ecpc = { para: round(paras_perf[camp_grp][para]['ecpc'], 4) for para in para_list } para_list = [ para for para in para_list if paras_perf[camp_grp][para]['click'] >= 10 ] # 考虑异常情况,没有任何一个参数能达到目标点击量 if len(para_list) == 0: bpo_logger.warn( 'for camp_grp %s and bid strategy %s, click # of all parameters < 10 ' % (camp_grp, bid_model[camp_grp].bid_strategy_type)) optimal_paras[camp_grp] = [] continue #修正ecpc,即取目标ecpc和最优ecpc之间的折中 #以及单位转换,这里值需要对target_ecpc进行转换 target_ecpc = cny_to_usd * float( status_config.get(camp_grp[0], 'target_ecpc')) / 1000000 last_middle_ecpc = float( status_config.get(camp_grp[0], 'last_middle_ecpc')) k = int(status_config.get(camp_grp[0], 'step_size')) modified_ecpc = target_ecpc * 1 / (2**k) + last_middle_ecpc * (1 - 1 / (2**k)) bpo_logger.info( 'target_ecpc%.3f,last_middle_ecpc%.3f, modified_ecpc%.3f' % (target_ecpc, last_middle_ecpc, modified_ecpc)) modified_ecpc = float(modified_ecpc) # 小于目标cpc上限的参数集 if type(max_ecpc) == type({}): modified_ecpc = max_ecpc[camp_grp[0]] else: modified_ecpc = max_ecpc para_list = [ para for para in para_list if tmp_paras_ecpc[para] < modified_ecpc ] if len(para_list) == 0: bpo_logger.warn( 'for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f ' % (camp_grp, bid_model[camp_grp].bid_strategy_type, modified_ecpc)) #若没有任何一个参数可以达到该目标ecpc,则取最小参数 optimal_paras[camp_grp] = [ min(tmp_paras_ecpc.keys(), key=lambda x: tmp_paras_ecpc[x]) ] continue #排序 para_list.sort() #去除参数列表两端的无效参数 last_effective_idx, first_effective_idx = None, None last_para_ecpc, first_para_ecpc = tmp_paras_ecpc[ para_list[-1]], tmp_paras_ecpc[para_list[0]] max_margin_ecpc = max(last_para_ecpc, first_para_ecpc) bpo_logger.info('max_margin_ecpc:%s ' % str(max_margin_ecpc)) try: last_effective_idx = len(para_list) - [ True if tmp_paras_ecpc[para] == last_para_ecpc else False for para in para_list[::-1] ].index(False) first_effective_idx = [ True if tmp_paras_ecpc[para] == first_para_ecpc else False for para in para_list ].index(False) except ValueError: raise NameError('parameter not change ecpc performance ') if last_effective_idx != None and first_effective_idx != None: para_list = para_list[first_effective_idx:last_effective_idx] para_list = [ para for para in para_list if tmp_paras_ecpc[para] < max_margin_ecpc * 0.8 ] # 小于目标cpc上限的参数集中点击量最大的参数 if len(para_list) >= 2: optimal_paras[camp_grp] = [ reduce( lambda x, y: x if paras_perf[camp_grp][x]['click'] > paras_perf[camp_grp][y]['click'] else y, para_list) ] elif len(para_list) == 1: optimal_paras[camp_grp] = [para_list[0]] else: optimal_paras[camp_grp] = [] #修改状态配置,并更新文件 status_config.set(camp_grp[0], 'last_middle_ecpc', value=modified_ecpc) status_config.set( camp_grp[0], 'step_size', value=int(status_config.get(camp_grp[0], 'step_size')) + 1) status_config.write(open(status_file, 'w')) return optimal_paras
def bid_paras_optimization(bid_history_files, the_ctr_model, bid_model, min_para, max_para, max_ecpc): ''' 对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。 paras: bid_history_files: 为join好的全部为win的日志文件列表 initial_paras:默认为单个参数,即为一个浮点型变量 endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC return: optimal_paras[camp_grp]:[optimal_para,] or [] ''' candidate_paras = {} for (camp_id, grp_id) in bid_model.keys(): #initial_para定为最大参数和最小参数之间的平均 candidate_paras[(camp_id, grp_id)] = bid_model[( camp_id, grp_id)].get_paras_by_log( central_para=(min_para + max_para + 0.0) / 2, max_para=max_para, min_para=min_para, range_num=200) paras_perf = {} paras_perf = history_repeat(bid_history_files, the_ctr_model, bid_model, candidate_paras) optimal_paras = {} for camp_grp in bid_model.keys(): if not paras_perf.has_key(camp_grp): continue #大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定 para_list = paras_perf[camp_grp].keys() #tmp_paras_ecpc为保留了3位小数的ecpc字典 tmp_paras_ecpc = { para: round(paras_perf[camp_grp][para]['ecpc'], 4) for para in para_list } #para_list = filter( lambda x:True if paras_perf[camp_grp][x]['click'] >= 10 else False, para_list ) para_list = [ para for para in para_list if paras_perf[camp_grp][para]['click'] >= 10 ] # 考虑异常情况,没有任何一个参数能达到目标点击量 if len(para_list) == 0: bpo_logger.warn( 'for camp_grp %s and bid strategy %s, click # of all parameters < 10 ' % (camp_grp, bid_model[camp_grp].bid_strategy_type)) optimal_paras[camp_grp] = [] continue # 小于目标cpc上限的参数集 if type(max_ecpc) == type({}): tmp_max_ecpc = max_ecpc[camp_grp[0]] else: tmp_max_ecpc = max_ecpc para_list = [ para for para in para_list if tmp_paras_ecpc[para] < tmp_max_ecpc ] if len(para_list) == 0: bpo_logger.warn( 'for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f ' % (camp_grp, bid_model[camp_grp].bid_strategy_type, tmp_max_ecpc)) #若没有任何一个参数可以达到该目标ecpc,则取最小参数 optimal_paras[camp_grp] = [ min(tmp_paras_ecpc.keys(), key=lambda x: tmp_paras_ecpc[x]) ] continue #排序 para_list.sort() #去除参数列表两端的无效参数 last_effective_idx, first_effective_idx = None, None last_para_ecpc, first_para_ecpc = tmp_paras_ecpc[ para_list[-1]], tmp_paras_ecpc[para_list[0]] max_margin_ecpc = max(last_para_ecpc, first_para_ecpc) bpo_logger.info('max_margin_ecpc:%s ' % str(max_margin_ecpc)) try: last_effective_idx = len(para_list) - [ True if tmp_paras_ecpc[para] == last_para_ecpc else False for para in para_list[::-1] ].index(False) first_effective_idx = [ True if tmp_paras_ecpc[para] == first_para_ecpc else False for para in para_list ].index(False) except ValueError: raise NameError('parameter not change ecpc performance ') if last_effective_idx != None and first_effective_idx != None: para_list = para_list[first_effective_idx:last_effective_idx] para_list = [ para for para in para_list if tmp_paras_ecpc[para] < max_margin_ecpc * 0.9 ] #bpo_logger.info('cpc90 %f, max cpc %f',cpc90,max([paras_perf[camp_grp][para]['ecpc'] for para in paras_perf[camp_grp].keys()])) # 小于目标cpc上限的参数集中点击量最大的参数 if len(para_list) >= 2: optimal_paras[camp_grp] = [ reduce( lambda x, y: x if paras_perf[camp_grp][x]['click'] > paras_perf[camp_grp][y]['click'] else y, para_list) ] elif len(para_list) == 1: optimal_paras[camp_grp] = [para_list[0]] else: optimal_paras[camp_grp] = [] return optimal_paras
def history_repeat(bid_history_files, the_ctr_model, bid_model, candidate_paras): ''' 历史重演,对一批历史数据进行模拟投放 paras: bid_history_files:join文件列表 the_ctr_model:ctr模型 bid_model:出价模型 candidate_paras:候选参数 return: paras_perf[camp_grp][para] = {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':} ''' candidate_paras = {camp_grp:candidate_paras[camp_grp] for camp_grp in candidate_paras.keys() if len(candidate_paras[camp_grp]) != 0 } bpo_logger.info('**************history_repeat function start**************') bpo_logger.info('files:%s' % ','.join(bid_history_files)) bpo_logger.info('************** bid_model: %s**************' % (bid_model.keys())) bpo_logger.info('************** candidate_paras: %s**************' % (candidate_paras)) stop_flag_idx = {} result = {} for (camp_id, grp_id) in bid_model.keys(): tmp_num = len(candidate_paras[(camp_id, grp_id)]) stop_flag_idx[(camp_id, grp_id)] = tmp_num result[(camp_id, grp_id)] = [None] * tmp_num for idx in range(0, tmp_num): result[(camp_id, grp_id)][idx] = {'impression':0, 'avg_pctr':0, 'click':0, 'cost':0} req_parser = join.Parser() total_time = 0 for bid_history_file in bid_history_files: for line in open(bid_history_file): req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue win_price = float(field_dict['win_price']) camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id']) #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不再bid_model中,则continue if (not candidate_paras.has_key(camp_grp)) or (not bid_model.has_key(camp_grp)) or win_price == None or win_price <= 0: continue pctr = the_ctr_model.predict_ctr(field_dict['feature_values']) candidate_bid_prices = bid_model[camp_grp].get_bids_auc(ctr=pctr, variable_paras=candidate_paras[camp_grp]) first_ge_idx = binary_search(candidate_bid_prices, win_price) #非累积方式 #if first_ge_idx < stop_flag_idx[camp_grp]: # for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]): # result[camp_grp][tmp_idx]['impression'] += 1 # result[camp_grp][tmp_idx]['avg_pctr'] += pctr # result[camp_grp][tmp_idx]['click'] += (1 if field_dict['click_flag'] == '1' else 0) # result[camp_grp][tmp_idx]['cost'] += win_price # 因为没有投放限制(比如总预算,总预订点击等),下列语句被注释了 # 指定资源限制方法:这里是成本限制,50%*总成本 # print float(total_budget[camp_grp])/2 # stop_flag_idx[camp_grp] = binary_search([result[camp_grp][idx]['cost'] for idx in range(0,stop_flag_idx[camp_grp])],float(total_budget[camp_grp])/16) # print stop_flag_idx[camp_grp] #累积方式,累积方式是不能添加资源限制的。累计方式速度更快 if first_ge_idx < stop_flag_idx[camp_grp]: result[camp_grp][first_ge_idx]['impression'] += 1 result[camp_grp][first_ge_idx]['avg_pctr'] += pctr result[camp_grp][first_ge_idx]['click'] += (1 if field_dict['click_flag'] == True else 0) result[camp_grp][first_ge_idx]['cost'] += win_price for camp_grp in result: for idx in range(1,len(result[camp_grp])): for tmp_metric in ['impression','avg_pctr','click','cost']: result[camp_grp][idx][tmp_metric] += result[camp_grp][idx-1][tmp_metric] #计算ctr/avg_pctr/ecpc/cpm等衍生度量 for camp_grp in result: for tmp_idx in range(0, len(result[camp_grp])): result[camp_grp][tmp_idx]['cost'] /= 1000 # 修改单位 # print result[camp_grp] tmp_dict = result[camp_grp][tmp_idx] result[camp_grp][tmp_idx]['ctr'] = ((tmp_dict['click'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['avg_pctr'] = ((tmp_dict['avg_pctr'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0 result[camp_grp][tmp_idx]['ecpc'] = (tmp_dict['cost']) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0 result[camp_grp][tmp_idx]['cpm'] = 1000 * tmp_dict['cost'] / tmp_dict['impression'] if tmp_dict['impression'] != 0 else 0 #提取出具体参数,而非参数索引 paras_perf = {} #for camp_grp in result.keys(): # paras_perf[camp_grp] = {} # for idx in range(0, len(candidate_paras[camp_grp])): # paras_perf[camp_grp][candidate_paras[camp_grp][idx]] = result[camp_grp][idx] paras_perf ={camp_grp:{ candidate_paras[camp_grp][idx]:result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp]))} for camp_grp in result.keys()} #打印结果 bpo_logger.info('**************history_repeat function end**************') formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n' for camp_grp in paras_perf.keys(): log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files) log_perf_str += formatter.format( camp=camp_grp[0], grp=camp_grp[1], strategy=bid_model[camp_grp].bid_strategy_type, para=bid_model[camp_grp].fixed_parameter) log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n' log_perf_str += 'unit: US dollar\n' for para in sorted(paras_perf[camp_grp].keys()): tmp_dict = paras_perf[camp_grp][para] log_perf_str += str(para) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict bpo_logger.info('************parameter performance start************') bpo_logger.info(log_perf_str) bpo_logger.info('************parameter performance end************') #print paras_perf return paras_perf
def bid_paras_optimization(bid_history_files, the_ctr_model, bid_model, min_para, max_para, max_ecpc): ''' 对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。 paras: bid_history_files: 为join好的全部为win的日志文件列表 initial_paras:默认为单个参数,即为一个浮点型变量 endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC return: optimal_paras[camp_grp]:[optimal_para,] or [] ''' candidate_paras = {} for (camp_id, grp_id) in bid_model.keys(): #initial_para定为最大参数和最小参数之间的平均 candidate_paras[(camp_id, grp_id)] = bid_model[(camp_id, grp_id)].get_paras_by_log(central_para=(min_para + max_para + 0.0) / 2, max_para=max_para, min_para=min_para, range_num=200) paras_perf = {} paras_perf = history_repeat(bid_history_files, the_ctr_model, bid_model, candidate_paras) optimal_paras = {} for camp_grp in bid_model.keys(): if not paras_perf.has_key(camp_grp): continue #大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定 para_list = paras_perf[camp_grp].keys() #tmp_paras_ecpc为保留了3位小数的ecpc字典 tmp_paras_ecpc = {para: round(paras_perf[camp_grp][para]['ecpc'],4) for para in para_list} #para_list = filter( lambda x:True if paras_perf[camp_grp][x]['click'] >= 10 else False, para_list ) para_list = [para for para in para_list if paras_perf[camp_grp][para]['click'] >= 10 ] # 考虑异常情况,没有任何一个参数能达到目标点击量 if len(para_list) == 0: bpo_logger.warn('for camp_grp %s and bid strategy %s, click # of all parameters < 10 ' % (camp_grp, bid_model[camp_grp].bid_strategy_type)) optimal_paras[camp_grp] = [] continue # 小于目标cpc上限的参数集 if type(max_ecpc) == type({}): tmp_max_ecpc = max_ecpc[camp_grp[0]] else: tmp_max_ecpc = max_ecpc para_list = [para for para in para_list if tmp_paras_ecpc[para] < tmp_max_ecpc ] if len(para_list) == 0: bpo_logger.warn('for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f ' % (camp_grp, bid_model[camp_grp].bid_strategy_type, tmp_max_ecpc)) #若没有任何一个参数可以达到该目标ecpc,则取最小参数 optimal_paras[camp_grp] = [min(tmp_paras_ecpc.keys(),key=lambda x:tmp_paras_ecpc[x])] continue #排序 para_list.sort() #去除参数列表两端的无效参数 last_effective_idx,first_effective_idx = None,None last_para_ecpc,first_para_ecpc = tmp_paras_ecpc[para_list[-1]],tmp_paras_ecpc[para_list[0]] max_margin_ecpc=max(last_para_ecpc,first_para_ecpc) bpo_logger.info('max_margin_ecpc:%s ' % str(max_margin_ecpc)) try: last_effective_idx = len(para_list)-[ True if tmp_paras_ecpc[para] == last_para_ecpc else False for para in para_list[::-1] ].index(False) first_effective_idx = [ True if tmp_paras_ecpc[para] == first_para_ecpc else False for para in para_list ].index(False) except ValueError: raise NameError('parameter not change ecpc performance ') if last_effective_idx != None and first_effective_idx != None: para_list = para_list[first_effective_idx:last_effective_idx] para_list = [para for para in para_list if tmp_paras_ecpc[para] < max_margin_ecpc*0.9 ] #bpo_logger.info('cpc90 %f, max cpc %f',cpc90,max([paras_perf[camp_grp][para]['ecpc'] for para in paras_perf[camp_grp].keys()])) # 小于目标cpc上限的参数集中点击量最大的参数 if len(para_list) >= 2: optimal_paras[camp_grp] = [reduce(lambda x, y:x if paras_perf[camp_grp][x]['click'] > paras_perf[camp_grp][y]['click'] else y, para_list)] elif len(para_list) == 1: optimal_paras[camp_grp] = [para_list[0]] else: optimal_paras[camp_grp] = [] return optimal_paras
def history_repeat_conditional( bid_history_files, the_ctr_model, bid_model, candidate_paras, condition_type, stat_result ): """ 历史重演,对一批历史数据进行模拟投放 paras: bid_history_files:join文件名列表,list类型 the_ctr_model:ctr模型 bid_model:出价模型 candidate_paras:候选参数 return: paras_perf[camp_grp][para] = {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':} """ candidate_paras = { camp_grp: candidate_paras[camp_grp] for camp_grp in candidate_paras.keys() if len(candidate_paras[camp_grp]) != 0 } bpo_logger.info("**************history_repeat function start**************") bpo_logger.info("files:%s" % ",".join(bid_history_files)) bpo_logger.info("************** bid_model: %s**************" % (bid_model.keys())) bpo_logger.info("************** candidate_paras: %s**************" % (candidate_paras)) bpo_logger.info("************** condition_type: %s**************" % (condition_type)) stop_flag_idx = {} result = {} # 预制所有的键 for camp_grp in bid_model.keys(): # bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不在bid_model中,则continue if (not candidate_paras.has_key(camp_grp)) or (not bid_model.has_key(camp_grp)): continue if not condition_type.has_key(camp_grp): continue tmp_num = len(candidate_paras[camp_grp]) stop_flag_idx[camp_grp] = tmp_num result[camp_grp] = [None] * tmp_num for idx in range(0, tmp_num): result[camp_grp][idx] = {"impression": 0, "avg_pctr": 0, "click": 0, "cost": 0} req_parser = join.Parser() for bid_history_file in bid_history_files: for line in open(bid_history_file): req_parser.feed(line) field_dict = req_parser.get_all() if field_dict == None: continue win_price = float(field_dict["win_price"]) camp_grp = (field_dict["campaign_id"], field_dict["adgroup_id"]) if win_price == None or win_price <= 0: continue if not result.has_key(camp_grp): continue pctr = the_ctr_model.predict_ctr(field_dict["feature_values"]) candidate_bid_prices = bid_model[camp_grp].get_bids_auc(ctr=pctr, variable_paras=candidate_paras[camp_grp]) first_ge_idx = binary_search(candidate_bid_prices, win_price) # 非累积方式 if first_ge_idx < stop_flag_idx[camp_grp]: for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]): result[camp_grp][tmp_idx]["impression"] += 1 result[camp_grp][tmp_idx]["avg_pctr"] += pctr result[camp_grp][tmp_idx]["click"] += 1 if field_dict["click_flag"] == True else 0 result[camp_grp][tmp_idx]["cost"] += win_price # 指定资源限制方法:50%*总成本,50%*总点击 if condition_type[camp_grp] == "cost": stop_flag_idx[camp_grp] = binary_search( [result[camp_grp][idx]["cost"] for idx in range(0, stop_flag_idx[camp_grp])], stat_result[camp_grp]["cost"] / 2, ) elif condition_type[camp_grp] == "click": stop_flag_idx[camp_grp] = binary_search( [result[camp_grp][idx]["click"] for idx in range(0, stop_flag_idx[camp_grp])], stat_result[camp_grp]["click"] / 2, ) # 计算ctr/avg_pctr/ecpc/cpm等衍生度量 for camp_grp in result: for tmp_idx in range(0, len(result[camp_grp])): result[camp_grp][tmp_idx]["cost"] /= 1000 # 修改单位,cpm为千次展示价格 tmp_dict = result[camp_grp][tmp_idx] result[camp_grp][tmp_idx]["ctr"] = ( ((tmp_dict["click"] + 0.0) / tmp_dict["impression"]) if tmp_dict["impression"] != 0 else 0 ) result[camp_grp][tmp_idx]["avg_pctr"] = ( ((tmp_dict["avg_pctr"] + 0.0) / tmp_dict["impression"]) if tmp_dict["impression"] != 0 else 0 ) result[camp_grp][tmp_idx]["ecpc"] = (tmp_dict["cost"]) / tmp_dict["click"] if tmp_dict["click"] != 0 else 0 result[camp_grp][tmp_idx]["cpm"] = ( 1000 * tmp_dict["cost"] / tmp_dict["impression"] if tmp_dict["impression"] != 0 else 0 ) # 提取出具体参数,而非参数索引 paras_perf = { camp_grp: { candidate_paras[camp_grp][idx]: result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp])) } for camp_grp in result.keys() } # 打印结果 bpo_logger.info("**************history_repeat function end**************") formatter = "paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n" for camp_grp in paras_perf.keys(): log_perf_str = "bid_history_files:%s\n" % ",".join(bid_history_files) log_perf_str += formatter.format( camp=camp_grp[0], grp=camp_grp[1], strategy=bid_model[camp_grp].bid_strategy_type, para=bid_model[camp_grp].fixed_parameter, ) log_perf_str += "condition type:%s\n" % condition_type[camp_grp] log_perf_str += "para click imp cost cpc ctr avg_pctr cpm\n" log_perf_str += "unit: US dollar\n" for para in sorted(paras_perf[camp_grp].keys()): tmp_dict = paras_perf[camp_grp][para] log_perf_str += ( str(para) + " %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n" % tmp_dict ) bpo_logger.info("************parameter performance start************") bpo_logger.info(log_perf_str) bpo_logger.info("************parameter performance end************") return paras_perf
def bid_paras_optimization_middle( bid_history_files, the_ctr_model, bid_model, min_para, max_para, max_ecpc, cny_to_usd, stat_result, status_file ): """ 对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。 paras: bid_history_files: 为join好的全部为win的日志文件列表 initial_paras:默认为单个参数,即为一个浮点型变量 endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC return: optimal_paras[camp_grp]:[optimal_para,] or [] """ # 判断历史重演的类型和条件 status_config = ConfigParser.ConfigParser() status_config.read(status_file) candidate_paras = {} condition_type = {} for (camp_id, grp_id) in bid_model.keys(): # initial_para定为最大参数和最小参数之间的平均 candidate_paras[(camp_id, grp_id)] = bid_model[(camp_id, grp_id)].get_paras_by_log( central_para=(min_para + max_para + 0.0) / 2, max_para=max_para, min_para=min_para, range_num=200 ) # 比较优化时所用ecpc和实际投放的ecpc tmp_ecpc = stat_result[(camp_id, grp_id)]["cost"] / stat_result[(camp_id, grp_id)]["click"] if not status_config.has_option(str(camp_id), "last_middle_ecpc"): continue if float(status_config.get(str(camp_id), "last_middle_ecpc")) > tmp_ecpc: condition_type[(camp_id, grp_id)] = "cost" else: condition_type[(camp_id, grp_id)] = "click" paras_perf = {} paras_perf = history_repeat_conditional( bid_history_files, the_ctr_model, bid_model, candidate_paras, condition_type, stat_result ) optimal_paras = {} for camp_grp in bid_model.keys(): if not paras_perf.has_key(camp_grp): continue # 大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定 para_list = paras_perf[camp_grp].keys() # tmp_paras_ecpc为保留了4位小数的ecpc字典 tmp_paras_ecpc = {para: round(paras_perf[camp_grp][para]["ecpc"], 4) for para in para_list} para_list = [para for para in para_list if paras_perf[camp_grp][para]["click"] >= 10] # 考虑异常情况,没有任何一个参数能达到目标点击量 if len(para_list) == 0: bpo_logger.warn( "for camp_grp %s and bid strategy %s, click # of all parameters < 10 " % (camp_grp, bid_model[camp_grp].bid_strategy_type) ) optimal_paras[camp_grp] = [] continue # 修正ecpc,即取目标ecpc和最优ecpc之间的折中 # 以及单位转换,这里值需要对target_ecpc进行转换 target_ecpc = cny_to_usd * float(status_config.get(camp_grp[0], "target_ecpc")) / 1000000 last_middle_ecpc = float(status_config.get(camp_grp[0], "last_middle_ecpc")) k = int(status_config.get(camp_grp[0], "step_size")) modified_ecpc = target_ecpc * 1 / (2 ** k) + last_middle_ecpc * (1 - 1 / (2 ** k)) bpo_logger.info( "target_ecpc%.3f,last_middle_ecpc%.3f, modified_ecpc%.3f" % (target_ecpc, last_middle_ecpc, modified_ecpc) ) modified_ecpc = float(modified_ecpc) # 小于目标cpc上限的参数集 if type(max_ecpc) == type({}): modified_ecpc = max_ecpc[camp_grp[0]] else: modified_ecpc = max_ecpc para_list = [para for para in para_list if tmp_paras_ecpc[para] < modified_ecpc] if len(para_list) == 0: bpo_logger.warn( "for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f " % (camp_grp, bid_model[camp_grp].bid_strategy_type, modified_ecpc) ) # 若没有任何一个参数可以达到该目标ecpc,则取最小参数 optimal_paras[camp_grp] = [min(tmp_paras_ecpc.keys(), key=lambda x: tmp_paras_ecpc[x])] continue # 排序 para_list.sort() # 去除参数列表两端的无效参数 last_effective_idx, first_effective_idx = None, None last_para_ecpc, first_para_ecpc = tmp_paras_ecpc[para_list[-1]], tmp_paras_ecpc[para_list[0]] max_margin_ecpc = max(last_para_ecpc, first_para_ecpc) bpo_logger.info("max_margin_ecpc:%s " % str(max_margin_ecpc)) try: last_effective_idx = len(para_list) - [ True if tmp_paras_ecpc[para] == last_para_ecpc else False for para in para_list[::-1] ].index(False) first_effective_idx = [ True if tmp_paras_ecpc[para] == first_para_ecpc else False for para in para_list ].index(False) except ValueError: raise NameError("parameter not change ecpc performance ") if last_effective_idx != None and first_effective_idx != None: para_list = para_list[first_effective_idx:last_effective_idx] para_list = [para for para in para_list if tmp_paras_ecpc[para] < max_margin_ecpc * 0.8] # 小于目标cpc上限的参数集中点击量最大的参数 if len(para_list) >= 2: optimal_paras[camp_grp] = [ reduce( lambda x, y: x if paras_perf[camp_grp][x]["click"] > paras_perf[camp_grp][y]["click"] else y, para_list, ) ] elif len(para_list) == 1: optimal_paras[camp_grp] = [para_list[0]] else: optimal_paras[camp_grp] = [] # 修改状态配置,并更新文件 status_config.set(camp_grp[0], "last_middle_ecpc", value=modified_ecpc) status_config.set(camp_grp[0], "step_size", value=int(status_config.get(camp_grp[0], "step_size")) + 1) status_config.write(open(status_file, "w")) return optimal_paras