Example #1
0
def history_repeat_conditional(bid_history_files, the_ctr_model, bid_model,
                               candidate_paras, condition_type, stat_result):
    '''
    历史重演,对一批历史数据进行模拟投放
    paras:
        bid_history_files:join文件名列表,list类型
        the_ctr_model:ctr模型
        bid_model:出价模型
        candidate_paras:候选参数
    return:
        paras_perf[camp_grp][para] =
            {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':}
    '''
    candidate_paras = {
        camp_grp: candidate_paras[camp_grp]
        for camp_grp in candidate_paras.keys()
        if len(candidate_paras[camp_grp]) != 0
    }
    bpo_logger.info(
        '**************history_repeat function start**************')
    bpo_logger.info('files:%s' % ','.join(bid_history_files))
    bpo_logger.info('************** bid_model: %s**************' %
                    (bid_model.keys()))
    bpo_logger.info('************** candidate_paras: %s**************' %
                    (candidate_paras))
    bpo_logger.info('************** condition_type: %s**************' %
                    (condition_type))
    stop_flag_idx = {}
    result = {}  #预制所有的键
    for camp_grp in bid_model.keys():
        #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不在bid_model中,则continue
        if (not candidate_paras.has_key(camp_grp)) or (
                not bid_model.has_key(camp_grp)):
            continue
        if not condition_type.has_key(camp_grp):
            continue
        tmp_num = len(candidate_paras[camp_grp])
        stop_flag_idx[camp_grp] = tmp_num
        result[camp_grp] = [None] * tmp_num
        for idx in range(0, tmp_num):
            result[camp_grp][idx] = {
                'impression': 0,
                'avg_pctr': 0,
                'click': 0,
                'cost': 0
            }
    req_parser = join.Parser()
    for bid_history_file in bid_history_files:
        for line in open(bid_history_file):
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            win_price = float(field_dict['win_price'])
            camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id'])
            if win_price == None or win_price <= 0:
                continue
            if not result.has_key(camp_grp):
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])
            candidate_bid_prices = bid_model[camp_grp].get_bids_auc(
                ctr=pctr, variable_paras=candidate_paras[camp_grp])
            first_ge_idx = binary_search(candidate_bid_prices, win_price)
            #非累积方式
            if first_ge_idx < stop_flag_idx[camp_grp]:
                for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]):
                    result[camp_grp][tmp_idx]['impression'] += 1
                    result[camp_grp][tmp_idx]['avg_pctr'] += pctr
                    result[camp_grp][tmp_idx]['click'] += (
                        1 if field_dict['click_flag'] == True else 0)
                    result[camp_grp][tmp_idx]['cost'] += win_price
            #指定资源限制方法:50%*总成本,50%*总点击
            if condition_type[camp_grp] == 'cost':
                stop_flag_idx[camp_grp] = binary_search([
                    result[camp_grp][idx]['cost']
                    for idx in range(0, stop_flag_idx[camp_grp])
                ], stat_result[camp_grp]['cost'] / 2)
            elif condition_type[camp_grp] == 'click':
                stop_flag_idx[camp_grp] = binary_search([
                    result[camp_grp][idx]['click']
                    for idx in range(0, stop_flag_idx[camp_grp])
                ], stat_result[camp_grp]['click'] / 2)
    #计算ctr/avg_pctr/ecpc/cpm等衍生度量
    for camp_grp in result:
        for tmp_idx in range(0, len(result[camp_grp])):
            result[camp_grp][tmp_idx]['cost'] /= 1000  # 修改单位,cpm为千次展示价格
            tmp_dict = result[camp_grp][tmp_idx]
            result[camp_grp][tmp_idx]['ctr'] = (
                (tmp_dict['click'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['avg_pctr'] = (
                (tmp_dict['avg_pctr'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['ecpc'] = (
                tmp_dict['cost']
            ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0
            result[camp_grp][tmp_idx][
                'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[
                    'impression'] if tmp_dict['impression'] != 0 else 0
    #提取出具体参数,而非参数索引
    paras_perf = {
        camp_grp: {
            candidate_paras[camp_grp][idx]: result[camp_grp][idx]
            for idx in range(0, len(candidate_paras[camp_grp]))
        }
        for camp_grp in result.keys()
    }
    #打印结果
    bpo_logger.info('**************history_repeat function end**************')
    formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n'
    for camp_grp in paras_perf.keys():
        log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files)
        log_perf_str += formatter.format(
            camp=camp_grp[0],
            grp=camp_grp[1],
            strategy=bid_model[camp_grp].bid_strategy_type,
            para=bid_model[camp_grp].fixed_parameter)
        log_perf_str += 'condition type:%s\n' % condition_type[camp_grp]
        log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n'
        log_perf_str += 'unit: US dollar\n'
        for para in sorted(paras_perf[camp_grp].keys()):
            tmp_dict = paras_perf[camp_grp][para]
            log_perf_str += str(
                para
            ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict
        bpo_logger.info('************parameter performance start************')
        bpo_logger.info(log_perf_str)
        bpo_logger.info('************parameter performance end************')
    return paras_perf
Example #2
0
def history_repeat(bid_history_files, the_ctr_model, bid_model,
                   candidate_paras):
    '''
    历史重演,对一批历史数据进行模拟投放
    paras:
        bid_history_files:join文件列表
        the_ctr_model:ctr模型
        bid_model:出价模型
        candidate_paras:候选参数
    return:
        paras_perf[camp_grp][para] =
            {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':}
    '''
    candidate_paras = {
        camp_grp: candidate_paras[camp_grp]
        for camp_grp in candidate_paras.keys()
        if len(candidate_paras[camp_grp]) != 0
    }
    bpo_logger.info(
        '**************history_repeat function start**************')
    bpo_logger.info('files:%s' % ','.join(bid_history_files))
    bpo_logger.info('************** bid_model: %s**************' %
                    (bid_model.keys()))
    bpo_logger.info('************** candidate_paras: %s**************' %
                    (candidate_paras))
    stop_flag_idx = {}
    result = {}
    for (camp_id, grp_id) in bid_model.keys():
        tmp_num = len(candidate_paras[(camp_id, grp_id)])
        stop_flag_idx[(camp_id, grp_id)] = tmp_num
        result[(camp_id, grp_id)] = [None] * tmp_num
        for idx in range(0, tmp_num):
            result[(camp_id, grp_id)][idx] = {
                'impression': 0,
                'avg_pctr': 0,
                'click': 0,
                'cost': 0
            }
    req_parser = join.Parser()
    total_time = 0
    for bid_history_file in bid_history_files:
        for line in open(bid_history_file):
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            win_price = float(field_dict['win_price'])
            camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id'])
            #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不再bid_model中,则continue
            if (not candidate_paras.has_key(camp_grp)) or (
                    not bid_model.has_key(camp_grp)
            ) or win_price == None or win_price <= 0:
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])
            candidate_bid_prices = bid_model[camp_grp].get_bids_auc(
                ctr=pctr, variable_paras=candidate_paras[camp_grp])
            first_ge_idx = binary_search(candidate_bid_prices, win_price)

            #非累积方式
            #if first_ge_idx < stop_flag_idx[camp_grp]:
            #    for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]):
            #        result[camp_grp][tmp_idx]['impression'] += 1
            #        result[camp_grp][tmp_idx]['avg_pctr'] += pctr
            #        result[camp_grp][tmp_idx]['click'] += (1 if field_dict['click_flag'] == '1' else 0)
            #        result[camp_grp][tmp_idx]['cost'] += win_price

            # 因为没有投放限制(比如总预算,总预订点击等),下列语句被注释了
            # 指定资源限制方法:这里是成本限制,50%*总成本
            # print float(total_budget[camp_grp])/2
            # stop_flag_idx[camp_grp] = binary_search([result[camp_grp][idx]['cost'] for idx in range(0,stop_flag_idx[camp_grp])],float(total_budget[camp_grp])/16)
            # print stop_flag_idx[camp_grp]

            #累积方式,累积方式是不能添加资源限制的。累计方式速度更快
            if first_ge_idx < stop_flag_idx[camp_grp]:
                result[camp_grp][first_ge_idx]['impression'] += 1
                result[camp_grp][first_ge_idx]['avg_pctr'] += pctr
                result[camp_grp][first_ge_idx]['click'] += (
                    1 if field_dict['click_flag'] == True else 0)
                result[camp_grp][first_ge_idx]['cost'] += win_price

    for camp_grp in result:
        for idx in range(1, len(result[camp_grp])):
            for tmp_metric in ['impression', 'avg_pctr', 'click', 'cost']:
                result[camp_grp][idx][tmp_metric] += result[camp_grp][
                    idx - 1][tmp_metric]

    #计算ctr/avg_pctr/ecpc/cpm等衍生度量
    for camp_grp in result:
        for tmp_idx in range(0, len(result[camp_grp])):
            result[camp_grp][tmp_idx]['cost'] /= 1000  # 修改单位
            # print result[camp_grp]
            tmp_dict = result[camp_grp][tmp_idx]
            result[camp_grp][tmp_idx]['ctr'] = (
                (tmp_dict['click'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['avg_pctr'] = (
                (tmp_dict['avg_pctr'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['ecpc'] = (
                tmp_dict['cost']
            ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0
            result[camp_grp][tmp_idx][
                'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[
                    'impression'] if tmp_dict['impression'] != 0 else 0

    #提取出具体参数,而非参数索引
    paras_perf = {}
    #for camp_grp in result.keys():
    #    paras_perf[camp_grp] = {}
    #    for idx in range(0, len(candidate_paras[camp_grp])):
    #        paras_perf[camp_grp][candidate_paras[camp_grp][idx]] = result[camp_grp][idx]
    paras_perf = {
        camp_grp: {
            candidate_paras[camp_grp][idx]: result[camp_grp][idx]
            for idx in range(0, len(candidate_paras[camp_grp]))
        }
        for camp_grp in result.keys()
    }

    #打印结果
    bpo_logger.info('**************history_repeat function end**************')
    formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n'
    for camp_grp in paras_perf.keys():
        log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files)
        log_perf_str += formatter.format(
            camp=camp_grp[0],
            grp=camp_grp[1],
            strategy=bid_model[camp_grp].bid_strategy_type,
            para=bid_model[camp_grp].fixed_parameter)
        log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n'
        log_perf_str += 'unit: US dollar\n'
        for para in sorted(paras_perf[camp_grp].keys()):
            tmp_dict = paras_perf[camp_grp][para]
            log_perf_str += str(
                para
            ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict

        bpo_logger.info('************parameter performance start************')
        bpo_logger.info(log_perf_str)
        bpo_logger.info('************parameter performance end************')
    #print paras_perf

    return paras_perf
Example #3
0
def bid_paras_optimization_middle(bid_history_files, the_ctr_model, bid_model,
                                  min_para, max_para, max_ecpc, cny_to_usd,
                                  stat_result, status_file):
    '''
    对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。
    paras:
        bid_history_files: 为join好的全部为win的日志文件列表
        initial_paras:默认为单个参数,即为一个浮点型变量
        endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC
    return:
        optimal_paras[camp_grp]:[optimal_para,] or []
    '''
    #判断历史重演的类型和条件
    status_config = ConfigParser.ConfigParser()
    status_config.read(status_file)

    candidate_paras = {}
    condition_type = {}
    for (camp_id, grp_id) in bid_model.keys():
        #initial_para定为最大参数和最小参数之间的平均
        candidate_paras[(camp_id, grp_id)] = bid_model[(
            camp_id, grp_id)].get_paras_by_log(
                central_para=(min_para + max_para + 0.0) / 2,
                max_para=max_para,
                min_para=min_para,
                range_num=200)
        #比较优化时所用ecpc和实际投放的ecpc
        tmp_ecpc = stat_result[(camp_id, grp_id)]['cost'] / stat_result[
            (camp_id, grp_id)]['click']
        if not status_config.has_option(str(camp_id), 'last_middle_ecpc'):
            continue
        if float(status_config.get(str(camp_id),
                                   'last_middle_ecpc')) > tmp_ecpc:
            condition_type[(camp_id, grp_id)] = "cost"
        else:
            condition_type[(camp_id, grp_id)] = "click"

    paras_perf = {}
    paras_perf = history_repeat_conditional(bid_history_files, the_ctr_model,
                                            bid_model, candidate_paras,
                                            condition_type, stat_result)
    optimal_paras = {}
    for camp_grp in bid_model.keys():
        if not paras_perf.has_key(camp_grp):
            continue
        #大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定
        para_list = paras_perf[camp_grp].keys()
        #tmp_paras_ecpc为保留了4位小数的ecpc字典
        tmp_paras_ecpc = {
            para: round(paras_perf[camp_grp][para]['ecpc'], 4)
            for para in para_list
        }
        para_list = [
            para for para in para_list
            if paras_perf[camp_grp][para]['click'] >= 10
        ]
        # 考虑异常情况,没有任何一个参数能达到目标点击量
        if len(para_list) == 0:
            bpo_logger.warn(
                'for camp_grp %s and bid strategy %s, click # of all parameters < 10 '
                % (camp_grp, bid_model[camp_grp].bid_strategy_type))
            optimal_paras[camp_grp] = []
            continue
        #修正ecpc,即取目标ecpc和最优ecpc之间的折中
        #以及单位转换,这里值需要对target_ecpc进行转换
        target_ecpc = cny_to_usd * float(
            status_config.get(camp_grp[0], 'target_ecpc')) / 1000000
        last_middle_ecpc = float(
            status_config.get(camp_grp[0], 'last_middle_ecpc'))
        k = int(status_config.get(camp_grp[0], 'step_size'))
        modified_ecpc = target_ecpc * 1 / (2**k) + last_middle_ecpc * (1 - 1 /
                                                                       (2**k))
        bpo_logger.info(
            'target_ecpc%.3f,last_middle_ecpc%.3f, modified_ecpc%.3f' %
            (target_ecpc, last_middle_ecpc, modified_ecpc))
        modified_ecpc = float(modified_ecpc)

        # 小于目标cpc上限的参数集
        if type(max_ecpc) == type({}):
            modified_ecpc = max_ecpc[camp_grp[0]]
        else:
            modified_ecpc = max_ecpc
        para_list = [
            para for para in para_list if tmp_paras_ecpc[para] < modified_ecpc
        ]
        if len(para_list) == 0:
            bpo_logger.warn(
                'for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f '
                % (camp_grp, bid_model[camp_grp].bid_strategy_type,
                   modified_ecpc))
            #若没有任何一个参数可以达到该目标ecpc,则取最小参数
            optimal_paras[camp_grp] = [
                min(tmp_paras_ecpc.keys(), key=lambda x: tmp_paras_ecpc[x])
            ]
            continue

        #排序
        para_list.sort()

        #去除参数列表两端的无效参数
        last_effective_idx, first_effective_idx = None, None
        last_para_ecpc, first_para_ecpc = tmp_paras_ecpc[
            para_list[-1]], tmp_paras_ecpc[para_list[0]]
        max_margin_ecpc = max(last_para_ecpc, first_para_ecpc)
        bpo_logger.info('max_margin_ecpc:%s ' % str(max_margin_ecpc))
        try:
            last_effective_idx = len(para_list) - [
                True if tmp_paras_ecpc[para] == last_para_ecpc else False
                for para in para_list[::-1]
            ].index(False)
            first_effective_idx = [
                True if tmp_paras_ecpc[para] == first_para_ecpc else False
                for para in para_list
            ].index(False)
        except ValueError:
            raise NameError('parameter not change ecpc performance ')
        if last_effective_idx != None and first_effective_idx != None:
            para_list = para_list[first_effective_idx:last_effective_idx]
        para_list = [
            para for para in para_list
            if tmp_paras_ecpc[para] < max_margin_ecpc * 0.8
        ]

        # 小于目标cpc上限的参数集中点击量最大的参数
        if len(para_list) >= 2:
            optimal_paras[camp_grp] = [
                reduce(
                    lambda x, y: x if paras_perf[camp_grp][x]['click'] >
                    paras_perf[camp_grp][y]['click'] else y, para_list)
            ]
        elif len(para_list) == 1:
            optimal_paras[camp_grp] = [para_list[0]]
        else:
            optimal_paras[camp_grp] = []

        #修改状态配置,并更新文件
        status_config.set(camp_grp[0], 'last_middle_ecpc', value=modified_ecpc)
        status_config.set(
            camp_grp[0],
            'step_size',
            value=int(status_config.get(camp_grp[0], 'step_size')) + 1)
        status_config.write(open(status_file, 'w'))

    return optimal_paras
Example #4
0
def bid_paras_optimization(bid_history_files, the_ctr_model, bid_model,
                           min_para, max_para, max_ecpc):
    '''
    对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。
    paras:
        bid_history_files: 为join好的全部为win的日志文件列表
        initial_paras:默认为单个参数,即为一个浮点型变量
        endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC
    return:
        optimal_paras[camp_grp]:[optimal_para,] or []
    '''
    candidate_paras = {}
    for (camp_id, grp_id) in bid_model.keys():
        #initial_para定为最大参数和最小参数之间的平均
        candidate_paras[(camp_id, grp_id)] = bid_model[(
            camp_id, grp_id)].get_paras_by_log(
                central_para=(min_para + max_para + 0.0) / 2,
                max_para=max_para,
                min_para=min_para,
                range_num=200)

    paras_perf = {}
    paras_perf = history_repeat(bid_history_files, the_ctr_model, bid_model,
                                candidate_paras)
    optimal_paras = {}
    for camp_grp in bid_model.keys():
        if not paras_perf.has_key(camp_grp):
            continue
        #大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定
        para_list = paras_perf[camp_grp].keys()
        #tmp_paras_ecpc为保留了3位小数的ecpc字典
        tmp_paras_ecpc = {
            para: round(paras_perf[camp_grp][para]['ecpc'], 4)
            for para in para_list
        }
        #para_list = filter( lambda x:True if paras_perf[camp_grp][x]['click'] >= 10 else False, para_list )
        para_list = [
            para for para in para_list
            if paras_perf[camp_grp][para]['click'] >= 10
        ]
        # 考虑异常情况,没有任何一个参数能达到目标点击量
        if len(para_list) == 0:
            bpo_logger.warn(
                'for camp_grp %s and bid strategy %s, click # of all parameters < 10 '
                % (camp_grp, bid_model[camp_grp].bid_strategy_type))
            optimal_paras[camp_grp] = []
            continue
        # 小于目标cpc上限的参数集
        if type(max_ecpc) == type({}):
            tmp_max_ecpc = max_ecpc[camp_grp[0]]
        else:
            tmp_max_ecpc = max_ecpc
        para_list = [
            para for para in para_list if tmp_paras_ecpc[para] < tmp_max_ecpc
        ]
        if len(para_list) == 0:
            bpo_logger.warn(
                'for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f '
                % (camp_grp, bid_model[camp_grp].bid_strategy_type,
                   tmp_max_ecpc))
            #若没有任何一个参数可以达到该目标ecpc,则取最小参数
            optimal_paras[camp_grp] = [
                min(tmp_paras_ecpc.keys(), key=lambda x: tmp_paras_ecpc[x])
            ]
            continue
        #排序
        para_list.sort()
        #去除参数列表两端的无效参数
        last_effective_idx, first_effective_idx = None, None
        last_para_ecpc, first_para_ecpc = tmp_paras_ecpc[
            para_list[-1]], tmp_paras_ecpc[para_list[0]]
        max_margin_ecpc = max(last_para_ecpc, first_para_ecpc)
        bpo_logger.info('max_margin_ecpc:%s ' % str(max_margin_ecpc))
        try:
            last_effective_idx = len(para_list) - [
                True if tmp_paras_ecpc[para] == last_para_ecpc else False
                for para in para_list[::-1]
            ].index(False)
            first_effective_idx = [
                True if tmp_paras_ecpc[para] == first_para_ecpc else False
                for para in para_list
            ].index(False)
        except ValueError:
            raise NameError('parameter not change ecpc performance ')
        if last_effective_idx != None and first_effective_idx != None:
            para_list = para_list[first_effective_idx:last_effective_idx]
        para_list = [
            para for para in para_list
            if tmp_paras_ecpc[para] < max_margin_ecpc * 0.9
        ]

        #bpo_logger.info('cpc90 %f, max cpc %f',cpc90,max([paras_perf[camp_grp][para]['ecpc'] for para in paras_perf[camp_grp].keys()]))

        # 小于目标cpc上限的参数集中点击量最大的参数
        if len(para_list) >= 2:
            optimal_paras[camp_grp] = [
                reduce(
                    lambda x, y: x if paras_perf[camp_grp][x]['click'] >
                    paras_perf[camp_grp][y]['click'] else y, para_list)
            ]
        elif len(para_list) == 1:
            optimal_paras[camp_grp] = [para_list[0]]
        else:
            optimal_paras[camp_grp] = []
    return optimal_paras
def history_repeat(bid_history_files, the_ctr_model, bid_model, candidate_paras):
    '''
    历史重演,对一批历史数据进行模拟投放
    paras:
        bid_history_files:join文件列表
        the_ctr_model:ctr模型
        bid_model:出价模型
        candidate_paras:候选参数
    return:
        paras_perf[camp_grp][para] =
            {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':}
    '''
    candidate_paras = {camp_grp:candidate_paras[camp_grp] for camp_grp in candidate_paras.keys() if len(candidate_paras[camp_grp]) != 0 }
    bpo_logger.info('**************history_repeat function start**************')
    bpo_logger.info('files:%s' % ','.join(bid_history_files))
    bpo_logger.info('************** bid_model: %s**************' % (bid_model.keys()))
    bpo_logger.info('************** candidate_paras: %s**************' % (candidate_paras))
    stop_flag_idx = {}
    result = {}
    for (camp_id, grp_id) in bid_model.keys():
        tmp_num = len(candidate_paras[(camp_id, grp_id)])
        stop_flag_idx[(camp_id, grp_id)] = tmp_num
        result[(camp_id, grp_id)] = [None] * tmp_num
        for idx in range(0, tmp_num):
            result[(camp_id, grp_id)][idx] = {'impression':0, 'avg_pctr':0, 'click':0, 'cost':0}
    req_parser = join.Parser()
    total_time = 0
    for bid_history_file in bid_history_files:
        for line in open(bid_history_file):
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            win_price = float(field_dict['win_price'])
            camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id'])
            #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不再bid_model中,则continue
            if (not candidate_paras.has_key(camp_grp)) or (not bid_model.has_key(camp_grp)) or win_price == None or win_price <= 0:
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])
            candidate_bid_prices = bid_model[camp_grp].get_bids_auc(ctr=pctr, variable_paras=candidate_paras[camp_grp])
            first_ge_idx = binary_search(candidate_bid_prices, win_price)

            #非累积方式
            #if first_ge_idx < stop_flag_idx[camp_grp]:
            #    for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]):
            #        result[camp_grp][tmp_idx]['impression'] += 1
            #        result[camp_grp][tmp_idx]['avg_pctr'] += pctr
            #        result[camp_grp][tmp_idx]['click'] += (1 if field_dict['click_flag'] == '1' else 0)
            #        result[camp_grp][tmp_idx]['cost'] += win_price

            # 因为没有投放限制(比如总预算,总预订点击等),下列语句被注释了
            # 指定资源限制方法:这里是成本限制,50%*总成本
            # print float(total_budget[camp_grp])/2
            # stop_flag_idx[camp_grp] = binary_search([result[camp_grp][idx]['cost'] for idx in range(0,stop_flag_idx[camp_grp])],float(total_budget[camp_grp])/16)
            # print stop_flag_idx[camp_grp]
            
            #累积方式,累积方式是不能添加资源限制的。累计方式速度更快
            if first_ge_idx < stop_flag_idx[camp_grp]:
                result[camp_grp][first_ge_idx]['impression'] += 1
                result[camp_grp][first_ge_idx]['avg_pctr'] += pctr
                result[camp_grp][first_ge_idx]['click'] += (1 if field_dict['click_flag'] == True else 0)
                result[camp_grp][first_ge_idx]['cost'] += win_price

    for camp_grp in result:
        for idx in range(1,len(result[camp_grp])):
            for tmp_metric in ['impression','avg_pctr','click','cost']:
                result[camp_grp][idx][tmp_metric] += result[camp_grp][idx-1][tmp_metric]

    #计算ctr/avg_pctr/ecpc/cpm等衍生度量
    for camp_grp in result:
        for tmp_idx in range(0, len(result[camp_grp])):
            result[camp_grp][tmp_idx]['cost'] /= 1000  # 修改单位
            # print result[camp_grp]
            tmp_dict = result[camp_grp][tmp_idx]
            result[camp_grp][tmp_idx]['ctr'] = ((tmp_dict['click'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['avg_pctr'] = ((tmp_dict['avg_pctr'] + 0.0) / tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['ecpc'] = (tmp_dict['cost']) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0
            result[camp_grp][tmp_idx]['cpm'] = 1000 * tmp_dict['cost'] / tmp_dict['impression']  if tmp_dict['impression'] != 0 else 0

    #提取出具体参数,而非参数索引
    paras_perf = {}
    #for camp_grp in result.keys():
    #    paras_perf[camp_grp] = {}
    #    for idx in range(0, len(candidate_paras[camp_grp])):
    #        paras_perf[camp_grp][candidate_paras[camp_grp][idx]] = result[camp_grp][idx]
    paras_perf ={camp_grp:{ candidate_paras[camp_grp][idx]:result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp]))}  for camp_grp in result.keys()}

    #打印结果
    bpo_logger.info('**************history_repeat function end**************')
    formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n'
    for camp_grp in paras_perf.keys():
        log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files)
        log_perf_str += formatter.format(
            camp=camp_grp[0],
            grp=camp_grp[1],
            strategy=bid_model[camp_grp].bid_strategy_type,
            para=bid_model[camp_grp].fixed_parameter)
        log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n'
        log_perf_str += 'unit: US dollar\n'
        for para in sorted(paras_perf[camp_grp].keys()):
            tmp_dict = paras_perf[camp_grp][para]
            log_perf_str += str(para) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict


        bpo_logger.info('************parameter performance start************')
        bpo_logger.info(log_perf_str)
        bpo_logger.info('************parameter performance end************')
    #print paras_perf

    return paras_perf
def bid_paras_optimization(bid_history_files, the_ctr_model, bid_model, min_para, max_para, max_ecpc):
    '''
    对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。
    paras:
        bid_history_files: 为join好的全部为win的日志文件列表
        initial_paras:默认为单个参数,即为一个浮点型变量
        endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC
    return:
        optimal_paras[camp_grp]:[optimal_para,] or []
    '''
    candidate_paras = {}
    for (camp_id, grp_id) in bid_model.keys():
        #initial_para定为最大参数和最小参数之间的平均
        candidate_paras[(camp_id, grp_id)] = bid_model[(camp_id, grp_id)].get_paras_by_log(central_para=(min_para + max_para + 0.0) / 2, max_para=max_para, min_para=min_para, range_num=200)

    paras_perf = {}
    paras_perf  = history_repeat(bid_history_files, the_ctr_model, bid_model, candidate_paras)
    optimal_paras = {}
    for camp_grp in bid_model.keys():
        if not paras_perf.has_key(camp_grp):
            continue
        #大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定
        para_list = paras_perf[camp_grp].keys()
        #tmp_paras_ecpc为保留了3位小数的ecpc字典
        tmp_paras_ecpc = {para: round(paras_perf[camp_grp][para]['ecpc'],4) for para in para_list}
        #para_list = filter( lambda x:True if paras_perf[camp_grp][x]['click'] >= 10 else False, para_list )
        para_list = [para for para in para_list if paras_perf[camp_grp][para]['click'] >= 10 ]
        # 考虑异常情况,没有任何一个参数能达到目标点击量
        if len(para_list) == 0:
            bpo_logger.warn('for camp_grp %s and bid strategy %s, click # of all parameters < 10 ' % (camp_grp, bid_model[camp_grp].bid_strategy_type))
            optimal_paras[camp_grp] = []
            continue
        # 小于目标cpc上限的参数集
        if type(max_ecpc) == type({}):
            tmp_max_ecpc = max_ecpc[camp_grp[0]]
        else:
            tmp_max_ecpc = max_ecpc
        para_list = [para for para in para_list if tmp_paras_ecpc[para] < tmp_max_ecpc ]
        if len(para_list) == 0:
            bpo_logger.warn('for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f ' % (camp_grp, bid_model[camp_grp].bid_strategy_type, tmp_max_ecpc))
            #若没有任何一个参数可以达到该目标ecpc,则取最小参数
            optimal_paras[camp_grp] = [min(tmp_paras_ecpc.keys(),key=lambda x:tmp_paras_ecpc[x])]
            continue
        #排序
        para_list.sort()
        #去除参数列表两端的无效参数
        last_effective_idx,first_effective_idx = None,None
        last_para_ecpc,first_para_ecpc = tmp_paras_ecpc[para_list[-1]],tmp_paras_ecpc[para_list[0]]
        max_margin_ecpc=max(last_para_ecpc,first_para_ecpc)
        bpo_logger.info('max_margin_ecpc:%s ' % str(max_margin_ecpc))
        try:
            last_effective_idx = len(para_list)-[ True if tmp_paras_ecpc[para] == last_para_ecpc  else False for para in para_list[::-1] ].index(False)
            first_effective_idx = [ True if tmp_paras_ecpc[para] == first_para_ecpc  else False for para in para_list ].index(False)
        except ValueError:
            raise NameError('parameter not change ecpc performance ')
        if last_effective_idx != None and first_effective_idx != None:
            para_list = para_list[first_effective_idx:last_effective_idx]
        para_list = [para for para in para_list if tmp_paras_ecpc[para] < max_margin_ecpc*0.9 ]

        #bpo_logger.info('cpc90 %f, max cpc %f',cpc90,max([paras_perf[camp_grp][para]['ecpc'] for para in paras_perf[camp_grp].keys()]))

        # 小于目标cpc上限的参数集中点击量最大的参数
        if len(para_list) >= 2:
            optimal_paras[camp_grp] = [reduce(lambda x, y:x if paras_perf[camp_grp][x]['click'] > paras_perf[camp_grp][y]['click'] else y, para_list)]
        elif  len(para_list) == 1:
            optimal_paras[camp_grp] = [para_list[0]]
        else:
            optimal_paras[camp_grp] = []
    return optimal_paras
def history_repeat_conditional(
    bid_history_files, the_ctr_model, bid_model, candidate_paras, condition_type, stat_result
):
    """
    历史重演,对一批历史数据进行模拟投放
    paras:
        bid_history_files:join文件名列表,list类型
        the_ctr_model:ctr模型
        bid_model:出价模型
        candidate_paras:候选参数
    return:
        paras_perf[camp_grp][para] =
            {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':}
    """
    candidate_paras = {
        camp_grp: candidate_paras[camp_grp]
        for camp_grp in candidate_paras.keys()
        if len(candidate_paras[camp_grp]) != 0
    }
    bpo_logger.info("**************history_repeat function start**************")
    bpo_logger.info("files:%s" % ",".join(bid_history_files))
    bpo_logger.info("************** bid_model: %s**************" % (bid_model.keys()))
    bpo_logger.info("************** candidate_paras: %s**************" % (candidate_paras))
    bpo_logger.info("************** condition_type: %s**************" % (condition_type))
    stop_flag_idx = {}
    result = {}  # 预制所有的键
    for camp_grp in bid_model.keys():
        # bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不在bid_model中,则continue
        if (not candidate_paras.has_key(camp_grp)) or (not bid_model.has_key(camp_grp)):
            continue
        if not condition_type.has_key(camp_grp):
            continue
        tmp_num = len(candidate_paras[camp_grp])
        stop_flag_idx[camp_grp] = tmp_num
        result[camp_grp] = [None] * tmp_num
        for idx in range(0, tmp_num):
            result[camp_grp][idx] = {"impression": 0, "avg_pctr": 0, "click": 0, "cost": 0}
    req_parser = join.Parser()
    for bid_history_file in bid_history_files:
        for line in open(bid_history_file):
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            win_price = float(field_dict["win_price"])
            camp_grp = (field_dict["campaign_id"], field_dict["adgroup_id"])
            if win_price == None or win_price <= 0:
                continue
            if not result.has_key(camp_grp):
                continue
            pctr = the_ctr_model.predict_ctr(field_dict["feature_values"])
            candidate_bid_prices = bid_model[camp_grp].get_bids_auc(ctr=pctr, variable_paras=candidate_paras[camp_grp])
            first_ge_idx = binary_search(candidate_bid_prices, win_price)
            # 非累积方式
            if first_ge_idx < stop_flag_idx[camp_grp]:
                for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]):
                    result[camp_grp][tmp_idx]["impression"] += 1
                    result[camp_grp][tmp_idx]["avg_pctr"] += pctr
                    result[camp_grp][tmp_idx]["click"] += 1 if field_dict["click_flag"] == True else 0
                    result[camp_grp][tmp_idx]["cost"] += win_price
            # 指定资源限制方法:50%*总成本,50%*总点击
            if condition_type[camp_grp] == "cost":
                stop_flag_idx[camp_grp] = binary_search(
                    [result[camp_grp][idx]["cost"] for idx in range(0, stop_flag_idx[camp_grp])],
                    stat_result[camp_grp]["cost"] / 2,
                )
            elif condition_type[camp_grp] == "click":
                stop_flag_idx[camp_grp] = binary_search(
                    [result[camp_grp][idx]["click"] for idx in range(0, stop_flag_idx[camp_grp])],
                    stat_result[camp_grp]["click"] / 2,
                )
    # 计算ctr/avg_pctr/ecpc/cpm等衍生度量
    for camp_grp in result:
        for tmp_idx in range(0, len(result[camp_grp])):
            result[camp_grp][tmp_idx]["cost"] /= 1000  # 修改单位,cpm为千次展示价格
            tmp_dict = result[camp_grp][tmp_idx]
            result[camp_grp][tmp_idx]["ctr"] = (
                ((tmp_dict["click"] + 0.0) / tmp_dict["impression"]) if tmp_dict["impression"] != 0 else 0
            )
            result[camp_grp][tmp_idx]["avg_pctr"] = (
                ((tmp_dict["avg_pctr"] + 0.0) / tmp_dict["impression"]) if tmp_dict["impression"] != 0 else 0
            )
            result[camp_grp][tmp_idx]["ecpc"] = (tmp_dict["cost"]) / tmp_dict["click"] if tmp_dict["click"] != 0 else 0
            result[camp_grp][tmp_idx]["cpm"] = (
                1000 * tmp_dict["cost"] / tmp_dict["impression"] if tmp_dict["impression"] != 0 else 0
            )
    # 提取出具体参数,而非参数索引
    paras_perf = {
        camp_grp: {
            candidate_paras[camp_grp][idx]: result[camp_grp][idx] for idx in range(0, len(candidate_paras[camp_grp]))
        }
        for camp_grp in result.keys()
    }
    # 打印结果
    bpo_logger.info("**************history_repeat function end**************")
    formatter = "paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n"
    for camp_grp in paras_perf.keys():
        log_perf_str = "bid_history_files:%s\n" % ",".join(bid_history_files)
        log_perf_str += formatter.format(
            camp=camp_grp[0],
            grp=camp_grp[1],
            strategy=bid_model[camp_grp].bid_strategy_type,
            para=bid_model[camp_grp].fixed_parameter,
        )
        log_perf_str += "condition type:%s\n" % condition_type[camp_grp]
        log_perf_str += "para click imp cost cpc ctr avg_pctr cpm\n"
        log_perf_str += "unit: US dollar\n"
        for para in sorted(paras_perf[camp_grp].keys()):
            tmp_dict = paras_perf[camp_grp][para]
            log_perf_str += (
                str(para) + " %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n" % tmp_dict
            )
        bpo_logger.info("************parameter performance start************")
        bpo_logger.info(log_perf_str)
        bpo_logger.info("************parameter performance end************")
    return paras_perf
def bid_paras_optimization_middle(
    bid_history_files, the_ctr_model, bid_model, min_para, max_para, max_ecpc, cny_to_usd, stat_result, status_file
):
    """
    对出价历史中的每个adgroup以及指定出价策略,挑选出最优的参数。
    paras:
        bid_history_files: 为join好的全部为win的日志文件列表
        initial_paras:默认为单个参数,即为一个浮点型变量
        endure_max_ecpc:reference effective CPC 可取为上一个时间段内的平均eCPC,或者使用总的 max eCPC
    return:
        optimal_paras[camp_grp]:[optimal_para,] or []
    """
    # 判断历史重演的类型和条件
    status_config = ConfigParser.ConfigParser()
    status_config.read(status_file)

    candidate_paras = {}
    condition_type = {}
    for (camp_id, grp_id) in bid_model.keys():
        # initial_para定为最大参数和最小参数之间的平均
        candidate_paras[(camp_id, grp_id)] = bid_model[(camp_id, grp_id)].get_paras_by_log(
            central_para=(min_para + max_para + 0.0) / 2, max_para=max_para, min_para=min_para, range_num=200
        )
        # 比较优化时所用ecpc和实际投放的ecpc
        tmp_ecpc = stat_result[(camp_id, grp_id)]["cost"] / stat_result[(camp_id, grp_id)]["click"]
        if not status_config.has_option(str(camp_id), "last_middle_ecpc"):
            continue
        if float(status_config.get(str(camp_id), "last_middle_ecpc")) > tmp_ecpc:
            condition_type[(camp_id, grp_id)] = "cost"
        else:
            condition_type[(camp_id, grp_id)] = "click"

    paras_perf = {}
    paras_perf = history_repeat_conditional(
        bid_history_files, the_ctr_model, bid_model, candidate_paras, condition_type, stat_result
    )
    optimal_paras = {}
    for camp_grp in bid_model.keys():
        if not paras_perf.has_key(camp_grp):
            continue
        # 大于10个点击的才能进行优化,因为点击太少时的优化结果很不稳定
        para_list = paras_perf[camp_grp].keys()
        # tmp_paras_ecpc为保留了4位小数的ecpc字典
        tmp_paras_ecpc = {para: round(paras_perf[camp_grp][para]["ecpc"], 4) for para in para_list}
        para_list = [para for para in para_list if paras_perf[camp_grp][para]["click"] >= 10]
        # 考虑异常情况,没有任何一个参数能达到目标点击量
        if len(para_list) == 0:
            bpo_logger.warn(
                "for camp_grp %s and bid strategy %s, click # of all parameters < 10 "
                % (camp_grp, bid_model[camp_grp].bid_strategy_type)
            )
            optimal_paras[camp_grp] = []
            continue
        # 修正ecpc,即取目标ecpc和最优ecpc之间的折中
        # 以及单位转换,这里值需要对target_ecpc进行转换
        target_ecpc = cny_to_usd * float(status_config.get(camp_grp[0], "target_ecpc")) / 1000000
        last_middle_ecpc = float(status_config.get(camp_grp[0], "last_middle_ecpc"))
        k = int(status_config.get(camp_grp[0], "step_size"))
        modified_ecpc = target_ecpc * 1 / (2 ** k) + last_middle_ecpc * (1 - 1 / (2 ** k))
        bpo_logger.info(
            "target_ecpc%.3f,last_middle_ecpc%.3f, modified_ecpc%.3f" % (target_ecpc, last_middle_ecpc, modified_ecpc)
        )
        modified_ecpc = float(modified_ecpc)

        # 小于目标cpc上限的参数集
        if type(max_ecpc) == type({}):
            modified_ecpc = max_ecpc[camp_grp[0]]
        else:
            modified_ecpc = max_ecpc
        para_list = [para for para in para_list if tmp_paras_ecpc[para] < modified_ecpc]
        if len(para_list) == 0:
            bpo_logger.warn(
                "for camp_grp %s and bid strategy %s, ecpc # of all parameters < %.f "
                % (camp_grp, bid_model[camp_grp].bid_strategy_type, modified_ecpc)
            )
            # 若没有任何一个参数可以达到该目标ecpc,则取最小参数
            optimal_paras[camp_grp] = [min(tmp_paras_ecpc.keys(), key=lambda x: tmp_paras_ecpc[x])]
            continue

        # 排序
        para_list.sort()

        # 去除参数列表两端的无效参数
        last_effective_idx, first_effective_idx = None, None
        last_para_ecpc, first_para_ecpc = tmp_paras_ecpc[para_list[-1]], tmp_paras_ecpc[para_list[0]]
        max_margin_ecpc = max(last_para_ecpc, first_para_ecpc)
        bpo_logger.info("max_margin_ecpc:%s " % str(max_margin_ecpc))
        try:
            last_effective_idx = len(para_list) - [
                True if tmp_paras_ecpc[para] == last_para_ecpc else False for para in para_list[::-1]
            ].index(False)
            first_effective_idx = [
                True if tmp_paras_ecpc[para] == first_para_ecpc else False for para in para_list
            ].index(False)
        except ValueError:
            raise NameError("parameter not change ecpc performance ")
        if last_effective_idx != None and first_effective_idx != None:
            para_list = para_list[first_effective_idx:last_effective_idx]
        para_list = [para for para in para_list if tmp_paras_ecpc[para] < max_margin_ecpc * 0.8]

        # 小于目标cpc上限的参数集中点击量最大的参数
        if len(para_list) >= 2:
            optimal_paras[camp_grp] = [
                reduce(
                    lambda x, y: x if paras_perf[camp_grp][x]["click"] > paras_perf[camp_grp][y]["click"] else y,
                    para_list,
                )
            ]
        elif len(para_list) == 1:
            optimal_paras[camp_grp] = [para_list[0]]
        else:
            optimal_paras[camp_grp] = []

        # 修改状态配置,并更新文件
        status_config.set(camp_grp[0], "last_middle_ecpc", value=modified_ecpc)
        status_config.set(camp_grp[0], "step_size", value=int(status_config.get(camp_grp[0], "step_size")) + 1)
        status_config.write(open(status_file, "w"))

    return optimal_paras