def predict_ctr(input_files, ctr_model_file_name ):
    """predict
    :param input_files:
    :param the_ctr_model:
    :return:
    """

    info_key = ['win price', 'bid floor', 'occurrence num']

    the_ctr_model = ctr_model.LrCtrModel(ctr_model_file_name, 'new')

    out_file = open(out_file_name,'w')

    bid_info = {'10544':4,'10564':1.8,'10501':2,'10527':2}

    req_parser = join.Parser()
    line_num = 0
    actual_pred_list = []
    for input_file in input_files:
        for line in open(input_file,'r'):
            line_num += 1
            if line_num % 1000 == 0:
                print line_num
            #if line_num > 100:
            #    sys.exit(1)
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])
            click_flag = 1 if field_dict['click_flag'] == True else 0
            actual_pred_list.append((click_flag,pctr))
    print ctr_model_evaluation.predict_evaluation(actual_pred_list)
def predict_4_files(input_files, the_ctr_model):
    req_parser = join.Parser()
    actual_pred_list = []
    for input_file in input_files:
        for line in open(input_file):
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])
            actual = 1 if field_dict['click_flag'] == True else 0
            actual_pred_list.append((actual, pctr))
    return actual_pred_list
def join_log_stat(file_names, bin_num, fit_flag):
    ''' 
    file_name为clear_price文件,bin_num为价格分区间的数目
    fit_flag:是否拟合的标记
    return:
        stat_result[(camp_id,grp_id)] = {'impression':0,'click':0,'cost':0,
                'price_bin':{'bin_size':bin_size,'x':statx,'y':staty},
                'win_function_fitting':'fitting_para','filtered_xy'}
    '''
    #jls_logger.info('****************join_log_stat start*******************')
    stat_result = {}  # 格式{[cam id,grp id]:stat_result,[cam id,grp id]:}
    total_line_num = 0
    bad_line_count = 0
    req_parser = join.Parser()
    for file_name in file_names:
        for line in open(file_name):
            total_line_num += 1
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            #field_dict = get_field_dict(line)
            if field_dict == None:
                bad_line_count += 1
                continue
            # 0 click,1campaign id,2adgroup id,3price
            camp_id = field_dict['campaign_id']
            grp_id = field_dict['adgroup_id']
            if not stat_result.has_key((camp_id, grp_id)):
                stat_result[(camp_id, grp_id)] = {
                    'impression': 0,
                    'click': 0,
                    'cost': 0,
                    'price_stat': {}
                }
            # impression stat
            stat_result[(camp_id, grp_id)]['impression'] += 1
            # price stat
            round_price = round(float(field_dict['win_price']), 4)
            if not stat_result[(camp_id,
                                grp_id)]['price_stat'].has_key(round_price):
                stat_result[(camp_id, grp_id)]['price_stat'][round_price] = 0
            stat_result[(camp_id,
                         grp_id)]['price_stat'][round_price] += round_price
            # cost stat
            stat_result[(camp_id, grp_id)]['cost'] += 1
            # click stat
            if field_dict['click_flag'] == True:
                stat_result[(camp_id, grp_id)]['click'] += 1

    stats_str = ''
    stats_str += "bad_line_count:%d\n" % bad_line_count
    for (camp_id, grp_id) in stat_result.keys():
        stats_str += 'camp_id:%s,grp_id:%s,impression num:%s,click num:%s\n' % (
            camp_id, grp_id, stat_result[(camp_id, grp_id)]['impression'],
            stat_result[(camp_id, grp_id)]['click'])
    #jls_logger.info('log stat information:%s' % stats_str)
    # jls_logger.info('log_stat stats:\ntotal_line_num:%s,adgroup list:%s' % (total_line_num,stat_result.keys()))
    # if total_line_num < 10000:
    #    jls_logger.info('clear_price%s' % file_name)
    #    return None

    # 对价格进行分区
    if fit_flag == True:
        for (camp_id, grp_id) in stat_result.keys():
            # if (sum(stat_result[grp_id].values())+0.0)/total_line_num <0.1:
            #    continue
            print 'len(stat_result price stat)', len(
                stat_result[(camp_id, grp_id)]['price_stat'])
            max_price = max(stat_result[(camp_id, grp_id)]['price_stat'])
            min_price = min(stat_result[(camp_id, grp_id)]['price_stat'])
            bin_size = (max_price - min_price) / bin_num
            if bin_size == 0:
                stat_result[(camp_id, grp_id)]['price_bin'] = {
                    'bin_size': bin_size,
                    'x': None,
                    'y': None
                }
                continue
            price_bin = [0] * bin_num
            for price in stat_result[(camp_id, grp_id)]['price_stat'].keys():
                bin_idx = int(round((price - min_price) / bin_size))
                bin_idx = bin_idx if bin_idx != bin_num else bin_idx - 1
                price_bin[bin_idx] += stat_result[(
                    camp_id, grp_id)]['price_stat'][price]
            print 'price bin %d', price_bin
            total_imp = stat_result[(camp_id, grp_id)]['impression']
            #total_click = stat_result[(camp_id, grp_id)]['click']
            histogram = [
                float(price_bin[idx]) / total_imp for idx in range(0, bin_num)
            ]
            # plot data
            print 'bin_size', bin_size
            # x为左端点,这里statx为所有闭区间的左端点以及最后一个右开区间的左端点
            statx = [
                min_price + idx * bin_size for idx in range(0, bin_num + 1)
            ]
            staty = map(lambda idx: sum(histogram[0:idx]),
                        range(1,
                              len(histogram) + 1))
            staty = [0] + staty
            #avg_ctr = float(total_click) / total_imp
            del stat_result[(camp_id, grp_id)]['price_stat']
            stat_result[(camp_id, grp_id)]['price_bin'] = {
                'bin_size': bin_size,
                'x': statx,
                'y': staty
            }
            # 'avg_ctr':avg_ctr,'total_click':total_click,'total_imp':total_imp}

        # fitting
        for (camp_id, grp_id) in stat_result.keys():
            fit_fun = lambda x, p: (p[0] + x) / (p[1] + x)
            # 待拟合的函数,x是变量,p是参数
            # (fitting_para,filtered_xy)= win_function_fitting('win_function_sample',fit_fun)
            if stat_result[(camp_id, grp_id)]['price_bin']['bin_size'] == 0:
                print 'bin size 0'
                continue
            #print 'stat result', len(stat_result[(camp_id, grp_id)]['price_bin']['x']), len(stat_result[(camp_id, grp_id)]['price_bin']['y'])
            #print stat_result[(camp_id, grp_id)]['price_bin']['x']
            #print stat_result[(camp_id, grp_id)]['price_bin']['y']
            (fitting_para, filtered_xy) = win_function_fitting(
                x=stat_result[(camp_id, grp_id)]['price_bin']['x'],
                y=stat_result[(camp_id, grp_id)]['price_bin']['y'],
                fit_fun=fit_fun)
            if fitting_para == None:
                stat_result[(camp_id, grp_id)]['win_function_fitting'] = None
                continue
            print 'c1', fitting_para[0], 'c2', fitting_para[1]
            # win_function_plot(filtered_xy[0],filtered_xy[1],fit_fun,fitting_para)
            stat_result[(camp_id, grp_id)]['win_function_fitting'] = {
                'fitting_para': fitting_para,
                'filtered_xy': filtered_xy
            }
            #jls_logger.info('win function fitting by campaign and adgroup(%s,%s):%s' % (camp_id, grp_id, fitting_para))

    #print '\n'.join(['campaign,adgroup:'+str(camp_grp)+'\n'.join(stat_result.keys()) for camp_grp in stat_result])
    result_string = '\n'
    for camp_grp in stat_result:
        result_string += '*********************\n'
        result_string += 'files:' + ','.join(file_names) + '\n'
        result_string += 'camp_id:%s,adgroup_id:%s\n' % camp_grp
        for info in stat_result[camp_grp]:
            result_string += info + ':' + str(
                stat_result[camp_grp][info]) + '\n'
    #print '\n'.join(['campaign,adgroup:'+str(camp_grp)+'\n'.join[str(info) +':'+ str(stat_result[camp_grp][info]) for info in stat_result[camp_grp].keys()] for camp_grp in stat_result])
    #jls_logger.info('join_log_stat result:%s' % result_string)
    #jls_logger.info('****************join_log_stat end*******************')
    return stat_result
Example #4
0
def history_repeat_conditional(bid_history_files, the_ctr_model, bid_model,
                               candidate_paras, condition_type, stat_result):
    '''
    历史重演,对一批历史数据进行模拟投放
    paras:
        bid_history_files:join文件名列表,list类型
        the_ctr_model:ctr模型
        bid_model:出价模型
        candidate_paras:候选参数
    return:
        paras_perf[camp_grp][para] =
            {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':}
    '''
    candidate_paras = {
        camp_grp: candidate_paras[camp_grp]
        for camp_grp in candidate_paras.keys()
        if len(candidate_paras[camp_grp]) != 0
    }
    bpo_logger.info(
        '**************history_repeat function start**************')
    bpo_logger.info('files:%s' % ','.join(bid_history_files))
    bpo_logger.info('************** bid_model: %s**************' %
                    (bid_model.keys()))
    bpo_logger.info('************** candidate_paras: %s**************' %
                    (candidate_paras))
    bpo_logger.info('************** condition_type: %s**************' %
                    (condition_type))
    stop_flag_idx = {}
    result = {}  #预制所有的键
    for camp_grp in bid_model.keys():
        #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不在bid_model中,则continue
        if (not candidate_paras.has_key(camp_grp)) or (
                not bid_model.has_key(camp_grp)):
            continue
        if not condition_type.has_key(camp_grp):
            continue
        tmp_num = len(candidate_paras[camp_grp])
        stop_flag_idx[camp_grp] = tmp_num
        result[camp_grp] = [None] * tmp_num
        for idx in range(0, tmp_num):
            result[camp_grp][idx] = {
                'impression': 0,
                'avg_pctr': 0,
                'click': 0,
                'cost': 0
            }
    req_parser = join.Parser()
    for bid_history_file in bid_history_files:
        for line in open(bid_history_file):
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            win_price = float(field_dict['win_price'])
            camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id'])
            if win_price == None or win_price <= 0:
                continue
            if not result.has_key(camp_grp):
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])
            candidate_bid_prices = bid_model[camp_grp].get_bids_auc(
                ctr=pctr, variable_paras=candidate_paras[camp_grp])
            first_ge_idx = binary_search(candidate_bid_prices, win_price)
            #非累积方式
            if first_ge_idx < stop_flag_idx[camp_grp]:
                for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]):
                    result[camp_grp][tmp_idx]['impression'] += 1
                    result[camp_grp][tmp_idx]['avg_pctr'] += pctr
                    result[camp_grp][tmp_idx]['click'] += (
                        1 if field_dict['click_flag'] == True else 0)
                    result[camp_grp][tmp_idx]['cost'] += win_price
            #指定资源限制方法:50%*总成本,50%*总点击
            if condition_type[camp_grp] == 'cost':
                stop_flag_idx[camp_grp] = binary_search([
                    result[camp_grp][idx]['cost']
                    for idx in range(0, stop_flag_idx[camp_grp])
                ], stat_result[camp_grp]['cost'] / 2)
            elif condition_type[camp_grp] == 'click':
                stop_flag_idx[camp_grp] = binary_search([
                    result[camp_grp][idx]['click']
                    for idx in range(0, stop_flag_idx[camp_grp])
                ], stat_result[camp_grp]['click'] / 2)
    #计算ctr/avg_pctr/ecpc/cpm等衍生度量
    for camp_grp in result:
        for tmp_idx in range(0, len(result[camp_grp])):
            result[camp_grp][tmp_idx]['cost'] /= 1000  # 修改单位,cpm为千次展示价格
            tmp_dict = result[camp_grp][tmp_idx]
            result[camp_grp][tmp_idx]['ctr'] = (
                (tmp_dict['click'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['avg_pctr'] = (
                (tmp_dict['avg_pctr'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['ecpc'] = (
                tmp_dict['cost']
            ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0
            result[camp_grp][tmp_idx][
                'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[
                    'impression'] if tmp_dict['impression'] != 0 else 0
    #提取出具体参数,而非参数索引
    paras_perf = {
        camp_grp: {
            candidate_paras[camp_grp][idx]: result[camp_grp][idx]
            for idx in range(0, len(candidate_paras[camp_grp]))
        }
        for camp_grp in result.keys()
    }
    #打印结果
    bpo_logger.info('**************history_repeat function end**************')
    formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n'
    for camp_grp in paras_perf.keys():
        log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files)
        log_perf_str += formatter.format(
            camp=camp_grp[0],
            grp=camp_grp[1],
            strategy=bid_model[camp_grp].bid_strategy_type,
            para=bid_model[camp_grp].fixed_parameter)
        log_perf_str += 'condition type:%s\n' % condition_type[camp_grp]
        log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n'
        log_perf_str += 'unit: US dollar\n'
        for para in sorted(paras_perf[camp_grp].keys()):
            tmp_dict = paras_perf[camp_grp][para]
            log_perf_str += str(
                para
            ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict
        bpo_logger.info('************parameter performance start************')
        bpo_logger.info(log_perf_str)
        bpo_logger.info('************parameter performance end************')
    return paras_perf
Example #5
0

import sys
sys.path.append('/home/chester/KuaiPan/workspace/tukmob')
import util.join as join
import matplotlib.pyplot as pp


dir_pre = '/home/chester/KuaiPan/workspace/tukmob/resource/join_log/join_log_april_adx/'

for adx_name in ["adiquity", "axonix", "doubleclick", "inmobi", "nexage", "smaato", "tapsense",]:
    parser = join.Parser()
    price_stat = {}
    for line in  open(dir_pre + '/' + 'join_log_april_1_' + adx_name,'r'):
        parser.feed(line)
        price = round(parser.get_all()['win_price'],2)
        if price not in price_stat:
            price_stat[price] = 0
        price_stat[price] += 1
    sorted_price_stat = sorted(price_stat)
    pp.clf()
    pp.plot(sorted_price_stat,[price_stat[price] for price in sorted_price_stat])
    pp.title("win_price_"+adx_name)
    pp.savefig('/home/chester/win_price_' + adx_name + ".png")

Example #6
0
def history_repeat(bid_history_files, the_ctr_model, bid_model,
                   candidate_paras):
    '''
    历史重演,对一批历史数据进行模拟投放
    paras:
        bid_history_files:join文件列表
        the_ctr_model:ctr模型
        bid_model:出价模型
        candidate_paras:候选参数
    return:
        paras_perf[camp_grp][para] =
            {'impression':,'avg_pctr':,'click':,'cost':,'ctr':,'ecpc':,'cpm':}
    '''
    candidate_paras = {
        camp_grp: candidate_paras[camp_grp]
        for camp_grp in candidate_paras.keys()
        if len(candidate_paras[camp_grp]) != 0
    }
    bpo_logger.info(
        '**************history_repeat function start**************')
    bpo_logger.info('files:%s' % ','.join(bid_history_files))
    bpo_logger.info('************** bid_model: %s**************' %
                    (bid_model.keys()))
    bpo_logger.info('************** candidate_paras: %s**************' %
                    (candidate_paras))
    stop_flag_idx = {}
    result = {}
    for (camp_id, grp_id) in bid_model.keys():
        tmp_num = len(candidate_paras[(camp_id, grp_id)])
        stop_flag_idx[(camp_id, grp_id)] = tmp_num
        result[(camp_id, grp_id)] = [None] * tmp_num
        for idx in range(0, tmp_num):
            result[(camp_id, grp_id)][idx] = {
                'impression': 0,
                'avg_pctr': 0,
                'click': 0,
                'cost': 0
            }
    req_parser = join.Parser()
    total_time = 0
    for bid_history_file in bid_history_files:
        for line in open(bid_history_file):
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            win_price = float(field_dict['win_price'])
            camp_grp = (field_dict['campaign_id'], field_dict['adgroup_id'])
            #bid_model的camp_grp 为有效的adgroup,如果该条req的adgroup不再bid_model中,则continue
            if (not candidate_paras.has_key(camp_grp)) or (
                    not bid_model.has_key(camp_grp)
            ) or win_price == None or win_price <= 0:
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])
            candidate_bid_prices = bid_model[camp_grp].get_bids_auc(
                ctr=pctr, variable_paras=candidate_paras[camp_grp])
            first_ge_idx = binary_search(candidate_bid_prices, win_price)

            #非累积方式
            #if first_ge_idx < stop_flag_idx[camp_grp]:
            #    for tmp_idx in range(first_ge_idx, stop_flag_idx[camp_grp]):
            #        result[camp_grp][tmp_idx]['impression'] += 1
            #        result[camp_grp][tmp_idx]['avg_pctr'] += pctr
            #        result[camp_grp][tmp_idx]['click'] += (1 if field_dict['click_flag'] == '1' else 0)
            #        result[camp_grp][tmp_idx]['cost'] += win_price

            # 因为没有投放限制(比如总预算,总预订点击等),下列语句被注释了
            # 指定资源限制方法:这里是成本限制,50%*总成本
            # print float(total_budget[camp_grp])/2
            # stop_flag_idx[camp_grp] = binary_search([result[camp_grp][idx]['cost'] for idx in range(0,stop_flag_idx[camp_grp])],float(total_budget[camp_grp])/16)
            # print stop_flag_idx[camp_grp]

            #累积方式,累积方式是不能添加资源限制的。累计方式速度更快
            if first_ge_idx < stop_flag_idx[camp_grp]:
                result[camp_grp][first_ge_idx]['impression'] += 1
                result[camp_grp][first_ge_idx]['avg_pctr'] += pctr
                result[camp_grp][first_ge_idx]['click'] += (
                    1 if field_dict['click_flag'] == True else 0)
                result[camp_grp][first_ge_idx]['cost'] += win_price

    for camp_grp in result:
        for idx in range(1, len(result[camp_grp])):
            for tmp_metric in ['impression', 'avg_pctr', 'click', 'cost']:
                result[camp_grp][idx][tmp_metric] += result[camp_grp][
                    idx - 1][tmp_metric]

    #计算ctr/avg_pctr/ecpc/cpm等衍生度量
    for camp_grp in result:
        for tmp_idx in range(0, len(result[camp_grp])):
            result[camp_grp][tmp_idx]['cost'] /= 1000  # 修改单位
            # print result[camp_grp]
            tmp_dict = result[camp_grp][tmp_idx]
            result[camp_grp][tmp_idx]['ctr'] = (
                (tmp_dict['click'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['avg_pctr'] = (
                (tmp_dict['avg_pctr'] + 0.0) /
                tmp_dict['impression']) if tmp_dict['impression'] != 0 else 0
            result[camp_grp][tmp_idx]['ecpc'] = (
                tmp_dict['cost']
            ) / tmp_dict['click'] if tmp_dict['click'] != 0 else 0
            result[camp_grp][tmp_idx][
                'cpm'] = 1000 * tmp_dict['cost'] / tmp_dict[
                    'impression'] if tmp_dict['impression'] != 0 else 0

    #提取出具体参数,而非参数索引
    paras_perf = {}
    #for camp_grp in result.keys():
    #    paras_perf[camp_grp] = {}
    #    for idx in range(0, len(candidate_paras[camp_grp])):
    #        paras_perf[camp_grp][candidate_paras[camp_grp][idx]] = result[camp_grp][idx]
    paras_perf = {
        camp_grp: {
            candidate_paras[camp_grp][idx]: result[camp_grp][idx]
            for idx in range(0, len(candidate_paras[camp_grp]))
        }
        for camp_grp in result.keys()
    }

    #打印结果
    bpo_logger.info('**************history_repeat function end**************')
    formatter = 'paras_performance\ncampaign id:{camp}\nadgroup id:{grp}\nbid strategy:{strategy}\nbid strategy fixed parameter:{para}\n'
    for camp_grp in paras_perf.keys():
        log_perf_str = 'bid_history_files:%s\n' % ','.join(bid_history_files)
        log_perf_str += formatter.format(
            camp=camp_grp[0],
            grp=camp_grp[1],
            strategy=bid_model[camp_grp].bid_strategy_type,
            para=bid_model[camp_grp].fixed_parameter)
        log_perf_str += 'para click imp cost cpc ctr avg_pctr cpm\n'
        log_perf_str += 'unit: US dollar\n'
        for para in sorted(paras_perf[camp_grp].keys()):
            tmp_dict = paras_perf[camp_grp][para]
            log_perf_str += str(
                para
            ) + ' %(click)s %(impression)s %(cost)s %(ecpc)s %(ctr)s %(avg_pctr)s %(cpm)s\n' % tmp_dict

        bpo_logger.info('************parameter performance start************')
        bpo_logger.info(log_perf_str)
        bpo_logger.info('************parameter performance end************')
    #print paras_perf

    return paras_perf
def simulate_camp(target_ecpc,input_files, ctr_model_file_name, bid_landscape_model_file_name, out_file_name):
    """predict
    :param input_files:
    :param the_ctr_model:
    :return:
    """

    simulate_result = {idx*0.025:{'cnt':0,'cost':0,'click':0} for idx in range(1,40)}

    info_key = ['win price', 'bid floor', 'occurrence num']

    the_ctr_model = ctr_model.LrCtrModel(ctr_model_file_name, 'new')

    bid_landscape_model = pickle.load(open(bid_landscape_model_file_name, 'r'))

    out_file = open(out_file_name,'w')

    bid_info = {'10544':4,'10564':1.8,'10501':2,'10527':2}

    surplus_stats = {}

    req_parser = join.Parser()
    actual_pred_list = []
    line_num = 0
    for input_file in input_files:
        for line in open(input_file,'r'):
            line_num += 1
            if line_num % 1000 == 0:
                print line_num
            #if line_num > 100:
            #    break
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])

            line_array = line.strip('\n').split('|')
            feat_values = dict.fromkeys(feats)
            bid_floor = None
            for key_val in line_array:
                if ':' not in key_val:
                    continue
                key, val = key_val.split(':', 1)
                if key in feats:
                    feat_values[key] = val
                if key == bid_floor_field_name:
                    bid_floor = float(val)
                if key == win_price_field_name:
                    win_price = float(val)
                if key ==  campaign_field_name:
                    camp_id = val
            comb_value = tuple([feat_values[feat] for feat in feats])

            if comb_value == 'doubleclick':
                continue

            click_flag = 1 if field_dict['click_flag'] == True else 0
            #bid = bid_info[camp_id]*0.62
            bid = target_ecpc*pctr*1000
            win_rate = bid_landscape_model.predict(comb_value,bid)
            #print bid
            expected_cost = bid_landscape_model.get_cost(comb_value,bid)
            ecpc = float(expected_cost)/pctr
            
            surplus = (bid - expected_cost) * win_rate
            #print target_ecpc,pctr,expected_cost,win_rate
            if  round(surplus,2) not in surplus_stats:
                surplus_stats[round(surplus,2)] = 0
            surplus_stats[round(surplus,2)] += 1
            #if surplus > 0.4:
            #    out_file.write('%d\t%f\t%f\t%f\t%f\t%f\t%f\n' % (click_flag,pctr,win_rate,bid,win_price,expected_cost,ecpc))
            surplus_bin = int(surplus/0.025)*0.025
            if bid < win_price:
                continue
            if surplus_bin in simulate_result:
                simulate_result[surplus_bin]['cnt'] += 1
                simulate_result[surplus_bin]['cost'] += win_price
                if click_flag == 1:
                    simulate_result[surplus_bin]['click'] += 1
    print surplus_stats
    print simulate_result
    cnt,cost,click = 0,0,0
    print 'surplus bin','imp','cost','click','ecpc','ctr'
    for surplus_bin in sorted(simulate_result.keys(),reverse=True):
        cnt += simulate_result[surplus_bin]['cnt']
        cost += simulate_result[surplus_bin]['cost']
        click += simulate_result[surplus_bin]['click']
        print surplus_bin,cnt,cost,click,cost/click,float(click)/cnt
def predict_4_files(target_ecpc,input_files, ctr_model_file_name, bid_landscape_model_file_name, out_file_name):
    """predict
    :param input_files:
    :param the_ctr_model:
    :return:
    """

    info_key = ['win price', 'bid floor', 'occurrence num']

    the_ctr_model = ctr_model.LrCtrModel(ctr_model_file_name, 'new')

    bid_landscape_model = pickle.load(open(bid_landscape_model_file_name, 'r'))

    out_file = open(out_file_name,'w')

    #bid_info = {'10544':4,'10564':1.8,'10501':2,'10527':2}

    req_parser = join.Parser()
    line_num = 0
    for input_file in input_files:
        for line in open(input_file,'r'):
            line_num += 1
            if line_num % 1000 == 0:
                print line_num
            #if line_num > 100:
            #    sys.exit(1)
            req_parser.feed(line)
            field_dict = req_parser.get_all()
            if field_dict == None:
                continue
            pctr = the_ctr_model.predict_ctr(field_dict['feature_values'])

            line_array = line.strip('\n').split('|')
            feat_values = dict.fromkeys(feats)
            bid_floor = None
            for key_val in line_array:
                if ':' not in key_val:
                    continue
                key, val = key_val.split(':', 1)
                if key in feats:
                    feat_values[key] = val
                if key == bid_floor_field_name:
                    bid_floor = float(val)
                if key == win_price_field_name:
                    win_price = float(val)
                if key ==  campaign_field_name:
                    camp_id = val
            comb_value = tuple([feat_values[feat] for feat in feats])

            if comb_value == 'doubleclick':
                continue

            click_flag = 1 if field_dict['click_flag'] == True else 0
            #bid = bid_info[camp_id]*0.62
            bid = target_ecpc*pctr*1000
            win_rate = bid_landscape_model.predict(comb_value,bid)
            #print bid
            expected_cost = bid_landscape_model.get_cost(comb_value,bid)
            ecpc = float(expected_cost)/pctr

            if bid < win_price:
                continue
            
            out_file.write('%d\t%f\t%f\t%f\t%f\t%f\t%f\n' % (click_flag,pctr,win_rate,bid,win_price,expected_cost,ecpc))
Example #9
0
class SampleDataProcessor():

    id_class = set([
        'tran_id',
        'req_id',
        'req_device_geo_latitude',
        'req_device_geo_longitude',
        'req_device_ip',
        'req_impressions_id',
        'req_ext_udi_idfa',
        'req_user_custom_data',
        'req_device_ext_nex_ifa',
        'req_device_geo_zip',
        'req_user_geo_zip',
        'req_user_ext_nex_hhi',
        'req_ext_x_uidh',
        'req_site_ref',
        'req_user_keywords',
        'req_device_dev_platform_id_sha1',
        'req_user_id',
        'req_ext_udi_udidmd5',
        'req_ext_udi_udidsha1',
        'req_ext_udi_openudid',
        'req_user_ext_nex_vzwuidh',
        'req_ext_udi_macmd5',
        'req_ext_udi_odin',
        'req_device_ext_nex_macsha1',
        'req_device_dev_platform_id_md5',
        'req_user_cookie_age_seconds',
        'req_device_hashed_idfa',
        'req_app_content_detected_vertical_weight',
        'req_user_ext_nex_dma',
        'req_app_keywords',
        'req_user_geo_city',
        'req_request_time',
        'req_device_dev_platform_id',
        'req_user_cookie_age_seconds'
        'req_device_geo_city',
        'rsp_id',
        'rsp_seatbid_bid_impid',
        'rsp_seatbid_bid_nurl',
        'rsp_seatbid_bid_adm',
        'rsp_seatbid_bid_price',
        'rsp_seatbid_bid_id',
        'rsp_bidid',
        'event_action_flag',
        'event_action_tr',
        'event_action_ts',
        'event_adx',
        #'event_ca','event_ch','event_click_flag','event_click_tr','event_click_ts','event_co',
        'event_crv',
        'event_cur',
        'event_dev',
        'event_devt',
        'event_enct',  #'event_grp',
        'event_impression_flag',
        'event_impression_ts',
        'event_ip',
        'event_price',
        'event_win_notice_ts'
    ])
    join_parser = join.Parser()

    def gethash(self, str):
        mmh3Value = mmh3.hash(str, 3419)
        return mmh3Value % 16777216

    def clear(self, line):
        """去除不需要的特征和不符合格式的数据
            返回处理后的特征值列表
        """
        if 'event_impression_flag:0' in line:
            return
        clk = 0
        enventCa = ''  #记录campaign
        eventGrp = ''  #记录广告组
        win_price = ''  #记录成交价
        hashed_feat_vals = {}
        line_arr = line.strip('\n').split('|')

        self.join_parser.feed(line)
        all_info = self.join_parser.get_all()

        for elem in line_arr:
            if ":" not in elem:
                continue
            elemArr = elem.split(':', 1)
            if elemArr[1] == '':
                continue
            if elemArr[0] == 'event_click_flag':
                clk = int(elemArr[1])
                continue

            if elemArr[0] == 'event_price':
                win_price = float(elemArr[1])

            if elemArr[0] == 'event_ca':
                eventCa = elemArr[1]
            if elemArr[0] == 'event_grp':
                eventGrp = elemArr[1]

            #hash处理
            if elemArr[0] in self.id_class:
                continue
            if '[' in elemArr[1] or ']' in elemArr[1]:
                feat_vals_list = elemArr[1].strip('[').strip(']').split(',')
                hashed_feat_vals.update({
                    self.gethash(elemArr[0] + ':' + feat_val):
                    elemArr[0] + ':' + feat_val
                    for feat_val in feat_vals_list
                })
            else:
                hashed_feat_vals[self.gethash(elem)] = elem
        return win_price, hashed_feat_vals

    def get_extended_feat(self):
        pass
Example #10
0
    def prediction_file(self, sample_file):
        target_ecpc = 0.06
        import util.join as join
        req_parser = join.Parser()
        the_model = ctr_model.LrCtrModel(
            '/home/chester/KuaiPan/workspace/tukmob/resource/wzn/wzn', 'new')
        bid_array = []
        expected_price_array = []
        surplus_array = []
        line_no = 0
        for line in open(sample_file):
            #if line_no % 1000 == 0:
            if line_no == 1000:
                print line_no
                break
            line_no += 1
            #print ftrl.get_win_rate(line,0.2),
            #print ftrl.get_cost(line,0.2),
            req_parser.feed(line)
            if req_parser.get_adx() == 'youku':
                continue
            bid = round(target_ecpc * the_model.predict_ctr(line) * 1000, 2)
            expected_price = round(self.get_cost(line, bid), 4)
            print 'win_price', 'pctr', 'bid', 'expected_price', 'win_rate', 'surplus'
            print req_parser.get_price(),
            print the_model.predict_ctr(line),
            print bid, expected_price,
            print self.get_win_rate(line, bid),
            print(bid - expected_price) * self.get_win_rate(line, bid)
            bid_array.append(round(bid, 2))
            expected_price_array.append(round(expected_price, 2))
            surplus_array.append(
                round((bid - expected_price) * self.get_win_rate(line, bid),
                      2))

        #print bid_array,expected_price_array,surplus_array

        import collections
        import matplotlib.pyplot as pp

        cnt = collections.Counter(bid_array)
        total_cnt = len(bid_array)
        sort_cnt_key = sorted(cnt.keys())
        pp.clf()
        pp.plot(sort_cnt_key,
                [(cnt[ppp] + 0.0) / total_cnt for ppp in sort_cnt_key])
        pp.savefig('/home/chester/bid.png')

        cnt = collections.Counter(expected_price_array)
        total_cnt = len(expected_price_array)
        sort_cnt_key = sorted(cnt.keys())
        pp.clf()
        pp.plot(sort_cnt_key,
                [(cnt[ppp] + 0.0) / total_cnt for ppp in sort_cnt_key])
        pp.savefig('/home/chester/expected_price_array.png')

        cnt = collections.Counter(surplus_array)
        total_cnt = len(surplus_array)
        sort_cnt_key = sorted(cnt.keys())
        pp.clf()
        pp.plot(sort_cnt_key,
                [(cnt[ppp] + 0.0) / total_cnt for ppp in sort_cnt_key])
        pp.savefig('/home/chester/surplus.png')