def gen_user_hour_click_count(update=False):
    '''
     生成所有数据的每天没小时点击统计量
     拼接键['ID_name', 'clickDay', 'clickHour']
    :param update:
    :return:
    '''
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    ads_feature = [
        'advertiserID', 'camgaignID', 'adID', 'creativeID', 'appID',
        'appCategory'
    ]
    context_feature = ['positionID', 'sitesetID']
    state_feature = ads_feature + context_feature

    for feature in tqdm(state_feature):
        feature_path = feature_data_path + 'user_' + feature + '_click_hour.pkl'
        if os.path.exists(feature_path):
            print('Found' + feature_path)
        else:
            print('Generation' + feature_path)
            user_feature_click_day = data.groupby(
                ['userID', 'clickDay', 'clickHour',
                 feature]).size().reset_index().rename(
                     columns={0: 'user_' + feature + '_click_hour'})
            dump_pickle(user_feature_click_day, feature_path)
Esempio n. 2
0
def gen_positionID_cvr_smooth(test_day):
    """
    * 新学trick:对positionID的在个别时间窗口内的ctr进行贝叶斯平滑处理
    """

    feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str(
        test_day) + '.pkl'
    if os.path.exists(feature_path):
        print('found ' + feature_path)
    else:
        print('generating ' + feature_path)
        data = load_pickle(raw_data_path + 'train.pkl')
        data = addTime(data)
        positionID_cvr = data[data.clickDay < test_day]
        I = positionID_cvr.groupby('positionID')['label'].size().reset_index(
        )  # size()的作用等于count(),即做计数计算
        I.columns = ['positionID', 'I']
        C = positionID_cvr.groupby('positionID')['label'].sum().reset_index(
        )  # 因为点击的label是1,这里做sum()等价于是求出所有的点击量
        C.columns = ['positionID', 'C']
        positionID_cvr = pd.concat([I, C['C']], axis=1)
        hyper = BayesianSmoothing(1, 1)
        hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values,
                     10000, 0.00000001)  # 传入的是展示量和点击量,以及训练的参数设定
        alpha = hyper.alpha
        beta = hyper.beta
        # 根据训练出来的alpha和beta,进行贝叶斯平滑
        positionID_cvr['positionID_cvr_smooth'] = (
            positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta)
        dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],
                    feature_path)
Esempio n. 3
0
def gen_tricks(start_day, end_day):
    '''
    生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接
    :param start_day:
    :param end_day:
    :return:
    '''
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    actions = load_pickle(raw_data_path + 'user_app_actions.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    data = addTime(data)
    data = addAd(data)

    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            df = data.loc[data.clickDay == day]
            df = add_trick(df)
            df = add_diff(df)
            df = add_install2click(df, day, actions)
            dump_pickle(
                df[[
                    'global_index', 'trick', 'first_diff', 'last_diff',
                    'install2click'
                ]], feature_path)
Esempio n. 4
0
def gen_ID_global_sum_count(
        last_day=27,
        stats_features=['positionID', 'creativeID', 'appID', 'adID',
                        'userID']):
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    del train, test
    gc.collect()
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    for feature in tqdm(stats_features):
        feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            # continue
        print('generating ' + feature_path)
        """
        数据来自:'train.pkl和test.pkl,表示是关于展示和点击的数据
        聚合的算子的size(),表示统计的是展示量
        """
        feature_count_sum = pd.DataFrame(
            data.groupby(feature).size()).reset_index().rename(
                columns={0: feature + '_sum_count'})
        dump_pickle(feature_count_sum, feature_path)
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25):
    # train_data = pd.read_csv(raw_data_path, 'train.csv')
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / (
                dfCvr['label_0'] + dfCvr['label_0'] + alpha * 2)
            # dfCvr['clickDay'] = day
            sub_data = pd.merge(data.loc[data.clickDay == day,
                                         ['clickDay', key]],
                                dfCvr[[key, key + '_cvr']],
                                'left',
                                on=[
                                    key,
                                ])
            sub_data.drop_duplicates(['clickDay', key], inplace=True)
            sub_data.sort_values(['clickDay', key], inplace=True)
            dump_pickle(sub_data[['clickDay', key, key + '_cvr']],
                        feature_path)
def gen_CountVector_ID_user_clicks(ID_name,
                                   last_day=27,
                                   ID_describe_feature_names=[
                                       'age_cut', 'gender', 'education',
                                       'marriageStatus', 'haveBaby'
                                   ],
                                   drop_na=False):
    '''
     生成根据train和test表计算的ID_name计数描述向量
     拼接键 [ID_name]
    :param ID_name:
    :param last_day:
    :param gen_CountVector_ID_user_clicks:
    :param drop_na:
    :return:
    '''
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)
    data = data[['userID', ID_name]]
    user_info = pd.read_csv(raw_data_path + 'user.csv')

    user_info['age_cut'] = pd.cut(user_info['age'],
                                  bins=[-1, 0, 18, 25, 35, 45, 55, np.inf],
                                  labels=False)
    user_info.loc[user_info.education == 7, 'education'] = 6

    user_info['hometown_province'] = user_info['hometown'].apply(
        lambda x: x // 100)
    user_info['residence_province'] = user_info['residence'].apply(
        lambda x: x // 100)

    for feature in tqdm(ID_describe_feature_names):
        feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if drop_na:
            feature_path += '.no_na'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            continue
        print('generating ' + feature_path)
        prefix_name = ID_name + '_user_clicks_' + feature
        sub_user_info = pd.get_dummies(user_info[['userID', feature]],
                                       columns=[feature],
                                       prefix=prefix_name)
        if drop_na:
            sub_user_info.drop([prefix_name + '_0'], axis=1, inplace=True)
        data = pd.merge(data, sub_user_info, 'left', 'userID')
        dummy_features = sub_user_info.columns.tolist()
        dummy_features.remove('userID')
        ID_describe_feature = data[[ID_name] + dummy_features].groupby(
            [ID_name], as_index=False).sum()
        data.drop(dummy_features, axis=1, inplace=True)
        dump_pickle(ID_describe_feature, feature_path)
def generate_stats_feature():
    '''
    输入train和test,进行concat后,添加用户点击数据的统计特征
    :return:
    '''
    feature_path = feature_data_path + 'UserClickStats.pkl'
    if os.path.exists(feature_path):
        print('Found', feature_path)
    else:
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        data = train.append(test)
        del train, test
        gc.collect()
        data = addTime(data)
        data = addAd(data)
        data = addPosition(data)
        data = addAppCategories(data)
        data = add_user_day_click(data)
        data = add_user_day_click_count(
            data, feature_list=['camgaignID', 'adID', 'appID', 'sitesetID'])
        # data = add_user_day_hour_count(data)
        # train_origin_features = train.columns.values.tolist()
        # test_origin_features = test.columns.values.tolist()

        feature_names = [
            'user_adID_click_day_mean',  # 有些统计特征没包括进来
            'user_adID_click_day_min',
            'user_camgaignID_click_day_min',
            'user_appID_click_day_mean',
            'user_appID_click_day_max',
            'user_appID_click_day_min',
            'user_sitesetID_click_day_mean',
            'user_sitesetID_click_day_max',
            'user_sitesetID_click_day_min',
            'user_click_day_mean',
            'user_click_day_max',
            'user_click_day_min'
        ]

        print('Generating', feature_path)
        columns_day = [
            'user_adID_click_day', 'user_camgaignID_click_day',
            'user_appID_click_day', 'user_sitesetID_click_day',
            'user_click_day'
        ]
        columns_hour = [
            'user_adID_click_hour', 'user_camgaignID_click_hour',
            'user_appID_click_hour', 'user_sitesetID_click_hour'
        ]
        sub_feature = ['userID', 'clickTime']
        # data = pd.concat([train[sub_feature+columns_day+columns_hour],test[sub_feature+columns_day+columns_hour]])
        for col in tqdm(columns_day):
            data = gen_click_stats(data, col)
        # for col in tqdm(columns_day):
        #     data = add
        data = data[feature_names + ['userID']].drop_duplicates(['userID'])
        dump_pickle(data, feature_path)
def gen_user_day_click():
    feature_path = feature_data_path + 'user_day_clicks.pkl'
    if os.path.exists(feature_path):
        print('Found' + feature_path)
    else:
        print('Generating' + feature_path)
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        all_data = train.append(test)
        all_data = addTime(all_data)
        user_click_day = pd.DataFrame(
            all_data.groupby([
                'clickDay', 'userID'
            ]).size()).reset_index().rename(columns={0: 'user_click_day'})
        dump_pickle(user_click_day, feature_path)
def generate_click_trick():
    # df['origin_index'] = df.index
    feature_path = feature_data_path + 'global_tricks.pkl'
    if os.path.exists(feature_path):
        print('found ' + feature_path)
    else:
        train = pd.read_pickle(raw_data_path + 'train.pkl')
        test = pd.read_pickle(raw_data_path + 'test.pkl')
        df = train.append(test)
        df = df[[
            'global_index',
            'creativeID',
            'userID',
            'label',
            'clickTime',
        ]]
        del train, test
        df = addTime(df)
        gc.collect()
        uct_cnt = df.groupby(['userID', 'creativeID']).size().reset_index()
        uct_cnt.rename(columns={0: 'global_uct_cnt'}, inplace=True)
        df = pd.merge(df, uct_cnt, how='left', on=['userID', 'creativeID'])

        df_1 = df.sort_values(by=['userID', 'clickTime'], ascending=True)
        first = df_1.drop_duplicates('userID')
        first['global_first'] = 1
        first = first[['userID', 'clickTime', 'global_first']]
        df = pd.merge(df, first, how='left', on=['userID', 'clickTime'])

        df_2 = df.sort_values(by=['userID', 'clickTime'], ascending=False)
        last = df_2.drop_duplicates('userID')
        last['global_last'] = 1
        last = last[['userID', 'clickTime', 'global_last']]
        df = pd.merge(df, last, how='left', on=['userID', 'clickTime'])
        pd.to_pickle(
            df[[
                'clickDay',
                'global_uct_cnt',
                'global_first',
                'global_last',
            ]], feature_path)
def gen_user_day_click():
    feature_path = feature_data_path + 'user_day_clicks.pkl'
    if os.path.exists(feature_path):
        print('Found' + feature_path)
    else:
        print('Generating' + feature_path)
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        all_data = train.append(test)
        all_data = addTime(all_data)  # 添加一些时间维度
        """
        .size(): 类似于count(), 统计个数
        .reset_index().rename(columns={0:'user_click_day'}): 目的只是为了重命名列名,将聚合统计的结果命名为'user_click_day'
        """
        user_click_day = pd.DataFrame(
            all_data.groupby([
                'clickDay', 'userID'
            ]).size()).reset_index().rename(columns={0: 'user_click_day'})

        # pickle的过程类似于spark的持久化
        dump_pickle(user_click_day, feature_path)
Esempio n. 11
0
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25):
    """
    * 新学trick:对传入的时间窗口的每一天的ctr做Laplace Smoothing
    """
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            """
            - 这里做的是Laplace Smoothing
            - 参见https://blog.csdn.net/bbbeoy/article/details/71249316
            """
            dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / (
                dfCvr['label_1'] + dfCvr['label_0'] + alpha * 2)
            sub_data = pd.merge(data.loc[data.clickDay == day,
                                         ['clickDay', key]],
                                dfCvr[[key, key + '_cvr']],
                                'left',
                                on=[
                                    key,
                                ])
            sub_data.drop_duplicates(['clickDay', key], inplace=True)
            sub_data.sort_values(['clickDay', key], inplace=True)
            dump_pickle(sub_data[['clickDay', key, key + '_cvr']],
                        feature_path)
def gen_positionID_cvr_smooth(test_day):
    feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str(
        test_day) + '.pkl'
    if os.path.exists(feature_path):
        print('found ' + feature_path)
    else:
        print('generating ' + feature_path)
        data = load_pickle(raw_data_path + 'train.pkl')
        data = addTime(data)
        positionID_cvr = data[data.clickDay < test_day]
        I = positionID_cvr.groupby('positionID')['label'].size().reset_index()
        I.columns = ['positionID', 'I']
        C = positionID_cvr.groupby('positionID')['label'].sum().reset_index()
        C.columns = ['positionID', 'C']
        positionID_cvr = pd.concat([I, C['C']], axis=1)
        hyper = BayesianSmoothing(1, 1)
        hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values,
                     10000, 0.00000001)
        alpha = hyper.alpha
        beta = hyper.beta
        positionID_cvr['positionID_cvr_smooth'] = (
            positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta)
        dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],
                    feature_path)