def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25):
    # train_data = pd.read_csv(raw_data_path, 'train.csv')
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / (
                dfCvr['label_0'] + dfCvr['label_0'] + alpha * 2)
            # dfCvr['clickDay'] = day
            sub_data = pd.merge(data.loc[data.clickDay == day,
                                         ['clickDay', key]],
                                dfCvr[[key, key + '_cvr']],
                                'left',
                                on=[
                                    key,
                                ])
            sub_data.drop_duplicates(['clickDay', key], inplace=True)
            sub_data.sort_values(['clickDay', key], inplace=True)
            dump_pickle(sub_data[['clickDay', key, key + '_cvr']],
                        feature_path)
예제 #2
0
def gen_user_group_install():
    user_install = load_pickle(raw_data_path+'user_installedapps.pkl')
    user_info = load_pickle(raw_data_path+'user.pkl')
    user_info['age_cut_small']=pd.cut(user_info['age'],bins=[-1,0,18,25,35,45,55,np.inf],labels=False)
    user_info['education_new'] = user_info['education']
    user_info.loc[user_info.education_new==7,'education_new'] = 6
    user_info_comb = user_info[['age_cut_small','gender','education_new',]].drop_duplicates()
    user_info_comb['user_group'] = np.arange(0,user_info_comb.shape[0])
    user_info = pd.merge(user_info,user_info_comb,'left',['age_cut_small','gender','education_new',])
    user_install = pd.merge(user_install,user_info[['userID','user_group','age_cut_small','gender','education_new',]],'left','userID')
    def update_dict(row,dic):
        dic[row['appID']] += 1
    user_group_install = None
    for i,u_g in tqdm(enumerate(user_install.user_group.unique())):
        sub_install = user_install[user_install.user_group==u_g]
        install_dict = dict((k,0) for k in user_install.appID.unique())
        install_dict['user_group'] = u_g
        install_dict['age_cut_small'] = sub_install['age_cut_small'].iloc[0]
        install_dict['gender'] = sub_install['gender'].iloc[0]
        install_dict['education_new'] = sub_install['education_new'].iloc[0]
        sub_install.apply(update_dict, args=(install_dict,),axis=1,)
        if user_group_install is None:
            user_group_install = pd.DataFrame(install_dict,index=[i,])
        else:
            user_group_install = pd.concat([user_group_install,pd.DataFrame(install_dict,index=[i,])])
    dump_pickle(user_group_install,feature_data_path+'user_group_install.pkl')
def gen_user_hour_click_count(update=False):
    '''
     生成所有数据的每天没小时点击统计量
     拼接键['ID_name', 'clickDay', 'clickHour']
    :param update:
    :return:
    '''
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    ads_feature = [
        'advertiserID', 'camgaignID', 'adID', 'creativeID', 'appID',
        'appCategory'
    ]
    context_feature = ['positionID', 'sitesetID']
    state_feature = ads_feature + context_feature

    for feature in tqdm(state_feature):
        feature_path = feature_data_path + 'user_' + feature + '_click_hour.pkl'
        if os.path.exists(feature_path):
            print('Found' + feature_path)
        else:
            print('Generation' + feature_path)
            user_feature_click_day = data.groupby(
                ['userID', 'clickDay', 'clickHour',
                 feature]).size().reset_index().rename(
                     columns={0: 'user_' + feature + '_click_hour'})
            dump_pickle(user_feature_click_day, feature_path)
예제 #4
0
def gen_tricks(start_day, end_day):
    '''
    生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接
    :param start_day:
    :param end_day:
    :return:
    '''
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    actions = load_pickle(raw_data_path + 'user_app_actions.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    data = addTime(data)
    data = addAd(data)

    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            df = data.loc[data.clickDay == day]
            df = add_trick(df)
            df = add_diff(df)
            df = add_install2click(df, day, actions)
            dump_pickle(
                df[[
                    'global_index', 'trick', 'first_diff', 'last_diff',
                    'install2click'
                ]], feature_path)
예제 #5
0
def gen_ID_global_sum_count(
        last_day=27,
        stats_features=['positionID', 'creativeID', 'appID', 'adID',
                        'userID']):
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    del train, test
    gc.collect()
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)

    for feature in tqdm(stats_features):
        feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            # continue
        print('generating ' + feature_path)
        """
        数据来自:'train.pkl和test.pkl,表示是关于展示和点击的数据
        聚合的算子的size(),表示统计的是展示量
        """
        feature_count_sum = pd.DataFrame(
            data.groupby(feature).size()).reset_index().rename(
                columns={0: feature + '_sum_count'})
        dump_pickle(feature_count_sum, feature_path)
예제 #6
0
def getConcatedAppIDCountVector(concated_list=None):
    '''
    拼接键['appID'] -- call gen_CountVector_appID_user_installed()
    :param concated_list:
    :return:
    '''
    if concated_list is None:
        concated_list = ['age_cut','gender','education','marriageStatus','haveBaby']

    concated_countvec = None
    for feature in tqdm(concated_list):
        feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl'
        if os.path.exists(feature_path):
            count_vec = load_pickle(feature_path)
        else:
            # call function, dump_pickle
            gen_CountVector_appID_user_installed(concated_list)

            # load_pickle
            count_vec = load_pickle(feature_path)

            """
            - 看着好像很繁琐,其实这样做可以保留中间计算数据,避免重算,是非常Spark的玩法。
            - coder几乎在任何用到pickle的地方都加入了os.path.exists(feature_path)的条件判断,这就是用pickle的正确方式。
            - 可以优化的地方:用cPickle代替pickle。
            """
        if concated_countvec is None:
            concated_countvec = count_vec
        else:
            concated_countvec = pd.merge(concated_countvec, count_vec, on='appID', how='left')
    return concated_countvec
def gen_CountVector_ID_user_clicks(ID_name,
                                   last_day=27,
                                   ID_describe_feature_names=[
                                       'age_cut', 'gender', 'education',
                                       'marriageStatus', 'haveBaby'
                                   ],
                                   drop_na=False):
    '''
     生成根据train和test表计算的ID_name计数描述向量
     拼接键 [ID_name]
    :param ID_name:
    :param last_day:
    :param gen_CountVector_ID_user_clicks:
    :param drop_na:
    :return:
    '''
    train = load_pickle(raw_data_path + 'train.pkl')
    test = load_pickle(raw_data_path + 'test.pkl')
    data = train.append(test)
    data = addTime(data)
    data = data[data.clickDay <= last_day]
    data = addAd(data)
    data = addPosition(data)
    data = addAppCategories(data)
    data = data[['userID', ID_name]]
    user_info = pd.read_csv(raw_data_path + 'user.csv')

    user_info['age_cut'] = pd.cut(user_info['age'],
                                  bins=[-1, 0, 18, 25, 35, 45, 55, np.inf],
                                  labels=False)
    user_info.loc[user_info.education == 7, 'education'] = 6

    user_info['hometown_province'] = user_info['hometown'].apply(
        lambda x: x // 100)
    user_info['residence_province'] = user_info['residence'].apply(
        lambda x: x // 100)

    for feature in tqdm(ID_describe_feature_names):
        feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if drop_na:
            feature_path += '.no_na'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
            continue
        print('generating ' + feature_path)
        prefix_name = ID_name + '_user_clicks_' + feature
        sub_user_info = pd.get_dummies(user_info[['userID', feature]],
                                       columns=[feature],
                                       prefix=prefix_name)
        if drop_na:
            sub_user_info.drop([prefix_name + '_0'], axis=1, inplace=True)
        data = pd.merge(data, sub_user_info, 'left', 'userID')
        dummy_features = sub_user_info.columns.tolist()
        dummy_features.remove('userID')
        ID_describe_feature = data[[ID_name] + dummy_features].groupby(
            [ID_name], as_index=False).sum()
        data.drop(dummy_features, axis=1, inplace=True)
        dump_pickle(ID_describe_feature, feature_path)
def generate_stats_feature():
    '''
    输入train和test,进行concat后,添加用户点击数据的统计特征
    :return:
    '''
    feature_path = feature_data_path + 'UserClickStats.pkl'
    if os.path.exists(feature_path):
        print('Found', feature_path)
    else:
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        data = train.append(test)
        del train, test
        gc.collect()
        data = addTime(data)
        data = addAd(data)
        data = addPosition(data)
        data = addAppCategories(data)
        data = add_user_day_click(data)
        data = add_user_day_click_count(
            data, feature_list=['camgaignID', 'adID', 'appID', 'sitesetID'])
        # data = add_user_day_hour_count(data)
        # train_origin_features = train.columns.values.tolist()
        # test_origin_features = test.columns.values.tolist()

        feature_names = [
            'user_adID_click_day_mean',  # 有些统计特征没包括进来
            'user_adID_click_day_min',
            'user_camgaignID_click_day_min',
            'user_appID_click_day_mean',
            'user_appID_click_day_max',
            'user_appID_click_day_min',
            'user_sitesetID_click_day_mean',
            'user_sitesetID_click_day_max',
            'user_sitesetID_click_day_min',
            'user_click_day_mean',
            'user_click_day_max',
            'user_click_day_min'
        ]

        print('Generating', feature_path)
        columns_day = [
            'user_adID_click_day', 'user_camgaignID_click_day',
            'user_appID_click_day', 'user_sitesetID_click_day',
            'user_click_day'
        ]
        columns_hour = [
            'user_adID_click_hour', 'user_camgaignID_click_hour',
            'user_appID_click_hour', 'user_sitesetID_click_hour'
        ]
        sub_feature = ['userID', 'clickTime']
        # data = pd.concat([train[sub_feature+columns_day+columns_hour],test[sub_feature+columns_day+columns_hour]])
        for col in tqdm(columns_day):
            data = gen_click_stats(data, col)
        # for col in tqdm(columns_day):
        #     data = add
        data = data[feature_names + ['userID']].drop_duplicates(['userID'])
        dump_pickle(data, feature_path)
def gen_user_day_click():
    feature_path = feature_data_path + 'user_day_clicks.pkl'
    if os.path.exists(feature_path):
        print('Found' + feature_path)
    else:
        print('Generating' + feature_path)
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        all_data = train.append(test)
        all_data = addTime(all_data)
        user_click_day = pd.DataFrame(
            all_data.groupby([
                'clickDay', 'userID'
            ]).size()).reset_index().rename(columns={0: 'user_click_day'})
        dump_pickle(user_click_day, feature_path)
예제 #10
0
def gen_positionID_cvr_smooth(test_day):
    """
    * 新学trick:对positionID的在个别时间窗口内的ctr进行贝叶斯平滑处理
    """

    feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str(
        test_day) + '.pkl'
    if os.path.exists(feature_path):
        print('found ' + feature_path)
    else:
        print('generating ' + feature_path)
        data = load_pickle(raw_data_path + 'train.pkl')
        data = addTime(data)
        positionID_cvr = data[data.clickDay < test_day]
        I = positionID_cvr.groupby('positionID')['label'].size().reset_index(
        )  # size()的作用等于count(),即做计数计算
        I.columns = ['positionID', 'I']
        C = positionID_cvr.groupby('positionID')['label'].sum().reset_index(
        )  # 因为点击的label是1,这里做sum()等价于是求出所有的点击量
        C.columns = ['positionID', 'C']
        positionID_cvr = pd.concat([I, C['C']], axis=1)
        hyper = BayesianSmoothing(1, 1)
        hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values,
                     10000, 0.00000001)  # 传入的是展示量和点击量,以及训练的参数设定
        alpha = hyper.alpha
        beta = hyper.beta
        # 根据训练出来的alpha和beta,进行贝叶斯平滑
        positionID_cvr['positionID_cvr_smooth'] = (
            positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta)
        dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']],
                    feature_path)
def gen_user_start_installed_cateA():
    '''
    计算用户初始安装的各大类的app的数量
    拼接键['userID']
    :return:
    '''
    user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
    app_cate = pd.read_csv(raw_data_path + 'app_categories.csv')
    app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100
                                                    if x > 100 else x)
    user_install = user_install.merge(app_cate, on='appID', how='left')
    for cate_a in tqdm(app_cate.cate_a.unique()):
        feature_path = feature_data_path + 'user_start_installed_cate_' + str(
            cate_a) + '.pkl'
        if os.path.exists(feature_path):
            print('Found ' + feature_path)
        else:
            print('Generating ' + feature_path)
            user_install_cate = user_install[user_install.cate_a == cate_a][[
                'userID', 'cate_a'
            ]]
            user_install_cate.rename(
                columns={'cate_a': 'user_start_install_cate_' + str(cate_a)},
                inplace=True)
            user_install_cate = user_install_cate.groupby(
                'userID', as_index=False).sum()
            dump_pickle(user_install_cate, feature_path)
def add_user_start_installed_cateA(data):
    for cate in tqdm([0, 1, 2, 3, 4, 5]):
        feature_path = feature_data_path + 'user_start_install_cate_' + str(
            cate) + '.pkl'
        user_start_installed_cateA = load_pickle(feature_path)
        data = pd.merge(data, user_start_installed_cateA, 'left', 'userID')
    return data
예제 #13
0
def get_ConcatedAppIDTfidfVector_userinstalled(concated_list=None, mode='local', norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):
    """
    拼接键 ['appID']
	
	** 新学trick: 使用td-idf对one-hot后的特征进行转换,使数据集更加稠密。
	
    为什么要对one-hot的结果进行tfidf转换,这么做的好处是什么?
	- make matrix dense;
	- but need test to prove its usefulness;
    """

    """
    复习一下assert expression AssertionError
    if not expression:
        raise AssertionError
    """
    assert mode in ['global', 'local'], 'mode must be global or local~~'

    if concated_list is None:
        concated_list = ['age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby']

    tfidf_vec = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
    if mode == 'global':
        concated_countvec = getConcatedAppIDCountVector(concated_list)
        concated_countvec.set_index('appID', inplace=True)  # 将'appID'从feature列中移到index列中
        vec_columns = concated_countvec.columns  # 剩下的列就是需要做tfidf的列名
        global_tfidf_vec = tfidf_vec.fit_transform(concated_countvec).todense() # 对one-hot的结果进行tfidf转换
        global_tfidf_vec = pd.DataFrame(global_tfidf_vec, columns=vec_columns, index=concated_countvec.index).reset_index() # 用tfidf的结果替代原来的one-hot
        return global_tfidf_vec
    else:
        concated_tfidf_vec = None
        for feature in tqdm(concated_list):
            feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl'
            if os.path.exists(feature_path):
                count_vec = load_pickle(feature_path)
            else:
                gen_CountVector_appID_user_installed(concated_list)
                count_vec = load_pickle(feature_path)
            count_vec.set_index('appID', inplace=True)
            vec_columns = count_vec.columns
            local_tfidf_vec = tfidf_vec.fit_transform(count_vec).todense()
            local_tfidf_vec = pd.DataFrame(local_tfidf_vec, columns=vec_columns, index=count_vec.index).reset_index()
            if concated_tfidf_vec is None:
                concated_tfidf_vec = local_tfidf_vec
            else:
                concated_tfidf_vec = pd.merge(concated_tfidf_vec, local_tfidf_vec, on='appID', how='left')
        return concated_tfidf_vec
def get_ConcatedAppIDTfidfVector_userinstalled(concated_list=[
    'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby'
],
                                               mode='local',
                                               norm='l2',
                                               use_idf=True,
                                               smooth_idf=True,
                                               sublinear_tf=False):
    assert mode in ['global', 'local'], 'mode must be global or local'
    tfidf_vec = TfidfTransformer(norm=norm,
                                 use_idf=use_idf,
                                 smooth_idf=smooth_idf,
                                 sublinear_tf=sublinear_tf)
    if mode == 'global':
        concated_countvec = getConcatedAppIDCountVector(concated_list)
        concated_countvec.set_index('appID', inplace=True)
        vec_columns = concated_countvec.columns
        global_tfidf_vec = tfidf_vec.fit_transform(concated_countvec).todense()
        global_tfidf_vec = pd.DataFrame(
            global_tfidf_vec,
            columns=vec_columns,
            index=concated_countvec.index).reset_index()
        return global_tfidf_vec
    else:
        concated_tfidf_vec = None
        for feature in tqdm(concated_list):
            feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl'
            if os.path.exists(feature_path):
                count_vec = load_pickle(feature_path)
            else:
                gen_CountVector_appID_user_installed(concated_list)
                count_vec = load_pickle(feature_path)
            count_vec.set_index('appID', inplace=True)
            vec_columns = count_vec.columns
            local_tfidf_vec = tfidf_vec.fit_transform(count_vec).todense()
            local_tfidf_vec = pd.DataFrame(
                local_tfidf_vec, columns=vec_columns,
                index=count_vec.index).reset_index()
            if concated_tfidf_vec is None:
                concated_tfidf_vec = local_tfidf_vec
            else:
                concated_tfidf_vec = pd.merge(concated_tfidf_vec,
                                              local_tfidf_vec,
                                              on='appID',
                                              how='left')
        return concated_tfidf_vec
def add_app_hist_install(data):
    feature_path = feature_data_path + 'app_hist_install.pkl'
    app_hist_install = load_pickle(feature_path)
    data = pd.merge(data,
                    app_hist_install,
                    on='left',
                    how=['appId', 'clickDay'])
    app_hist_install['app_hist_install'] = app_hist_install[
        'app_hist_install'] / (app_hist_install['clickDay'] - 1)
    return data
예제 #16
0
def add_smooth_pos_cvr(data, test_day):
    """
    - 将每一天的feature数据load_pickle出来
    - 和原始的data_frame进行merge后,返回
    - do next step with new data_frame
    """
    feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str(
        test_day) + '.pkl'
    smooth_pos_cvr = load_pickle(feature_path)
    data = pd.merge(data, smooth_pos_cvr, 'left', 'positionID')
    return data
def gen_user_day_click():
    feature_path = feature_data_path + 'user_day_clicks.pkl'
    if os.path.exists(feature_path):
        print('Found' + feature_path)
    else:
        print('Generating' + feature_path)
        train = load_pickle(raw_data_path + 'train.pkl')
        test = load_pickle(raw_data_path + 'test.pkl')
        all_data = train.append(test)
        all_data = addTime(all_data)  # 添加一些时间维度
        """
        .size(): 类似于count(), 统计个数
        .reset_index().rename(columns={0:'user_click_day'}): 目的只是为了重命名列名,将聚合统计的结果命名为'user_click_day'
        """
        user_click_day = pd.DataFrame(
            all_data.groupby([
                'clickDay', 'userID'
            ]).size()).reset_index().rename(columns={0: 'user_click_day'})

        # pickle的过程类似于spark的持久化
        dump_pickle(user_click_day, feature_path)
def add_user_day_click(data):
    '''
     添加用户当天的点击总数
    :param data:
    :return:
    '''
    feature_path = feature_data_path + 'user_day_clicks.pkl'
    if not os.path.exists(feature_path):
        gen_user_day_click()
    user_click_day = load_pickle(feature_path)
    data = pd.merge(data, user_click_day, 'left', ['clickDay', 'userID'])
    return data
예제 #19
0
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25):
    """
    * 新学trick:对传入的时间窗口的每一天的ctr做Laplace Smoothing
    """
    train_data = load_pickle(raw_data_path + 'train.pkl')
    test_data = load_pickle(raw_data_path + 'test.pkl')
    data = train_data.append(test_data)
    del train_data, test_data
    gc.collect()
    data = addTime(data)
    data = addAd(data)
    data = addPosition(data)
    ID_hist_cvr = None
    for day in tqdm(np.arange(start_day, end_day + 1)):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        if os.path.exists(feature_path):
            print('found ' + feature_path)
        else:
            print('generating ' + feature_path)
            dfCvr = data[data.clickDay < day]
            dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label')
            dfCvr = dfCvr.groupby([key], as_index=False).sum()
            """
            - 这里做的是Laplace Smoothing
            - 参见https://blog.csdn.net/bbbeoy/article/details/71249316
            """
            dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / (
                dfCvr['label_1'] + dfCvr['label_0'] + alpha * 2)
            sub_data = pd.merge(data.loc[data.clickDay == day,
                                         ['clickDay', key]],
                                dfCvr[[key, key + '_cvr']],
                                'left',
                                on=[
                                    key,
                                ])
            sub_data.drop_duplicates(['clickDay', key], inplace=True)
            sub_data.sort_values(['clickDay', key], inplace=True)
            dump_pickle(sub_data[['clickDay', key, key + '_cvr']],
                        feature_path)
예제 #20
0
def add_hist_cvr_smooth(data, key):
    hist_cvr_smooth = None
    for day in tqdm((data.clickTime // 1000000).unique()):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        day_cvr_smooth = load_pickle(feature_path)
        if hist_cvr_smooth is None:
            hist_cvr_smooth = day_cvr_smooth
        else:
            hist_cvr_smooth = pd.concat([hist_cvr_smooth, day_cvr_smooth],
                                        axis=0)
    data = pd.merge(data, hist_cvr_smooth, 'left', ['clickDay', key])
    return data
def gen_CountVector_appID_user_installed(appID_describe_feature_names=[
    'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby',
    'hometown_province', 'residence_province'
]):
    '''
    生成根据install表计算的appID计数描述向量
    :param appID_describe_feature_names:
    :return:
    '''
    user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
    user_info = pd.read_csv(raw_data_path + 'user.csv')
    user_info['age_cut'] = pd.cut(user_info['age'],
                                  bins=[-1, 0, 18, 25, 35, 45, 55, 65, np.inf],
                                  labels=False)
    user_info['hometown_province'] = user_info['hometown'].apply(
        lambda x: x // 100)
    user_info['residence_province'] = user_info['residence'].apply(
        lambda x: x // 100)

    for feature in tqdm(appID_describe_feature_names):
        feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + '.pkl'
        if os.path.exists(feature_path):
            print('Found ' + feature_path)
        else:
            print('Generating ' + feature_path)
            sub_user_info = pd.get_dummies(user_info[['userID', feature]],
                                           columns=[feature],
                                           prefix='appID_installed_' +
                                           feature)  # 进行独热编码
            user_install = pd.merge(user_install,
                                    sub_user_info,
                                    on='userID',
                                    how='left')
            dummy_features = sub_user_info.columns.tolist()
            dummy_features.remove('userID')
            app_describe_feature = None
            for dummy_feature in tqdm(dummy_features):
                app_feature_installed = user_install[[
                    'appID', dummy_feature
                ]].groupby('appID', as_index=False).sum()
                if app_describe_feature is None:
                    app_describe_feature = app_feature_installed
                else:
                    app_describe_feature = pd.concat([
                        app_describe_feature,
                        app_feature_installed[[dummy_feature]]
                    ],
                                                     axis=1)
                user_install.drop(dummy_feature, inplace=True, axis=1)
            dump_pickle(app_describe_feature, feature_path)
def addAd(data):
    '''
     拼接原始ad特征
    :param data:
    :return:
    '''
    feature_path = raw_data_path + 'ad.pkl'
    ad_feature = ['adID', 'camgaignID', 'creativeID', 'advertiserID', 'appID', 'appPlatform'] #ad.csv的所有字段
    if os.path.exists(feature_path):
        ad = load_pickle(feature_path)
    else:
        ad = pd.read_csv(raw_data_path + 'ad.csv')
        dump_pickle(ad, feature_path)
    return pd.merge(data, ad[ad_feature], on='creativeID', how='left')
def getConcatedAppIDCountVector(concated_list=[
    'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby'
]):
    '''
     拼接键['appID']
    :param concated_list:
    :return:
    '''
    concated_countvec = None
    for feature in tqdm(concated_list):
        feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl'
        if os.path.exists(feature_path):
            count_vec = load_pickle(feature_path)
        else:
            gen_CountVector_appID_user_installed(concated_list)
            count_vec = load_pickle(feature_path)
        if concated_countvec is None:
            concated_countvec = count_vec
        else:
            concated_countvec = pd.merge(concated_countvec,
                                         count_vec,
                                         on='appID',
                                         how='left')
    return concated_countvec
def addPosition(data):
    '''
     拼接原始position特征
    :param data:
    :return:
    '''
    feature_path = raw_data_path + 'position.pkl'
    position_feature = ['positionID', 'sitesetID', 'positionType']
    if os.path.exists(feature_path):
        position = load_pickle(feature_path)
    else:
        position = pd.read_csv(raw_data_path + 'position.csv')
        dump_pickle(position, feature_path)

    return pd.merge(data, position[position_feature], on='positionID', how='left')
예제 #25
0
def add_global_count_sum(
        data,
        last_day=27,
        stats_features=['positionID', 'creativeID', 'appID', 'adID',
                        'userID']):
    """
    添加ID出现次数,根据ID_name拼接
    """
    for feature in tqdm(stats_features):
        feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str(
            last_day) + '.pkl'
        if not os.path.exists(feature_path):
            gen_ID_global_sum_count([feature])
        feature_count_sum = load_pickle(feature_path)
        data = data.merge(feature_count_sum, 'left', [feature])
    return data
예제 #26
0
def gen_app_start_installed():
    '''
    记录第一天之前各个appID被记录的安装数量
    拼接键['appID']
    :return:
    '''
    feature_path = feature_data_path + 'app_start_installed.pkl'
    if os.path.exists(feature_path):
        print('Found:' + feature_path)
    else:
        print('Generating ' + feature_path)
        user_install = load_pickle(raw_data_path + 'user_installedapps.pkl')
        app_start_sum = user_install.groupby('appID').size().reset_index().rename(columns={0:'app_start_install_num'})
        del user_install
        gc.collect()
        dump_pickle(app_start_sum, feature_path)
예제 #27
0
def add_hist_cvr_smooth(data, key):
    """
    - 将每一天的feature数据load_pickle出来
    - 和原始的data_frame进行merge后,返回
    - do next step with new data_frame
    """
    hist_cvr_smooth = None
    for day in tqdm((data.clickTime // 1000000).unique()):
        feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str(
            day) + '.pkl'
        day_cvr_smooth = load_pickle(feature_path)
        if hist_cvr_smooth is None:
            hist_cvr_smooth = day_cvr_smooth
        else:
            hist_cvr_smooth = pd.concat([hist_cvr_smooth, day_cvr_smooth],
                                        axis=0)
    data = pd.merge(data, hist_cvr_smooth, 'left', ['clickDay', key])
    return data
예제 #28
0
def add_tricks(data):
    '''

    :param data:
    :return:
    '''
    tricks = None
    for day in tqdm((data.clickTime // 1000000).unique()):
        feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl'
        if not os.path.exists(feature_path):
            gen_tricks(day, day)
        day_tricks = load_pickle(feature_path)
        if tricks is None:
            tricks = day_tricks
        else:
            tricks = pd.concat([tricks, day_tricks], axis=0)
    data = pd.merge(data, tricks, 'left', 'global_index')
    return data
예제 #29
0
def get_TfidfVector_appCategory_user_action_hour(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):
    """
    **新学的trick:对时间区间级的安装量进行tf-idf转换
    - 可能是太多时间区间的安装量比较集中,maybe画图看出来的;
    - 但本来就是连续性的feature,即使长尾分布,用tfidf转换,有点小问题。还是那句话:多做实验
    - 我觉得这里做一个box-cox转化 或者 高斯归一化更好;
    """

    tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
    feature_path = feature_data_path + 'CountVector_appCategory_actionHour.pkl'
    if not os.path.exists(feature_path):
        gen_CountVector_appCategory_user_action_hour()
    count_vec = load_pickle(feature_path)
    count_vec.set_index('appCategory', inplace=True)
    col_name = count_vec.columns
    tfidf_vec = pd.DataFrame(tfidf.fit_transform(count_vec).todense(), columns=col_name,
                             index=count_vec.index).reset_index()
    return tfidf_vec
def get_TfidfVector_appCategory_user_action_hour(norm='l2',
                                                 use_idf=True,
                                                 smooth_idf=True,
                                                 sublinear_tf=False):
    tfidf = TfidfTransformer(norm='l2',
                             use_idf=True,
                             smooth_idf=True,
                             sublinear_tf=False)

    feature_path = feature_data_path + 'CountVector_appCategory_actionHour.pkl'
    if not os.path.exists(feature_path):
        gen_CountVector_appCategory_user_action_hour()
    count_vec = load_pickle(feature_path)
    count_vec.set_index('appCategory', inplace=True)
    col_name = count_vec.columns
    tfidf_vec = pd.DataFrame(tfidf.fit_transform(count_vec).todense(),
                             columns=col_name,
                             index=count_vec.index).reset_index()
    return tfidf_vec