Example #1
0
def gen_item_stats_feature(updata=False):
    feat_path = os.path.join(feats_root, 'item_click_stats.pkl')
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        dfal = get_nominal_dfal()
        dfal = add_item_total_da_click(dfal)
        dfal = add_item_da_feature_click(dfal)
        print('generating ' + feat_path)
        columns_da = list(
            filter(lambda x: x.endswith('_click_da'), dfal.columns.values))
        columns_ho = list(
            filter(lambda x: x.endswith('_click_ho'), dfal.columns.values))

        tbar = tqdm(columns_da)
        for col in tbar:
            tbar.set_description('add_item_click_stats ' + col)
            dfal = gen_item_click_stats(dfal, col)
        print('add_item_click_stats completed.')

        feat_names = list(
            filter(lambda x: '_click_da_' in x, dfal.columns.values))
        dfal = dfal[feat_names + ['item_id']].drop_duplicates(['item_id'])
        print('gen_item_stats_feature shape:', dfal.shape)
        dump_pickle(dfal, feat_path)
    print('gen_item_stats_feature completed.')
def gen_level_aggs(col, updata=False):
    feat_path = os.path.join(feats_root,'level_aggs_{}.pkl'.format(col))
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        print('Generating ' + feat_path)
        dfal = get_nominal_dfal()[[col, 'da'] + level_cols]
        dmax = dfal.da.max()
        dmin = dfal.da.min()
        
        level_agg = None
        for da in sorted(dfal.da.unique())[1:]:
            da_agg = None
            for win_das in [1, 2, 3]:
                if da - win_das < dmin:
                    continue
                agg = gen_level_agg_features(dfal, da, win_das, col)
                print('Generated {} {} {}'.format(col, da, win_das))
                if da_agg is None:
                    da_agg = agg
                else:
                    da_agg = da_agg.merge(agg, how='outer')
            if level_agg is None:
                level_agg = da_agg
            else: 
                level_agg = pd.concat([level_agg, da_agg], axis=0)
                level_agg.fillna(0, inplace=True)
                level_agg, _ = reduce_mem_usage(level_agg)
        print(level_agg.shape)
        level_agg, _ = reduce_mem_usage(level_agg)
        dump_pickle(level_agg, feat_path)
def gen_hist_cvr_smooth(start_da, end_da, key, alpha=0.25):
    dfal = get_nominal_dfal()
    dfal = dfal.loc[dfal.da <= end_da, [key, 'da', 'is_trade']]
    gc.collect()
    for da in tqdm(np.arange(start_da, end_da + 1)):
        feat_path = os.path.join(
            feats_root, key + '_hist_cvr_smooth_da_' + str(da) + '.pkl')
        if os.path.exists(feat_path):
            print('found ' + feat_path)
        else:
            print('generating ' + feat_path)
            dfcv = dfal.copy().loc[dfal.da < da]
            dfcv.is_trade = dfcv.is_trade.apply(int)
            dfcv = pd.get_dummies(dfcv, columns=['is_trade'], prefix='label')
            dfcv = dfcv.groupby([key], as_index=False).sum()
            dfcv[key + '_cvr'] = (dfcv['label_1'] + alpha) / (
                dfcv['label_0'] + dfcv['label_1'] + alpha * 2)
            result = pd.merge(dfal.loc[dfal.da == da, ['da', key]],
                              dfcv.loc[:, [key, key + '_cvr']],
                              'left',
                              on=[
                                  key,
                              ])
            result.drop_duplicates(['da', key], inplace=True)
            result.sort_values(['da', key], inplace=True)
            dump_pickle(result.loc[:, ['da', key, key + '_cvr']], feat_path)
Example #4
0
def gen_shop_da_feature_click(updata=False):
    """生成用户相关所有数据的每天点击统计量"""
    dfal = get_nominal_dfal()
    stats_feat = [
        'item_category_list', 'item_brand_id', 'item_city_id',
        'user_gender_id', 'user_occupation_id', 'item_price_level',
        'item_sales_level', 'item_collected_level', 'item_pv_level',
        'user_age_level', 'user_star_level', 'context_page_id', 'item_id',
        'user_id'
    ]
    tbar = tqdm(stats_feat)
    for feat in tbar:
        feat_path = os.path.join(feats_root, 'shop_' + feat + '_click_da.pkl')
        if os.path.exists(feat_path) and updata == False:
            tbar.set_description('Found {:>60}'.format(
                os.path.basename(feat_path)))
        else:
            tbar.set_description('Generating {:>60}'.format(
                os.path.basename(feat_path)))
            shop_feat_click_da = dfal.groupby(
                ['shop_id', 'da', feat]).size().reset_index().rename(
                    columns={0: 'agg_shop_%s_click_da' % feat})
            dump_pickle(shop_feat_click_da, feat_path)

    print('gen_shop_da_feature_click completed.')
def gen_user_total_da_click(update=False):
    dfal = get_nominal_dfal()
    feat_path = os.path.join(feats_root, 'user_total_click_da.pkl')
    if os.path.exists(feat_path) and update == False:
        print('Found ' + feat_path)
    else:
        print('Generating ' + feat_path)
        user_all_click_da = dfal.groupby(['user_id', 'da'])                                 .size().reset_index()                                 .rename(columns={0: 'agg_user_total_click_da'})
        dump_pickle(user_all_click_da, feat_path)
        
    print('gen_user_total_da_click completed.')
def gen_id_global_sum_count(last_da=23,
                            stats_feats=[
                                'item_id', 'shop_id', 'user_id',
                                'item_brand_id', 'item_city_id', 'hm'
                            ]):
    dfal = get_nominal_dfal()
    dfal = dfal.loc[dfal.da < last_da, stats_feats]
    for feat in tqdm(stats_feats):
        feat_path = os.path.join(
            feats_root,
            'global_count_' + feat + '_lastda' + str(last_da) + '.pkl')
        if os.path.exists(feat_path):
            print('found ' + feat_path)
        else:
            print('generating ' + feat_path)
            feat_count_sum = pd.DataFrame(
                dfal.groupby(feat).size()).reset_index().rename(
                    columns={0: 'agg_' + feat + '_sum_count'})
            dump_pickle(feat_count_sum, feat_path)
def gen_target_aggs(col, updata=False):
    feat_path = os.path.join(feats_root, 'target_aggs_{}.pkl'.format(col))
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        print('Generating ' + feat_path)
        dfal = get_nominal_dfal()[[col, 'da', 'is_trade']]
        dmax = dfal.da.max()
        dmin = dfal.da.min()
        for da in sorted(dfal.da.unique())[1:]:
            for win_das in [1, 2, 3]:
                if da - win_das < dmin:
                    continue
                dfal = gen_target_agg_features(dfal, da, win_das, col)
        dfal = dfal.loc[dfal.da > 17, :]
        dfal.drop(['is_trade'], inplace=True, axis=1)
        dfal.drop_duplicates([col, 'da'], inplace=True)
        dfal.fillna(0, inplace=True)
        dfal, _ = reduce_mem_usage(dfal)
        dump_pickle(dfal, feat_path)
Example #8
0
def gen_item_ho_feature_click(updata=False):
    """生成用户相关所有数据的每天每小时点击统计量"""
    dfal = get_nominal_dfal()
    stats_feat = [
        'shop_id', 'user_id', 'user_gender_id', 'user_occupation_id',
        'user_age_level', 'user_star_level', 'context_page_id',
        'shop_review_num_level', 'shop_star_level'
    ]
    tbar = tqdm(stats_feat)
    for feat in tbar:
        feat_path = os.path.join(feats_root, 'item_' + feat + '_click_ho.pkl')
        if os.path.exists(feat_path) and updata == False:
            tbar.set_description('Found {:>60}'.format(
                os.path.basename(feat_path)))
        else:
            tbar.set_description('Generating {:>60}'.format(
                os.path.basename(feat_path)))
            item_feat_click_ho = dfal.groupby(
                ['item_id', 'da', 'ho', feat]).size().reset_index().rename(
                    columns={0: 'agg_item_%s_click_ho' % feat})
            dump_pickle(item_feat_click_ho, feat_path)
    print('gen_item_ho_feature_click completed.')
def gen_final_dataset(tr_start_da, tr_end_da, te_da=24, updata=False):
    dfal = get_nominal_dfal()
    dfal = dfal.sort_values('dt')
    user_time_delta_feature = load_pickle(
        './feats/user_time_delta_feature.pkl')
    user_last_attrs_feature = load_pickle(
        './feats/user_last_attrs_feature.pkl')
    dfal = pd.concat([dfal, user_time_delta_feature, user_last_attrs_feature],
                     axis=1)
    print(dfal.shape)

    dftr = dfal.loc[(dfal.da >= tr_start_da) & (dfal.da <= tr_end_da)]
    tr_dump_file = './cache/final_dataset_tr_{}_{}.h5'.format(
        tr_start_da, tr_end_da)
    dftr = gen_dataset(dftr, tr_dump_file, 'tr', tr_end_da, updata)

    dfte = dfal.loc[dfal.da == te_da]
    te_dump_file = './cache/final_dataset_te_{}.h5'.format(te_da)
    dfte = gen_dataset(dfte, te_dump_file, 'te', te_da, updata)

    del dfal
    gc.collect()
    return dftr, dfte
Example #10
0
def gen_final_dataset(tr_start_da, tr_end_da, te_da=24):
    tr_dump_file = './cache/final_dataset_tr_{}_{}.h5'.format(
        tr_start_da, tr_end_da)
    te_dump_file = './cache/final_dataset_te_{}.h5'.format(te_da)

    dftr = None
    dfte = None
    if os.path.exists(tr_dump_file):
        print('Found ' + tr_dump_file)
        store = pd.HDFStore(
            tr_dump_file,
            mode='r',
            complevel=9,
        )
        dftr = store['dataset']
        store.close()
    elif dftr is None:
        dfal = get_nominal_dfal()
        dftr = dfal.loc[(dfal.da >= tr_start_da) & (dfal.da <= tr_end_da)]
        print('Generating Train Dataset...')
        ##################################################################
        # add user click
        #dftr = add_user_click_stats(dftr)
        #dftr = add_user_total_da_click(dftr)
        #dftr = add_user_da_feature_click(dftr)
        #dftr = add_user_ho_feature_click(dftr)
        # add item click
        #dftr = add_item_click_stats(dftr)
        #dftr = add_item_total_da_click(dftr)
        #dftr = add_item_da_feature_click(dftr)
        #dftr = add_item_ho_feature_click(dftr)
        # add shop click
        #dftr = add_shop_click_stats(dftr)
        #dftr = add_shop_total_da_click(dftr)
        #dftr = add_shop_da_feature_click(dftr)
        #dftr = add_shop_ho_feature_click(dftr)
        # add global count sum
        dftr = add_global_count_sum(dftr, tr_end_da)

        # add smooth cvr
        for c in tqdm(ordinal_cate_cols + nominal_cate_cols + identity_cols +
                      ['hm', 'mi'],
                      desc='add_hist_cvr_smooth'):
            dftr = add_hist_cvr_smooth(dftr, c)
        print('add_hist_cvr_smooth completed')

        #for c in tqdm(['item_id', 'shop_id','user_id', 'item_brand_id','item_city_id','hm', 'mi'], desc='add_target_features'):
        #    dftr = add_target_features(dftr, c)
        #print('add_target_features completed')

        # for c in tqdm(nominal_cate_cols + ['hm', 'mi', 'ho'], desc='add_level_features'):
        #    dftr = add_level_features(dftr, c)

        print('add_level_features completed')
        print(dftr.shape)
        store = pd.HDFStore(tr_dump_file, mode='w', complevel=9)
        store['dataset'] = dftr
        store.close()
        del dfal
        gc.collect()
        print('Generated Train Dataset')

    if os.path.exists(te_dump_file):
        print('Found ' + te_dump_file)
        store = pd.HDFStore(te_dump_file, mode='r', complevel=9)
        dfte = store['dataset']
        store.close()
    elif dfte is None:
        dfal = get_nominal_dfal()
        dfte = dfal.loc[dfal.da == te_da]
        ##################################################################
        print('Generating Test Dataset...')
        # add user click
        #dfte = add_user_click_stats(dfte)
        #dfte = add_user_total_da_click(dfte)
        #dfte = add_user_da_feature_click(dfte)
        #dfte = add_user_ho_feature_click(dfte)
        # add item click
        #dfte = add_item_click_stats(dfte)
        #dfte = add_item_total_da_click(dfte)
        #dfte = add_item_da_feature_click(dfte)
        #dfte = add_item_ho_feature_click(dfte)
        # add shop click
        #dfte = add_shop_click_stats(dfte)
        #dfte = add_shop_total_da_click(dfte)
        #dfte = add_shop_da_feature_click(dfte)
        #dfte = add_shop_ho_feature_click(dfte)
        # add global count sum
        dfte = add_global_count_sum(dfte, te_da)

        # add smooth cvr
        for c in tqdm(ordinal_cate_cols + nominal_cate_cols + identity_cols +
                      ['hm', 'mi'],
                      desc='add_hist_cvr_smooth'):
            dfte = add_hist_cvr_smooth(dfte, c)
        print('add_hist_cvr_smooth completed')

        #for c in tqdm(['item_id','shop_id','user_id', 'item_brand_id','item_city_id','hm', 'mi'], desc='add_target_features'):
        #    dfte = add_target_features(dfte, c)
        #print('add_target_features completed')

        # for c in tqdm(['item_id','shop_id','user_id','item_brand_id','item_city_id', 'hm', 'mi', 'ho'], desc='add_level_features'):
        #     dfte = add_level_features(dfte, c)
        # print('add_level_features completed')
        print(dfte.shape)
        store = pd.HDFStore(te_dump_file, mode='w', complevel=9)
        store['dataset'] = dfte
        store.close()

        del dfal
        gc.collect()
        print('Generated Test Dataset')
    #dftr.drop(unused_cols, axis=1, inplace=True)
    #dfte.drop(unused_cols, axis=1, inplace=True)
    return dftr, dfte
# In[31]:

cat3 = cb.CatBoostClassifier(**best_cat_params)
cat3 = cat3.fit(X_tr, y_tr, eval_set=(X_va, y_va))

# In[32]:

y_hat_cat3 = cat3.predict_proba(X_te)[:, 1]

# In[33]:

verbose_feature_importance_cat(cat3, X_tr)

# In[35]:

dfal = get_nominal_dfal()
hat = dfal.loc[dfal.da == 24, ['instance_id']]
del dfal
gc.collect()

# In[36]:

hat.shape, y_hat_cat1.shape

# In[37]:

hat['lgb1'] = y_hat_lgb1
hat['lgb2'] = y_hat_lgb2
hat['cat1'] = y_hat_cat1
hat['cat2'] = y_hat_cat2
hat['cat3'] = y_hat_cat3