def gen_tricks(start_day, end_day): ''' 生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接 :param start_day: :param end_day: :return: ''' train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') actions = load_pickle(raw_data_path + 'user_app_actions.pkl') data = train_data.append(test_data) del train_data, test_data data = addTime(data) data = addAd(data) for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) df = data.loc[data.clickDay == day] df = add_trick(df) df = add_diff(df) df = add_install2click(df, day, actions) dump_pickle( df[[ 'global_index', 'trick', 'first_diff', 'last_diff', 'install2click' ]], feature_path)
def gen_user_hour_click_count(update=False): ''' 生成所有数据的每天没小时点击统计量 拼接键['ID_name', 'clickDay', 'clickHour'] :param update: :return: ''' train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = addAd(data) data = addPosition(data) data = addAppCategories(data) ads_feature = [ 'advertiserID', 'camgaignID', 'adID', 'creativeID', 'appID', 'appCategory' ] context_feature = ['positionID', 'sitesetID'] state_feature = ads_feature + context_feature for feature in tqdm(state_feature): feature_path = feature_data_path + 'user_' + feature + '_click_hour.pkl' if os.path.exists(feature_path): print('Found' + feature_path) else: print('Generation' + feature_path) user_feature_click_day = data.groupby( ['userID', 'clickDay', 'clickHour', feature]).size().reset_index().rename( columns={0: 'user_' + feature + '_click_hour'}) dump_pickle(user_feature_click_day, feature_path)
def gen_ID_global_sum_count( last_day=27, stats_features=['positionID', 'creativeID', 'appID', 'adID', 'userID']): train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] del train, test gc.collect() data = addAd(data) data = addPosition(data) data = addAppCategories(data) for feature in tqdm(stats_features): feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str( last_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) # continue print('generating ' + feature_path) """ 数据来自:'train.pkl和test.pkl,表示是关于展示和点击的数据 聚合的算子的size(),表示统计的是展示量 """ feature_count_sum = pd.DataFrame( data.groupby(feature).size()).reset_index().rename( columns={0: feature + '_sum_count'}) dump_pickle(feature_count_sum, feature_path)
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25): # train_data = pd.read_csv(raw_data_path, 'train.csv') train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') data = train_data.append(test_data) del train_data, test_data gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / ( dfCvr['label_0'] + dfCvr['label_0'] + alpha * 2) # dfCvr['clickDay'] = day sub_data = pd.merge(data.loc[data.clickDay == day, ['clickDay', key]], dfCvr[[key, key + '_cvr']], 'left', on=[ key, ]) sub_data.drop_duplicates(['clickDay', key], inplace=True) sub_data.sort_values(['clickDay', key], inplace=True) dump_pickle(sub_data[['clickDay', key, key + '_cvr']], feature_path)
def gen_CountVector_ID_user_clicks(ID_name, last_day=27, ID_describe_feature_names=[ 'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby' ], drop_na=False): ''' 生成根据train和test表计算的ID_name计数描述向量 拼接键 [ID_name] :param ID_name: :param last_day: :param gen_CountVector_ID_user_clicks: :param drop_na: :return: ''' train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] data = addAd(data) data = addPosition(data) data = addAppCategories(data) data = data[['userID', ID_name]] user_info = pd.read_csv(raw_data_path + 'user.csv') user_info['age_cut'] = pd.cut(user_info['age'], bins=[-1, 0, 18, 25, 35, 45, 55, np.inf], labels=False) user_info.loc[user_info.education == 7, 'education'] = 6 user_info['hometown_province'] = user_info['hometown'].apply( lambda x: x // 100) user_info['residence_province'] = user_info['residence'].apply( lambda x: x // 100) for feature in tqdm(ID_describe_feature_names): feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str( last_day) + '.pkl' if drop_na: feature_path += '.no_na' if os.path.exists(feature_path): print('found ' + feature_path) continue print('generating ' + feature_path) prefix_name = ID_name + '_user_clicks_' + feature sub_user_info = pd.get_dummies(user_info[['userID', feature]], columns=[feature], prefix=prefix_name) if drop_na: sub_user_info.drop([prefix_name + '_0'], axis=1, inplace=True) data = pd.merge(data, sub_user_info, 'left', 'userID') dummy_features = sub_user_info.columns.tolist() dummy_features.remove('userID') ID_describe_feature = data[[ID_name] + dummy_features].groupby( [ID_name], as_index=False).sum() data.drop(dummy_features, axis=1, inplace=True) dump_pickle(ID_describe_feature, feature_path)
def generate_stats_feature(): ''' 输入train和test,进行concat后,添加用户点击数据的统计特征 :return: ''' feature_path = feature_data_path + 'UserClickStats.pkl' if os.path.exists(feature_path): print('Found', feature_path) else: train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) del train, test gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) data = addAppCategories(data) data = add_user_day_click(data) data = add_user_day_click_count( data, feature_list=['camgaignID', 'adID', 'appID', 'sitesetID']) # data = add_user_day_hour_count(data) # train_origin_features = train.columns.values.tolist() # test_origin_features = test.columns.values.tolist() feature_names = [ 'user_adID_click_day_mean', # 有些统计特征没包括进来 'user_adID_click_day_min', 'user_camgaignID_click_day_min', 'user_appID_click_day_mean', 'user_appID_click_day_max', 'user_appID_click_day_min', 'user_sitesetID_click_day_mean', 'user_sitesetID_click_day_max', 'user_sitesetID_click_day_min', 'user_click_day_mean', 'user_click_day_max', 'user_click_day_min' ] print('Generating', feature_path) columns_day = [ 'user_adID_click_day', 'user_camgaignID_click_day', 'user_appID_click_day', 'user_sitesetID_click_day', 'user_click_day' ] columns_hour = [ 'user_adID_click_hour', 'user_camgaignID_click_hour', 'user_appID_click_hour', 'user_sitesetID_click_hour' ] sub_feature = ['userID', 'clickTime'] # data = pd.concat([train[sub_feature+columns_day+columns_hour],test[sub_feature+columns_day+columns_hour]]) for col in tqdm(columns_day): data = gen_click_stats(data, col) # for col in tqdm(columns_day): # data = add data = data[feature_names + ['userID']].drop_duplicates(['userID']) dump_pickle(data, feature_path)
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25): """ * 新学trick:对传入的时间窗口的每一天的ctr做Laplace Smoothing """ train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') data = train_data.append(test_data) del train_data, test_data gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() """ - 这里做的是Laplace Smoothing - 参见https://blog.csdn.net/bbbeoy/article/details/71249316 """ dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / ( dfCvr['label_1'] + dfCvr['label_0'] + alpha * 2) sub_data = pd.merge(data.loc[data.clickDay == day, ['clickDay', key]], dfCvr[[key, key + '_cvr']], 'left', on=[ key, ]) sub_data.drop_duplicates(['clickDay', key], inplace=True) sub_data.sort_values(['clickDay', key], inplace=True) dump_pickle(sub_data[['clickDay', key, key + '_cvr']], feature_path)