def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25): # train_data = pd.read_csv(raw_data_path, 'train.csv') train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') data = train_data.append(test_data) del train_data, test_data gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / ( dfCvr['label_0'] + dfCvr['label_0'] + alpha * 2) # dfCvr['clickDay'] = day sub_data = pd.merge(data.loc[data.clickDay == day, ['clickDay', key]], dfCvr[[key, key + '_cvr']], 'left', on=[ key, ]) sub_data.drop_duplicates(['clickDay', key], inplace=True) sub_data.sort_values(['clickDay', key], inplace=True) dump_pickle(sub_data[['clickDay', key, key + '_cvr']], feature_path)
def gen_user_group_install(): user_install = load_pickle(raw_data_path+'user_installedapps.pkl') user_info = load_pickle(raw_data_path+'user.pkl') user_info['age_cut_small']=pd.cut(user_info['age'],bins=[-1,0,18,25,35,45,55,np.inf],labels=False) user_info['education_new'] = user_info['education'] user_info.loc[user_info.education_new==7,'education_new'] = 6 user_info_comb = user_info[['age_cut_small','gender','education_new',]].drop_duplicates() user_info_comb['user_group'] = np.arange(0,user_info_comb.shape[0]) user_info = pd.merge(user_info,user_info_comb,'left',['age_cut_small','gender','education_new',]) user_install = pd.merge(user_install,user_info[['userID','user_group','age_cut_small','gender','education_new',]],'left','userID') def update_dict(row,dic): dic[row['appID']] += 1 user_group_install = None for i,u_g in tqdm(enumerate(user_install.user_group.unique())): sub_install = user_install[user_install.user_group==u_g] install_dict = dict((k,0) for k in user_install.appID.unique()) install_dict['user_group'] = u_g install_dict['age_cut_small'] = sub_install['age_cut_small'].iloc[0] install_dict['gender'] = sub_install['gender'].iloc[0] install_dict['education_new'] = sub_install['education_new'].iloc[0] sub_install.apply(update_dict, args=(install_dict,),axis=1,) if user_group_install is None: user_group_install = pd.DataFrame(install_dict,index=[i,]) else: user_group_install = pd.concat([user_group_install,pd.DataFrame(install_dict,index=[i,])]) dump_pickle(user_group_install,feature_data_path+'user_group_install.pkl')
def gen_user_hour_click_count(update=False): ''' 生成所有数据的每天没小时点击统计量 拼接键['ID_name', 'clickDay', 'clickHour'] :param update: :return: ''' train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = addAd(data) data = addPosition(data) data = addAppCategories(data) ads_feature = [ 'advertiserID', 'camgaignID', 'adID', 'creativeID', 'appID', 'appCategory' ] context_feature = ['positionID', 'sitesetID'] state_feature = ads_feature + context_feature for feature in tqdm(state_feature): feature_path = feature_data_path + 'user_' + feature + '_click_hour.pkl' if os.path.exists(feature_path): print('Found' + feature_path) else: print('Generation' + feature_path) user_feature_click_day = data.groupby( ['userID', 'clickDay', 'clickHour', feature]).size().reset_index().rename( columns={0: 'user_' + feature + '_click_hour'}) dump_pickle(user_feature_click_day, feature_path)
def gen_tricks(start_day, end_day): ''' 生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接 :param start_day: :param end_day: :return: ''' train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') actions = load_pickle(raw_data_path + 'user_app_actions.pkl') data = train_data.append(test_data) del train_data, test_data data = addTime(data) data = addAd(data) for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) df = data.loc[data.clickDay == day] df = add_trick(df) df = add_diff(df) df = add_install2click(df, day, actions) dump_pickle( df[[ 'global_index', 'trick', 'first_diff', 'last_diff', 'install2click' ]], feature_path)
def gen_ID_global_sum_count( last_day=27, stats_features=['positionID', 'creativeID', 'appID', 'adID', 'userID']): train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] del train, test gc.collect() data = addAd(data) data = addPosition(data) data = addAppCategories(data) for feature in tqdm(stats_features): feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str( last_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) # continue print('generating ' + feature_path) """ 数据来自:'train.pkl和test.pkl,表示是关于展示和点击的数据 聚合的算子的size(),表示统计的是展示量 """ feature_count_sum = pd.DataFrame( data.groupby(feature).size()).reset_index().rename( columns={0: feature + '_sum_count'}) dump_pickle(feature_count_sum, feature_path)
def getConcatedAppIDCountVector(concated_list=None): ''' 拼接键['appID'] -- call gen_CountVector_appID_user_installed() :param concated_list: :return: ''' if concated_list is None: concated_list = ['age_cut','gender','education','marriageStatus','haveBaby'] concated_countvec = None for feature in tqdm(concated_list): feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl' if os.path.exists(feature_path): count_vec = load_pickle(feature_path) else: # call function, dump_pickle gen_CountVector_appID_user_installed(concated_list) # load_pickle count_vec = load_pickle(feature_path) """ - 看着好像很繁琐,其实这样做可以保留中间计算数据,避免重算,是非常Spark的玩法。 - coder几乎在任何用到pickle的地方都加入了os.path.exists(feature_path)的条件判断,这就是用pickle的正确方式。 - 可以优化的地方:用cPickle代替pickle。 """ if concated_countvec is None: concated_countvec = count_vec else: concated_countvec = pd.merge(concated_countvec, count_vec, on='appID', how='left') return concated_countvec
def gen_CountVector_ID_user_clicks(ID_name, last_day=27, ID_describe_feature_names=[ 'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby' ], drop_na=False): ''' 生成根据train和test表计算的ID_name计数描述向量 拼接键 [ID_name] :param ID_name: :param last_day: :param gen_CountVector_ID_user_clicks: :param drop_na: :return: ''' train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] data = addAd(data) data = addPosition(data) data = addAppCategories(data) data = data[['userID', ID_name]] user_info = pd.read_csv(raw_data_path + 'user.csv') user_info['age_cut'] = pd.cut(user_info['age'], bins=[-1, 0, 18, 25, 35, 45, 55, np.inf], labels=False) user_info.loc[user_info.education == 7, 'education'] = 6 user_info['hometown_province'] = user_info['hometown'].apply( lambda x: x // 100) user_info['residence_province'] = user_info['residence'].apply( lambda x: x // 100) for feature in tqdm(ID_describe_feature_names): feature_path = feature_data_path + 'CountVector_' + ID_name + '_user_clicks_' + feature + '_lastday' + str( last_day) + '.pkl' if drop_na: feature_path += '.no_na' if os.path.exists(feature_path): print('found ' + feature_path) continue print('generating ' + feature_path) prefix_name = ID_name + '_user_clicks_' + feature sub_user_info = pd.get_dummies(user_info[['userID', feature]], columns=[feature], prefix=prefix_name) if drop_na: sub_user_info.drop([prefix_name + '_0'], axis=1, inplace=True) data = pd.merge(data, sub_user_info, 'left', 'userID') dummy_features = sub_user_info.columns.tolist() dummy_features.remove('userID') ID_describe_feature = data[[ID_name] + dummy_features].groupby( [ID_name], as_index=False).sum() data.drop(dummy_features, axis=1, inplace=True) dump_pickle(ID_describe_feature, feature_path)
def generate_stats_feature(): ''' 输入train和test,进行concat后,添加用户点击数据的统计特征 :return: ''' feature_path = feature_data_path + 'UserClickStats.pkl' if os.path.exists(feature_path): print('Found', feature_path) else: train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) del train, test gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) data = addAppCategories(data) data = add_user_day_click(data) data = add_user_day_click_count( data, feature_list=['camgaignID', 'adID', 'appID', 'sitesetID']) # data = add_user_day_hour_count(data) # train_origin_features = train.columns.values.tolist() # test_origin_features = test.columns.values.tolist() feature_names = [ 'user_adID_click_day_mean', # 有些统计特征没包括进来 'user_adID_click_day_min', 'user_camgaignID_click_day_min', 'user_appID_click_day_mean', 'user_appID_click_day_max', 'user_appID_click_day_min', 'user_sitesetID_click_day_mean', 'user_sitesetID_click_day_max', 'user_sitesetID_click_day_min', 'user_click_day_mean', 'user_click_day_max', 'user_click_day_min' ] print('Generating', feature_path) columns_day = [ 'user_adID_click_day', 'user_camgaignID_click_day', 'user_appID_click_day', 'user_sitesetID_click_day', 'user_click_day' ] columns_hour = [ 'user_adID_click_hour', 'user_camgaignID_click_hour', 'user_appID_click_hour', 'user_sitesetID_click_hour' ] sub_feature = ['userID', 'clickTime'] # data = pd.concat([train[sub_feature+columns_day+columns_hour],test[sub_feature+columns_day+columns_hour]]) for col in tqdm(columns_day): data = gen_click_stats(data, col) # for col in tqdm(columns_day): # data = add data = data[feature_names + ['userID']].drop_duplicates(['userID']) dump_pickle(data, feature_path)
def gen_user_day_click(): feature_path = feature_data_path + 'user_day_clicks.pkl' if os.path.exists(feature_path): print('Found' + feature_path) else: print('Generating' + feature_path) train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') all_data = train.append(test) all_data = addTime(all_data) user_click_day = pd.DataFrame( all_data.groupby([ 'clickDay', 'userID' ]).size()).reset_index().rename(columns={0: 'user_click_day'}) dump_pickle(user_click_day, feature_path)
def gen_positionID_cvr_smooth(test_day): """ * 新学trick:对positionID的在个别时间窗口内的ctr进行贝叶斯平滑处理 """ feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str( test_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) data = load_pickle(raw_data_path + 'train.pkl') data = addTime(data) positionID_cvr = data[data.clickDay < test_day] I = positionID_cvr.groupby('positionID')['label'].size().reset_index( ) # size()的作用等于count(),即做计数计算 I.columns = ['positionID', 'I'] C = positionID_cvr.groupby('positionID')['label'].sum().reset_index( ) # 因为点击的label是1,这里做sum()等价于是求出所有的点击量 C.columns = ['positionID', 'C'] positionID_cvr = pd.concat([I, C['C']], axis=1) hyper = BayesianSmoothing(1, 1) hyper.update(positionID_cvr['I'].values, positionID_cvr['C'].values, 10000, 0.00000001) # 传入的是展示量和点击量,以及训练的参数设定 alpha = hyper.alpha beta = hyper.beta # 根据训练出来的alpha和beta,进行贝叶斯平滑 positionID_cvr['positionID_cvr_smooth'] = ( positionID_cvr['C'] + alpha) / (positionID_cvr['I'] + alpha + beta) dump_pickle(positionID_cvr[['positionID', 'positionID_cvr_smooth']], feature_path)
def gen_user_start_installed_cateA(): ''' 计算用户初始安装的各大类的app的数量 拼接键['userID'] :return: ''' user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') app_cate = pd.read_csv(raw_data_path + 'app_categories.csv') app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100 if x > 100 else x) user_install = user_install.merge(app_cate, on='appID', how='left') for cate_a in tqdm(app_cate.cate_a.unique()): feature_path = feature_data_path + 'user_start_installed_cate_' + str( cate_a) + '.pkl' if os.path.exists(feature_path): print('Found ' + feature_path) else: print('Generating ' + feature_path) user_install_cate = user_install[user_install.cate_a == cate_a][[ 'userID', 'cate_a' ]] user_install_cate.rename( columns={'cate_a': 'user_start_install_cate_' + str(cate_a)}, inplace=True) user_install_cate = user_install_cate.groupby( 'userID', as_index=False).sum() dump_pickle(user_install_cate, feature_path)
def add_user_start_installed_cateA(data): for cate in tqdm([0, 1, 2, 3, 4, 5]): feature_path = feature_data_path + 'user_start_install_cate_' + str( cate) + '.pkl' user_start_installed_cateA = load_pickle(feature_path) data = pd.merge(data, user_start_installed_cateA, 'left', 'userID') return data
def get_ConcatedAppIDTfidfVector_userinstalled(concated_list=None, mode='local', norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): """ 拼接键 ['appID'] ** 新学trick: 使用td-idf对one-hot后的特征进行转换,使数据集更加稠密。 为什么要对one-hot的结果进行tfidf转换,这么做的好处是什么? - make matrix dense; - but need test to prove its usefulness; """ """ 复习一下assert expression AssertionError if not expression: raise AssertionError """ assert mode in ['global', 'local'], 'mode must be global or local~~' if concated_list is None: concated_list = ['age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby'] tfidf_vec = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) if mode == 'global': concated_countvec = getConcatedAppIDCountVector(concated_list) concated_countvec.set_index('appID', inplace=True) # 将'appID'从feature列中移到index列中 vec_columns = concated_countvec.columns # 剩下的列就是需要做tfidf的列名 global_tfidf_vec = tfidf_vec.fit_transform(concated_countvec).todense() # 对one-hot的结果进行tfidf转换 global_tfidf_vec = pd.DataFrame(global_tfidf_vec, columns=vec_columns, index=concated_countvec.index).reset_index() # 用tfidf的结果替代原来的one-hot return global_tfidf_vec else: concated_tfidf_vec = None for feature in tqdm(concated_list): feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl' if os.path.exists(feature_path): count_vec = load_pickle(feature_path) else: gen_CountVector_appID_user_installed(concated_list) count_vec = load_pickle(feature_path) count_vec.set_index('appID', inplace=True) vec_columns = count_vec.columns local_tfidf_vec = tfidf_vec.fit_transform(count_vec).todense() local_tfidf_vec = pd.DataFrame(local_tfidf_vec, columns=vec_columns, index=count_vec.index).reset_index() if concated_tfidf_vec is None: concated_tfidf_vec = local_tfidf_vec else: concated_tfidf_vec = pd.merge(concated_tfidf_vec, local_tfidf_vec, on='appID', how='left') return concated_tfidf_vec
def get_ConcatedAppIDTfidfVector_userinstalled(concated_list=[ 'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby' ], mode='local', norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): assert mode in ['global', 'local'], 'mode must be global or local' tfidf_vec = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) if mode == 'global': concated_countvec = getConcatedAppIDCountVector(concated_list) concated_countvec.set_index('appID', inplace=True) vec_columns = concated_countvec.columns global_tfidf_vec = tfidf_vec.fit_transform(concated_countvec).todense() global_tfidf_vec = pd.DataFrame( global_tfidf_vec, columns=vec_columns, index=concated_countvec.index).reset_index() return global_tfidf_vec else: concated_tfidf_vec = None for feature in tqdm(concated_list): feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl' if os.path.exists(feature_path): count_vec = load_pickle(feature_path) else: gen_CountVector_appID_user_installed(concated_list) count_vec = load_pickle(feature_path) count_vec.set_index('appID', inplace=True) vec_columns = count_vec.columns local_tfidf_vec = tfidf_vec.fit_transform(count_vec).todense() local_tfidf_vec = pd.DataFrame( local_tfidf_vec, columns=vec_columns, index=count_vec.index).reset_index() if concated_tfidf_vec is None: concated_tfidf_vec = local_tfidf_vec else: concated_tfidf_vec = pd.merge(concated_tfidf_vec, local_tfidf_vec, on='appID', how='left') return concated_tfidf_vec
def add_app_hist_install(data): feature_path = feature_data_path + 'app_hist_install.pkl' app_hist_install = load_pickle(feature_path) data = pd.merge(data, app_hist_install, on='left', how=['appId', 'clickDay']) app_hist_install['app_hist_install'] = app_hist_install[ 'app_hist_install'] / (app_hist_install['clickDay'] - 1) return data
def add_smooth_pos_cvr(data, test_day): """ - 将每一天的feature数据load_pickle出来 - 和原始的data_frame进行merge后,返回 - do next step with new data_frame """ feature_path = feature_data_path + 'positionID_cvr_smooth_day_' + str( test_day) + '.pkl' smooth_pos_cvr = load_pickle(feature_path) data = pd.merge(data, smooth_pos_cvr, 'left', 'positionID') return data
def gen_user_day_click(): feature_path = feature_data_path + 'user_day_clicks.pkl' if os.path.exists(feature_path): print('Found' + feature_path) else: print('Generating' + feature_path) train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') all_data = train.append(test) all_data = addTime(all_data) # 添加一些时间维度 """ .size(): 类似于count(), 统计个数 .reset_index().rename(columns={0:'user_click_day'}): 目的只是为了重命名列名,将聚合统计的结果命名为'user_click_day' """ user_click_day = pd.DataFrame( all_data.groupby([ 'clickDay', 'userID' ]).size()).reset_index().rename(columns={0: 'user_click_day'}) # pickle的过程类似于spark的持久化 dump_pickle(user_click_day, feature_path)
def add_user_day_click(data): ''' 添加用户当天的点击总数 :param data: :return: ''' feature_path = feature_data_path + 'user_day_clicks.pkl' if not os.path.exists(feature_path): gen_user_day_click() user_click_day = load_pickle(feature_path) data = pd.merge(data, user_click_day, 'left', ['clickDay', 'userID']) return data
def gen_hist_cvr_smooth(start_day, end_day, key, alpha=0.25): """ * 新学trick:对传入的时间窗口的每一天的ctr做Laplace Smoothing """ train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') data = train_data.append(test_data) del train_data, test_data gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() """ - 这里做的是Laplace Smoothing - 参见https://blog.csdn.net/bbbeoy/article/details/71249316 """ dfCvr[key + '_cvr'] = (dfCvr['label_1'] + alpha) / ( dfCvr['label_1'] + dfCvr['label_0'] + alpha * 2) sub_data = pd.merge(data.loc[data.clickDay == day, ['clickDay', key]], dfCvr[[key, key + '_cvr']], 'left', on=[ key, ]) sub_data.drop_duplicates(['clickDay', key], inplace=True) sub_data.sort_values(['clickDay', key], inplace=True) dump_pickle(sub_data[['clickDay', key, key + '_cvr']], feature_path)
def add_hist_cvr_smooth(data, key): hist_cvr_smooth = None for day in tqdm((data.clickTime // 1000000).unique()): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' day_cvr_smooth = load_pickle(feature_path) if hist_cvr_smooth is None: hist_cvr_smooth = day_cvr_smooth else: hist_cvr_smooth = pd.concat([hist_cvr_smooth, day_cvr_smooth], axis=0) data = pd.merge(data, hist_cvr_smooth, 'left', ['clickDay', key]) return data
def gen_CountVector_appID_user_installed(appID_describe_feature_names=[ 'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby', 'hometown_province', 'residence_province' ]): ''' 生成根据install表计算的appID计数描述向量 :param appID_describe_feature_names: :return: ''' user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') user_info = pd.read_csv(raw_data_path + 'user.csv') user_info['age_cut'] = pd.cut(user_info['age'], bins=[-1, 0, 18, 25, 35, 45, 55, 65, np.inf], labels=False) user_info['hometown_province'] = user_info['hometown'].apply( lambda x: x // 100) user_info['residence_province'] = user_info['residence'].apply( lambda x: x // 100) for feature in tqdm(appID_describe_feature_names): feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + '.pkl' if os.path.exists(feature_path): print('Found ' + feature_path) else: print('Generating ' + feature_path) sub_user_info = pd.get_dummies(user_info[['userID', feature]], columns=[feature], prefix='appID_installed_' + feature) # 进行独热编码 user_install = pd.merge(user_install, sub_user_info, on='userID', how='left') dummy_features = sub_user_info.columns.tolist() dummy_features.remove('userID') app_describe_feature = None for dummy_feature in tqdm(dummy_features): app_feature_installed = user_install[[ 'appID', dummy_feature ]].groupby('appID', as_index=False).sum() if app_describe_feature is None: app_describe_feature = app_feature_installed else: app_describe_feature = pd.concat([ app_describe_feature, app_feature_installed[[dummy_feature]] ], axis=1) user_install.drop(dummy_feature, inplace=True, axis=1) dump_pickle(app_describe_feature, feature_path)
def addAd(data): ''' 拼接原始ad特征 :param data: :return: ''' feature_path = raw_data_path + 'ad.pkl' ad_feature = ['adID', 'camgaignID', 'creativeID', 'advertiserID', 'appID', 'appPlatform'] #ad.csv的所有字段 if os.path.exists(feature_path): ad = load_pickle(feature_path) else: ad = pd.read_csv(raw_data_path + 'ad.csv') dump_pickle(ad, feature_path) return pd.merge(data, ad[ad_feature], on='creativeID', how='left')
def getConcatedAppIDCountVector(concated_list=[ 'age_cut', 'gender', 'education', 'marriageStatus', 'haveBaby' ]): ''' 拼接键['appID'] :param concated_list: :return: ''' concated_countvec = None for feature in tqdm(concated_list): feature_path = feature_data_path + 'CountVector_appID_user_installed_' + feature + 'pkl' if os.path.exists(feature_path): count_vec = load_pickle(feature_path) else: gen_CountVector_appID_user_installed(concated_list) count_vec = load_pickle(feature_path) if concated_countvec is None: concated_countvec = count_vec else: concated_countvec = pd.merge(concated_countvec, count_vec, on='appID', how='left') return concated_countvec
def addPosition(data): ''' 拼接原始position特征 :param data: :return: ''' feature_path = raw_data_path + 'position.pkl' position_feature = ['positionID', 'sitesetID', 'positionType'] if os.path.exists(feature_path): position = load_pickle(feature_path) else: position = pd.read_csv(raw_data_path + 'position.csv') dump_pickle(position, feature_path) return pd.merge(data, position[position_feature], on='positionID', how='left')
def add_global_count_sum( data, last_day=27, stats_features=['positionID', 'creativeID', 'appID', 'adID', 'userID']): """ 添加ID出现次数,根据ID_name拼接 """ for feature in tqdm(stats_features): feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str( last_day) + '.pkl' if not os.path.exists(feature_path): gen_ID_global_sum_count([feature]) feature_count_sum = load_pickle(feature_path) data = data.merge(feature_count_sum, 'left', [feature]) return data
def gen_app_start_installed(): ''' 记录第一天之前各个appID被记录的安装数量 拼接键['appID'] :return: ''' feature_path = feature_data_path + 'app_start_installed.pkl' if os.path.exists(feature_path): print('Found:' + feature_path) else: print('Generating ' + feature_path) user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') app_start_sum = user_install.groupby('appID').size().reset_index().rename(columns={0:'app_start_install_num'}) del user_install gc.collect() dump_pickle(app_start_sum, feature_path)
def add_hist_cvr_smooth(data, key): """ - 将每一天的feature数据load_pickle出来 - 和原始的data_frame进行merge后,返回 - do next step with new data_frame """ hist_cvr_smooth = None for day in tqdm((data.clickTime // 1000000).unique()): feature_path = feature_data_path + key + '_histcvr_smooth_day_' + str( day) + '.pkl' day_cvr_smooth = load_pickle(feature_path) if hist_cvr_smooth is None: hist_cvr_smooth = day_cvr_smooth else: hist_cvr_smooth = pd.concat([hist_cvr_smooth, day_cvr_smooth], axis=0) data = pd.merge(data, hist_cvr_smooth, 'left', ['clickDay', key]) return data
def add_tricks(data): ''' :param data: :return: ''' tricks = None for day in tqdm((data.clickTime // 1000000).unique()): feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' if not os.path.exists(feature_path): gen_tricks(day, day) day_tricks = load_pickle(feature_path) if tricks is None: tricks = day_tricks else: tricks = pd.concat([tricks, day_tricks], axis=0) data = pd.merge(data, tricks, 'left', 'global_index') return data
def get_TfidfVector_appCategory_user_action_hour(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): """ **新学的trick:对时间区间级的安装量进行tf-idf转换 - 可能是太多时间区间的安装量比较集中,maybe画图看出来的; - 但本来就是连续性的feature,即使长尾分布,用tfidf转换,有点小问题。还是那句话:多做实验 - 我觉得这里做一个box-cox转化 或者 高斯归一化更好; """ tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) feature_path = feature_data_path + 'CountVector_appCategory_actionHour.pkl' if not os.path.exists(feature_path): gen_CountVector_appCategory_user_action_hour() count_vec = load_pickle(feature_path) count_vec.set_index('appCategory', inplace=True) col_name = count_vec.columns tfidf_vec = pd.DataFrame(tfidf.fit_transform(count_vec).todense(), columns=col_name, index=count_vec.index).reset_index() return tfidf_vec
def get_TfidfVector_appCategory_user_action_hour(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): tfidf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) feature_path = feature_data_path + 'CountVector_appCategory_actionHour.pkl' if not os.path.exists(feature_path): gen_CountVector_appCategory_user_action_hour() count_vec = load_pickle(feature_path) count_vec.set_index('appCategory', inplace=True) col_name = count_vec.columns tfidf_vec = pd.DataFrame(tfidf.fit_transform(count_vec).todense(), columns=col_name, index=count_vec.index).reset_index() return tfidf_vec