def run(): csv_data_all = pd.read_csv( com.get_project_path('Data/Csv/ClnData/csv_data_all.csv')) csv_data_item = pd.read_csv( com.get_project_path('Data/Csv/ClnData/csv_data_item.csv')) csv_data_p = pd.read_csv( com.get_project_path('Data/Csv/ClnData/csv_data_p.csv')) csv_data_all['ui_id'] = sp.get_ui_id(csv_data_all) csv_data_p['ui_id'] = sp.get_ui_id(csv_data_p) csv_data_all['uc_id'] = sp.get_uc_id(csv_data_all) csv_data_p['uc_id'] = sp.get_uc_id(csv_data_p) get_feature(data_all=csv_data_all, data_p=csv_data_p, data_item=csv_data_item, label_day_rank=31, p_only=False, duration=31, save=True) get_feature(data_all=csv_data_all, data_p=csv_data_p, data_item=csv_data_item, label_day_rank=32, p_only=True, duration=31, save=True)
def run(): train_x = pd.read_csv( com.get_project_path( 'Data/Csv/FeaData/_A/fea_all_label31_dur31_sl1.csv')) train_y_ui = sp.get_csv_label( pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv')), 31) print('特征数量: ' + str(len(train_x.columns) - 2)) print('训练集数量: ' + str(len(train_x))) train_y = sp.get_ui_id(train_x).isin(sp.get_ui_id(train_y_ui)).replace({ True: 1, False: 0 }) train_x = train_x.replace({np.inf: 1}) rfe = RFE(estimator=XGBClassifier(n_estimators=10, learning_rate=0.05, max_depth=5, colsample_bytree=0.8, subsample=0.8, min_child_weight=16), n_features_to_select=30) rfe.fit(train_x.drop(['user_id', 'item_id'], axis=1), train_y) result = pd.DataFrame(sorted( zip(map(lambda x: round(x, 4), rfe.ranking_), train_x.drop(['user_id', 'item_id'], axis=1).columns)), columns=['score', 'feature']) result.to_csv(com.get_project_path('Data/Temp/feature_rfe_.csv'), index=None) print(result)
def run(): csv_data_item = pd.read_csv(com.get_project_path( 'Data/Csv/OriData/tianchi_fresh_comp_train_item.csv'), header=0, names=['item_id', 'item_geo', 'item_cate']) csv_data_user = pd.read_csv(com.get_project_path( 'Data/Csv/OriData/tianchi_fresh_comp_train_user.csv'), header=0, names=[ 'user_id', 'item_id', 'beh_type', 'user_geo', 'item_cate', 'time' ]) # 多此一举 csv_data_all = pd.merge(csv_data_user, csv_data_item.loc[:, ['item_id']].drop_duplicates(), how='left', on='item_id') csv_data_all.to_csv( com.get_project_path('Data/Csv/OriData/csv_data_all.csv'), index=None) # 保存1w条做来测试代码 csv_data_all.head(10000).to_csv( com.get_project_path('Data/Csv/OriData/csv_data_all_h1w.csv'), index=None)
def run(): csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all.csv')) csv_data_item = pd.read_csv(com.get_project_path('Data/Csv/OriData/tianchi_fresh_comp_train_item.csv'), header=0, names=['item_id', 'item_geo', 'item_cate']) # 测试代码时解注下面一条 # csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all_h1w.csv')) # 处理time csv_data_all['time'] = pd.to_datetime(csv_data_all['time'], format='%Y%m%d %H') csv_data_all['hour'] = csv_data_all['time'].dt.hour csv_data_all['time'] = csv_data_all['time'].dt.normalize() csv_data_all['week'] = csv_data_all['time'].apply(lambda a: a.weekday()+1) csv_data_all['day_rank'] = csv_data_all['time'].rank(method='dense').apply(lambda a: int(a)) # del csv_data_all['time'] # 处理经纬度 csv_data_item['item_geo'] = csv_data_item['item_geo'].replace('input_data_is_error', '').fillna('').apply(lambda a: gh64.decode(a)) csv_data_item['item_geo_lat'] = csv_data_item['item_geo'].apply(lambda a: get_lat_lon(a, 0, inplace=-90)) csv_data_item['item_geo_lon'] = csv_data_item['item_geo'].apply(lambda a: get_lat_lon(a, 1, inplace=180)) del csv_data_item['item_geo'] csv_data_all['user_geo'] = csv_data_all['user_geo'].replace('input_data_is_error', '').fillna('').apply(lambda a: gh64.decode(a)) csv_data_all['user_geo_lat'] = csv_data_all['user_geo'].apply(lambda a: get_lat_lon(a, 0, inplace=90)) csv_data_all['user_geo_lon'] = csv_data_all['user_geo'].apply(lambda a: get_lat_lon(a, 1, inplace=-180)) del csv_data_all['user_geo'] # 保存 com.save_csv(csv_data_all.sort_values(by=['user_id', 'day_rank', 'item_id', 'beh_type']), com.get_project_path('Data/Csv/ClnData/'), 'csv_data_all.csv') com.save_csv(csv_data_item.sort_values(by=['item_id', 'item_cate']), com.get_project_path('Data/Csv/ClnData/'), 'csv_data_item.csv') com.save_csv(csv_data_all[csv_data_all['item_id'].isin(csv_data_item['item_id'])].sort_values(by=['user_id', 'day_rank', 'item_id', 'beh_type']), com.get_project_path('Data/Csv/ClnData/'), 'csv_data_p.csv') # 保存1w条做来测试代码 csv_data_all.head(10000).sort_values(by=['user_id', 'day_rank', 'item_id']).to_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all_h1w.csv'), index=None)
def run(): global train_y_ui global test_y_ui global train_ui global test_ui train_x = pd.read_csv( com.get_project_path( 'Data/Csv/FeaData/_A/fea_all_label30_dur31_sl1.csv')) test_x = pd.read_csv( com.get_project_path( 'Data/Csv/FeaData/_A/fea_all_label31_dur31_sl1_p.csv')) train_y_ui = sp.get_csv_label( pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv')), 30) test_y_ui = sp.get_csv_label( pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_p.csv')), 31) print('特征数量: ' + str(len(train_x.columns) - 2)) print('训练集数量: ' + str(len(train_x))) train_ui = train_x.loc[:, ['user_id', 'item_id']] test_ui = test_x.loc[:, ['user_id', 'item_id']] train_y = sp.get_ui_id(train_x).isin(sp.get_ui_id(train_y_ui)).replace({ True: 1, False: 0 }) test_y = sp.get_ui_id(test_x).isin(sp.get_ui_id(test_y_ui)).replace({ True: 1, False: 0 }) # ########### 模型 ############ # pre_label = xgb_pre(train_x.drop(['user_id', 'item_id'], axis=1), train_y, test_x.drop(['user_id', 'item_id'], axis=1), test_y=test_y, if_save_imp=False) tmp = list(pre_label.sort_values(ascending=False))[700] pre_label = pre_label.apply(lambda a: a >= tmp).replace({ True: 1, False: 0 }) test_x['label'] = pre_label test_pre_ui = test_x[test_x['label'] == 1].loc[:, ['user_id', 'item_id']] sp.f1_score(test_pre_ui, test_y_ui.loc[:, ['user_id', 'item_id']], if_print=True) del test_x['label']
def run(): data_all = pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv')) train_x = pd.read_csv(com.get_project_path(FEATURE_PATH+'fea_all_label31_dur31_sl3.csv')) train_x['ui_id'] = sp.get_ui_id(train_x) test_x = pd.read_csv(com.get_project_path(FEATURE_PATH+'fea_all_label32_dur31_sl3_p.csv')) test_x['ui_id'] = sp.get_ui_id(test_x) train_y = sp.get_csv_label(data_all, 31) train_y['ui_id'] = sp.get_ui_id(train_y) train_y = train_x['ui_id'].isin(train_y['ui_id']).replace({True: 1, False: 0}) print('特征数量: '+str(len(train_x.columns)-3)) print('训练集数量: ' + str(len(train_x))) # ########### 搞模型 ############ # pre_label = xgb_pre(train_x.drop(['user_id', 'item_id', 'ui_id'], axis=1), train_y, test_x.drop(['user_id', 'item_id', 'ui_id'], axis=1)) tmp = list(pre_label.sort_values(ascending=False))[500] pre_label = pre_label.apply(lambda a: a>=tmp).replace({True: 1, False: 0}) test_x['label'] = pre_label csv_fea_label24_dur14_p = test_x[test_x['label']==1].loc[:, ['user_id', 'item_id']] save_name = '_A_02_xgb_202001032331.csv' com.save_csv(csv_fea_label24_dur14_p.loc[:, ['user_id', 'item_id']], com.get_project_path(RESULT_PATH), save_name)
def xgb_pre(train_x, train_y, test_x, num_round=500, params=None, test_y=None, if_save_imp=True): dtrain = xgb.DMatrix(train_x, label=train_y) dtest = xgb.DMatrix(test_x, label=test_y) if params is None: params = { 'objective': 'binary:logistic', # 'objective': 'rank:pairwise', 'eta': 0.01, 'max_depth': 5, 'colsample_bytree': 0.8, 'subsample': 0.8, 'min_child_weight': 16, 'tree_method': 'exact', # 'gamma': 0.1, # 'scale_pos_weight': 10, # 'max_delta_step': 0.7, # 'eval_metric': 'auc', } watchlist = [(dtrain, 'train'), (dtest, 'test')] if test_y is None: bst = xgb.train(params, dtrain, num_boost_round=num_round) else: bst = xgb.train(params, dtrain, num_round, watchlist, feval=evalerror) if if_save_imp: imp_dict = bst.get_fscore(fmap='') imp = pd.DataFrame({ 'column': list(imp_dict.keys()), 'importance': list(imp_dict.values()) }) com.save_csv(imp.sort_values(by='importance'), com.get_project_path('Data/Temp/'), 'xgb-val_importance.csv') pre_label = pd.Series(bst.predict(dtest)) return pre_label
def run(): csv_data_all = pd.read_csv( com.get_project_path('Data/Csv/OriData/csv_data_all.csv')) csv_data_item = pd.read_csv(com.get_project_path( 'Data/Csv/OriData/tianchi_fresh_comp_train_item.csv'), header=0, names=['item_id', 'item_geo', 'item_cate']) csv_data_p = csv_data_all[csv_data_all['item_id'].isin( csv_data_item['item_id'])] # 测试代码时解注下面一条 # csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/OriData/csv_data_all_h1w.csv')) # 对time处理下, 根据需求自行注释可节省大量时间 csv_data_all['time'] = pd.to_datetime(csv_data_all['time'], format='%Y%m%d %H') # csv_data_all['hour'] = csv_data_all['time'].dt.hour csv_data_all['time'] = csv_data_all['time'].dt.normalize() # csv_data_all['week'] = csv_data_all['time'].apply(lambda a: a.weekday()+1) # csv_data_all['day'] = csv_data_all['time'].dt.day csv_data_all['day_rank'] = csv_data_all['time'].rank( method='dense').apply(lambda a: int(a)) # ###### 文字分析 ###### # ''' 最小日期 2014-11-18 00:00:00 最大日期 2014-12-18 23:00:00 ''' # print(min(csv_data_all['time'])) # print(max(csv_data_all['time'])) ''' 总人数 20000 总商品数 4758484 ''' # print(len(set(csv_data_all['user_id']))) # print(len(set(csv_data_all['item_id']))) ''' 同一商品只有一种类型 ''' # print(len(set(csv_data_item['item_id']))) # print(len(set(csv_data_item['item_id'].apply(lambda a: str(a)) + csv_data_item['item_cate'].apply(lambda a: str(a))))) ''' 同一商品会有多个经纬度 ''' # print(len(set(csv_data_item['item_id']))) # print(len(set(csv_data_item['item_id'].apply(lambda a: str(a)) + csv_data_item['item_geo'].apply(lambda a: str(a))))) ''' 同一用户会有多个经纬度 ''' # print(len(set(csv_data_all['user_id']))) # print(len(set(csv_data_all['user_id'].apply(lambda a: str(a)) + csv_data_all['user_geo'].apply(lambda a: str(a))))) ''' 找到异常用户 ''' # csv_user_bh_count = com.pivot_table_plus(csv_data_all, 'user_id', 'item_id', 'count', 'bh_count') # csv_user_day_count = com.pivot_table_plus(csv_data_all, 'user_id', 'day_rank', com.count_with_drop_duplicates_for_series, 'day_count') # csv_user_bh_count = pd.merge(csv_user_bh_count, csv_user_day_count, on='user_id', how='left') # csv_user_bh_count['bh_count_mean'] = csv_user_bh_count['bh_count'] / csv_user_bh_count['day_count'] # csv_user_bh_count = csv_user_bh_count.sort_values(by='bh_count_mean', ascending=False).head(15) # print(csv_user_bh_count) # # csv_user_4_count = com.pivot_table_plus(csv_data_all[csv_data_all['user_id'].isin(csv_user_bh_count['user_id']) & (csv_data_all['beh_type']==4)], # 'user_id', 'item_id', 'count', 'bh4_count') # print(csv_user_4_count) ''' 经纬度中的字符set{'d', 'n', '4', '3', 'l', 'e', 'j', 'p', 'h', 't', '_', 'c', 'm', '5', 'v', '7', 'o', 'k', 's', '9', '0', 'g', 'w', 'r', 'u', 'q', '1', 'f', '2', 'a', 'b', 'i', '6'} 其中'_'来自'input_data_is_error' 整理下,正常的set为:012345679abcdefghijklmnopqrstuvw,缺少 8 x y z ''' # set_geo = set(list(csv_data_all['item_geo'].dropna())+list(csv_data_all['user_geo'].dropna())) # str_geo = str(set_geo).replace('\'', '').replace(',', '').replace(' ', '')[1:-1] # print(set(str_geo)) ''' 销售量 全集/子集 大于1 的商品有 31235/3010 件 大于2 的商品有 11759/1090 件 大于10 的商品有 504/62 件 大于20 的商品有 107/10 件 大于50 的商品有 24/1 件 ''' # csv_data_cate4 = csv_data_all[csv_data_all['beh_type']==4] # csv_data_cate4 = pd.pivot_table(csv_data_cate4, index='item_id', values='user_id', aggfunc='count').reset_index() # print(csv_data_cate4[csv_data_cate4['user_id']>1]) # print(csv_data_cate4[csv_data_cate4['user_id']>2]) # print(csv_data_cate4[csv_data_cate4['user_id']>10]) # print(csv_data_cate4[csv_data_cate4['user_id']>20]) # print(csv_data_cate4[csv_data_cate4['user_id']>50]) # csv_data_cate4 = csv_data_p[csv_data_p['beh_type']==4] # csv_data_cate4 = pd.pivot_table(csv_data_cate4, index='item_id', values='user_id', aggfunc='count').reset_index() # print(csv_data_cate4[csv_data_cate4['user_id']>1]) # print(csv_data_cate4[csv_data_cate4['user_id']>2]) # print(csv_data_cate4[csv_data_cate4['user_id']>10]) # print(csv_data_cate4[csv_data_cate4['user_id']>20]) # print(csv_data_cate4[csv_data_cate4['user_id']>50]) ''' 有人会同一天多次买多种商品,目测的 ''' ''' 总的行为有23291027条 对于子集商品只有2084859条 ''' # print(len(csv_data_all)) # print(len(csv_data_all[csv_data_all['item_id'].isin(csv_data_item['item_id'])])) ''' 全集商品4758484种,分类9557种 子集商品422858种,分类1054种 ''' # print(len(set(csv_data_all['item_id']))) # print(len(set(csv_data_all['item_cate']))) # # print(len(set(csv_data_item['item_id']))) # print(len(set(csv_data_item['item_cate']))) # ##### 图分析 ###### # ''' 销售长达x天的商品数量(两张图) # ''' # csv_item_count_by_sale_day_count = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['day_rank', 'item_id']] # csv_item_count_by_sale_day_count = pd.pivot_table(csv_item_count_by_sale_day_count, index='item_id', values='day_rank', # aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'sale_day_count'}) # # print(csv_item_count_by_sale_day_count) # csv_item_count_by_sale_day_count = pd.pivot_table(csv_item_count_by_sale_day_count, index='sale_day_count', values='item_id', # aggfunc='count').rename(columns={'item_id': 'item_count'}).sort_values(by='item_count', ascending=False) # # print(csv_item_count_by_sale_day_count) # csv_item_count_by_sale_day_count.plot.bar() # plt.xlabel('sale days count') # plt.savefig(com.get_project_path('Data/Graph/item_count_by_sale_day_count.jpg')) # # plt.show() # # csv_item_count_by_sale_day_count.tail(21).plot.bar() # plt.xlabel('sale days count') # plt.savefig(com.get_project_path('Data/Graph/item_count_by_sale_day_count_t21.jpg')) # # plt.show() # gc.collect() ''' 记录长达x天的商品数量(两张图) ''' # csv_item_count_by_log_day_count = csv_data_all.copy().loc[:, ['day_rank', 'item_id']] # csv_item_count_by_log_day_count = pd.pivot_table(csv_item_count_by_log_day_count, index='item_id', values='day_rank', # aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'log_day_count'}) # # print(csv_item_count_by_log_day_count) # csv_item_count_by_log_day_count = pd.pivot_table(csv_item_count_by_log_day_count, index='log_day_count', values='item_id', # aggfunc='count').rename(columns={'item_id': 'item_count'}).sort_values(by='item_count', ascending=False) # # print(csv_item_count_by_log_day_count) # csv_item_count_by_log_day_count.plot.bar() # plt.xlabel('log days count') # plt.savefig(com.get_project_path('Data/Graph/item_count_by_log_day_count.jpg')) # # plt.show() # # csv_item_count_by_log_day_count.tail(21).plot.bar() # plt.xlabel('log days count') # plt.savefig(com.get_project_path('Data/Graph/item_count_by_log_day_count_t21.jpg')) # # plt.show() # gc.collect() ''' 购买长达x天的个人数量 ''' # csv_user_count_by_sale_day_count = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['day_rank', 'user_id']] # csv_user_count_by_sale_day_count = pd.pivot_table(csv_user_count_by_sale_day_count, index='user_id', values='day_rank', # aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'sale_day_count'}) # # print(csv_user_count_by_sale_day_count) # csv_user_count_by_sale_day_count = pd.pivot_table(csv_user_count_by_sale_day_count, index='sale_day_count', values='user_id', # aggfunc='count').rename(columns={'user_id': 'user_count'}).sort_values(by='user_count', ascending=False) # # print(csv_user_count_by_sale_day_count) # csv_user_count_by_sale_day_count.plot.bar() # plt.xlabel('sale days count') # plt.savefig(com.get_project_path('Data/Graph/user_count_by_sale_day_count.jpg')) # # plt.show() # gc.collect() ''' 记录长达x天的个人数量 ''' # csv_user_count_by_log_day_count = csv_data_all.copy().loc[:, ['day_rank', 'user_id']] # csv_user_count_by_log_day_count = pd.pivot_table(csv_user_count_by_log_day_count, index='user_id', values='day_rank', # aggfunc=com.count_with_drop_duplicates_for_series).reset_index().rename(columns={'day_rank': 'log_day_count'}) # # print(csv_user_count_by_log_day_count) # csv_user_count_by_log_day_count = pd.pivot_table(csv_user_count_by_log_day_count, index='log_day_count', values='user_id', # aggfunc='count').rename(columns={'user_id': 'user_count'}).sort_values(by='user_count', ascending=False) # # print(csv_user_count_by_log_day_count) # csv_user_count_by_log_day_count.plot.bar() # plt.xlabel('log days count') # plt.savefig(com.get_project_path('Data/Graph/user_count_by_log_day_count.jpg')) # # plt.show() # gc.collect() ''' 商品全集/子集 销售前100计数 ''' # csv_item_sale_by_user = csv_data_all[(csv_data_all['beh_type']==4) & csv_data_all['item_id'].isin(csv_data_item['item_id'])].copy().loc[:, ['user_id', 'item_id']] # csv_item_sale_by_user = pd.pivot_table(csv_item_sale_by_user, index='item_id', values='user_id', aggfunc='count').rename(columns={'user_id': 'item_sale'}).sort_values(by='item_sale', ascending=False).head(100) # # csv_item_sale_by_user.plot.bar() # plt.xlabel('items') # plt.xticks(np.arange(0, 101, 10), np.arange(0, 101, 10)) # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_item_P.jpg')) # plt.show() # gc.collect() # csv_item_sale_by_user = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['user_id', 'item_id']] # csv_item_sale_by_user = pd.pivot_table(csv_item_sale_by_user, index='item_id', values='user_id', aggfunc='count').rename(columns={'user_id': 'item_sale'}).sort_values(by='item_sale', ascending=False).head(100) # # csv_item_sale_by_user.plot.bar() # plt.xlabel('items') # plt.xticks(np.arange(0, 101, 10), np.arange(0, 101, 10)) # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_item.jpg')) # # plt.show() # gc.collect() ''' 商品记录计数 ''' # csv_item_log_by_user = csv_data_all.copy().loc[:, ['user_id', 'item_id']] # csv_item_log_by_user = pd.pivot_table(csv_item_log_by_user, index='item_id', values='user_id', # aggfunc='count').rename(columns={'user_id': 'item_log'}).sort_values(by='item_log', ascending=False).head(100) # # csv_item_log_by_user.plot.bar() # plt.xlabel('items') # plt.xticks(np.arange(0, 101, 10), np.arange(0, 101, 10)) # plt.savefig(com.get_project_path('Data/Graph/item_log_by_item.jpg')) # # plt.show() # gc.collect() ''' 个人购买计数 ''' # csv_item_sale_by_user = csv_data_all[csv_data_all['beh_type']==4].copy().loc[:, ['user_id', 'item_id']] # csv_item_sale_by_user = pd.pivot_table(csv_item_sale_by_user, index='user_id', values='item_id', # aggfunc='count').rename(columns={'item_id': 'item_sale'}).sort_values(by='item_sale', ascending=False) # # csv_item_sale_by_user.plot.bar() # plt.xticks([]) # plt.xlabel('users') # plt.xticks(np.arange(0, 20001, 1000), np.arange(0, 20001, 1000), rotation=60) # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_user.jpg')) # # plt.show() # gc.collect() ''' 个人记录计数 ''' # csv_item_log_by_user = csv_data_all.copy().loc[:, ['user_id', 'item_id']] # csv_item_log_by_user = pd.pivot_table(csv_item_log_by_user, index='user_id', values='item_id', # aggfunc='count').rename(columns={'item_id': 'item_log'}).sort_values(by='item_log', ascending=False) # # csv_item_log_by_user.plot.bar() # plt.xticks([]) # plt.xlabel('users') # plt.xticks(np.arange(0, 20001, 1000), np.arange(0, 20001, 1000), rotation=60) # plt.savefig(com.get_project_path('Data/Graph/item_log_by_user.jpg')) # # plt.show() # gc.collect() ''' 商品种类记录当天的增加占比和减少减少 eg: day1[a, b, c], day2[b, c, d, e] increase_rate_of_log_count = [d, e] / [b, c, d, e] = 1/2 decrement_rate_of_log_count = [a] / [a, b, c] = 1/3 ''' # csv_data_all_copy = csv_data_all.copy().loc[:, ['day_rank', 'item_id']] # csv_item_count_by_day_rank = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id', # aggfunc=com.count_with_drop_duplicates_for_series).reset_index() # csv_item_count_by_day_rank['increase_rate_of_log_count'] = [np.nan] + [ # len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]['item_id'], # csv_data_all_copy[csv_data_all_copy['day_rank'] == (day_rank - 1)][ # 'item_id'])) / # len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]) for day_rank in range(2, 32)] # csv_item_count_by_day_rank['decrement_rate_of_log_count'] = [np.nan] + [ # len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank - 1]['item_id'], # csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]['item_id'])) / # len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank - 1]) for day_rank in range(2, 32)] # # del csv_item_count_by_day_rank['item_id'] # csv_item_count_by_day_rank = csv_item_count_by_day_rank.set_index('day_rank') # # csv_item_count_by_day_rank.plot() # plt.xticks(range(1, 32, 2)) # plt.xlabel('day rank') # plt.savefig(com.get_project_path('Data/Graph/item_inc&dec_rate_by_day_rank.jpg')) # # plt.show() # gc.collect() ''' 去重人数当天的增加占比和减少减少 ''' # csv_data_all_copy = csv_data_all.copy().loc[:, ['day_rank', 'user_id']] # csv_item_count_by_day_rank = pd.pivot_table(csv_data_all_copy, index='day_rank', values='user_id', aggfunc=com.count_with_drop_duplicates_for_series).reset_index() # csv_item_count_by_day_rank['increase_rate_of_user_count'] = [np.nan] + [ # len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank']==day_rank]['user_id'], csv_data_all_copy[csv_data_all_copy['day_rank']==(day_rank-1)]['user_id'])) / # len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]) for day_rank in range(2, 32)] # csv_item_count_by_day_rank['decrement_rate_of_user_count'] = [np.nan] + [ # len(com.get_difference_for_series(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank-1]['user_id'], csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank]['user_id'])) / # len(csv_data_all_copy[csv_data_all_copy['day_rank'] == day_rank-1]) for day_rank in range(2, 32)] # # del csv_item_count_by_day_rank['user_id'] # csv_item_count_by_day_rank = csv_item_count_by_day_rank.set_index('day_rank') # # csv_item_count_by_day_rank.plot() # plt.xticks(range(1, 32, 2)) # plt.xlabel('day rank') # plt.savefig(com.get_project_path('Data/Graph/user_inc&dec_rate_by_day_rank.jpg')) # # plt.show() # gc.collect() ''' 五个星期的记录对比图 ''' # csv_data_week1 = csv_data_all[csv_data_all['day_rank']<=6] # csv_data_week1.loc[:, ['day_rank']] = csv_data_week1['day_rank']+1 # csv_data_week1 = pd.pivot_table(csv_data_week1, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week2 = csv_data_all[(csv_data_all['day_rank']>6) & (csv_data_all['day_rank']<=13)] # csv_data_week2.loc[:, ['day_rank']] = csv_data_week2['day_rank']-6 # csv_data_week2 = pd.pivot_table(csv_data_week2, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week3 = csv_data_all[(csv_data_all['day_rank']>13) & (csv_data_all['day_rank']<=20)] # csv_data_week3.loc[:, ['day_rank']] = csv_data_week3['day_rank']-13 # csv_data_week3 = pd.pivot_table(csv_data_week3, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week4 = csv_data_all[(csv_data_all['day_rank']>20) & (csv_data_all['day_rank']<=27)] # csv_data_week4.loc[:, ['day_rank']] = csv_data_week4['day_rank']-20 # csv_data_week4 = pd.pivot_table(csv_data_week4, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week5 = csv_data_all[csv_data_all['day_rank']>27] # csv_data_week5.loc[:, ['day_rank']] = csv_data_week5['day_rank']-27 # csv_data_week5 = pd.pivot_table(csv_data_week5, index='day_rank', values='item_id', aggfunc='count') # # csv_data_weeks = pd.concat([csv_data_week1, csv_data_week2, csv_data_week3, csv_data_week4, csv_data_week5], axis=1) # csv_data_weeks.columns=['week1', 'week2', 'week3', 'week4', 'week5'] # csv_data_weeks = csv_data_weeks.fillna(np.mean(csv_data_weeks)//2) # csv_data_weeks.plot.bar() # plt.ylabel('sale count') # plt.xlabel('day of the week') # plt.savefig(com.get_project_path('Data/Graph/log_count_by_week.jpg')) # # plt.show() # gc.collect() ''' 五个星期的销售对比图 ''' # csv_data_all_copy = csv_data_all[csv_data_all['beh_type']==4].copy() # csv_data_week1 = csv_data_all_copy[csv_data_all_copy['day_rank']<=6] # csv_data_week1.loc[:, ['day_rank']] = csv_data_week1['day_rank']+1 # csv_data_week1 = pd.pivot_table(csv_data_week1, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week2 = csv_data_all_copy[(csv_data_all_copy['day_rank']>6) & (csv_data_all_copy['day_rank']<=13)] # csv_data_week2.loc[:, ['day_rank']] = csv_data_week2['day_rank']-6 # csv_data_week2 = pd.pivot_table(csv_data_week2, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week3 = csv_data_all_copy[(csv_data_all_copy['day_rank']>13) & (csv_data_all_copy['day_rank']<=20)] # csv_data_week3.loc[:, ['day_rank']] = csv_data_week3['day_rank']-13 # csv_data_week3 = pd.pivot_table(csv_data_week3, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week4 = csv_data_all_copy[(csv_data_all_copy['day_rank']>20) & (csv_data_all_copy['day_rank']<=27)] # csv_data_week4.loc[:, ['day_rank']] = csv_data_week4['day_rank']-20 # csv_data_week4 = pd.pivot_table(csv_data_week4, index='day_rank', values='item_id', aggfunc='count') # # csv_data_week5 = csv_data_all_copy[csv_data_all_copy['day_rank']>27] # csv_data_week5.loc[:, ['day_rank']] = csv_data_week5['day_rank']-27 # csv_data_week5 = pd.pivot_table(csv_data_week5, index='day_rank', values='item_id', aggfunc='count') # # csv_data_weeks = pd.concat([csv_data_week1, csv_data_week2, csv_data_week3, csv_data_week4, csv_data_week5], axis=1) # csv_data_weeks.columns=['week1', 'week2', 'week3', 'week4', 'week5'] # csv_data_weeks = csv_data_weeks.fillna(np.mean(csv_data_weeks)//2) # csv_data_weeks.plot.bar() # plt.ylabel('log count') # plt.xlabel('day of the week') # plt.savefig(com.get_project_path('Data/Graph/sale_count_by_week.jpg')) # # plt.show() # gc.collect() ''' 按日期排序的销量图 ''' # csv_data_all_copy = csv_data_all[csv_data_all['beh_type']==4].copy() # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id', aggfunc='count') # csv_data_all_copy.plot(color='g', kind='bar') # plt.xlabel('day rank') # plt.legend(['item sale']) # plt.savefig(com.get_project_path('Data/Graph/item_sale_by_day_rank.jpg')) # # plt.show() # gc.collect() ''' 按日期排序的记录量量图 ''' # csv_data_all_copy = csv_data_all.copy() # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id', aggfunc='count') # csv_data_all_copy.plot(color='g', kind='bar') # plt.xlabel('day rank') # plt.legend(['log count']) # plt.savefig(com.get_project_path('Data/Graph/log_count_by_day_rank.jpg')) # # plt.show() # gc.collect() ''' 按日期排序的商品种类图 ''' # csv_data_all_copy = csv_data_all.copy() # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='item_id', aggfunc=com.count_with_drop_duplicates_for_series) # csv_data_all_copy.plot(color='b', kind='bar') # plt.xlabel('day rank') # plt.legend(['item count']) # plt.savefig(com.get_project_path('Data/Graph/item_count_by_day_rank.jpg')) # # plt.show() # gc.collect() ''' 按日期排序的去重人数图 ''' # csv_data_all_copy = csv_data_all.copy() # csv_data_all_copy = pd.pivot_table(csv_data_all_copy, index='day_rank', values='user_id', aggfunc=com.count_with_drop_duplicates_for_series) # csv_data_all_copy.plot(color='b', kind='bar') # plt.xlabel('day rank') # plt.legend(['user count']) # plt.savefig(com.get_project_path('Data/Graph/user_count_by_day_rank.jpg')) # # plt.show() # gc.collect() ''' 只在某一天出现的用户和商品计数 (没优化,跑的很慢) ''' user_len = [] item_len = [] for i in range(1, 31): csv_user_day_count = com.pivot_table_plus( csv_data_all, 'user_id', 'day_rank', com.count_with_drop_duplicates_for_series, 'day_count') csv_user_day_count = csv_user_day_count[csv_user_day_count['day_count'] == 1] csv_user_only1212 = csv_data_all[csv_data_all['day_rank'] == i].drop_duplicates('user_id') csv_user_only1212 = csv_user_only1212[ csv_user_only1212['user_id'].isin(csv_user_day_count['user_id'])] user_len += [len(csv_user_only1212)] user_len = pd.DataFrame({ 'day_rank': range(1, 31), 'user_count': user_len }).set_index('day_rank') plt.plot(user_len) plt.savefig( com.get_project_path( 'Data/Graph/user_only_one_day_count_by_day_rank.jpg')) plt.show()
def get_feature(data_all, data_p, label_day_rank, duration=7, p_only=True, data_item=None, save=False): # 第一部分: 用户的特征 fea_user_path = get_user_feature(data_all=data_all, data_p=data_p, data_item=data_item, label_day_rank=label_day_rank, duration=duration, p_only=p_only, save=True) # fea_user_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='user') # 第二部分: 商品的特征 fea_item_path = get_item_feature(data_all=data_all, data_p=data_p, data_item=data_item, label_day_rank=label_day_rank, duration=duration, p_only=p_only, save=True) # fea_item_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='item') # 第三部分: 用户X商品 的特征 fea_ui_path = get_ui_feature(data_all=data_all, data_p=data_p, data_item=data_item, label_day_rank=label_day_rank, duration=duration, p_only=p_only, save=True) # fea_ui_path = com.get_project_path(FEATURE_PATH) + get_save_name(label_day_rank, duration, p_only, index='ui') # 组合特征 data_fea = pd.read_csv(fea_ui_path).loc[:, ['user_id', 'item_id']] data_fea = pd.merge(data_fea, pd.read_csv(fea_user_path), on='user_id', how='left') data_fea = pd.merge(data_fea, pd.read_csv(fea_item_path), on='item_id', how='left') data_fea = pd.merge(data_fea, pd.read_csv(fea_ui_path), on=['user_id', 'item_id'], how='left') if save is True: save_name = get_save_name(label_day_rank, duration, p_only, index='all') com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name) return com.get_project_path(FEATURE_PATH) + save_name else: return data_fea
def get_ui_feature(data_all, data_p, label_day_rank, duration=7, p_only=True, data_item=None, save=False): data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration) & (data_all['day_rank'] <= label_day_rank - 1)] data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration) & (data_p['day_rank'] <= label_day_rank - 1)] if p_only is True: data_fea = data_p[ (data_p['day_rank'] >= label_day_rank - SET_LENGTH) & (data_p['beh_type'] == 1 )].loc[:, ['user_id', 'item_id', 'item_cate', 'ui_id', 'uc_id' ]].drop_duplicates() else: data_fea = data_all[ (data_all['day_rank'] >= label_day_rank - SET_LENGTH) & (data_all['beh_type'] == 1 )].loc[:, ['user_id', 'item_id', 'item_cate', 'ui_id', 'uc_id' ]].drop_duplicates() # 用户 前1天当天/前2天当天/前3天当天 购买了/浏览了 几次这个商品 for ago_time in [1, 2, 3]: for beh_type in [1, 2, 3, 4]: fea_name = 'beh_type_' + str(beh_type) + '_count&ui_id&' + str( ago_time) + '_day_ago' feature = com.pivot_table_plus( data_all[(data_all['beh_type'] == beh_type) & (data_all['day_rank'] == label_day_rank - ago_time)], index='ui_id', values='user_id', aggfunc='count', new_name=fea_name) data_fea = pd.merge(data_fea, feature, on='ui_id', how='left') data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int) print('# -- ' + fea_name + ' complete -- #') # 用户是否 收藏/购买 过这个商品 data_fea['beh_type_4_if&ui_id'] = (data_fea['ui_id'].isin( data_all[data_all['beh_type'] == 4]['ui_id'])).replace({ True: 1, False: 0 }) data_fea['beh_type_2_if&ui_id'] = (data_fea['ui_id'].isin( data_all[data_all['beh_type'] == 2]['ui_id'])).replace({ True: 1, False: 0 }) # 用户与这个商品最后一次交互 是购买1 还是收藏1.5 还是浏览2 还是加购物车4 fea_name = 'beh_type_?_last&ui_id' feature = data_all.copy() feature['tmp'] = feature['day_rank'] * 100 + feature['hour'] feature['rank'] = feature.groupby('ui_id')['tmp'].rank(ascending=0) feature = feature[feature['rank'] == 1] data_fea = pd.merge(data_fea, feature.loc[:, ['ui_id', 'beh_type']], on='ui_id', how='left') data_fea[fea_name] = data_fea['beh_type'].replace({ 1: 2, 2: 1.5, 3: 4, 4: 1 }).fillna(0) del data_fea['beh_type'] # 商品在 全集/子集 是否是用户最后的交互对象 for data_index in ['data_all', 'data_p']: if data_index == 'data_all': data = data_all else: data = data_p fea_name = 'is_last&ui_id&' + data_index feature = data.loc[:, ['user_id', 'ui_id', 'day_rank', 'hour' ]].sort_values( by=['user_id', 'day_rank', 'hour'], ascending=[0, 0, 0]).drop_duplicates('user_id') data_fea[fea_name] = (data_fea['ui_id'].isin( feature['ui_id'])).replace({ True: 1, False: 0 }) print('# -- ' + fea_name + ' complete -- #') # 商品是用户在 全集/子集 倒数第几个交互对象 for data_index in ['data_all', 'data_p']: if data_index == 'data_all': data = data_all else: data = data_p fea_name = 'last_?&ui_id&' + data_index feature = data.loc[:, ['ui_id', 'day_rank', 'hour']] feature['tmp'] = feature['day_rank'] * 100 + feature['hour'] feature['rank'] = feature.groupby('ui_id')['tmp'].rank(method='dense', ascending=1) feature = feature.sort_values(by=['rank', 'ui_id'], ascending=[True, True]).drop_duplicates( ['ui_id']).loc[:, ['ui_id', 'rank']] data_fea = pd.merge(data_fea, feature, on='ui_id', how='left').fillna(max(feature['rank'] + 1)) data_fea = data_fea.rename(columns={'rank': fea_name}) print('# -- ' + fea_name + ' complete -- #') # 用户最后一次 浏览/收藏/购物车/购买 该商品/该商品类型 距标签多少小时 for beh_type in [1, 2, 3, 4]: for id_index in ['ui_id', 'uc_id']: fea_name = 'beh_type_' + str( beh_type) + '_latest_to_now_hour&' + id_index feature = data_all[( data_all['beh_type'] == beh_type )].loc[:, [id_index, 'day_rank', 'hour']].sort_values( by=['day_rank', 'hour'], ascending=[0, 0]).drop_duplicates(id_index) feature[fea_name] = feature['day_rank'].apply( lambda a: label_day_rank - label_day_rank) feature[fea_name] = (feature[fea_name] * 24) + (24 - feature['hour']) data_fea = pd.merge(data_fea, feature.loc[:, [id_index, fea_name]], how='left', on=id_index) data_fea[fea_name] = data_fea[fea_name].fillna( 24 * duration).astype(int) print('# -- ' + fea_name + ' complete -- #') del data_fea['uc_id'] del data_fea['ui_id'] del data_fea['item_cate'] if save is True: save_name = get_save_name(label_day_rank, duration, p_only, index='ui') com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name) return com.get_project_path(FEATURE_PATH) + save_name else: return data_fea
def get_item_feature(data_all, data_p, label_day_rank, duration=7, p_only=True, data_item=None, save=False): data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration) & (data_all['day_rank'] <= label_day_rank - 1)] data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration) & (data_p['day_rank'] <= label_day_rank - 1)] if p_only is True: data_fea = data_p[(data_p['day_rank'] >= label_day_rank - SET_LENGTH) & (data_p['beh_type'] == 1 )].loc[:, ['item_id', 'item_cate']].drop_duplicates() else: data_fea = data_all[ (data_all['day_rank'] >= label_day_rank - SET_LENGTH) & (data_all['beh_type'] == 1 )].loc[:, ['item_id', 'item_cate']].drop_duplicates() # 商品在/商品种类在 总/前1天内/前2天内/前3天内浏览/收藏/购物车/购买 的计数 for item_index in ['item_id', 'item_cate']: for duration_time in [duration, 1, 2, 3]: for beh_type in [1, 2, 3, 4]: fea_name = 'beh_type_' + str(beh_type) + '_count&' + str( item_index) + '&latest_' + str(duration_time) feature = com.pivot_table_plus( data_all[(data_all['beh_type'] == beh_type) & (data_all['day_rank'] >= label_day_rank - duration_time)], index=item_index, values='user_id', aggfunc='count', new_name=fea_name) data_fea = pd.merge(data_fea, feature, on=item_index, how='left') data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int) print('# -- ' + fea_name + ' complete -- #') # 商品的/商品种类的 购买/收藏 的计数在 全集/子集 中的 正/反 排序 for data_index in ['data_all', 'data_p']: for item_index in ['item_id', 'item_cate']: for beh_type in [2, 4]: for ascending in [0, 1]: if data_index == 'data_all': data = data_all else: data = data_p fea_name = 'count_rank' + str(ascending) + '&' + str( item_index) + '&beh_type_' + str(beh_type) + '&' + str( data_index) feature = com.pivot_table_plus( data[(data['beh_type'] == beh_type)], index=item_index, values='user_id', aggfunc='count', new_name='tmp') data_fea = pd.merge(data_fea, feature.loc[:, [item_index, 'tmp']], on=item_index, how='left') data_fea['tmp'] = data_fea['tmp'].fillna(0) data_fea[fea_name] = data_fea['tmp'].rank( ascending=ascending, method='dense') print('# -- ' + fea_name + ' complete -- #') del data_fea['tmp'] # 商品/商品类型 被多少人 浏览/收藏/购物车/购买 过 for item_index in ['item_id', 'item_cate']: for beh_type in [1, 2, 3, 4]: fea_name = 'user_count&' + item_index + '&' + 'beh_type_' + str( beh_type) feature = data_all[ (data_all[item_index].isin(data_fea[item_index])) & (data_all['beh_type'] == beh_type)] feature = com.pivot_table_plus( feature, index=item_index, values='user_id', aggfunc=com.count_with_drop_duplicates_for_series, new_name=fea_name) data_fea = pd.merge(data_fea, feature, on=item_index, how='left') data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int) print('# -- ' + fea_name + ' complete -- #') # 商品的/商品类型的 转化率(浏览X购买) data_fea['item_id_ctr'] = 1. * data_fea[ 'beh_type_4_count&item_id&latest_' + str(duration)] / data_fea['beh_type_1_count&item_id&latest_' + str(duration)] data_fea['item_id_ctr'] = data_fea['item_id_ctr'].fillna(0) data_fea['item_cate_ctr'] = 1. * data_fea[ 'beh_type_4_count&item_cate&latest_' + str(duration)] / data_fea['beh_type_1_count&item_cate&latest_' + str(duration)] data_fea['item_cate_ctr'] = data_fea['item_cate_ctr'].fillna(0) # 商品有几个经纬度 for item_index in ['item_id', 'item_cate']: fea_name = 'geo_count&' + item_index feature = data_item[data_item['item_geo_lat'] != -90] feature = com.pivot_table_plus( feature, item_index, 'item_geo_lat', com.count_with_drop_duplicates_for_series, fea_name) data_fea = pd.merge(data_fea, feature, on=item_index, how='left') data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int) del data_fea['item_cate'] if save is True: save_name = get_save_name(label_day_rank, duration, p_only, index='item') com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name) return com.get_project_path(FEATURE_PATH) + save_name else: return data_fea
def get_user_feature(data_all, data_p, label_day_rank, duration=7, p_only=True, data_item=None, save=False): data_all = data_all[(data_all['day_rank'] >= label_day_rank - duration) & (data_all['day_rank'] <= label_day_rank - 1)] data_p = data_p[(data_p['day_rank'] >= label_day_rank - duration) & (data_p['day_rank'] <= label_day_rank - 1)] if p_only is True: data_fea = data_p[ (data_p['day_rank'] >= label_day_rank - SET_LENGTH) & (data_p['beh_type'] == 1)].loc[:, ['user_id']].drop_duplicates() else: data_fea = data_all[ (data_all['day_rank'] >= label_day_rank - SET_LENGTH) & (data_all['beh_type'] == 1)].loc[:, ['user_id']].drop_duplicates() # 用户在 全集中的/子集中的 总/前1天内/前2天内/前3天内 浏览/收藏/购物车/购买 的计数 for data_index in ['data_all', 'data_p']: for duration_time in [duration, 1, 2, 3]: for beh_type in [1, 2, 3, 4]: if data_index == 'data_all': data = data_all else: data = data_p fea_name = 'beh_type_' + str( beh_type) + '_count&user&latest_' + str( duration_time) + '&' + str(data_index) feature = com.pivot_table_plus( data[(data['beh_type'] == beh_type) & (data['day_rank'] >= label_day_rank - duration_time)], index='user_id', values='beh_type', aggfunc='count', new_name=fea_name) data_fea = pd.merge(data_fea, feature, on='user_id', how='left') data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int) print('# -- ' + fea_name + ' complete -- #') # 用户在 全集中/子集中 浏览/收藏/购物车/购买 过几种商品 for data_index in ['data_all', 'data_p']: for beh_type in [1, 2, 3, 4]: if data_index == 'data_all': data = data_all else: data = data_p fea_name = 'item_count&' + 'user&' + 'beh_type_' + str( beh_type) + '&' + str(data_index) feature = com.pivot_table_plus( data[(data['beh_type'] == beh_type)], index='user_id', values='item_id', aggfunc=com.count_with_drop_duplicates_for_series, new_name=fea_name) data_fea = pd.merge(data_fea, feature, on='user_id', how='left') data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int) print('# -- ' + fea_name + ' complete -- #') # 用户在 全集中的/子集中的 转化率 data_fea['user_ctr&data_all'] = 1. * data_fea[ 'beh_type_4_count&user&latest_' + str(duration) + '&data_all'] / data_fea['beh_type_1_count&user&latest_' + str(duration) + '&data_all'] data_fea['user_ctr&data_all'] = data_fea['user_ctr&data_all'].fillna(0) data_fea['user_ctr&data_p'] = 1. * data_fea[ 'beh_type_4_count&user&latest_' + str(duration) + '&data_p'] / data_fea['beh_type_1_count&user&latest_' + str(duration) + '&data_p'] data_fea['user_ctr&data_p'] = data_fea['user_ctr&data_p'].fillna(0) # 用户最后一次 浏览/收藏/购物车/购买 距标签多少小时 for beh_type in [1, 2, 3, 4]: fea_name = 'beh_type_' + str(beh_type) + '_latest_to_now_hour&user_id' feature = data_all.loc[:, ['user_id', 'day_rank', 'hour']].sort_values( by=['user_id', 'day_rank', 'hour'], ascending=[0, 0, 0]).drop_duplicates('user_id') feature[fea_name] = feature['day_rank'].apply( lambda a: label_day_rank - a) feature[fea_name] = (feature[fea_name] * 24) + (24 - feature['hour']) data_fea = pd.merge(data_fea, feature.loc[:, ['user_id', fea_name]], how='left', on='user_id') data_fea[fea_name] = data_fea[fea_name].fillna(24 * duration).astype(int) print('# -- ' + fea_name + ' complete -- #') # 用户有几个经纬度 fea_name = 'geo_count&user_id' feature = data_all[data_all['user_geo_lat'] != 90] feature = com.pivot_table_plus(feature, 'user_id', 'user_geo_lat', com.count_with_drop_duplicates_for_series, fea_name) data_fea = pd.merge(data_fea, feature, on='user_id', how='left') data_fea[fea_name] = data_fea[fea_name].fillna(0).astype(int) print('# -- ' + fea_name + ' complete -- #') if save is True: save_name = get_save_name(label_day_rank, duration, p_only, index='user') com.save_csv(data_fea, com.get_project_path(FEATURE_PATH), save_name) return com.get_project_path(FEATURE_PATH) + save_name else: return data_fea
def run(): csv_data_all = pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_all.csv')) csv_data_p = pd.read_csv(com.get_project_path('Data/Csv/ClnData/csv_data_p.csv')) # 31号全部的购物车记录 # a = get_result_by_rule1(csv_data_p, beh_type=3, day_rank=31) # com.save_csv(a, com.get_project_path('Data/Csv/ResData/_Z/beh_type_3&latest_1_day_201912281826.csv'), 'beh_type_3&latest_1_day.csv') ''' 以 前一天所有在购物车 的商品交上去 ''' # for i in range(20, 30): # print("第"+str(i+1)+"天为标签") # a = get_result_by_rule1(csv_data_all, beh_type=3, day_rank=i) # b = sp.get_csv_label(csv_data_all, i+1) # print(sp.f1_score(b, a)) # # a = get_result_by_rule1(csv_data_p, beh_type=3, day_rank=i) # b = sp.get_csv_label(csv_data_p, i+1) # print(sp.f1_score(b, a)) ''' 以 前一天所有浏览过 的商品交上去 ''' # for i in range(20, 30): # print("第"+str(i+1)+"天为标签") # a = get_result_by_rule1(csv_data_all, beh_type=1, day_rank=i) # b = sp.get_csv_label(csv_data_all, i+1) # print(len(a), len(b)) # print(sp.f1_score(b, a)) # # a = get_result_by_rule1(csv_data_p, beh_type=1, day_rank=i) # b = sp.get_csv_label(csv_data_p, i+1) # print(len(a), len(b)) # print(sp.f1_score(b, a)) ''' 以 前一天所有在购物车且收藏过 的商品交上去 ''' # for i in range(20, 30): # print("\n第"+str(i+1)+"天为标签") # a = get_result_by_rule2(csv_data_all, day_rank=i) # b = sp.get_csv_label(csv_data_all, i+1) # print(len(a), len(b)) # print(sp.f1_score(b, a)) # # a = get_result_by_rule2(csv_data_p, day_rank=i) # b = sp.get_csv_label(csv_data_p, i+1) # print(len(a), len(b)) # print(sp.f1_score(b, a)) ''' 纯马后炮测试, 交 标签日期中,曾出现在前一天的购物车里 的商品 ''' # for i in range(20, 30): # print("\n第" + str(i + 1) + "天为标签") # b = sp.get_csv_label(csv_data_all, i + 1) # a = csv_data_all[(csv_data_all['day_rank']==i) & (csv_data_all['beh_type']==3)].loc[:, ['user_id', 'item_id']].drop_duplicates() # a = a[(a['item_id'] + a['user_id']).isin(b['item_id'] + b['user_id'])] # print(len(a), len(b)) # print(sp.f1_score(b, a)) # # b = sp.get_csv_label(csv_data_p, i + 1) # a = csv_data_p[(csv_data_p['day_rank']==i) & (csv_data_p['beh_type']==3)].loc[:, ['user_id', 'item_id']].drop_duplicates() # a = a[(a['item_id'] + a['user_id']).isin(b['item_id'] + b['user_id'])] # print(len(a), len(b)) # print(sp.f1_score(b, a)) ''' 纯马后炮测试, 交 标签日期中,曾在前一天浏览过 的商品 ''' # for i in range(20, 30): # print("\n第" + str(i + 1) + "天为标签") # b = sp.get_csv_label(csv_data_all, i + 1) # a = csv_data_all[(csv_data_all['day_rank']==i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() # a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] # print(len(a), len(b)) # print(sp.f1_score(b, a)) # # b = sp.get_csv_label(csv_data_p, i + 1) # a = csv_data_p[(csv_data_p['day_rank']==i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() # a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] # print(len(a), len(b)) # print(sp.f1_score(b, a)) ''' 纯马后炮测试, 交 标签日期中,曾在前两天浏览过 的商品 ''' # for i in range(20, 30): # print("\n第" + str(i + 1) + "天为标签") # b = sp.get_csv_label(csv_data_all, i + 1) # a = csv_data_all[(csv_data_all['day_rank']>=i-1) & (csv_data_all['day_rank']<=i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() # a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] # print(len(a), len(b)) # sp.f1_score(b, a, if_print=True) # # b = sp.get_csv_label(csv_data_p, i + 1) # a = csv_data_p[(csv_data_p['day_rank']>=i-1) & (csv_data_p['day_rank']<=i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() # a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] # print(len(a), len(b)) # sp.f1_score(b, a, if_print=True) ''' 纯马后炮测试, 交 标签日期中,曾在前七天浏览过 的商品 ''' for i in range(20, 30): print("\n第" + str(i + 1) + "天为标签") b = sp.get_csv_label(csv_data_all, i + 1) a = csv_data_all[(csv_data_all['day_rank']>=i-6) & (csv_data_all['day_rank']<=i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] print(len(a), len(b)) sp.f1_score(b, a, if_print=True) b = sp.get_csv_label(csv_data_p, i + 1) a = csv_data_p[(csv_data_p['day_rank']>=i-6) & (csv_data_p['day_rank']<=i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] print(len(a), len(b)) sp.f1_score(b, a, if_print=True) ''' 纯马后炮测试, 交 标签日期中,曾有过任何记录 的商品 ''' for i in range(20, 30): print("\n第" + str(i + 1) + "天为标签") b = sp.get_csv_label(csv_data_all, i + 1) a = csv_data_all[(csv_data_all['day_rank']<=i) & (csv_data_all['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] print(len(a), len(b)) sp.f1_score(b, a, if_print=True) b = sp.get_csv_label(csv_data_p, i + 1) a = csv_data_p[(csv_data_p['day_rank']<=i) & (csv_data_p['beh_type']==1)].loc[:, ['user_id', 'item_id']].drop_duplicates() a = a[(a['item_id']+a['user_id']).isin(b['item_id']+b['user_id'])] print(len(a), len(b)) sp.f1_score(b, a, if_print=True) '''