def xgb_online(train, cv, test): train_data = train.copy() cv_data = cv.copy() test_data = test.copy() train_data = pd.concat([train_data, cv_data], axis=0) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) test_data.drop(drop_cols, axis=1, inplace=True) folds = 5 kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): print('第{}次训练...'.format(i)) train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index]) cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index]) test_feat = xgb.DMatrix(test_data.values) watchlist = [(train_feat, 'train'), (cv_feat, 'val')] clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\ evals=watchlist,early_stopping_rounds=7,verbose_eval=False) predict_train = clf.predict(train_feat) predict_cv = clf.predict(cv_feat) predict_test = clf.predict(test_feat) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:, i] = predict_test #特征重要度 features = train_data.columns ceate_feature_map(features) importance = clf.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1)) feat_imp = pd.DataFrame(importance, columns=['feature', 'fscore']) feat_imp['fscore'] = feat_imp['fscore'] / feat_imp['fscore'].sum() print(clf.best_iteration) print(clf.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y.loc[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index])) predict_test = np.median(test_preds, axis=1) predict_test = predict_test / (predict_test + (1 - predict_test) / rate) print(params) print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('test mean:', np.mean(predict_test)) submmit_result(predict_test, 'XGB') return feat_imp, predict_test
def lgb_online(train, cv, test): train_data = train.copy() cv_data = cv.copy() test_data = test.copy() train_data = pd.concat([train_data, cv_data], axis=0) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) test_data.drop(drop_cols, axis=1, inplace=True) folds = 5 kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): print('第{}次训练...'.format(i)) train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index]) lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index]) gbm = lgb.train(params=params, train_set=lgb_train, num_boost_round=6000, valid_sets=lgb_cv, verbose_eval=False, early_stopping_rounds=50) #评价特征的重要性 feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) predict_train = gbm.predict(train_feat.values) predict_cv = gbm.predict(cv_feat.values) test_preds[:, i] = gbm.predict(test_data.values) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) print(gbm.best_iteration) print(gbm.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y.loc[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index])) predict_test = np.median(test_preds, axis=1) predict_test = predict_test / (predict_test + (1 - predict_test) / rate) print(params) print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('test mean:', np.mean(predict_test)) submmit_result(predict_test, 'LGB') return feat_imp, predict_test
def LR_online(train_data, cv_data, test_data): print('on line') train_data = pd.concat([train_data, cv_data], axis=0) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'] fold = 5 kf = KFold(len(train_data), n_folds=fold, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], fold)) data_in_process = [(train_data.loc[train_index], train_data.loc[cv_index], test_data) for i, (train_index, cv_index) in enumerate(kf)] index_all = [(train_index, cv_index) for i, (train_index, cv_index) in enumerate(kf)] with multiprocessing.Pool(fold) as p: k_val_list = p.map(_LR_train, data_in_process) for i, index, val in zip(range(fold), index_all, k_val_list): print('no %d train' % (i)) train_index, cv_index = index predict_train, predict_cv, predict_test = val train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:, i] = predict_test print(' 训练损失:', cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y[cv_index])) ''' for i, (train_index, cv_index) in enumerate(kf): print('no %d train:'%(i)) train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] predict_train,predict_cv,predict_test = _LR_train((train_feat,cv_feat,test_data)) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:,i] = predict_test print(' 训练损失:',cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:',cal_log_loss(predict_cv, train_Y[cv_index])) ''' predict_test = np.median(test_preds, axis=1) print('mean:', np.mean(predict_test)) print('训练损失:', cal_log_loss(train_preds / (fold - 1), train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) submmit_result(predict_test, 'LR')
def LR_online(train_data, cv_data, test_data): train_data = pd.concat([train_data, cv_data],axis=0) train_data = build_train_dataset(train_data, rate) train_data.reset_index(inplace=True,drop=True) train_Y = train_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols,axis=1,inplace=True) test_data.drop(drop_cols,axis=1,inplace=True) fold = 5 kf = KFold(len(train_data), n_folds = fold, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], fold)) for i, (train_index, cv_index) in enumerate(kf): train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] clf = LogisticRegression(C=1.2, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5}) clf.fit(X=train_feat.values, y=train_Y[train_index]) predict_train = clf.predict_proba(train_feat.values)[:,1] predict_cv = clf.predict_proba(cv_feat.values)[:,1] predict_test = clf.predict_proba(test_data.values)[:,1] train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:,i] = predict_test print(' 训练损失:',cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:',cal_log_loss(predict_cv, train_Y[cv_index])) predict_test = np.median(test_preds,axis=1) predict_test = predict_test/(predict_test+(1-predict_test)/rate) print('训练损失:',cal_log_loss(train_preds/4, train_Y)) print('测试损失:',cal_log_loss(cv_preds, train_Y)) submmit_result(predict_test, 'LR')
def online(train_data, cv_data, test_data): #剔除历史数据,保留老用户的历史数据 cv_data.index += len(train_data) train_data = pd.concat([train_data, cv_data], axis=0) history_cols = ['user_id_cvr_smooth', 'user_id_buy_count'] old_user_data_train = train_data[history_cols] old_user_data_test = test_data[history_cols] #对数据集进行训练 train_Y = train_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) test_data.drop(drop_cols, axis=1, inplace=True) kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] print('第{}次训练...'.format(i)) lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index]) lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index]) gbm = lgb.train(params=params, train_set=lgb_train, num_boost_round=6000, valid_sets=lgb_cv, verbose_eval=False, early_stopping_rounds=200) #评价特征的重要性 feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) predict_train = gbm.predict(train_feat.values) predict_cv = gbm.predict(cv_feat.values) test_preds[:, i] = gbm.predict(test_data.values) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) print(gbm.best_iteration) print(gbm.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y.loc[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index])) test_preds = np.median(test_preds, axis=1) print(params) print('训练损失:', cal_log_loss(train_preds / 4, train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) #划分出新老用户的分数并计算损失情况 train_old_user_index = old_user_data_train.loc[ old_user_data_train.user_id_cvr_smooth != -1, :].index test_old_user_index = old_user_data_test.loc[ old_user_data_test.user_id_cvr_smooth != -1, :].index # train_new_user_index = old_user_data_train.loc[old_user_data_train.user_id_cvr_smooth==-1,:].index # test_new_user_index = old_user_data_test.loc[old_user_data_test.user_id_cvr_smooth==-1,:].index train_old_score = cv_preds[train_old_user_index] test_old_score = test_preds[test_old_user_index] # train_new_score = cv_preds[train_new_user_index] # test_new_score = test_preds[test_new_user_index] new_train_data = old_user_data_train.loc[ old_user_data_train.user_id_cvr_smooth != -1, :] new_test_data = old_user_data_test.loc[ old_user_data_test.user_id_cvr_smooth != -1, :] new_train_data['y'] = train_old_score new_test_data['y'] = test_old_score new_train_Y = train_Y[train_old_user_index] #对老用户单独训练 clf = LogisticRegression(C=12, fit_intercept=True, max_iter=3000, class_weight={ 0: 0.5, 1: 0.5 }) clf.fit(X=new_train_data.values, y=new_train_Y) train_LR_score = clf.predict_proba(new_train_data.values)[:, 1] test_LR_score = clf.predict_proba(new_test_data.values)[:, 1] cv_preds[train_old_user_index] = train_LR_score test_preds[test_old_user_index] = test_LR_score #记录老用户的损失情况 print('LR train:', cal_log_loss(train_LR_score, new_train_Y)) #拼接结果看看总体的损失情况 print('All train:', cal_log_loss(cv_preds, train_Y)) submmit_result(test_preds, 'old_and_new') return test_preds, feat_imp
clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1) clf.fit(X=predict_day7, y=train_Y) predict_train_2 = clf.predict(predict_day7) predict_cv_2 = clf.predict(predict_cv) print('train:', cal_log_loss(predict_train_2, train_Y)) print('test:', cal_log_loss(predict_cv_2, cv_Y)) print('train mean:', np.mean(predict_train_2)) print('cv mean:', np.mean(predict_cv_2)) return predict_cv_2 if __name__ == '__main__': t0 = time.time() day7_data = _load_splited_df(path=cache_pkl_path + 'Tree_day/train_7') cv_data = _load_splited_df(path=cache_pkl_path + 'Tree_day/cv') test_data = _load_splited_df(path=cache_pkl_path + 'Tree_day/test') print('off line') _ = XGB_model_second(day7_data, cv_data) print('on line') # day7_data = pd.concat([day7_data, cv_data],axis=0) predict_cv_2 = XGB_model_second(day7_data, test_data) submmit_result(predict_cv_2, 'XGB_LR')
gbc.fit(train_data.values, train_Y) predict_train = gbc.predict_proba(train_data.values)[:,1] predict_cv = gbc.predict_proba(cv_data.values)[:,1] predict_test = gbc.predict_proba(test_data.values)[:,1] # print(gbc.get_params) print('训练损失:',cal_log_loss(predict_train, train_Y)) print('测试损失:',cal_log_loss(predict_cv, cv_Y)) t1 = time.time() print('训练用时:',t1-t0) new_train, new_cv, new_test = gen_gbdt_feature(gbc, train_data, cv_data, test_data) print('train shap:',new_train.shape) print('cv shape', new_cv.shape) print('test shape', new_test.shape) # LR预测 clf = LogisticRegression(C=0.8, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5}) clf.fit(X=new_train, y=np.squeeze(train_Y)) predict_train = clf.predict_proba(new_train)[:,1] predict_cv = clf.predict_proba(new_cv)[:,1] predict_test = clf.predict_proba(new_test)[:,1] print('训练损失:',cal_log_loss(predict_train, train_Y)) print('测试损失:',cal_log_loss(predict_cv, cv_Y)) t1 = time.time() print('训练用时:',t1-t0) submmit_result(predict_test,'GBDT_LR')
# -*- coding: utf-8 -*- """ Created on Mon Apr 16 22:13:45 2018 @author: weiqing """ import pandas as pd import numpy as np from utils import load_pickle, raw_data_path, feature_data_path, cache_pkl_path, result_path, model_path, submmit_result if __name__ == '__main__': XGB = pd.read_csv('../result/XGB_20180421_211412.txt', sep=' ') LGB = pd.read_csv('../result/LGB_20180421_172434.txt', sep=' ') FFM = pd.read_csv('../result/FFM_20180421_215653.txt', sep=' ') result = np.zeros((len(XGB), 3)) result[:, 0] = XGB['predicted_score'].values result[:, 1] = LGB['predicted_score'].values result[:, 2] = FFM['predicted_score'].values median = np.median(result, axis=1) submmit_result(median, 'median')
def change_to_result(): preds = pd.read_csv('../result/ffm_online_result.csv', header=None) submmit_result(np.squeeze(preds.values), 'FFM')
if __name__ == '__main__': t0 = time.time() day7_data = _load_splited_df(path=cache_pkl_path + 'LR_day/train_7') cv_data = _load_splited_df(path=cache_pkl_path + 'LR_day/cv') test_data = _load_splited_df(path=cache_pkl_path + 'LR_day/test') predict_day7, predict_cv, predict_test = LR_model_first( day7_data, cv_data, test_data) #model 2 train_Y = day7_data['is_trade'].values cv_Y = cv_data['is_trade'].values clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1) clf.fit(X=predict_day7, y=train_Y) predict_train_2 = clf.predict(predict_day7) predict_cv_2 = clf.predict(predict_cv) predict_test_2 = clf.predict(predict_test) print('train:', cal_log_loss(predict_train_2, train_Y)) print('test:', cal_log_loss(predict_cv_2, cv_Y)) print('train mean:', np.mean(predict_train_2)) print('cv mean:', np.mean(predict_cv_2)) print('test mean:', np.mean(predict_test_2)) submmit_result(predict_test_2, 'LR_LR')