def test(**kwargs): # ---------------------- 更新参数 ---------------------- opt = DefaultConfig() opt.update(**kwargs) opt.printf() # ---------------------- 数据处理 ---------------------- # 获取数据 train, test = get_test_data(opt) gc.collect() # # 获取样本 # test_sample = get_sample(train, test, load=True) # gc.collect() # # 获取特征 # test_feat = get_feat(train, test_sample) # gc.collect() # 保存特征至文件 # test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) test_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0])) test_feat = get_feat(train, test_feat) gc.collect() test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}_filter.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) # ---------------------- 载入模型 ---------------------- # opt['model_name'] = 'lgb_1_90_all.pkl' # gbm0, use_feat0 = load_model(opt) opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl' gbm1, use_feat1 = load_model(opt) # opt['model_name'] = 'lgb_2_300_top15.pkl' # gbm2, use_feat2 = load_model(opt) # opt['model_name'] = 'lgb_3_300_top10.pkl' # gbm3, use_feat3 = load_model(opt) # opt['model_name'] = 'lgb_4_300_top5.pkl' # gbm4, use_feat4 = load_model(opt) # ---------------------- 保存预测结果 ------------------- # test_feat.loc[:, 'pred'] = gbm0.predict(test_feat[use_feat0]) # gc.collect() # res = test_feat[['orderid', 'geohashed_end_loc', 'pred']].sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(25) # res[['orderid', 'geohashed_end_loc']].to_hdf('/home/xuwenchao/dyj-storage/sample_25_{}_filter_leak_sample.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) # gc.collect() # test_feat.loc[:, 'pred'] = gbm1.predict(test_feat[use_feat1]) # test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58820.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) res = predict(test_feat, use_feat1, gbm1) test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58893.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5) gc.collect() cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') res_path = '{}/day{}_{}_wc_sample_0.58893.csv'.format(opt['result_dir'], opt['test_startday'], cur_time) res.to_csv(res_path, index=False) print('保存测试结果至:', res_path)
def val(**kwargs): # ---------------------- 更新参数 ---------------------- opt = DefaultConfig() opt.update(**kwargs) opt.printf() # ---------------------- 数据处理 ---------------------- # 获取数据 # train1, train2, train_test = get_train_data(opt) # 获取样本 # train_sample = get_sample(train1, train2, load=True) # 获取特征 # train_feat = get_feat(train_test, train_sample) # gc.collect() # train_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(opt['startday']), 'w', complib='blosc', complevel=5) train_feat = pd.read_hdf( '/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf') # ---------------------- 载入模型 ---------------------- # opt['model_name'] = 'lgb_1_90_all.pkl' # gbm0, use_feat0 = load_model(opt) opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' gbm, use_feat = load_model(opt) opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl' gbm1, use_feat1 = load_model(opt) # gbm2, use_feat2 = load_model(opt) # opt['model_name'] = 'lgb_2017-09-03#23:24:26_0.57836.pkl' # gbm3, use_feat3 = load_model(opt) # opt['model_name'] = '' # gbm4, use_feat4 = load_model(opt) # ---------------------- 评估 ------------------------- train_feat.loc[:, 'pred'] = gbm.predict(train_feat[use_feat]) gc.collect() train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv( '/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58820.csv', index=None) train_feat.loc[:, 'pred'] = gbm1.predict(train_feat[use_feat1]) gc.collect() train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv( '/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58893.csv', index=None)
def val(**kwargs): # ---------------------- 更新参数 ---------------------- opt = DefaultConfig() opt.update(**kwargs) opt.printf() # ---------------------- 数据处理 ---------------------- # 获取数据 # train1, train2, train_test = get_train_data(opt) # 获取样本 # train_sample = get_sample(train1, train2, load=True) # 获取特征 # train_feat = get_feat(train_test, train_sample) # gc.collect() # train_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(opt['startday']), 'w', complib='blosc', complevel=5) train_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf') # ---------------------- 载入模型 ---------------------- # opt['model_name'] = 'lgb_1_90_all.pkl' # gbm0, use_feat0 = load_model(opt) opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' gbm, use_feat = load_model(opt) opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl' gbm1, use_feat1 = load_model(opt) # gbm2, use_feat2 = load_model(opt) # opt['model_name'] = 'lgb_2017-09-03#23:24:26_0.57836.pkl' # gbm3, use_feat3 = load_model(opt) # opt['model_name'] = '' # gbm4, use_feat4 = load_model(opt) # ---------------------- 评估 ------------------------- train_feat.loc[:, 'pred'] = gbm.predict(train_feat[use_feat]) gc.collect() train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58820.csv', index=None) train_feat.loc[:, 'pred'] = gbm1.predict(train_feat[use_feat1]) gc.collect() train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58893.csv', index=None)
def train(**kwargs): # ---------------------- 更新参数 ---------------------- opt = DefaultConfig() opt.update(**kwargs) opt.printf() # ---------------------- 数据处理 ---------------------- # 获取数据 train1, train2 = get_train_data(opt) # 获取样本 # train_sample = get_sample(train1, train2, load=True) # 获取特征 # train_feat = get_feat(train1, train_sample) # 获取标签 # train_all = get_label(train_feat, opt) # gc.collect() # train_all.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf', 'w', complib='blosc', complevel=5) train_all = pd.read_hdf( '/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf') print(train_all.shape) # 取出需要用的特征 # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' # gbm, use_feat = load_model(opt) # predictors_100 = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()}) # predictors_100 = predictors_100.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100] # use_feat = list(predictors_100) + ['orderid', 'geohashed_end_loc', 'label'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] # train_all = train_all[use_feat] # gc.collect() # -------------------- 训练第一层 ------------------------ # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义使用哪些特征 # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' # gbm, use_feat = load_model(opt) filters = set([ 'orderid', 'userid', 'biketype', 'geohashed_start_loc', 'bikeid', 'starttime', 'geohashed_end_loc', 'label' ]) predictors = list( filter(lambda x: x not in filters, train_all.columns.tolist())) # predictors = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()}) # predictors = predictors.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100] # use_feat = list(predictors) + ['orderid', 'geohashed_end_loc'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] # predictors = list(predictors_100) + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] print('使用的特征:{}维\n'.format(len(predictors)), predictors) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # ********* LightGBM ********* # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # 配置 params = { 'objective': 'binary', 'metric': {'auc', 'binary_logloss'}, 'is_unbalance': True, 'num_leaves': opt['lgb_leaves'], 'learning_rate': opt['lgb_lr'], 'feature_fraction': 0.886, 'bagging_fraction': 0.886, 'bagging_freq': 5 } gc.collect() # ********** 开始训练 ********* gbm1 = lgb.train(params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5) gc.collect() # # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') # save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb', cur_time, score[0]) save_path = '{}/{}_{}.pkl'.format(opt['model_dir'], 'lgb', cur_time) with open(save_path, 'wb') as fout: pickle.dump(gbm1, fout) print('保存模型:', save_path) gc.collect() # # ********* 评估 ********* # # 在训练集上看效果 del X_train, y_train, X_val, y_val gc.collect() score = get_score(train_all, predictors, gbm1, opt) print('训练集分数:{}'.format(score)) import sys sys.exit(0) # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_1_300_top25') # with open(save_path, 'wb') as fout: # pickle.dump(gbm1, fout) # print('保存模型(第一层):', save_path) # ********* save predict ***** # train_all[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/train2324_80_pred_res.hdf', 'w', complib='blosc', complevel=5) # print('Save train_pred_res.hdf successful!!!') # import sys # sys.exit(0) # -------------------- 训练第二层 ------------------------ # opt['model_name'] = 'lgb_1_300_top25.pkl' # gbm1, use_feat1 = load_model(opt) # train_all.loc[:, 'pred'] = gbm1.predict(train_all[use_feat1]) # 去掉重要性较低的特征,筛选出排名前十的候选样本,重新训练模型(后期可以载入模型finetune, # 尤其是对于样本量较少的情况,甚至可以选前5, # 但15可以覆盖99.5%的原始label,10可以覆盖98%的原始label, # 这两者可能会好一些,备选方案:5(+finetune),10(+finetune),15(+finetune)) predictors = pd.DataFrame( data={ 'feature_name': gbm1.feature_name(), 'feature_importance': gbm1.feature_importance() }) predictors = predictors[ predictors['feature_importance'] > 0]['feature_name'].values print('第二层使用的特征:{}维\n'.format(len(predictors)), predictors) train_all = train_all.sort_values( by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15) # train_all = rank(train_all, 'orderid', 'pred', ascending=False) del train_all['pred'] print('第二层数据:', train_all.shape) # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # ********** 开始训练 ********* gbm2 = lgb.train(params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5 # init_model=gbm1 # finetune ) # ********* 评估 ********* # 在训练集上看效果 score = get_score(train_all, predictors, gbm2, opt) print('训练集分数(第二层):{}'.format(score)) # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_2', cur_time, score[0]) with open(save_path, 'wb') as fout: pickle.dump(gbm2, fout) print('保存模型(第二层):', save_path) # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_2_300_top15') # with open(save_path, 'wb') as fout: # pickle.dump(gbm2, fout) # print('保存模型(第二层):', save_path) import sys sys.exit(0) # -------------------- 训练第三层 ------------------------ # 筛选出排名前五的候选样本 predictors = pd.DataFrame( data={ 'feature_name': gbm2.feature_name(), 'feature_importance': gbm2.feature_importance() }) predictors = predictors[ predictors['feature_importance'] > 0]['feature_name'].values print('第三层使用的特征:{}维\n'.format(len(predictors)), predictors) train_all = train_all.sort_values( by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10) # train_all = rank(train_all, 'orderid', 'pred', ascending=False) del train_all['pred'] print('第三层数据:', train_all.shape) # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # ********** 开始训练 ********* gbm3 = lgb.train(params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5 # init_model=gbm2 # finetune ) # ********* 评估 ********* # 在训练集上看效果 score = get_score(train_all, predictors, gbm3, opt) print('训练集分数(第三层):{}'.format(score)) # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_3', cur_time, score[0]) with open(save_path, 'wb') as fout: pickle.dump(gbm3, fout) print('保存模型(第三层):', save_path) save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_3_300_top10') with open(save_path, 'wb') as fout: pickle.dump(gbm3, fout) print('保存模型(第三层):', save_path) # -------------------- 训练第四层 ------------------------ # 筛选出排名前三的候选样本 predictors = pd.DataFrame( data={ 'feature_name': gbm3.feature_name(), 'feature_importance': gbm3.feature_importance() }) predictors = predictors[ predictors['feature_importance'] > 0]['feature_name'].values print('第四层使用的特征:{}维\n'.format(len(predictors)), predictors) train_all = train_all.sort_values( by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5) # train_all = rank(train_all, 'orderid', 'pred', ascending=False) del train_all['pred'] print('第四层数据:', train_all.shape) # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # ********** 开始训练 ********* gbm4 = lgb.train(params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5 # init_model=gbm3 # finetune ) # ********* 评估 ********* # 在训练集上看效果 score = get_score(train_all, predictors, gbm4, opt) print('训练集分数(第四层):{}'.format(score)) # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_4', cur_time, score[0]) with open(save_path, 'wb') as fout: pickle.dump(gbm4, fout) print('保存模型(第四层):', save_path) save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_4_300_top5') with open(save_path, 'wb') as fout: pickle.dump(gbm4, fout) print('保存模型(第四层):', save_path)
def train(**kwargs): # ---------------------- 更新参数 ---------------------- opt = DefaultConfig() opt.update(**kwargs) opt.printf() # ---------------------- 数据处理 ---------------------- # 获取数据 # train1, train2 = get_train_data(opt) # 获取样本 # train_sample = get_sample(train1, train2, load=True) # 获取特征 # train_feat = get_feat(train1, train_sample) # 获取标签 # train_all = get_label(train_feat, opt) # gc.collect() # train_all.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf', 'w', complib='blosc', complevel=5) train_all = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf') print(train_all.shape) # 取出需要用的特征 # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' # gbm, use_feat = load_model(opt) # predictors_100 = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()}) # predictors_100 = predictors_100.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100] # use_feat = list(predictors_100) + ['orderid', 'geohashed_end_loc', 'label'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] # train_all = train_all[use_feat] # gc.collect() # -------------------- 训练第一层 ------------------------ # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义使用哪些特征 # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl' # gbm, use_feat = load_model(opt) filters = set(['orderid', 'userid', 'biketype', 'geohashed_start_loc', 'bikeid', 'starttime', 'geohashed_end_loc', 'label']) predictors = list(filter(lambda x: x not in filters, train_all.columns.tolist())) # predictors = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()}) # predictors = predictors.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100] # use_feat = list(predictors) + ['orderid', 'geohashed_end_loc'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] # predictors = list(predictors_100) + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate'] print('使用的特征:{}维\n'.format(len(predictors)), predictors) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # ********* LightGBM ********* # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # 配置 params = { 'objective': 'binary', 'metric': {'auc', 'binary_logloss'}, 'is_unbalance': True, 'num_leaves': opt['lgb_leaves'], 'learning_rate': opt['lgb_lr'], 'feature_fraction': 0.886, 'bagging_fraction': 0.886, 'bagging_freq': 5 } gc.collect() # ********** 开始训练 ********* gbm1 = lgb.train( params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5 ) gc.collect() # # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') # save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb', cur_time, score[0]) save_path = '{}/{}_{}.pkl'.format(opt['model_dir'], 'lgb', cur_time) with open(save_path, 'wb') as fout: pickle.dump(gbm1, fout) print('保存模型:', save_path) gc.collect() # # ********* 评估 ********* # # 在训练集上看效果 del X_train, y_train, X_val, y_val gc.collect() score = get_score(train_all, predictors, gbm1, opt) print('训练集分数:{}'.format(score)) import sys sys.exit(0) # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_1_300_top25') # with open(save_path, 'wb') as fout: # pickle.dump(gbm1, fout) # print('保存模型(第一层):', save_path) # ********* save predict ***** # train_all[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/train2324_80_pred_res.hdf', 'w', complib='blosc', complevel=5) # print('Save train_pred_res.hdf successful!!!') # import sys # sys.exit(0) # -------------------- 训练第二层 ------------------------ # opt['model_name'] = 'lgb_1_300_top25.pkl' # gbm1, use_feat1 = load_model(opt) # train_all.loc[:, 'pred'] = gbm1.predict(train_all[use_feat1]) # 去掉重要性较低的特征,筛选出排名前十的候选样本,重新训练模型(后期可以载入模型finetune,尤其是对于样本量较少的情况,甚至可以选前5,但15可以覆盖99.5%的原始label,10可以覆盖98%的原始label,这两者可能会好一些,备选方案:5(+finetune),10(+finetune),15(+finetune)) predictors = pd.DataFrame(data={'feature_name': gbm1.feature_name(), 'feature_importance': gbm1.feature_importance()}) predictors = predictors[predictors['feature_importance']>0]['feature_name'].values print('第二层使用的特征:{}维\n'.format(len(predictors)), predictors) train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15) # train_all = rank(train_all, 'orderid', 'pred', ascending=False) del train_all['pred'] print('第二层数据:', train_all.shape) # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # ********** 开始训练 ********* gbm2 = lgb.train( params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5 # init_model=gbm1 # finetune ) # ********* 评估 ********* # 在训练集上看效果 score = get_score(train_all, predictors, gbm2, opt) print('训练集分数(第二层):{}'.format(score)) # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_2', cur_time, score[0]) with open(save_path, 'wb') as fout: pickle.dump(gbm2, fout) print('保存模型(第二层):', save_path) # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_2_300_top15') # with open(save_path, 'wb') as fout: # pickle.dump(gbm2, fout) # print('保存模型(第二层):', save_path) import sys sys.exit(0) # -------------------- 训练第三层 ------------------------ # 筛选出排名前五的候选样本 predictors = pd.DataFrame(data={'feature_name': gbm2.feature_name(), 'feature_importance': gbm2.feature_importance()}) predictors = predictors[predictors['feature_importance']>0]['feature_name'].values print('第三层使用的特征:{}维\n'.format(len(predictors)), predictors) train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10) # train_all = rank(train_all, 'orderid', 'pred', ascending=False) del train_all['pred'] print('第三层数据:', train_all.shape) # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # ********** 开始训练 ********* gbm3 = lgb.train( params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5 # init_model=gbm2 # finetune ) # ********* 评估 ********* # 在训练集上看效果 score = get_score(train_all, predictors, gbm3, opt) print('训练集分数(第三层):{}'.format(score)) # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_3', cur_time, score[0]) with open(save_path, 'wb') as fout: pickle.dump(gbm3, fout) print('保存模型(第三层):', save_path) save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_3_300_top10') with open(save_path, 'wb') as fout: pickle.dump(gbm3, fout) print('保存模型(第三层):', save_path) # -------------------- 训练第四层 ------------------------ # 筛选出排名前三的候选样本 predictors = pd.DataFrame(data={'feature_name': gbm3.feature_name(), 'feature_importance': gbm3.feature_importance()}) predictors = predictors[predictors['feature_importance']>0]['feature_name'].values print('第四层使用的特征:{}维\n'.format(len(predictors)), predictors) train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5) # train_all = rank(train_all, 'orderid', 'pred', ascending=False) del train_all['pred'] print('第四层数据:', train_all.shape) # ********* 准备数据 ********** # 划分验证集 train, val = train_test_split(train_all, test_size=0.1) # 定义数据集 X_train = train[predictors] y_train = train['label'] X_val = val[predictors] y_val = val['label'] del train, val gc.collect() # 数据集 lgb_train = lgb.Dataset(X_train, y_train) lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # ********** 开始训练 ********* gbm4 = lgb.train( params, lgb_train, num_boost_round=1200, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=5 # init_model=gbm3 # finetune ) # ********* 评估 ********* # 在训练集上看效果 score = get_score(train_all, predictors, gbm4, opt) print('训练集分数(第四层):{}'.format(score)) # ********* 保存模型 ********* cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_4', cur_time, score[0]) with open(save_path, 'wb') as fout: pickle.dump(gbm4, fout) print('保存模型(第四层):', save_path) save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_4_300_top5') with open(save_path, 'wb') as fout: pickle.dump(gbm4, fout) print('保存模型(第四层):', save_path)
def test_stack(**kwargs): opt = DefaultConfig() opt.update(**kwargs) logger = Logger() result_dir = '/home/dyj/' resmat = [result_dir+'TextCNN1_2017-07-27#12:30:16_test_res.pt',\ result_dir+'TextCNN2_2017-07-27#12:22:42_test_res.pt', \ result_dir+'RNN1_2017-07-27#12:35:51_test_res.pt',\ result_dir+'RNN2_2017-07-27#11:33:24_test_res.pt',\ result_dir+'RCNN1_2017-07-27#11:30:42_test_res.pt',\ result_dir+'RCNNcha_2017-07-27#16:00:33_test_res.pt',\ result_dir+'FastText4_2017-07-28#17:20:21_test_res.pt',\ result_dir+'FastText1_2017-07-29#10:47:46_test_res.pt'] opt['stack_num'] = len(resmat) test_dataset = Stack_Dataset(resmat=resmat, test=True) test_loader = data.DataLoader(test_dataset, shuffle=False, batch_size=opt['batch_size']) test_idx = np.load(opt['test_idx']) topic_idx = np.load(opt['topic_idx']) logger.info('Using model {}'.format(opt['model'])) Model = getattr(models, opt['model']) model = Model(opt) print model if opt['load']: if opt.get('load_name', None) is None: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model']) else: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name']) if opt['device'] != None: torch.cuda.set_device(opt['device']) if opt['cuda']: model.cuda() logger.info('Start testing...') model.eval() predict_label_list = [] res = torch.Tensor(opt['test_num'], opt['class_num']) for i, batch in enumerate(test_loader, 0): batch_size = batch[0].size(0) resmat = batch resmat = [Variable(ii) for ii in resmat] if opt['cuda']: resmat = [ii.cuda() for ii in resmat] logit = model(resmat) if opt.get('save_resmat', False): res[i * opt['batch_size']:i * opt['batch_size'] + batch_size] = logit.data.cpu() predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data] if opt.get('save_resmat', False): torch.save( res, '{}/{}_{}_test_res.pt'.format( opt['result_dir'], opt['model'], datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S'))) return lines = [] for qid, top5 in zip(test_idx, predict_label_list): topic_ids = [topic_idx[i] for i in top5] lines.append('{},{}'.format(qid, ','.join(topic_ids))) if opt.get('load_name', None) is None: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], result_dir=opt['result_dir']) else: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name'], result_dir=opt['result_dir'])
def train_stack(**kwargs): opt = DefaultConfig() opt.update(**kwargs) vis = Visualizer(opt['model']) logger = Logger() result_dir = '/home/dyj/' resmat = [(result_dir + 'RNN10_cal_res.pt', 10),\ (result_dir + 'TextCNN10_char.pt', 10),\ (result_dir + 'TextCNN10_top1.pt', 10),\ (result_dir + 'TextCNN10_top1_char.pt', 10),\ (result_dir + 'FastText10_res.pt', 10),\ ('/mnt/result/results/TextCNN5_12h.pt', 5),\ ('/mnt/result/results/RNN1_char.pt', 1) ] label = result_dir + 'label.pt' opt['stack_num'] = len(resmat) train_dataset = Stack_Dataset(resmat=resmat, label=label) train_loader = data.DataLoader(train_dataset, shuffle=True, batch_size=opt['batch_size']) logger.info('Using model {}'.format(opt['model'])) Model = getattr(models, opt['model']) model = Model(opt) print model if opt['use_self_loss']: Loss = getattr(models, opt['loss_function']) else: Loss = getattr(nn, opt['loss_function']) if opt['load']: if opt.get('load_name', None) is None: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model']) else: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name']) if opt['device'] != None: torch.cuda.set_device(opt['device']) if opt['cuda']: model.cuda() loss_function = Loss() optimizer = torch.optim.Adam(model.parameters(), lr=opt['lr']) logger.info('Start running...') steps = 0 model.train() for epoch in range(opt['base_epoch'] + 1, opt['epochs'] + 1): for i, batch in enumerate(train_loader, 1): resmat, label = batch[0:-1], batch[-1] resmat, label = [Variable(ii) for ii in resmat], Variable(label) if opt['cuda']: resmat, label = [ii.cuda() for ii in resmat], label.cuda() optimizer.zero_grad() logit = model(resmat) loss = loss_function(logit, label) loss.backward() optimizer.step() steps += 1 if steps % opt['log_interval'] == 0: corrects = ((logit.data > opt['threshold']) == (label.data).byte()).sum() accuracy = 100.0 * corrects / (opt['batch_size'] * opt['class_num']) log_info = 'Steps[{:>8}] (epoch[{:>2}] / batch[{:>5}]) - loss: {:.6f}, acc: {:.4f} % ({} / {})'.format( \ steps, epoch, i, loss.data[0], accuracy, \ corrects, opt['batch_size'] * opt['class_num']) logger.info(log_info) vis.plot('loss', loss.data[0]) precision, recall, score = get_score(logit.data.cpu(), label.data.cpu()) logger.info('Precision {}, Recall {}, Score {}'.format( precision, recall, score)) vis.plot('score', score) logger.info('Training epoch {} finished!'.format(epoch)) #save_model(model, model_dir=opt['model_dir'], model_name=opt['model'], epoch=epoch) if epoch == 3: for param_group in optimizer.param_groups: param_group['lr'] = opt['lr'] * opt['lr_decay'] save_model(model, model_dir=opt['model_dir'], model_name=opt['model'], epoch=epoch)
def test(**kwargs): opt = DefaultConfig() opt.update(**kwargs) logger = Logger() prefix = '' if opt['use_double_length']: prefix += '_2' print prefix if opt['use_char']: logger.info('Load char data starting...') opt['embed_num'] = opt['char_embed_num'] embed_mat = np.load(opt['char_embed']) test_title = np.load(opt['test_title_char' + prefix]) test_desc = np.load(opt['test_desc_char' + prefix]) logger.info('Load char data finished!') elif opt['use_word']: logger.info('Load word data starting...') opt['embed_num'] = opt['word_embed_num'] embed_mat = np.load(opt['word_embed']) test_title = np.load(opt['test_title_word' + prefix]) test_desc = np.load(opt['test_desc_word' + prefix]) logger.info('Load word data finished!') elif opt['use_char_word']: logger.info('Load char-word data starting...') embed_mat_char = np.load(opt['char_embed']) embed_mat_word = np.load(opt['word_embed']) embed_mat = np.vstack((embed_mat_char, embed_mat_word)) test_title = np.load(opt['test_title_char' + prefix]) test_desc = np.load(opt['test_desc_word' + prefix]) logger.info('Load char-word data finished!') elif opt['use_word_char']: logger.info('Load word-char data starting...') embed_mat_char = np.load(opt['char_embed']) embed_mat_word = np.load(opt['word_embed']) embed_mat = np.vstack((embed_mat_char, embed_mat_word)) test_title = np.load(opt['test_title_word' + prefix]) test_desc = np.load(opt['test_desc_char' + prefix]) logger.info('Load word-char data finished!') test_idx = np.load(opt['test_idx']) topic_idx = np.load(opt['topic_idx']) test_dataset = Dataset(test=True, title=test_title, desc=test_desc) test_loader = data.DataLoader(test_dataset, shuffle=False, batch_size=opt['batch_size']) logger.info('Using model {}'.format(opt['model'])) Model = getattr(models, opt['model']) model = Model(embed_mat, opt) if opt['load']: if opt.get('load_name', None) is None: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model']) else: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name']) if opt['device'] != None: torch.cuda.set_device(opt['device']) if opt['cuda']: model.cuda() logger.info('Start testing...') model.eval() predict_label_list = [] res = torch.Tensor(opt['test_num'], opt['class_num']) for i, batch in enumerate(test_loader, 0): batch_size = batch[0].size(0) title, desc = batch title, desc = Variable(title), Variable(desc) if opt['cuda']: title, desc = title.cuda(), desc.cuda() logit = model(title, desc) if opt.get('save_resmat', False): res[i * opt['batch_size']:i * opt['batch_size'] + batch_size] = logit.data.cpu() predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data] if opt.get('save_resmat', False): torch.save(res, '{}/{}_test_res.pt'.format(opt['result_dir'], opt['model'])) return lines = [] for qid, top5 in zip(test_idx, predict_label_list): topic_ids = [topic_idx[i] for i in top5] lines.append('{},{}'.format(qid, ','.join(topic_ids))) if opt.get('load_name', None) is None: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], result_dir=opt['result_dir']) else: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name'], result_dir=opt['result_dir'])
def train(**kwargs): opt = DefaultConfig() opt.update(**kwargs) vis = Visualizer(opt['model']) logger = Logger() prefix = '' if opt['use_double_length']: prefix += '_2' print prefix if opt['use_char']: logger.info('Load char data starting...') opt['embed_num'] = opt['char_embed_num'] embed_mat = np.load(opt['char_embed']) train_title = np.load(opt['train_title_char' + prefix]) train_desc = np.load(opt['train_desc_char' + prefix]) train_label = np.load(opt['train_label']) val_title = np.load(opt['val_title_char' + prefix]) val_desc = np.load(opt['val_desc_char' + prefix]) val_label = np.load(opt['val_label']) logger.info('Load char data finished!') elif opt['use_word']: logger.info('Load word data starting...') opt['embed_num'] = opt['word_embed_num'] embed_mat = np.load(opt['word_embed']) train_title = np.load(opt['train_title_word' + prefix]) train_desc = np.load(opt['train_desc_word' + prefix]) train_label = np.load(opt['train_label']) val_title = np.load(opt['val_title_word' + prefix]) val_desc = np.load(opt['val_desc_word' + prefix]) val_label = np.load(opt['val_label']) logger.info('Load word data finished!') elif opt['use_char_word']: logger.info('Load char-word data starting...') embed_mat_char = np.load(opt['char_embed']) embed_mat_word = np.load(opt['word_embed']) embed_mat = np.vstack((embed_mat_char, embed_mat_word)) train_title = np.load(opt['train_title_char' + prefix]) train_desc = np.load(opt['train_desc_word' + prefix]) train_label = np.load(opt['train_label']) val_title = np.load(opt['val_title_char' + prefix]) val_desc = np.load(opt['val_desc_word' + prefix]) val_label = np.load(opt['val_label']) logger.info('Load char-word data finished!') elif opt['use_word_char']: logger.info('Load word-char data starting...') embed_mat_char = np.load(opt['char_embed']) embed_mat_word = np.load(opt['word_embed']) embed_mat = np.vstack((embed_mat_char, embed_mat_word)) train_title = np.load(opt['train_title_word' + prefix]) train_desc = np.load(opt['train_desc_char' + prefix]) train_label = np.load(opt['train_label']) val_title = np.load(opt['val_title_word' + prefix]) val_desc = np.load(opt['val_desc_char' + prefix]) val_label = np.load(opt['val_label']) logger.info('Load word-char data finished!') train_dataset = Dataset(title=train_title, desc=train_desc, label=train_label, class_num=opt['class_num']) train_loader = data.DataLoader(train_dataset, shuffle=True, batch_size=opt['batch_size']) val_dataset = Dataset(title=val_title, desc=val_desc, label=val_label, class_num=opt['class_num']) val_loader = data.DataLoader(val_dataset, shuffle=False, batch_size=opt['batch_size']) logger.info('Using model {}'.format(opt['model'])) Model = getattr(models, opt['model']) model = Model(embed_mat, opt) print model loss_weight = torch.ones(opt['class_num']) if opt['boost']: if opt['base_layer'] != 0: cal_res = torch.load('{}/{}/layer_{}_cal_res_3.pt'.format( opt['model_dir'], opt['model'], opt['base_layer']), map_location=lambda storage, loc: storage) logger.info('Load cal_res successful!') loss_weight = torch.load('{}/{}/layer_{}_loss_weight_3.pt'.format( opt['model_dir'], opt['model'], opt['base_layer'] + 1), map_location=lambda storage, loc: storage) else: cal_res = torch.zeros(opt['val_num'], opt['class_num']) print 'cur_layer:', opt['base_layer'] + 1, \ 'loss_weight:', loss_weight.mean(), loss_weight.max(), loss_weight.min(), loss_weight.std() if opt['use_self_loss']: Loss = getattr(models, opt['loss_function']) else: Loss = getattr(nn, opt['loss_function']) if opt['load']: if opt.get('load_name', None) is None: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model']) else: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name']) if opt['cuda'] and opt['device'] != None: torch.cuda.set_device(opt['device']) if opt['cuda']: model.cuda() loss_weight = loss_weight.cuda() # import sys # precision, recall, score = eval(val_loader, model, opt, save_res=True) # print precision, recall, score # sys.exit() loss_function = Loss(weight=loss_weight + 1 - loss_weight.mean()) optimizer = torch.optim.Adam(model.parameters(), lr=opt['lr']) logger.info('Start running...') steps = 0 model.train() base_epoch = opt['base_epoch'] for epoch in range(1, opt['epochs'] + 1): for i, batch in enumerate(train_loader, 0): title, desc, label = batch title, desc, label = Variable(title), Variable(desc), Variable( label).float() if opt['cuda']: title, desc, label = title.cuda(), desc.cuda(), label.cuda() optimizer.zero_grad() logit = model(title, desc) loss = loss_function(logit, label) loss.backward() optimizer.step() steps += 1 if steps % opt['log_interval'] == 0: corrects = ((logit.data > opt['threshold']) == (label.data).byte()).sum() accuracy = 100.0 * corrects / (opt['batch_size'] * opt['class_num']) log_info = 'Steps[{:>8}] (epoch[{:>2}] / batch[{:>5}]) - loss: {:.6f}, acc: {:.4f} % ({} / {})'.format( \ steps, epoch + base_epoch, (i+1), loss.data[0], accuracy, \ corrects, opt['batch_size'] * opt['class_num']) logger.info(log_info) vis.plot('loss', loss.data[0]) precision, recall, score = eval(batch, model, opt, isBatch=True) vis.plot('score', score) logger.info('Training epoch {} finished!'.format(epoch + base_epoch)) precision, recall, score = eval(val_loader, model, opt) log_info = 'Epoch[{}] - score: {:.6f} (precision: {:.4f}, recall: {:.4f})'.format( \ epoch + base_epoch, score, precision, recall) vis.log(log_info) save_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \ epoch=epoch+base_epoch, score=score) if epoch + base_epoch == 2: model.opt['static'] = False elif epoch + base_epoch == 4: for param_group in optimizer.param_groups: param_group['lr'] = opt['lr'] * opt['lr_decay'] elif epoch + base_epoch >= 5: if opt['boost']: res, truth = eval(val_loader, model, opt, return_res=True) ori_score = get_score(cal_res, truth) cal_res += res cur_score = get_score(cal_res, truth) logger.info('Layer {}: {}, Layer {}: {}'.format( opt['base_layer'], ori_score, opt['base_layer'] + 1, cur_score)) loss_weight = get_loss_weight(cal_res, truth) torch.save( cal_res, '{}/{}/layer_{}_cal_res_3.pt'.format( opt['model_dir'], opt['model'], opt['base_layer'] + 1)) logger.info('Save cal_res successful!') torch.save( loss_weight, '{}/{}/layer_{}_loss_weight_3.pt'.format( opt['model_dir'], opt['model'], opt['base_layer'] + 2)) break