def GBDT_train(self,X_train, X_valid, labels_train, labels_valid, X_test, gbdt_params_all): gbdt_params = gbdt_params_all.copy() objective_type = gbdt_params['objective_type'] gbdt_params.pop('objective_type') if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = GradientBoostingRegressor(**gbdt_params) else: clf = GradientBoostingClassifier(**gbdt_params) clf.fit(X_train, labels_train) else: clf = model_load.fit(X_train, labels_train) else: if objective_type == 'regressor': clf = GradientBoostingRegressor(**gbdt_params) else: clf = GradientBoostingClassifier(**gbdt_params) clf.fit(X_train, labels_train) val_xgb_pre = clf.predict(X_valid.values) test_xgb_pre = clf.predict(X_test.values) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_xgb = myMetrics.metricsFunc(val_xgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_xgb_pre, test_xgb_pre, score_xgb
def CGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, cgb_params_all): cgb_params = cgb_params_all.copy() objective_type = cgb_params['objective_type'] cgb_params.pop('objective_type') cgb_param_contrl = {'verbose': 200, 'early_stopping_rounds': 100} for k in cgb_param_contrl.keys(): if k in cgb_params: cgb_param_contrl[k] = cgb_params[k] cgb_params.pop(k) if 'cat_features' in cgb_params: cgb_param_contrl['cat_features'] = cgb_params['cat_features'] cgb_params.pop('cat_features') else: cgb_param_contrl['cat_features'] = None if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = CatBoostRegressor(**cgb_params) else: clf = CatBoostClassifier(**cgb_params) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], verbose=cgb_param_contrl['verbose'], early_stopping_rounds=cgb_param_contrl['early_stopping_rounds'], cat_features=cgb_param_contrl['cat_features']) else: clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], verbose=cgb_param_contrl['verbose'], early_stopping_rounds=cgb_param_contrl['early_stopping_rounds'], cat_features=cgb_param_contrl['cat_features']) else: if objective_type == 'regressor': clf = CatBoostRegressor(**cgb_params) else: clf = CatBoostClassifier(**cgb_params) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], verbose=cgb_param_contrl['verbose'], early_stopping_rounds=cgb_param_contrl['early_stopping_rounds'], cat_features=cgb_param_contrl['cat_features']) val_xgb_pre = clf.predict(X_valid, ntree_end=clf.best_iteration_) test_xgb_pre = clf.predict(X_test, ntree_end=clf.best_iteration_) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_xgb = myMetrics.metricsFunc(val_xgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_xgb_pre, test_xgb_pre, score_xgb
def LGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, lgb_param_all): lgb_param_contrl = {'early_stopping_rounds': 100, 'categorical_feature': 'auto'} lgb_param = lgb_param_all.copy() objective_type = lgb_param['objective_type'] lgb_param.pop('objective_type') for k in ['early_stopping_rounds', 'categorical_feature']: if k in lgb_param: lgb_param_contrl[k] = lgb_param[k] lgb_param.pop(k) if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) else: if objective_type == 'regressor': clf = LGBMRegressor(**lgb_param) else: clf = LGBMClassifier(**lgb_param) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'], categorical_feature=lgb_param_contrl['categorical_feature']) val_lgb_pre = clf.predict(X_valid.values, num_iteration=clf.best_iteration_) test_lgb_pre = clf.predict(X_test.values, num_iteration=clf.best_iteration_) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_lgb = myMetrics.metricsFunc(val_lgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_lgb_pre, test_lgb_pre, score_lgb
def XGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, xgb_params_all): xgb_param_contrl = {'early_stopping_rounds': 100} xgb_params = xgb_params_all.copy() objective_type = xgb_params['objective_type'] xgb_params.pop('objective_type') for k in xgb_param_contrl.keys(): if k in xgb_params: xgb_param_contrl[k] = xgb_params[k] xgb_params.pop(k) if not self.config.retrain: # 调用已有模型进行增量训练 model_load = self.load_model() if not model_load: print('不存在模型:{},从头训练'.format(self.modelName)) if objective_type == 'regressor': clf = XGBRegressor(**xgb_params) else: clf = XGBClassifier(**xgb_params) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=xgb_param_contrl['early_stopping_rounds']) else: clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=xgb_param_contrl['early_stopping_rounds']) else: if objective_type == 'regressor': clf = XGBRegressor(**xgb_params) else: clf = XGBClassifier(**xgb_params) clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse', early_stopping_rounds=xgb_param_contrl['early_stopping_rounds']) val_xgb_pre = clf.predict(X_valid, ntree_limit=clf.best_iteration) test_xgb_pre = clf.predict(X_test, ntree_limit=clf.best_iteration) metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) score_xgb = myMetrics.metricsFunc(val_xgb_pre, labels_valid) self.save_model(clf, self.config.saveModel) return val_xgb_pre, test_xgb_pre, score_xgb
def predict_by_models(X_train, X_test, config): """ --其实是run_train.py的简化版:只调用、不训练 输入训练集(带label)、测试集(不带label), 利用所有训练集循环runs次(每次不同随机种子), 每次使用folds折交叉, 每折使用bagging_size次bagging(每次bagging随机抽取bootstrap_ratio*100%的训练集),最后预测结果求各bagging的均值 对于每次run的folds次训练,训练集的预测结果是各folds拼接,,测试集的预测结果是folds次的均值 最后求训练集和测试集的runs次的均值 :param X_train: 训练集 :param X_test: 测试集 :return: 训练集和测试集的预测值,训练集的得分 """ skf = getKfoldIndex.get_kfold_index(X_train, config) target_col = config.data_label y_train = X_train.loc[:, target_col] X_train = X_train.drop([target_col], axis=1) metrics_name = config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) ####################### ## Generate Features ## ####################### print("同志们让我们撸起袖子干起来!!!") Loss_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) pre_sumRuns_test = np.zeros(len(X_test)) pre_sumRuns_train = np.zeros(len(X_train)) for run in range(config.n_runs): info_0 = "<=============================================================================>" print(info_0) predictions_run = np.zeros(len(X_test)) val_oneRun = pd.DataFrame() for fold_, (trn_idx, val_idx) in enumerate(skf[run]): info_1 = "*******************************************************************************" info_2 = "************************* Model Run: %d, Fold: %d start *************************" % (run+1, fold_+1) info_3 = "*******************************************************************************\n" print(info_1) print(info_2) print(info_3) X_valid_kf = X_train.iloc[val_idx, :] labels_valid_kf = y_train.iloc[val_idx] X_train_kf = X_train.iloc[trn_idx, :] labels_train_kf = y_train.iloc[trn_idx] rng = np.random.RandomState(2015 + 1000 * run + 10 * fold_) numValid = X_valid_kf.shape[0] numTrain = X_train_kf.shape[0] numTest = X_test.shape[0] preds_bagging_val = np.zeros((numValid, config.bagging_size), dtype=float) preds_bagging_test = np.zeros((numTest, config.bagging_size), dtype=float) # 使用Bagging框架,训练集采样进行训练 for n in range(config.bagging_size): info_4 = '---------------------------------->bagging: {}'.format(n) print(info_4) if config.bootstrap_replacement: # 随机选 sampleSize = int(numTrain*config.bootstrap_ratio) index_base = rng.randint(numTrain, size=sampleSize) index_meta = [i for i in range(numTrain) if i not in index_base] else: # 均匀选 randnum = rng.uniform(size=numTrain) index_base = [i for i in range(numTrain) if randnum[i] < config.bootstrap_ratio] index_meta = [i for i in range(numTrain) if randnum[i] >= config.bootstrap_ratio] model_name = 'run_{}_fold_{}_bag_{}'.format(run, fold_, n) joblib_models_path = config.path_data.saveModelPath + '/joblib_models/%s/' % config.modelType \ + model_name + '.pkl' pickle_models_path = config.path_data.saveModelPath + '/pickle_models/%s/' % config.modelType\ + model_name + '.txt' keras_models_path = config.path_data.saveModelPath +'/keras_models/%s/' % config.modelType \ + model_name + ".hdf5" if os.path.exists(joblib_models_path): info_5 = '调用模型预测:{}'.format(model_name + '.pkl') print(info_5) model_load = joblib.load(joblib_models_path) elif os.path.exists(pickle_models_path): info_6 = '调用模型预测:{}'.format(model_name + '.txt') print(info_6) with open(pickle_models_path, 'rb') as f: model_load = pickle.load(f) elif os.path.exists(keras_models_path): info_7 = '调用模型预测:{}'.format(model_name + '.hdf5') print(info_7) kerasModel = loadKerasModel(model_name, config) model_load = kerasModel.get_modelStructure(X_train_kf.shape[1]) model_load.load_weights(keras_models_path) else: print('模型库中不存在调用模型,请先建模!!!') return None ############################################################################################# pred_valid = model_load.predict(X_valid_kf.values) preds_bagging_val[:, n] = pred_valid.reshape(-1) pred_meanBagging_val = np.mean(preds_bagging_val[:, :(n + 1)], axis=1) score_meanBagging_val = myMetrics.metricsFunc(pred_meanBagging_val, labels_valid_kf) pred_test = model_load.predict(X_test.values) preds_bagging_test[:, n] = pred_test.reshape(-1) pred_meanBagging_test = np.mean(preds_bagging_test[:, :(n + 1)], axis=1) if (n + 1) != config.bagging_size: info_10 = " - - - - - - - - - - - - - - - - ->[{}-{}-{}] score:{} shape:{} x {}".format( run+1, fold_+1, n + 1, np.round(score_meanBagging_val, 6), X_train_kf.shape[0], X_train_kf.shape[1]) print(info_10) else: info_11 = "- - - - - - - - - - - - - - - - ->[{}-{}-{}] score:{} shape:{} x {}\n\n\n".format( run+1, fold_+1, n + 1, np.round(score_meanBagging_val, 6), X_train_kf.shape[0], X_train_kf.shape[1]) print(info_11) if n == (config.bagging_size-1): # 某一折的误差 Loss_cv[run, fold_] = score_meanBagging_val # 将所有验证集集成 val_oneFold_tmp = pd.DataFrame(data=pred_meanBagging_val, index=val_idx) val_oneRun = pd.concat([val_oneRun, val_oneFold_tmp], axis=0) predictions_run += pred_meanBagging_test / config.n_folds # 一次run的验证集结果 val_oneRun.sort_index(axis=0, ascending=True, inplace=True) val_oneRun.index = y_train.index this_run_score = myMetrics.metricsFunc(val_oneRun[0], y_train) predict_run_dir = config.path_data.predictPath + '/process/run_%d/%s' % (run+1, config.modelType) if not os.path.exists(predict_run_dir): os.makedirs(predict_run_dir) train_run_file = predict_run_dir + "/{}_train_{}.csv".format(config.mark, int(this_run_score)) val_oneRun_df = pd.DataFrame(val_oneRun, index=y_train.index) val_oneRun_df.to_csv(train_run_file, index=True) test_run_file = predict_run_dir + "/{}_test_{}.csv".format(config.mark, int(this_run_score)) test_oneRun_df = pd.DataFrame(predictions_run, index=X_test.index) test_oneRun_df.to_csv(test_run_file, index=True) info_12 = "******************** Model Run: {}, CV val score: {:<8.5f} ********************".format( run+1, this_run_score) print(info_12) info_13 = "<=============================================================================>\n\n\n\n" print(info_13) pre_sumRuns_test = pre_sumRuns_test + predictions_run pre_sumRuns_train = pre_sumRuns_train + val_oneRun.iloc[:, 0].values pre_sumRuns_test = pre_sumRuns_test/config.n_runs pre_sumRuns_train = pre_sumRuns_train/config.n_runs pre_sumRuns_train = pd.DataFrame(pre_sumRuns_train, index=X_train.index) score_final = int(myMetrics.metricsFunc(pre_sumRuns_train, y_train)) info_14 = '-------------------------------------------------------------------------------\n' + \ '-------------------------------------------------------------------------------\n' + \ '最终得分: %d\n\n\n\n\n\n' % score_final print(info_14) predict_final_dir = config.path_data.predictPath + '/final/%s' % config.modelType if not os.path.exists(predict_final_dir): os.makedirs(predict_final_dir) train_file = predict_final_dir + "/{}_train_{}.csv".format(config.mark, score_final) pre_sumRuns_train.to_csv(train_file, index=True) test_file = predict_final_dir + "/{}_test_{}.csv".format(config.mark, score_final) pre_sumRuns_test = pd.DataFrame(pre_sumRuns_test, index=X_test.index) pre_sumRuns_test.to_csv(test_file, index=True) return pre_sumRuns_train, pre_sumRuns_test, score_final
def train_model(X_train, X_test, param, config, use_trainedModel=False): """ 输入训练集(带label)、测试集(不带label), 利用所有训练集循环runs次(每次不同随机种子), 每次使用folds折交叉, 每折使用bagging_size次bagging(每次bagging随机抽取bootstrap_ratio*100%的训练集),最后预测结果求各bagging的均值 对于每次run的folds次训练,训练集的预测结果是各folds拼接,,测试集的预测结果是folds次的均值 最后求训练集和测试集的runs次的均值 :param X_train: 训练集 :param X_test: 测试集 :param param: 类的参数(超参),可以用于调参 :param use_trainedModel: (默认)False:表示训练,True:表示测试(当不存在可调用模型时,会临时重新训练) :return: 训练集和测试集的预测值,训练集的得分 """ mark = config.mark # 提供将训练集的拆分为runs次folds折的index skf = getKfoldIndex.get_kfold_index(X_train, config) target_col = config.data_label y_train = X_train.loc[:, target_col] X_train = X_train.drop([target_col], axis=1) metrics_name = config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) log_path = config.path_data.logPath + '/%s' % config.modelType if not os.path.exists(log_path): os.makedirs(log_path) log_name = '/%s.log' % config.project_name log_file = log_path + log_name Mylog = DefindLog(log_file).get_logger() if len(mark) % 2 == 0: long = 70 else: long = 71 start_time = time.strftime("%Y_%m_%d %H:%M:%S") mark_withTime = mark + ' ' + start_time null = 33 single = (long-len(mark_withTime)-2)//2 u = '#'*long + '\n' m = ' '*null + '#'*single + ' ' + mark_withTime + ' ' + '#'*single + '\n' d = ' '*null + '#'*long + '\n' train_star = u+m+d Mylog.info(train_star) if config.mark_text != '': cu = '+'*35 + '\n' cd = ' '*null + '+'*35 + '\n' content = cu + ' '*null + config.mark_text + '\n' + cd Mylog.info(content) ####################### ## Generate Features ## ####################### print("同志们让我们撸起袖子干起来!!!") Loss_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) pre_sumRuns_test = np.zeros(len(X_test)) pre_sumRuns_train = np.zeros(len(X_train)) # CPU的执行时间,返回的单位是秒 start_CPU = time.clock() # 程序执行时间(=cpu时间 + io时间 + 休眠或者等待时间) start_RUN = datetime.datetime.now() for run in range(config.n_runs): info_0 = "<=============================================================================>" Mylog.info(info_0) start_CPU_oneRun = time.clock() start_RUN_oneRun = datetime.datetime.now() predictions_run = np.zeros(len(X_test)) val_oneRun = pd.DataFrame() for fold_, (trn_idx, val_idx) in enumerate(skf[run]): info_1 = "*******************************************************************************" info_2 = "************************* Model Run: %d, Fold: %d start *************************" % (run+1, fold_+1) info_3 = "*******************************************************************************\n" Mylog.info(info_1) Mylog.info(info_2) Mylog.info(info_3) start_CPU_oneFold = time.clock() start_RUN_oneFold = datetime.datetime.now() X_valid_kf = X_train.iloc[val_idx, :] labels_valid_kf = y_train.iloc[val_idx] X_train_kf = X_train.iloc[trn_idx, :] labels_train_kf = y_train.iloc[trn_idx] rng = np.random.RandomState(2015 + 1000 * run + 10 * fold_) numValid = X_valid_kf.shape[0] numTrain = X_train_kf.shape[0] numTest = X_test.shape[0] preds_bagging_val = np.zeros((numValid, config.bagging_size), dtype=float) preds_bagging_test = np.zeros((numTest, config.bagging_size), dtype=float) # 使用Bagging框架,训练集采样进行训练 for n in range(config.bagging_size): info_4 = '---------------------------------->bagging: {}'.format(n) Mylog.info(info_4) start_CPU_oneBagging = time.clock() start_RUN_oneBagging = datetime.datetime.now() if config.bootstrap_replacement: # 随机选 sampleSize = int(numTrain*config.bootstrap_ratio) index_base = rng.randint(numTrain, size=sampleSize) index_meta = [i for i in range(numTrain) if i not in index_base] else: # 均匀选 randnum = rng.uniform(size=numTrain) index_base = [i for i in range(numTrain) if randnum[i] < config.bootstrap_ratio] index_meta = [i for i in range(numTrain) if randnum[i] >= config.bootstrap_ratio] model_name = 'run_{}_fold_{}_bag_{}'.format(run+1, fold_+1, n+1) if use_trainedModel: joblib_models_path = config.path_data.saveModelPath + '/joblib_models/%s/' % config.modelType\ + model_name + '.pkl' pickle_models_path = config.path_data.saveModelPath + '/pickle_models/%s/' % config.modelType \ + model_name + '.txt' keras_models_path = config.path_data.saveModelPath +'/keras_models/%s/' % config.modelType \ + model_name + ".hdf5" if os.path.exists(joblib_models_path): info_5 = '调用模型预测:{}'.format(model_name + '.pkl') Mylog.info(info_5) model_load = joblib.load(joblib_models_path) pred_valid = model_load.predict(X_valid_kf.values) pred_test = model_load.predict(X_test.values) elif os.path.exists(pickle_models_path): info_6 = '调用模型预测:{}'.format(model_name + '.txt') Mylog.info(info_6) with open(pickle_models_path, 'rb') as f: model_load = pickle.load(f) pred_valid = model_load.predict(X_valid_kf.values) pred_test = model_load.predict(X_test.values) elif os.path.exists(keras_models_path): info_7 = '调用模型预测:{}'.format(model_name + '.hdf5') Mylog.info(info_7) kerasModel = loadKerasModel(model_name, config) model_load = kerasModel.get_modelStructure(X_train_kf.shape[1]) model_load.load_weights(keras_models_path) pred_valid = model_load.predict(X_valid_kf.values) pred_test = model_load.predict(X_test.values) else: info_8 = '不存在模型:{},临时训练'.format(model_name) Mylog.info(info_8) ############################################################################################# if config.modelType in config.treeModelLib: model = loadTreeModel(model_name, config) elif config.modelType in config.kerasModelLib: model = loadKerasModel(model_name, config) else: print('请在training下的config.py中确定合适模型类型!!!') return None pred_valid, pred_test, score_bag_one = model.train(X_train_kf.iloc[index_base, :], X_valid_kf, labels_train_kf.iloc[index_base], labels_valid_kf, X_test, param) ############################################################################################# else: info_9 = '开始训练模型:{}'.format(model_name) Mylog.info(info_9) ############################################################################################# if config.modelType in config.treeModelLib: model = loadTreeModel(model_name, config) elif config.modelType in config.kerasModelLib: model = loadKerasModel(model_name, config) else: print('请在training下的config.py中确定合适模型类型!!!') return None pred_valid, pred_test, score_bag_one = model.train(X_train_kf.iloc[index_base, :], X_valid_kf, labels_train_kf.iloc[index_base], labels_valid_kf, X_test, param) ############################################################################################# if not use_trainedModel: end_CPU_oneBagging = time.clock() end_RUN_oneBagging = datetime.datetime.now() oneBagging_spend_CPU = time_tran.time_format(int(end_CPU_oneBagging - start_CPU_oneBagging)) oneBagging_spend_RUN = time_tran.time_format(int((end_RUN_oneBagging - start_RUN_oneBagging).total_seconds())) sumBagging_spend_CPU = time_tran.time_format(int(end_CPU_oneBagging - start_CPU)) sumBagging_spend_RUN = time_tran.time_format(int((end_RUN_oneBagging - start_RUN).total_seconds())) Mylog.info('此轮bagging,CPU耗时:{}'.format(oneBagging_spend_CPU)) Mylog.info('此轮bagging,RUN耗时:{}'.format(oneBagging_spend_RUN)) Mylog.info('从开始运行到现在,CPU耗时:{}'.format(sumBagging_spend_CPU)) Mylog.info('从开始运行到现在,RUN耗时:{}'.format(sumBagging_spend_RUN)) ## this bagging iteration preds_bagging_val[:, n] = pred_valid.reshape(-1) pred_meanBagging_val = np.mean(preds_bagging_val[:, :(n + 1)], axis=1) score_meanBagging_val = myMetrics.metricsFunc(pred_meanBagging_val, labels_valid_kf) preds_bagging_test[:, n] = pred_test.reshape(-1) pred_meanBagging_test = np.mean(preds_bagging_test[:, :(n + 1)], axis=1) if (n + 1) != config.bagging_size: info_10 = " - - - - - - - - - - - - - - - - ->[{}-{}-{}] score:{} shape:{} x {}".format( run+1, fold_+1, n + 1, np.round(score_meanBagging_val, 6), X_train_kf.shape[0], X_train_kf.shape[1]) Mylog.info(info_10) else: info_11 = "- - - - - - - - - - - - - - - - ->[{}-{}-{}] score:{} shape:{} x {}\n\n\n".format( run+1, fold_+1, n + 1, np.round(score_meanBagging_val, 6), X_train_kf.shape[0], X_train_kf.shape[1]) Mylog.info(info_11) if n == (config.bagging_size-1): # 某一折的误差 Loss_cv[run, fold_] = score_meanBagging_val # 将所有验证集集成 val_oneFold_tmp = pd.DataFrame(data=pred_meanBagging_val, index=val_idx) val_oneRun = pd.concat([val_oneRun, val_oneFold_tmp], axis=0) predictions_run += pred_meanBagging_test / config.n_folds if not use_trainedModel: end_CPU_oneFold = time.clock() end_RUN_oneFold = datetime.datetime.now() oneFold_spend_CPU = time_tran.time_format(int(end_CPU_oneFold - start_CPU_oneFold)) oneFold_spend_RUN = time_tran.time_format(int((end_RUN_oneFold - start_RUN_oneFold).total_seconds())) sumFold_spend_CPU = time_tran.time_format(int(end_CPU_oneFold - start_CPU)) sumFold_spend_RUN = time_tran.time_format(int((end_RUN_oneFold - start_RUN).total_seconds())) Mylog.info('此轮fold,CPU耗时:{}'.format(oneFold_spend_CPU)) Mylog.info('此轮fold,RUN耗时:{}'.format(oneFold_spend_RUN)) Mylog.info('从开始运行到现在,CPU耗时:{}'.format(sumFold_spend_CPU)) Mylog.info('从开始运行到现在,RUN耗时:{}'.format(sumFold_spend_RUN)) # 一次run的验证集结果 val_oneRun.sort_index(axis=0, ascending=True, inplace=True) val_oneRun.index = y_train.index this_run_score = myMetrics.metricsFunc(val_oneRun[0], y_train) predict_run_dir = config.path_data.predictPath + '/process/run_%d/%s' % (run+1, config.modelType) if not os.path.exists(predict_run_dir): os.makedirs(predict_run_dir) train_run_file = predict_run_dir + "/{}_train_{}.csv".format(config.mark, int(this_run_score)) val_oneRun_df = pd.DataFrame(val_oneRun, index=y_train.index) val_oneRun_df.to_csv(train_run_file, index=True) test_run_file = predict_run_dir + "/{}_test_{}.csv".format(config.mark, int(this_run_score)) test_oneRun_df = pd.DataFrame(predictions_run, index=X_test.index) test_oneRun_df.to_csv(test_run_file, index=True) info_12 = "******************** Model Run: {}, CV val score: {:<8.5f} ********************".format( run+1, this_run_score) Mylog.info(info_12) info_13 = "<=============================================================================>\n\n\n\n" Mylog.info(info_13) pre_sumRuns_test = pre_sumRuns_test + predictions_run pre_sumRuns_train = pre_sumRuns_train + val_oneRun.iloc[:, 0].values if not use_trainedModel: end_CPU_oneRun = time.clock() end_RUN_oneRun = datetime.datetime.now() onoRun_spend_CPU = time_tran.time_format(int(end_CPU_oneRun - start_CPU_oneRun)) onoRun_spend_RUN = time_tran.time_format(int((end_RUN_oneRun - start_RUN_oneRun).total_seconds())) sumRun_spend_CPU = time_tran.time_format(int(end_CPU_oneRun - start_CPU)) sumRun_spend_RUN = time_tran.time_format(int((end_RUN_oneRun - start_RUN).total_seconds())) Mylog.info('此轮run,CPU耗时:{}'.format(onoRun_spend_CPU)) Mylog.info('此轮run,RUN耗时:{}'.format(onoRun_spend_RUN)) Mylog.info('从开始运行到现在,CPU耗时:{}'.format(sumRun_spend_CPU)) Mylog.info('从开始运行到现在,RUN耗时:{}'.format(sumRun_spend_RUN)) pre_sumRuns_test = pre_sumRuns_test/config.n_runs pre_sumRuns_train = pre_sumRuns_train/config.n_runs pre_sumRuns_train = pd.DataFrame(pre_sumRuns_train, index=X_train.index) score_final = myMetrics.metricsFunc(pre_sumRuns_train, y_train) info_14 = '-------------------------------------------------------------------------------\n' + \ ' '*null + '-------------------------------------------------------------------------------\n' + \ ' '*null + '最终得分: %f\n\n\n\n\n\n' % score_final Mylog.info(info_14) predict_final_dir = config.path_data.predictPath + '/final/%s' % config.modelType if not os.path.exists(predict_final_dir): os.makedirs(predict_final_dir) train_file = predict_final_dir + "/{}_train_{:<8.4f}.csv".format(config.mark, score_final) pre_sumRuns_train.to_csv(train_file, index=True) train_file_cp = config.path_data.dataPrePath + "/{}_train_{:<8.4f}.csv".format(config.mark, score_final) pre_sumRuns_train.to_csv(train_file_cp, index=True) test_file = predict_final_dir + "/{}_test_{:<8.4f}.csv".format(config.mark, score_final) pre_sumRuns_test = pd.DataFrame(pre_sumRuns_test, index=X_test.index) pre_sumRuns_test.to_csv(test_file, index=True) test_file_cp = config.path_data.dataPrePath + "/{}_test_{:<8.4f}.csv".format(config.mark, score_final) pre_sumRuns_test.to_csv(test_file_cp, index=True) return pre_sumRuns_train, pre_sumRuns_test, score_final
def train(self, X_train, X_val, y_train, y_val, X_test, params, save_type='J'): if 'epochs' in params: epochs = params['epochs'] else: epochs = 20 if 'batch_size' in params: batch_size = params['batch_size'] else: batch_size = X_train.shape[0] // 10 if 'show_fig' in params: show_fig = params['show_fig'] else: show_fig = False if 'loss' in params: loss = params['loss'] else: loss = 'loss' if 'optimizer' in params: optimizer = params['optimizer'] else: optimizer = 'adam' if 'metrics' in params: metrics = params['metrics'] if str(type(metrics)).split('\'')[1] == 'str': metrics = [metrics] else: metrics = 'mse' # 利用回调函数,调整训练过程的学习率 # def scheduler(epoch): # # 到规定的epoch,学习率减小为原来的1/10 # # if epoch == 300: # lr = K.get_value(model.optimizer.lr) # K.set_value(model.optimizer.lr, lr * 0.5) # print("lr changed to {}".format(lr * 0.5)) # if epoch == 500: # lr = K.get_value(model.optimizer.lr) # K.set_value(model.optimizer.lr, lr * 0.2) # print("lr changed to {}".format(lr * 0.2)) # if epoch == 1000: # lr = K.get_value(model.optimizer.lr) # K.set_value(model.optimizer.lr, lr * 0.5) # print("lr changed to {}".format(lr * 0.5)) # if epoch == 1500: # lr = K.get_value(model.optimizer.lr) # K.set_value(model.optimizer.lr, lr * 0.2) # print("lr changed to {}".format(lr * 0.2)) # if epoch == 2000: # lr = K.get_value(model.optimizer.lr) # K.set_value(model.optimizer.lr, lr * 0.5) # print("lr changed to {}".format(lr * 0.5)) # if epoch == 2500: # lr = K.get_value(model.optimizer.lr) # K.set_value(model.optimizer.lr, lr * 0.2) # print("lr changed to {}".format(lr * 0.2)) # return K.get_value(model.optimizer.lr) def scheduler(epoch): # 每隔100个epoch,学习率减小为原来的1/10 if epoch % 20 == 0 and epoch != 0: lr = K.get_value(model.optimizer.lr) K.set_value(model.optimizer.lr, lr * 0.6) print("lr changed to {}".format(lr * 0.6)) return K.get_value(model.optimizer.lr) reduce_lr = LearningRateScheduler(scheduler) # 利用回调函数,保存模型 load_model_dir = self.config.path_data.saveModelPath+'/keras_models/%s/' % self.config.modelType if not os.path.exists(load_model_dir): os.makedirs(load_model_dir) load_model_file = load_model_dir + self.modelName + ".hdf5" save_model_path = load_model_file checkpoint = keras.callbacks.ModelCheckpoint(save_model_path, monitor='val_loss', verbose=0, save_best_only=True, mode='min', skipping=1, save_weights_only=True) # 利用回调函数,设置早停止机制 early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.000001, patience=200, verbose=0, mode='auto', baseline=None, restore_best_weights=False) # 确定使用的回调函数 # callback_lists = [checkpoint, reduce_lr, early_stopping] callback_lists = [checkpoint, reduce_lr] # 数据由DF转化为array train_data_x = np.array(X_train) train_data_y = np.array(y_train) val_data_x = np.array(X_val) val_data_y = np.array(y_val) ######################################################### h = X_train.shape[1] # 调用现有模型结构 if self.config.modelType == 'Resreg': model = FC_Resreg().structure(input_shape=(h,)) elif self.config.modelType == 'fc_6Dmodel': model = FC_6Dmodel().structure(h) elif self.config.modelType == 'fc_8Dmodel': model = FC_8Dmodel().structure(h) else: print('不存在这类模型:{}'.format(self.config.modelType)) return None ######################################################### if not self.config.retrain: # 调用已有模型进行增量训练 load_model_path = save_model_path if os.path.exists(load_model_path): try: model.load_weights(load_model_path) # 若成功加载前面保存的参数,输出下列信息 print("调用已有模型进行增量训练") except Exception as e: print(e) print('模型可能参数不匹配') else: print("不存在已有模型,重新训练") # 定义损失函数、优化器、评价函数 print(loss, optimizer, metrics) model.compile(loss=loss, optimizer=optimizer, metrics=metrics) # 训练模型 model.fit(train_data_x, train_data_y, epochs=epochs, batch_size=batch_size, validation_data=(val_data_x, val_data_y), callbacks=callback_lists) # 绘制训练过程中的loss、metrics曲线 lossFig_dir = self.config.path_data.saveFigsPath + '/%s' % self.config.modelType if not os.path.exists(lossFig_dir): os.makedirs(lossFig_dir) lossFig_path = lossFig_dir + '/%s' % self.modelName metrice_loss_figs(model, lossFig_path, show_fig=show_fig) # 验证集、测试集预测 val_data_y_pre = model.predict(X_val).reshape((model.predict(X_val).shape[0],)) test_data_y_pre = model.predict(X_test).reshape((model.predict(X_test).shape[0],)) # 计算验证集得分 metrics_name = self.config.metrics_name myMetrics = defindMetrics.MyMetrics(metrics_name) val_score = myMetrics.metricsFunc(val_data_y_pre, y_val) return val_data_y_pre, test_data_y_pre, val_score