def optimize_RFC(data, targets): """Apply Bayesian Optimization to RandomForestClassifier hyper-parameters.""" def RFC_CV(n_estimators, min_samples_split, max_features, data, targets): """RandomForestClassifier with cross validation.""" estimator = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, max_features=max_features) cval = cross_val_score(estimator, data, targets, scoring='neg_log_loss', cv=3) return cval.mean() def RFC_crossval(n_estimators, min_samples_split, max_features): """Wrapper of RandomForestClassifier cross validation.""" return RFC_CV(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split), max_features=max(min(max_features, 0.999), 1e-3), data=data, targets=targets) optimizer = BayesianOptimization(f=RFC_crossval, pbounds={"n_estimators": (10, 25), "min_samples_split": (2, 25), "max_features": (0.1, 0.999)}, verbose=2) optimizer.maximize(n_iter=10) return optimizer.max # dictionary print(Colours.green("--- Optimizing RandomForestClassifier ---")) optimal_params = optimize_RFC(X_train, y_train) print(optimal_params)
def test_forecast(self): # 进行模型训练 # train model print(Colours.yellow("--- CNN forecast---")) source_data = unserialize_pickle(self.data_source_dump) stat_dict = unserialize_json(self.stat_file) data_flow = unserialize_numpy(self.flow_file) t_s_dict = unserialize_json(self.t_s_dict_file) model_data = DataModel(source_data, data_flow, t_s_dict, stat_dict) model_dict = model_data.data_source.data_config.model_dict # model_file = os.path.join(model_dict["dir"]["Out"], "model.yaml") # weight_file = os.path.join(model_dict["dir"]["Out"], "weights.h5") # bo_json_file = os.path.join(model_dict["dir"]["Out"], "bo_logs.json") # bo_json = unserialize_bo_json(bo_json_file) # n_input = int(bo_json["params"]["n_input"]) n_input = int(14) try_cnn = TryCnn() data, targets = routing_cnn.to_supervised( model_data.load_data(model_dict), n_input) obs_value, pred_value = cnn_test(try_cnn, X=data, Y=targets, stat_dict=stat_dict) # obs_value, pred_value = cnn_test(data, targets, stat_dict, model_file=model_file, weight_file=weight_file) print("the observe value:", obs_value) print("the predict value:", pred_value) serialize_numpy(obs_value, self.obs_file) serialize_numpy(pred_value, self.pred_file)
def test_colours(): colour_wrappers = [ (Colours.BLUE, Colours.blue), (Colours.BOLD, Colours.bold), (Colours.CYAN, Colours.cyan), (Colours.DARKCYAN, Colours.darkcyan), (Colours.GREEN, Colours.green), (Colours.PURPLE, Colours.purple), (Colours.RED, Colours.red), (Colours.UNDERLINE, Colours.underline), (Colours.YELLOW, Colours.yellow), ] for colour, wrapper in colour_wrappers: text1 = Colours._wrap_colour("test", colour) text2 = wrapper("test") assert text1.split("test") == [colour, Colours.END] assert text2.split("test") == [colour, Colours.END]
def bayesian_random_forest(frame): # """ # ML model applying Random Forest Regression with Bayesian optimization # to tune the model hyperparameters # input: Pandas dataframe to use in modelling # output: (score, error, execution time) # """ # using https://github.com/fmfn/BayesianOptimization frame.iloc[:, :-1] = StandardScaler().fit_transform(frame.iloc[:, :-1]) print("\n\n************************************************************") print("MODEL Random forest regression Bayesian opt") y = frame.iloc[:, -1] #target x = filterFeatures(frame) #features print(Colours.green("--- Optimizing Random Forest ---")) t0 = time.time() optimize_rfr(x, y) execution = time.time() - t0 print("Execution time: ", execution)
def test_forecast_data_model(self): print(Colours.yellow("--- reading CNN---")) config_file = definitions.CONFIG_FILE # 读取模型配置文件 config_data = DataConfig(config_file) # 准备训练数据 source_data = DataSource(config_data, config_data.model_dict["data"]["tRangeTest"]) # 构建输入数据类对象 model_data = DataModel(source_data) # 序列化保存对象 dir_temp = source_data.all_configs["temp_dir"] source_file = self.data_source_dump stat_file = self.stat_file flow_file = os.path.join(dir_temp, 'flow_test') t_s_dict_file = self.t_s_dict_file # 存储data_model,因为data_model里的数据如果直接序列化会比较慢,所以各部分分别序列化,dict的直接序列化为json文件,数据的HDF5 serialize_pickle(source_data, source_file) serialize_json(model_data.stat_dict, stat_file) serialize_numpy(model_data.data_flow, flow_file) serialize_json(model_data.t_s_dict, t_s_dict_file)
"expGamma": (-4, -1) }, random_state=1234, verbose=2) optimizer.maximize(n_iter=10) print("Final result:", optimizer.max) optimizer_max = optimizer.max return optimizer_max # COMMAND ---------- # Optimizing Support Vector Machine # Run bayesian optimization function with first batch of iterations print(Colours.yellow("--- Optimizing SVM ---")) optimize_svc(data, targets) # COMMAND ---------- ###################### Bayesian Optimization ########### ###################### Random Forest Classification ########### def rfc_cv(n_estimators, min_samples_split, max_features, data, targets): """Random Forest cross validation. This function will instantiate a random forest classifier with parameters n_estimators, min_samples_split, and max_features. Combined with data and targets this will in turn be used to perform cross validation. The result of cross validation is returned. Our goal is to find combinations of n_estimators, min_samples_split, and max_features that minimzes the log loss.
return json.dumps(result, ensure_ascii=False) if __name__ == "__main__": # 读取分词文件 data = pd.read_csv('seg_ratings_data.txt', sep='\t') # TfidfVectorizer是CountVectorizer + TfidfTransformer的组合,输出的是各个文本各个词的TF-IDF值 # min_df=5, max_features=10000 tfidf_vec = TfidfVectorizer(max_features=10000) tfidf_matrix = tfidf_vec.fit_transform(data['comment'].astype('U')) # 划分数据集 X_train, X_test, y_train, y_test = train_test_split( tfidf_matrix, data['rating'], test_size=0.2, random_state=1) # ,stratify = y # 如果需要调参,请把注释去掉 print(Colours.yellow("--- Optimizing SVM ---")) params = optimize_svc(X_train, y_train) # 是为了获取最优超参数 C # Regularization parameter. The strength of the regularization is inversely proportional to C. svm = LinearSVC(C=params['expC']) # 获取调参后最优的参数;如果需要调参,请注释掉下面一行代码 # svm = LinearSVC(C=0.3830389007577846) # 概率校准 svc = CalibratedClassifierCV(svm) svc.fit(X_train, y_train) # svc_y_pred为预测类别,svc_y_prod为预测属于各个类别的概率 svc_y_pred = svc.predict(X_test) # 参考:https://blog.csdn.net/u011630575/article/details/79429757 svc_y_prod = svc.predict_proba(X_test)[:, 1]
f=augmentation_wrapper, pbounds={ "AddSentDiverse": (0, 1.0), "AddKSentDiverse": (0, 1.0), "AddAnswerPosition": (0, 1.0), "InvalidateAnswer": (0, 1.0), "PerturbAnswer": (0, 1.0), "AddSentDiverse_PerturbAnswer": (0, 1.0), "AddKSentDiverse_PerturbAnswer": (0, 1.0), "AddAnswerPosition_PerturbAnswer": (0, 1.0), "PerturbQuestion": (0, 1.0), "AddSentDiverse_PerturbQuestion": (0, 1.0), "AddAnswerPosition_PerturbQuestion": (0, 1.0) }, random_state=1234, verbose=2) optimizer.maximize( init_points=20, n_iter=100 # What follows are GP regressor parameters # alpha=1e-3, # n_restarts_optimizer=5 ) print("Final result:", optimizer.max) if __name__ == "__main__": print(Colours.yellow("--- Optimizing Roberta ---")) optimize_roberta()
"""Wrapper of SVC cross validation. Notice how we transform between regular and log scale. While this is not technically necessary, it greatly improves the performance of the optimizer. """ return gms_function_value(capacity_1, capacity_2, capacity_3, capacity_4) optimizer = BayesianOptimization(f=gms_value, pbounds={ "capacity_1": (0.000001, 0.000001), "capacity_2": (0.000001, 0.000007), "capacity_3": (0.000001, 0.000007), "capacity_4": (0.000001, 0.000007) }, verbose=0) optimizer.maximize(n_iter=50) print("Final result:", optimizer.max) if __name__ == "__main__": print(Colours.yellow("--- Optimizing gms ---")) optimize_gms() print('111') df_target = pd.DataFrame() df_target['target'] = target_y df_target['action'] = target_x df_target.to_csv('process_value.csv', index=False)
# n_estimators=int(n_estimators), # min_samples_split=int(min_samples_split), # max_features=max(min(max_features, 0.999), 1e-3), # data=data, # targets=targets, # ) # # optimizer = BayesianOptimization( # f=rfc_crossval, # pbounds={ # "n_estimators": (10, 250), # "min_samples_split": (2, 25), # "max_features": (0.1, 0.999), # }, # random_state=1234, # verbose=2 # ) # optimizer.maximize(n_iter=10) #print("Final result:", optimizer.max) if __name__ == "__main__": # print(Colours.yellow("--- Optimizing SVM ---")) # optimize_svc(data, targets) # print(Colours.green("--- Optimizing Random Forest ---")) # # optimize_rfc(data, targets) print(Colours.green("--- XGboost ---")) optimize_xgb(data, targets) # Final result: {'target': -0.2464449012248331, 'params': {'colsample_bytree': 0.3023814615005768, 'gamma': 0.9941908961396094, 'max_depth': 3.105523507289568}}
import definitions from bayes_opt.util import Colours from data import * from hydroDL import * print('Starting ...') configFile = definitions.CONFIG_FILE # 读取模型配置文件 configData = DataConfig(configFile) # 准备训练数据 sourceData = DataSource(configData, configData.model_dict["data"]["tRangeTrain"]) # 构建输入数据类对象 dataModel = DataModel(sourceData) # 进行模型训练 # train model print(Colours.yellow("--- Optimizing CNN ---")) optimize_cnn(dataModel)
def test_shap(self): print(Colours.yellow("--- CNN shap---")) source_data = unserialize_pickle(self.data_source_dump) stat_dict = unserialize_json(self.stat_file) data_flow = unserialize_numpy(self.flow_file) t_s_dict = unserialize_json(self.t_s_dict_file) model_data = DataModel(source_data, data_flow, t_s_dict, stat_dict) model_dict = model_data.data_source.data_config.model_dict n_input = int(14) try_cnn = TryCnn() data, targets = routing_cnn.to_supervised( model_data.load_data(model_dict), n_input) try_cnn.load_state_dict( torch.load("F:\科研类\codes\hydro-routing-cnn\checkpoint.pt")) # x1 = data.reshape(715, 1, -1) # x = x1.reshape(-1, 1, 1, x1.shape[2]) x = data.reshape(-1, 1, data.shape[1], data.shape[2]) x = torch.from_numpy(x).float() try_cnn.eval() # x_pred = try_cnn(x[301:306]) # print(x[301:306]) # print(x_pred) print("======计算SHAP========") # 新建一个解释器(模型,数据) background = x[np.random.choice(x.shape[0], 100, replace=False)] e = shap.DeepExplainer(try_cnn, background) # e = shap.DeepExplainer(try_cnn, x) shap_values = e.shap_values(x) shap_values_stations_days = np.abs(shap_values).sum(axis=0).reshape( 14, data.shape[2]) shap_days = shap_values_stations_days.sum(axis=1) shap_stations = shap_values_stations_days.sum(axis=0) # 计算base_line # y_base = e.expected_value # print("y_base的值:", y_base) # print("y_base+shap值的和:",y_base+shap_values.sum()) shap_values_array = shap_values.reshape(-1, data.shape[1] * data.shape[2]) shap_arrays_values = [] for i in range(shap_values_array.shape[0]): new_array = np.zeros( (shap_values_array.shape[0] - 1) * data.shape[2]) if i == 0: ndarray = np.append(shap_values_array[i], new_array) elif i == shap_values_array.shape[0] - 1: ndarray = np.insert(shap_values_array[i], 0, new_array) else: ndarray = np.pad( shap_values_array[i], (i * data.shape[2], (shap_values_array.shape[0] - 1 - i) * data.shape[2]), 'constant') shap_arrays_values.append(ndarray) shap_arrays_values = np.array(shap_arrays_values) shap_arrays_values_abs = np.abs(shap_arrays_values).sum( axis=0).reshape(-1, data.shape[2]) print(shap_arrays_values_abs) shap_values_days_stations = [] for j in range(shap_arrays_values_abs.shape[0]): if j < 14: shap_values_day_state = shap_arrays_values_abs[j] / (j + 1) elif j >= shap_arrays_values_abs.shape[0] - 14: shap_values_day_state = shap_arrays_values_abs[j] / ( shap_arrays_values_abs.shape[0] - j) else: shap_values_day_state = shap_arrays_values_abs[j] / 14 shap_values_days_stations.append(shap_values_day_state) shap_values_days_stations = np.array(shap_values_days_stations) print(shap_values_days_stations) serialize_numpy(shap_values_days_stations, self.shap_values_days_states_file) serialize_numpy(shap_days, self.shap_days_file) serialize_numpy(shap_stations, self.shap_stations_file) serialize_numpy(shap_values_stations_days, self.shap_values_stations_days_file)