Beispiel #1
0
    def optimize_RFC(data, targets):
    """Apply Bayesian Optimization to RandomForestClassifier hyper-parameters."""
    def RFC_CV(n_estimators, min_samples_split, max_features, data, targets):
        """RandomForestClassifier with cross validation."""
        estimator = RandomForestClassifier(n_estimators=n_estimators, 
                                           min_samples_split=min_samples_split, 
                                           max_features=max_features)
        cval = cross_val_score(estimator, data, targets, scoring='neg_log_loss', cv=3)
        return cval.mean()

    def RFC_crossval(n_estimators, min_samples_split, max_features):
        """Wrapper of RandomForestClassifier cross validation."""
        return RFC_CV(n_estimators=int(n_estimators),
                      min_samples_split=int(min_samples_split),
                      max_features=max(min(max_features, 0.999), 1e-3),
                      data=data,
                      targets=targets)

    optimizer = BayesianOptimization(f=RFC_crossval, 
                                     pbounds={"n_estimators": (10, 25),
                                              "min_samples_split": (2, 25),
                                              "max_features": (0.1, 0.999)}, 
                                     verbose=2)
    optimizer.maximize(n_iter=10)
    return optimizer.max  # dictionary 

    print(Colours.green("--- Optimizing RandomForestClassifier ---"))
    optimal_params = optimize_RFC(X_train, y_train)
    print(optimal_params)
Beispiel #2
0
 def test_forecast(self):
     # 进行模型训练
     # train model
     print(Colours.yellow("--- CNN forecast---"))
     source_data = unserialize_pickle(self.data_source_dump)
     stat_dict = unserialize_json(self.stat_file)
     data_flow = unserialize_numpy(self.flow_file)
     t_s_dict = unserialize_json(self.t_s_dict_file)
     model_data = DataModel(source_data, data_flow, t_s_dict, stat_dict)
     model_dict = model_data.data_source.data_config.model_dict
     # model_file = os.path.join(model_dict["dir"]["Out"], "model.yaml")
     # weight_file = os.path.join(model_dict["dir"]["Out"], "weights.h5")
     # bo_json_file = os.path.join(model_dict["dir"]["Out"], "bo_logs.json")
     # bo_json = unserialize_bo_json(bo_json_file)
     # n_input = int(bo_json["params"]["n_input"])
     n_input = int(14)
     try_cnn = TryCnn()
     data, targets = routing_cnn.to_supervised(
         model_data.load_data(model_dict), n_input)
     obs_value, pred_value = cnn_test(try_cnn,
                                      X=data,
                                      Y=targets,
                                      stat_dict=stat_dict)
     # obs_value, pred_value = cnn_test(data, targets, stat_dict, model_file=model_file, weight_file=weight_file)
     print("the observe value:", obs_value)
     print("the predict value:", pred_value)
     serialize_numpy(obs_value, self.obs_file)
     serialize_numpy(pred_value, self.pred_file)
Beispiel #3
0
def test_colours():
    colour_wrappers = [
        (Colours.BLUE, Colours.blue),
        (Colours.BOLD, Colours.bold),
        (Colours.CYAN, Colours.cyan),
        (Colours.DARKCYAN, Colours.darkcyan),
        (Colours.GREEN, Colours.green),
        (Colours.PURPLE, Colours.purple),
        (Colours.RED, Colours.red),
        (Colours.UNDERLINE, Colours.underline),
        (Colours.YELLOW, Colours.yellow),
    ]

    for colour, wrapper in colour_wrappers:
        text1 = Colours._wrap_colour("test", colour)
        text2 = wrapper("test")

        assert text1.split("test") == [colour, Colours.END]
        assert text2.split("test") == [colour, Colours.END]
Beispiel #4
0
def bayesian_random_forest(frame):
    # """
    # ML model applying Random Forest Regression with Bayesian optimization
    # to tune the model hyperparameters

    # input: Pandas dataframe to use in modelling
    # output: (score, error, execution time)

    # """
    # using https://github.com/fmfn/BayesianOptimization
    frame.iloc[:, :-1] = StandardScaler().fit_transform(frame.iloc[:, :-1])
    print("\n\n************************************************************")
    print("MODEL Random forest regression Bayesian opt")

    y = frame.iloc[:, -1]  #target
    x = filterFeatures(frame)  #features

    print(Colours.green("--- Optimizing Random Forest ---"))

    t0 = time.time()
    optimize_rfr(x, y)
    execution = time.time() - t0
    print("Execution time: ", execution)
Beispiel #5
0
    def test_forecast_data_model(self):
        print(Colours.yellow("--- reading CNN---"))
        config_file = definitions.CONFIG_FILE

        # 读取模型配置文件
        config_data = DataConfig(config_file)
        # 准备训练数据
        source_data = DataSource(config_data,
                                 config_data.model_dict["data"]["tRangeTest"])
        # 构建输入数据类对象
        model_data = DataModel(source_data)
        # 序列化保存对象
        dir_temp = source_data.all_configs["temp_dir"]
        source_file = self.data_source_dump
        stat_file = self.stat_file
        flow_file = os.path.join(dir_temp, 'flow_test')
        t_s_dict_file = self.t_s_dict_file

        # 存储data_model,因为data_model里的数据如果直接序列化会比较慢,所以各部分分别序列化,dict的直接序列化为json文件,数据的HDF5
        serialize_pickle(source_data, source_file)
        serialize_json(model_data.stat_dict, stat_file)
        serialize_numpy(model_data.data_flow, flow_file)
        serialize_json(model_data.t_s_dict, t_s_dict_file)
Beispiel #6
0
                                         "expGamma": (-4, -1)
                                     },
                                     random_state=1234,
                                     verbose=2)
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)
    optimizer_max = optimizer.max
    return optimizer_max


# COMMAND ----------

# Optimizing Support Vector Machine
# Run bayesian optimization function with first batch of iterations
print(Colours.yellow("--- Optimizing SVM ---"))
optimize_svc(data, targets)

# COMMAND ----------


###################### Bayesian Optimization ###########
###################### Random Forest Classification ###########
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    """Random Forest cross validation.
    This function will instantiate a random forest classifier with parameters
    n_estimators, min_samples_split, and max_features. Combined with data and
    targets this will in turn be used to perform cross validation. The result
    of cross validation is returned.
    Our goal is to find combinations of n_estimators, min_samples_split, and
    max_features that minimzes the log loss.
    return json.dumps(result, ensure_ascii=False)


if __name__ == "__main__":
    # 读取分词文件
    data = pd.read_csv('seg_ratings_data.txt', sep='\t')
    # TfidfVectorizer是CountVectorizer + TfidfTransformer的组合,输出的是各个文本各个词的TF-IDF值
    # min_df=5, max_features=10000
    tfidf_vec = TfidfVectorizer(max_features=10000)
    tfidf_matrix = tfidf_vec.fit_transform(data['comment'].astype('U'))
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        tfidf_matrix, data['rating'], test_size=0.2, random_state=1)  # ,stratify = y

    # 如果需要调参,请把注释去掉
    print(Colours.yellow("--- Optimizing SVM ---"))
    params = optimize_svc(X_train, y_train)  # 是为了获取最优超参数 C
    # Regularization parameter. The strength of the regularization is inversely proportional to C.
    svm = LinearSVC(C=params['expC'])

    # 获取调参后最优的参数;如果需要调参,请注释掉下面一行代码
    # svm = LinearSVC(C=0.3830389007577846)

    # 概率校准
    svc = CalibratedClassifierCV(svm)
    svc.fit(X_train, y_train)
    # svc_y_pred为预测类别,svc_y_prod为预测属于各个类别的概率
    svc_y_pred = svc.predict(X_test)
    # 参考:https://blog.csdn.net/u011630575/article/details/79429757
    svc_y_prod = svc.predict_proba(X_test)[:, 1]
Beispiel #8
0
        f=augmentation_wrapper,
        pbounds={
            "AddSentDiverse": (0, 1.0),
            "AddKSentDiverse": (0, 1.0),
            "AddAnswerPosition": (0, 1.0),
            "InvalidateAnswer": (0, 1.0),
            "PerturbAnswer": (0, 1.0),
            "AddSentDiverse_PerturbAnswer": (0, 1.0),
            "AddKSentDiverse_PerturbAnswer": (0, 1.0),
            "AddAnswerPosition_PerturbAnswer": (0, 1.0),
            "PerturbQuestion": (0, 1.0),
            "AddSentDiverse_PerturbQuestion": (0, 1.0),
            "AddAnswerPosition_PerturbQuestion": (0, 1.0)
        },
        random_state=1234,
        verbose=2)
    optimizer.maximize(
        init_points=20,
        n_iter=100
        # What follows are GP regressor parameters
        # alpha=1e-3,
        # n_restarts_optimizer=5
    )
    print("Final result:", optimizer.max)


if __name__ == "__main__":

    print(Colours.yellow("--- Optimizing Roberta ---"))
    optimize_roberta()
        """Wrapper of SVC cross validation.

        Notice how we transform between regular and log scale. While this
        is not technically necessary, it greatly improves the performance
        of the optimizer.
        """
        return gms_function_value(capacity_1, capacity_2, capacity_3,
                                  capacity_4)

    optimizer = BayesianOptimization(f=gms_value,
                                     pbounds={
                                         "capacity_1": (0.000001, 0.000001),
                                         "capacity_2": (0.000001, 0.000007),
                                         "capacity_3": (0.000001, 0.000007),
                                         "capacity_4": (0.000001, 0.000007)
                                     },
                                     verbose=0)
    optimizer.maximize(n_iter=50)

    print("Final result:", optimizer.max)


if __name__ == "__main__":
    print(Colours.yellow("--- Optimizing gms ---"))
    optimize_gms()
    print('111')
    df_target = pd.DataFrame()
    df_target['target'] = target_y
    df_target['action'] = target_x
    df_target.to_csv('process_value.csv', index=False)
Beispiel #10
0
#            n_estimators=int(n_estimators),
#            min_samples_split=int(min_samples_split),
#            max_features=max(min(max_features, 0.999), 1e-3),
#            data=data,
#            targets=targets,
#        )
#
#    optimizer = BayesianOptimization(
#        f=rfc_crossval,
#        pbounds={
#            "n_estimators": (10, 250),
#            "min_samples_split": (2, 25),
#            "max_features": (0.1, 0.999),
#        },
#        random_state=1234,
#        verbose=2
#    )
#    optimizer.maximize(n_iter=10)

#print("Final result:", optimizer.max)
if __name__ == "__main__":

    #    print(Colours.yellow("--- Optimizing SVM ---"))
    #    optimize_svc(data, targets)
    #    print(Colours.green("--- Optimizing Random Forest ---"))
    #
    #    optimize_rfc(data, targets)
    print(Colours.green("--- XGboost ---"))
    optimize_xgb(data, targets)
    # Final result: {'target': -0.2464449012248331, 'params': {'colsample_bytree': 0.3023814615005768, 'gamma': 0.9941908961396094, 'max_depth': 3.105523507289568}}
import definitions
from bayes_opt.util import Colours
from data import *
from hydroDL import *

print('Starting ...')

configFile = definitions.CONFIG_FILE
# 读取模型配置文件
configData = DataConfig(configFile)
# 准备训练数据
sourceData = DataSource(configData,
                        configData.model_dict["data"]["tRangeTrain"])
# 构建输入数据类对象
dataModel = DataModel(sourceData)

# 进行模型训练
# train model
print(Colours.yellow("--- Optimizing CNN ---"))
optimize_cnn(dataModel)
Beispiel #12
0
 def test_shap(self):
     print(Colours.yellow("--- CNN shap---"))
     source_data = unserialize_pickle(self.data_source_dump)
     stat_dict = unserialize_json(self.stat_file)
     data_flow = unserialize_numpy(self.flow_file)
     t_s_dict = unserialize_json(self.t_s_dict_file)
     model_data = DataModel(source_data, data_flow, t_s_dict, stat_dict)
     model_dict = model_data.data_source.data_config.model_dict
     n_input = int(14)
     try_cnn = TryCnn()
     data, targets = routing_cnn.to_supervised(
         model_data.load_data(model_dict), n_input)
     try_cnn.load_state_dict(
         torch.load("F:\科研类\codes\hydro-routing-cnn\checkpoint.pt"))
     # x1 = data.reshape(715, 1, -1)
     # x = x1.reshape(-1, 1, 1, x1.shape[2])
     x = data.reshape(-1, 1, data.shape[1], data.shape[2])
     x = torch.from_numpy(x).float()
     try_cnn.eval()
     # x_pred = try_cnn(x[301:306])
     # print(x[301:306])
     # print(x_pred)
     print("======计算SHAP========")
     # 新建一个解释器(模型,数据)
     background = x[np.random.choice(x.shape[0], 100, replace=False)]
     e = shap.DeepExplainer(try_cnn, background)
     # e = shap.DeepExplainer(try_cnn, x)
     shap_values = e.shap_values(x)
     shap_values_stations_days = np.abs(shap_values).sum(axis=0).reshape(
         14, data.shape[2])
     shap_days = shap_values_stations_days.sum(axis=1)
     shap_stations = shap_values_stations_days.sum(axis=0)
     # 计算base_line
     # y_base = e.expected_value
     # print("y_base的值:", y_base)
     # print("y_base+shap值的和:",y_base+shap_values.sum())
     shap_values_array = shap_values.reshape(-1,
                                             data.shape[1] * data.shape[2])
     shap_arrays_values = []
     for i in range(shap_values_array.shape[0]):
         new_array = np.zeros(
             (shap_values_array.shape[0] - 1) * data.shape[2])
         if i == 0:
             ndarray = np.append(shap_values_array[i], new_array)
         elif i == shap_values_array.shape[0] - 1:
             ndarray = np.insert(shap_values_array[i], 0, new_array)
         else:
             ndarray = np.pad(
                 shap_values_array[i],
                 (i * data.shape[2],
                  (shap_values_array.shape[0] - 1 - i) * data.shape[2]),
                 'constant')
         shap_arrays_values.append(ndarray)
     shap_arrays_values = np.array(shap_arrays_values)
     shap_arrays_values_abs = np.abs(shap_arrays_values).sum(
         axis=0).reshape(-1, data.shape[2])
     print(shap_arrays_values_abs)
     shap_values_days_stations = []
     for j in range(shap_arrays_values_abs.shape[0]):
         if j < 14:
             shap_values_day_state = shap_arrays_values_abs[j] / (j + 1)
         elif j >= shap_arrays_values_abs.shape[0] - 14:
             shap_values_day_state = shap_arrays_values_abs[j] / (
                 shap_arrays_values_abs.shape[0] - j)
         else:
             shap_values_day_state = shap_arrays_values_abs[j] / 14
         shap_values_days_stations.append(shap_values_day_state)
     shap_values_days_stations = np.array(shap_values_days_stations)
     print(shap_values_days_stations)
     serialize_numpy(shap_values_days_stations,
                     self.shap_values_days_states_file)
     serialize_numpy(shap_days, self.shap_days_file)
     serialize_numpy(shap_stations, self.shap_stations_file)
     serialize_numpy(shap_values_stations_days,
                     self.shap_values_stations_days_file)