def full_data_training(stockmodel, option_type, only_call=False, with_percentage=False): """ print the results of the performance over the part of the dataset(*) for the given stock stockmodel and option type (*) hardware problems when full dataset is given. :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax" :param only_call: bool (default=False), if the dataset only contains the call options :param with_percentage: bool (default=False), if the dataset needs to contain the percentage of the stock price and the strike """ n_samples = 10000 random_state = 9943 base_file_name = "GPR-random_search_{0}_{1}_scaled.p".format( stockmodel, option_type) full_file_name = pkg_resources.open_text(random_search_gpr, base_file_name).name dict_cv_results = modelsaver.get_model(full_file_name).cv_results_ best_position = np.where( dict_cv_results['rank_test_neg_mean_squared_error'] == 1) best_model_parameters = np.array( dict_cv_results['params'])[best_position][0] dm = dc.DataManager(stockmodel=stockmodel, option_type=option_type, only_call=only_call, with_percent=with_percentage) X_train, y_train, x_not_selected, y_not_selected = dm.get_random_training_data( n_samples=n_samples, random_state=random_state, get_not_selected_data=True) scaler = preprocessing.StandardScaler().fit(X_train, y_train) X_train = scaler.transform(X_train) gpr_model = gaussian_process.GaussianProcessRegressor( kernel=best_model_parameters["kernel"], normalize_y=best_model_parameters["normalize_y"], alpha=best_model_parameters["alpha"]) gpr_model.fit(X_train, y_train) X_test, y_test = dm.get_test_data() X_test = scaler.transform(X_test) x_not_selected = scaler.transform(x_not_selected) y_pred = gpr_model.predict(X_test) mse_test = mean_squared_error(y_test, y_pred=y_pred) y_pred_not_selected = gpr_model.predict(x_not_selected) mse_not_selected = mean_squared_error(y_not_selected, y_pred_not_selected) print(f"MSE(test data): {mse_test}") print(f"MSE(not selected): {mse_not_selected}")
def cv_svr_models(stockmodel, option_type, random_state): """ For the given stockmodel and option type do a 3-fold cross validation of 50 random parametersets. Saves all the cross validations in "SVR-random_search_{stockmodel}_{option_type}_scaled_random{random_state}" :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax" :param random_state: int, for the randomstate """ datamanager = dc.DataManager(stockmodel=stockmodel, option_type=option_type) X, y = datamanager.get_training_data() # het SVR gaat veel sneller en presteert veel beter als de data wordt herschaald scaler = preprocessing.StandardScaler().fit(X, y) X = scaler.transform(X) svr = SVR(cache_size=1000) clf = RandomizedSearchCV(svr, distributions, random_state=random_state, cv=3, n_iter=50, verbose=10, n_jobs=6, scoring=['neg_mean_squared_error', 'r2'], refit=False) performance = clf.fit(X, y) modelsaver.save_model(performance, f"SVR-random_search_{stockmodel}_{option_type}")
def part_dataset_like_gpr(stockmodel, option_type, only_call=False): """ Do the testings with a smaller set of datapoints, the same as the test for the Gaussian Process Regressor Print the mse of the Test data and the part of the training data which are not used :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or :param only_call: bool (default=False), if the dataset only contains the call options :param with_percentage: bool (default=False), if the dataset needs to contain the percentage of the stock price and the strike :param scale: bool (default=False), whenever to scale the data """ n_samples = 10000 random_state = 9943 base_file_name = "SVR-random_search_{0}_{1}_scaled.p".format(stockmodel, option_type) # get the best parameters from the cross validation full_file_name = pkg_resources.open_text(random_search_svr, base_file_name).name dict_cv_results = modelsaver.get_model(full_file_name).cv_results_ best_position = np.where(dict_cv_results['rank_test_neg_mean_squared_error'] == 1) best_model_parameters = np.array(dict_cv_results['params'])[best_position][0] # get the training and test data dm = dc.DataManager(stockmodel=stockmodel, option_type=option_type, only_call=only_call) X_train, y_train, x_not_selected, y_not_selected = dm.get_random_training_data(n_samples=n_samples, random_state=random_state, get_not_selected_data=True) scaler = preprocessing.StandardScaler().fit(X_train, y_train) X_train = scaler.transform(X_train) svr_model = SVR(cache_size=2000, C=best_model_parameters['C'], degree=best_model_parameters['degree'], epsilon=best_model_parameters['epsilon'], gamma=best_model_parameters['gamma'], kernel=best_model_parameters['kernel']) svr_model.fit(X_train, y_train) X_test, y_test = dm.get_test_data() X_test = scaler.transform(X_test) x_not_selected = scaler.transform(x_not_selected) y_pred = svr_model.predict(X_test) mse_test = mean_squared_error(y_test, y_pred=y_pred) y_pred_not_selected = svr_model.predict(x_not_selected) mse_not_selected = mean_squared_error(y_not_selected, y_pred_not_selected) print(f"MSE(test data): {mse_test:4.3f}") print(f"MSE(not selected): {mse_not_selected:4.3f}")
def part_dataset_like_gpr(stockmodel, option_type, only_call=False, with_percentage=False, scale=True): """ Do the testings with a smaller set of datapoints, the same as the test for the Gaussian Process Regressor Print the mse of the Test data and the part of the training data which are not used :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or :param only_call: bool (default=False), if the dataset only contains the call options :param with_percentage: bool (default=False), if the dataset needs to contain the percentage of the stock price and the strike :param scale: bool (default=False), whenever to scale the data """ n_samples = 10000 random_state = 9943 # get the training and test data dm = dc.DataManager(stockmodel=stockmodel, option_type=option_type, only_call=only_call, with_percent=with_percentage) X_train, y_train, x_not_selected, y_not_selected = dm.get_random_training_data( n_samples=n_samples, random_state=random_state, get_not_selected_data=True) if scale: scaler = preprocessing.StandardScaler().fit(X_train, y_train) X_train = scaler.transform(X_train) size_layers, activations = get_best_model(stockmodel, option_type) nn_model = build_nn_model(X_train.shape[1], size_layers, activations) nn_model.fit(X_train, y_train, verbose=1, batch_size=100, epochs=100) X_test, y_test = dm.get_test_data() if scale: X_test = scaler.transform(X_test) x_not_selected = scaler.transform(x_not_selected) y_pred = nn_model.predict(X_test) mse_test = mean_squared_error(y_test, y_pred=y_pred) y_pred_not_selected = nn_model.predict(x_not_selected) mse_not_selected = mean_squared_error(y_not_selected, y_pred_not_selected) print(f"MSE(test data): {mse_test}") print(f"MSE(not selected): {mse_not_selected}")
def full_dataset(stockmodel, option_type, only_call=False, with_percentage=False, scale=False): """ print the results of the performance over the full dataset for the given stock stockmodel and option type :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or :param only_call: bool (default=False), if the dataset only contains the call options :param with_percentage: bool (default=False), if the dataset needs to contain the percentage of the stock price and the strike :param scale: bool (default=False), if the dataset needs to be scaled """ n_estimators = 700 if (stockmodel == "BS" and option_type == "opt_standard") or stockmodel == "VG": max_feature = "log2" else: max_feature = 5 dm = dc.DataManager(stockmodel=stockmodel, option_type=option_type, only_call=only_call, with_percent=with_percentage) X_train, y_train = dm.get_training_data() if scale: scaler = preprocessing.StandardScaler().fit(X_train, y_train) X_train = scaler.transform(X_train) rf_model = RandomForestRegressor(n_jobs=8, verbose=0, max_features=max_feature, n_estimators=n_estimators) rf_model.fit(X_train, y_train) X_test, y_test = dm.get_test_data() if scale: X_test = scaler.transform(X_test) y_pred = rf_model.predict(X_test) mse = mean_squared_error(y_test, y_pred=y_pred) print(f"MSE: {mse}")
def full_data_training(stockmodel, option_type, only_call=False, with_percentage=False): """ print the results of the performance over the full dataset for the given stock stockmodel and option type :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax" :param only_call: bool (default=False), if the dataset only contains the call options :param with_percentage: bool (default=False), if the dataset needs to contain the percentage of the stock price and the strike """ base_file_name = "SVR-random_search_{0}_{1}_scaled.p".format(stockmodel, option_type) # get the best parameters from the cross validation full_file_name = pkg_resources.open_text(random_search_svr, base_file_name).name dict_cv_results = modelsaver.get_model(full_file_name).cv_results_ best_position = np.where(dict_cv_results['rank_test_neg_mean_squared_error'] == 1) best_model_parameters = np.array(dict_cv_results['params'])[best_position][0] dm = dc.DataManager(stockmodel=stockmodel, option_type=option_type, only_call=only_call, with_percent=with_percentage) X_train, y_train = dm.get_training_data() scaler = preprocessing.StandardScaler().fit(X_train, y_train) X_train = scaler.transform(X_train) svr_model = SVR(cache_size=2000, C=best_model_parameters['C'], degree=best_model_parameters['degree'], epsilon=best_model_parameters['epsilon'], gamma=best_model_parameters['gamma'], kernel=best_model_parameters['kernel']) svr_model.fit(X_train, y_train) X_test, y_test = dm.get_test_data() X_test = scaler.transform(X_test) y_pred = svr_model.predict(X_test) mse = mean_squared_error(y_test, y_pred=y_pred) print(f"MSE: {mse:4.3f}")
def cv_gpr_models(stockmodel, option, random_state=None, scale=False): """ For the given stockmodel and option type do a 3-fold cross validation of 50 random parametersets. Saves all the cross validations in f"GPR-random_search_{stockmodel}_{option}{string_scaled}_random{random_state}" :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax" :param random_state: int, for the randomstate """ kernels = [RBF(), Matern(), DotProduct(), RationalQuadratic()] param_grid = { "normalize_y": [True, False], 'kernel': kernels, "alpha": uniform(loc=0.000000001, scale=0.001) } datamanager = dc.DataManager(stockmodel=stockmodel, option_type=option) X, y = datamanager.get_random_training_data(10000) if scale: scaler = preprocessing.StandardScaler().fit(X, y) X = scaler.transform(X) gpr = gaussian_process.GaussianProcessRegressor(optimizer="fmin_l_bfgs_b") clf = RandomizedSearchCV(gpr, param_grid, random_state=random_state, cv=3, n_iter=50, verbose=10, n_jobs=2, scoring=['neg_mean_squared_error', 'r2'], refit=False) performance = clf.fit(X, y) string_scaled = '_scaled' if scale else "" modelsaver.save_model( performance, f"GPR-random_search_{stockmodel}_{option}{string_scaled}")
def full_dataset(stockmodel, option_type, only_call=False, with_percentage=False, scale=True): """ print the results of the performance over the full dataset for the given stock stockmodel and option type :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax" :param only_call: bool (default=False), if the dataset only contains the call options :param with_percentage: bool (default=False), if the dataset needs to contain the percentage of the stock price and the strike :param scale: bool (default=False), if the dataset needs to be scaled """ dm = dc.DataManager(stockmodel=stockmodel, option_type=option_type, only_call=only_call, with_percent=with_percentage) X_train, y_train = dm.get_training_data() if scale: scaler = preprocessing.StandardScaler().fit(X_train, y_train) X_train = scaler.transform(X_train) size_layers, activations = get_best_model(stockmodel, option_type) nn_model = build_nn_model(X_train.shape[1], size_layers, activations) nn_model.fit(X_train, y_train, verbose=0, batch_size=100, epochs=50) X_test, y_test = dm.get_test_data() if scale: X_test = scaler.transform(X_test) y_pred = nn_model.predict(X_test) mse = mean_squared_error(y_test, y_pred=y_pred) print(f"MSE: {mse}")
def one_tree_visualisation(): rf = RandomForestRegressor(n_estimators=100, max_features="auto", n_jobs=6, verbose=2) datamanger = dc.DataManager() X, y = datamanger.get_training_data() # Train rf.fit(X, y) # Extract single tree estimator = rf.estimators_[8] fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=800) tree.plot_tree(rf.estimators_[8], feature_names=X.columns, max_depth=2, filled=True) # plt.title("Random Forest: Decision Tree") fig.savefig('rf_individualtree.png') print(estimator.get_depth())
def cv_layers(n_random_samples, stock_model, option_type, cv=3, batch_size=100, epochs=50, random_state=4173, scale=True): """ Cross validation of random neural networks :param n_random_samples: int, number of random neural networks :param stock_model: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax" :param cv: int (default=3), cross validations :param batch_size: int(default=100), batch size of the neural networks :param epochs: int(default=50), number of epochs for the neural networks :param random_state: int(default=4173) :param scale: bool(default=False), whenever to scale the data :return: list of dicts with keys "n_layers": number of layers used, "size_layers": list of size n_layers, with all the sizes, "activations": list of size n_layers, with all the activation functions "cv_result": dict with the Train and Test errors """ activation_functions = ["relu", "softsign", "sigmoid", "elu"] datamanager = dc.DataManager(stockmodel=stock_model, option_type=option_type) X, y = datamanager.get_training_data() if scale: scaler = preprocessing.StandardScaler().fit(X, y) X = scaler.transform(X) results_fitting = [] np.random.seed(random_state) for i in range(n_random_samples): first_layer_size = random.randrange(50, 301, 50) # 1, 2 or 3 layers n_hidden_layers = np.random.randint(1, 4) size_layers = [ first_layer_size // ((i + 1)**i) for i in range(n_hidden_layers) ] activation_layers = random.choices(activation_functions, k=n_hidden_layers) architecture = { "size_layers": size_layers, "activations": activation_layers, "input": X.shape[1] } gen_error = cross_validation_nn(architecture, X, y, cv=cv, batch_size=batch_size, epochs=epochs) nn_model_values = { "n_layers": n_hidden_layers, "size_layers": size_layers, "activations": activation_layers, "cv_result": gen_error } results_fitting.append(nn_model_values) return results_fitting
def rf_n_estimators(stockmodel="BS", option_type="opt_exact_standard", range_n_estimators=range(50, 751, 50), save_mse=True, max_features="auto", scale=True): """ Method to calculate the mse for a range of estimators :param stockmodel: str, "BS", "VG" or "H" :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax" If stockmodel = "BS" -> "opt_exact_standard" is also possible :param range_n_estimators: list with the number of estimators for each run :param save_mse: bool, whenever to save all the values in a file. :param max_features: "auto", "log2" or a integer, for the splits in the Tree stockmodel :param scale: bool, if the data needs to be scaled or not :return: dict,with keys "Train", "Test", "oob_score", "n_estimators". Train = mse of the Training data Test = mse of the Test data oob_score = mse of the out-of-bag observations n_estimators = list of the number of estimators """ dict_option_types = { "opt_exact_standard": "SE", "opt_standard": "S", "opt_asianmean": "A", "opt_lookbackmin": "Lmin", "opt_lookbackmax": "Lmax" } list_results_train = [] list_results_test = [] list_oob_score = [] datamanager = dc.DataManager(stockmodel=stockmodel, option_type=option_type) X, y = datamanager.get_training_data() X_test, y_test = datamanager.get_test_data() if scale: scaler = preprocessing.StandardScaler().fit(X, y) X = scaler.transform(X) X_test = scaler.transform(X_test) for n_estimator in range_n_estimators: rf_model = RandomForestRegressor(n_estimators=n_estimator, verbose=1, n_jobs=7, random_state=2458 + n_estimator, max_features=max_features, oob_score=True) rf_model.fit(X, y) mse_train = mean_squared_error(y, rf_model.predict(X)) mse_test = mean_squared_error(y_test, rf_model.predict(X_test)) oob_score = rf_model.oob_score_ print(f'Train {mse_train}') print(f'Test {mse_test}') print(f'OOB score: {oob_score}') list_results_train.append(mse_train) list_results_test.append(mse_test) list_oob_score.append(oob_score) dict_result = { "Train": list_results_train, "Test": list_results_test, "oob_score": list_oob_score, "n_estimators": range_n_estimators } if save_mse: string_scaled = "_scaled" if scale else "" modelsaver.save_model( dict_result, f"rf_{min(range_n_estimators)}-{max(range_n_estimators)}" f"-results_train_test-{stockmodel}-{dict_option_types[option_type]}" f"-{max_features}{string_scaled}") return dict_result