Python DataCollector.DataManagerの例、Estimators.DataCollector.DataManager Pythonの例

コード例 #1

0

ファイルを表示

def full_data_training(stockmodel,
                       option_type,
                       only_call=False,
                       with_percentage=False):
    """
    print the results of the performance over the part of the dataset(*) for the given stock stockmodel and option type

    (*) hardware problems when full dataset is given.

    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax"
    :param only_call: bool (default=False), if the dataset only contains the call options
    :param with_percentage: bool (default=False),
            if the dataset needs to contain the percentage of the stock price and the strike
    """
    n_samples = 10000
    random_state = 9943

    base_file_name = "GPR-random_search_{0}_{1}_scaled.p".format(
        stockmodel, option_type)

    full_file_name = pkg_resources.open_text(random_search_gpr,
                                             base_file_name).name
    dict_cv_results = modelsaver.get_model(full_file_name).cv_results_
    best_position = np.where(
        dict_cv_results['rank_test_neg_mean_squared_error'] == 1)
    best_model_parameters = np.array(
        dict_cv_results['params'])[best_position][0]

    dm = dc.DataManager(stockmodel=stockmodel,
                        option_type=option_type,
                        only_call=only_call,
                        with_percent=with_percentage)
    X_train, y_train, x_not_selected, y_not_selected = dm.get_random_training_data(
        n_samples=n_samples,
        random_state=random_state,
        get_not_selected_data=True)

    scaler = preprocessing.StandardScaler().fit(X_train, y_train)
    X_train = scaler.transform(X_train)

    gpr_model = gaussian_process.GaussianProcessRegressor(
        kernel=best_model_parameters["kernel"],
        normalize_y=best_model_parameters["normalize_y"],
        alpha=best_model_parameters["alpha"])

    gpr_model.fit(X_train, y_train)

    X_test, y_test = dm.get_test_data()
    X_test = scaler.transform(X_test)
    x_not_selected = scaler.transform(x_not_selected)

    y_pred = gpr_model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred=y_pred)

    y_pred_not_selected = gpr_model.predict(x_not_selected)
    mse_not_selected = mean_squared_error(y_not_selected, y_pred_not_selected)

    print(f"MSE(test data): {mse_test}")
    print(f"MSE(not selected): {mse_not_selected}")

コード例 #2

0

ファイルを表示

ファイル: SupportVectorRegression.py プロジェクト: ysdgroot/Masterthesis

def cv_svr_models(stockmodel, option_type, random_state):
    """
    For the given stockmodel and option type do a 3-fold cross validation of 50 random parametersets.

    Saves all the cross validations in "SVR-random_search_{stockmodel}_{option_type}_scaled_random{random_state}"

    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax"
    :param random_state: int, for the randomstate
    """
    datamanager = dc.DataManager(stockmodel=stockmodel, option_type=option_type)
    X, y = datamanager.get_training_data()

    # het SVR gaat veel sneller en presteert veel beter als de data wordt herschaald
    scaler = preprocessing.StandardScaler().fit(X, y)
    X = scaler.transform(X)

    svr = SVR(cache_size=1000)
    clf = RandomizedSearchCV(svr, distributions, random_state=random_state, cv=3, n_iter=50, verbose=10, n_jobs=6,
                             scoring=['neg_mean_squared_error', 'r2'],
                             refit=False)

    performance = clf.fit(X, y)

    modelsaver.save_model(performance, f"SVR-random_search_{stockmodel}_{option_type}")

コード例 #3

0

ファイルを表示

ファイル: SupportVectorRegression.py プロジェクト: ysdgroot/Masterthesis

def part_dataset_like_gpr(stockmodel, option_type, only_call=False):
    """
   Do the testings with a smaller set of datapoints, the same as the test for the Gaussian Process Regressor
   Print the mse of the Test data and the part of the training data which are not used

   :param stockmodel: str, "BS", "VG" or "H"
   :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or
   :param only_call: bool (default=False), if the dataset only contains the call options
   :param with_percentage: bool (default=False),
           if the dataset needs to contain the percentage of the stock price and the strike
   :param scale: bool (default=False), whenever to scale the data
   """
    n_samples = 10000
    random_state = 9943

    base_file_name = "SVR-random_search_{0}_{1}_scaled.p".format(stockmodel, option_type)

    # get the best parameters from the cross validation
    full_file_name = pkg_resources.open_text(random_search_svr, base_file_name).name
    dict_cv_results = modelsaver.get_model(full_file_name).cv_results_
    best_position = np.where(dict_cv_results['rank_test_neg_mean_squared_error'] == 1)
    best_model_parameters = np.array(dict_cv_results['params'])[best_position][0]

    # get the training and test data
    dm = dc.DataManager(stockmodel=stockmodel, option_type=option_type, only_call=only_call)
    X_train, y_train, x_not_selected, y_not_selected = dm.get_random_training_data(n_samples=n_samples,
                                                                                   random_state=random_state,
                                                                                   get_not_selected_data=True)

    scaler = preprocessing.StandardScaler().fit(X_train, y_train)
    X_train = scaler.transform(X_train)

    svr_model = SVR(cache_size=2000,
                    C=best_model_parameters['C'],
                    degree=best_model_parameters['degree'],
                    epsilon=best_model_parameters['epsilon'],
                    gamma=best_model_parameters['gamma'],
                    kernel=best_model_parameters['kernel'])

    svr_model.fit(X_train, y_train)

    X_test, y_test = dm.get_test_data()
    X_test = scaler.transform(X_test)
    x_not_selected = scaler.transform(x_not_selected)

    y_pred = svr_model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred=y_pred)

    y_pred_not_selected = svr_model.predict(x_not_selected)
    mse_not_selected = mean_squared_error(y_not_selected, y_pred_not_selected)

    print(f"MSE(test data): {mse_test:4.3f}")
    print(f"MSE(not selected): {mse_not_selected:4.3f}")

コード例 #4

0

ファイルを表示

ファイル: NeuralNetworks.py プロジェクト: ysdgroot/Masterthesis

def part_dataset_like_gpr(stockmodel,
                          option_type,
                          only_call=False,
                          with_percentage=False,
                          scale=True):
    """
    Do the testings with a smaller set of datapoints, the same as the test for the Gaussian Process Regressor
    Print the mse of the Test data and the part of the training data which are not used

    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or
    :param only_call: bool (default=False), if the dataset only contains the call options
    :param with_percentage: bool (default=False),
            if the dataset needs to contain the percentage of the stock price and the strike
    :param scale: bool (default=False), whenever to scale the data
    """
    n_samples = 10000
    random_state = 9943

    # get the training and test data
    dm = dc.DataManager(stockmodel=stockmodel,
                        option_type=option_type,
                        only_call=only_call,
                        with_percent=with_percentage)
    X_train, y_train, x_not_selected, y_not_selected = dm.get_random_training_data(
        n_samples=n_samples,
        random_state=random_state,
        get_not_selected_data=True)

    if scale:
        scaler = preprocessing.StandardScaler().fit(X_train, y_train)
        X_train = scaler.transform(X_train)

    size_layers, activations = get_best_model(stockmodel, option_type)

    nn_model = build_nn_model(X_train.shape[1], size_layers, activations)

    nn_model.fit(X_train, y_train, verbose=1, batch_size=100, epochs=100)

    X_test, y_test = dm.get_test_data()
    if scale:
        X_test = scaler.transform(X_test)
        x_not_selected = scaler.transform(x_not_selected)

    y_pred = nn_model.predict(X_test)
    mse_test = mean_squared_error(y_test, y_pred=y_pred)

    y_pred_not_selected = nn_model.predict(x_not_selected)
    mse_not_selected = mean_squared_error(y_not_selected, y_pred_not_selected)

    print(f"MSE(test data): {mse_test}")
    print(f"MSE(not selected): {mse_not_selected}")

コード例 #5

0

ファイルを表示

ファイル: RandomForest.py プロジェクト: ysdgroot/Masterthesis

def full_dataset(stockmodel,
                 option_type,
                 only_call=False,
                 with_percentage=False,
                 scale=False):
    """
    print the results of the performance over the full dataset for the given stock stockmodel and option type
    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or
    :param only_call: bool (default=False), if the dataset only contains the call options
    :param with_percentage: bool (default=False),
            if the dataset needs to contain the percentage of the stock price and the strike
    :param scale: bool (default=False), if the dataset needs to be scaled
    """
    n_estimators = 700
    if (stockmodel == "BS"
            and option_type == "opt_standard") or stockmodel == "VG":
        max_feature = "log2"
    else:
        max_feature = 5

    dm = dc.DataManager(stockmodel=stockmodel,
                        option_type=option_type,
                        only_call=only_call,
                        with_percent=with_percentage)
    X_train, y_train = dm.get_training_data()

    if scale:
        scaler = preprocessing.StandardScaler().fit(X_train, y_train)
        X_train = scaler.transform(X_train)

    rf_model = RandomForestRegressor(n_jobs=8,
                                     verbose=0,
                                     max_features=max_feature,
                                     n_estimators=n_estimators)

    rf_model.fit(X_train, y_train)

    X_test, y_test = dm.get_test_data()
    if scale:
        X_test = scaler.transform(X_test)

    y_pred = rf_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred=y_pred)

    print(f"MSE: {mse}")

コード例 #6

0

ファイルを表示

ファイル: SupportVectorRegression.py プロジェクト: ysdgroot/Masterthesis

def full_data_training(stockmodel, option_type, only_call=False, with_percentage=False):
    """
    print the results of the performance over the full dataset for the given stock stockmodel and option type

    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax"
    :param only_call: bool (default=False), if the dataset only contains the call options
    :param with_percentage: bool (default=False),
            if the dataset needs to contain the percentage of the stock price and the strike
    """
    base_file_name = "SVR-random_search_{0}_{1}_scaled.p".format(stockmodel, option_type)

    # get the best parameters from the cross validation
    full_file_name = pkg_resources.open_text(random_search_svr, base_file_name).name
    dict_cv_results = modelsaver.get_model(full_file_name).cv_results_
    best_position = np.where(dict_cv_results['rank_test_neg_mean_squared_error'] == 1)
    best_model_parameters = np.array(dict_cv_results['params'])[best_position][0]

    dm = dc.DataManager(stockmodel=stockmodel,
                        option_type=option_type,
                        only_call=only_call,
                        with_percent=with_percentage)
    X_train, y_train = dm.get_training_data()

    scaler = preprocessing.StandardScaler().fit(X_train, y_train)
    X_train = scaler.transform(X_train)

    svr_model = SVR(cache_size=2000,
                    C=best_model_parameters['C'],
                    degree=best_model_parameters['degree'],
                    epsilon=best_model_parameters['epsilon'],
                    gamma=best_model_parameters['gamma'],
                    kernel=best_model_parameters['kernel'])

    svr_model.fit(X_train, y_train)

    X_test, y_test = dm.get_test_data()
    X_test = scaler.transform(X_test)

    y_pred = svr_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred=y_pred)

    print(f"MSE: {mse:4.3f}")

コード例 #7

0

ファイルを表示

def cv_gpr_models(stockmodel, option, random_state=None, scale=False):
    """
    For the given stockmodel and option type do a 3-fold cross validation of 50 random parametersets.

    Saves all the cross validations in f"GPR-random_search_{stockmodel}_{option}{string_scaled}_random{random_state}"

    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax"
    :param random_state: int, for the randomstate
    """
    kernels = [RBF(), Matern(), DotProduct(), RationalQuadratic()]
    param_grid = {
        "normalize_y": [True, False],
        'kernel': kernels,
        "alpha": uniform(loc=0.000000001, scale=0.001)
    }

    datamanager = dc.DataManager(stockmodel=stockmodel, option_type=option)
    X, y = datamanager.get_random_training_data(10000)

    if scale:
        scaler = preprocessing.StandardScaler().fit(X, y)
        X = scaler.transform(X)

    gpr = gaussian_process.GaussianProcessRegressor(optimizer="fmin_l_bfgs_b")
    clf = RandomizedSearchCV(gpr,
                             param_grid,
                             random_state=random_state,
                             cv=3,
                             n_iter=50,
                             verbose=10,
                             n_jobs=2,
                             scoring=['neg_mean_squared_error', 'r2'],
                             refit=False)

    performance = clf.fit(X, y)

    string_scaled = '_scaled' if scale else ""
    modelsaver.save_model(
        performance, f"GPR-random_search_{stockmodel}_{option}{string_scaled}")

コード例 #8

0

ファイルを表示

ファイル: NeuralNetworks.py プロジェクト: ysdgroot/Masterthesis

def full_dataset(stockmodel,
                 option_type,
                 only_call=False,
                 with_percentage=False,
                 scale=True):
    """
    print the results of the performance over the full dataset for the given stock stockmodel and option type
    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax"
    :param only_call: bool (default=False), if the dataset only contains the call options
    :param with_percentage: bool (default=False),
            if the dataset needs to contain the percentage of the stock price and the strike
    :param scale: bool (default=False), if the dataset needs to be scaled
    """
    dm = dc.DataManager(stockmodel=stockmodel,
                        option_type=option_type,
                        only_call=only_call,
                        with_percent=with_percentage)
    X_train, y_train = dm.get_training_data()

    if scale:
        scaler = preprocessing.StandardScaler().fit(X_train, y_train)
        X_train = scaler.transform(X_train)

    size_layers, activations = get_best_model(stockmodel, option_type)

    nn_model = build_nn_model(X_train.shape[1], size_layers, activations)

    nn_model.fit(X_train, y_train, verbose=0, batch_size=100, epochs=50)

    X_test, y_test = dm.get_test_data()
    if scale:
        X_test = scaler.transform(X_test)

    y_pred = nn_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred=y_pred)

    print(f"MSE: {mse}")

コード例 #9

0

ファイルを表示

ファイル: RandomForest.py プロジェクト: ysdgroot/Masterthesis

def one_tree_visualisation():
    rf = RandomForestRegressor(n_estimators=100,
                               max_features="auto",
                               n_jobs=6,
                               verbose=2)

    datamanger = dc.DataManager()

    X, y = datamanger.get_training_data()

    # Train
    rf.fit(X, y)
    # Extract single tree
    estimator = rf.estimators_[8]

    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=800)
    tree.plot_tree(rf.estimators_[8],
                   feature_names=X.columns,
                   max_depth=2,
                   filled=True)
    # plt.title("Random Forest: Decision Tree")
    fig.savefig('rf_individualtree.png')

    print(estimator.get_depth())

コード例 #10

0

ファイルを表示

ファイル: NeuralNetworks.py プロジェクト: ysdgroot/Masterthesis

def cv_layers(n_random_samples,
              stock_model,
              option_type,
              cv=3,
              batch_size=100,
              epochs=50,
              random_state=4173,
              scale=True):
    """
    Cross validation of random neural networks

    :param n_random_samples: int, number of random neural networks
    :param stock_model: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax"
    :param cv: int (default=3), cross validations
    :param batch_size: int(default=100), batch size of the neural networks
    :param epochs: int(default=50), number of epochs for the neural networks
    :param random_state: int(default=4173)
    :param scale: bool(default=False), whenever to scale the data
    :return: list of dicts with keys
                            "n_layers": number of layers used,
                            "size_layers": list of size n_layers, with all the sizes,
                            "activations": list of size n_layers, with all the activation functions
                            "cv_result": dict with the Train and Test errors
    """
    activation_functions = ["relu", "softsign", "sigmoid", "elu"]

    datamanager = dc.DataManager(stockmodel=stock_model,
                                 option_type=option_type)
    X, y = datamanager.get_training_data()

    if scale:
        scaler = preprocessing.StandardScaler().fit(X, y)
        X = scaler.transform(X)

    results_fitting = []

    np.random.seed(random_state)

    for i in range(n_random_samples):
        first_layer_size = random.randrange(50, 301, 50)

        # 1, 2 or 3 layers
        n_hidden_layers = np.random.randint(1, 4)

        size_layers = [
            first_layer_size // ((i + 1)**i) for i in range(n_hidden_layers)
        ]

        activation_layers = random.choices(activation_functions,
                                           k=n_hidden_layers)

        architecture = {
            "size_layers": size_layers,
            "activations": activation_layers,
            "input": X.shape[1]
        }

        gen_error = cross_validation_nn(architecture,
                                        X,
                                        y,
                                        cv=cv,
                                        batch_size=batch_size,
                                        epochs=epochs)

        nn_model_values = {
            "n_layers": n_hidden_layers,
            "size_layers": size_layers,
            "activations": activation_layers,
            "cv_result": gen_error
        }
        results_fitting.append(nn_model_values)

    return results_fitting

コード例 #11

0

ファイルを表示

ファイル: RandomForest.py プロジェクト: ysdgroot/Masterthesis

def rf_n_estimators(stockmodel="BS",
                    option_type="opt_exact_standard",
                    range_n_estimators=range(50, 751, 50),
                    save_mse=True,
                    max_features="auto",
                    scale=True):
    """
    Method to calculate the mse for a range of estimators

    :param stockmodel: str, "BS", "VG" or "H"
    :param option_type: str, "opt_standard", "opt_asianmean", "opt_lookbackmin" or "opt_lookbackmax"
            If stockmodel = "BS" -> "opt_exact_standard" is also possible
    :param range_n_estimators: list with the number of estimators for each run
    :param save_mse: bool, whenever to save all the values in a file.
    :param max_features: "auto", "log2" or a integer, for the splits in the Tree stockmodel
    :param scale: bool, if the data needs to be scaled or not
    :return: dict,with keys "Train", "Test", "oob_score", "n_estimators".
            Train = mse of the Training data
            Test = mse of the Test data
            oob_score = mse of the out-of-bag observations
            n_estimators = list of the number of estimators
    """

    dict_option_types = {
        "opt_exact_standard": "SE",
        "opt_standard": "S",
        "opt_asianmean": "A",
        "opt_lookbackmin": "Lmin",
        "opt_lookbackmax": "Lmax"
    }

    list_results_train = []
    list_results_test = []
    list_oob_score = []

    datamanager = dc.DataManager(stockmodel=stockmodel,
                                 option_type=option_type)

    X, y = datamanager.get_training_data()
    X_test, y_test = datamanager.get_test_data()

    if scale:
        scaler = preprocessing.StandardScaler().fit(X, y)
        X = scaler.transform(X)
        X_test = scaler.transform(X_test)

    for n_estimator in range_n_estimators:
        rf_model = RandomForestRegressor(n_estimators=n_estimator,
                                         verbose=1,
                                         n_jobs=7,
                                         random_state=2458 + n_estimator,
                                         max_features=max_features,
                                         oob_score=True)
        rf_model.fit(X, y)

        mse_train = mean_squared_error(y, rf_model.predict(X))
        mse_test = mean_squared_error(y_test, rf_model.predict(X_test))
        oob_score = rf_model.oob_score_

        print(f'Train {mse_train}')
        print(f'Test {mse_test}')
        print(f'OOB score: {oob_score}')

        list_results_train.append(mse_train)
        list_results_test.append(mse_test)
        list_oob_score.append(oob_score)

    dict_result = {
        "Train": list_results_train,
        "Test": list_results_test,
        "oob_score": list_oob_score,
        "n_estimators": range_n_estimators
    }

    if save_mse:
        string_scaled = "_scaled" if scale else ""
        modelsaver.save_model(
            dict_result,
            f"rf_{min(range_n_estimators)}-{max(range_n_estimators)}"
            f"-results_train_test-{stockmodel}-{dict_option_types[option_type]}"
            f"-{max_features}{string_scaled}")

    return dict_result