Beispiel #1
0
#generate random dataset of the Franke function with noise
FrankeDS = dataset(0)
FrankeDS.generate_franke(150, 0.05)

#Normalize dataset
FrankeDS.normalize_dataset()

#Divide in train and test
if CV:
    FrankeDS.sort_in_k_batches(k)
else:
    FrankeDS.sort_train_test(ratio=0.2, random=False)

#Make model
FrankeModel = fit(FrankeDS)

#Create polynomial design matrix for train and test sets
X_train = FrankeModel.create_design_matrix(deg=deg)

if CV:
    # Run k-fold CV algorithm and fit models.
    sample = sampling(FrankeDS)
    sample.kfold_cross_validation(method,
                                  deg=deg,
                                  lambd=lambd,
                                  Niterations=Niterations)

    # Print metrics
    print("Cross-validation batches: k = ", k)
    print('Best train mse is in arg ', np.argmin(sample.mse_train), ' : ',
Beispiel #2
0
DF = dataset.df
DF = DF.drop(columns=[
    "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"
])
DF = DF.drop(columns=[
    "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5",
    "BILL_AMT6"
])
DF = DF.drop(columns=["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"])

#Print and plot some info about data.
statistics.print_info_dataframe(dataset.df, DF)
statistics.print_info_input_output(dataset.XTrain, dataset.yTrain)
plot_traits(DF, show=showplot, save=saveplot)

model = fit(dataset)
model.fit_logistic_regression(delta=0.0001, iterations=iterations)
model.test_logistic_regression(data="test")

plt.title("Evolution of the accuracy score.")
plt.plot(np.linspace(1, iterations, iterations), model.training_score)
plt.show()

print(
    "Score, own model: ",
    statistics.calc_accuracy(pred=model.prediction_test,
                             target=model.y_test_target))

model.fit_logistic_regression_sklearn()
"""
## Neural network - not done yet.
Beispiel #3
0
#Normal decision trees with pruning, or XGBoost?
XGBoost = True

# Load dataset
with open('datasets.pkl', 'rb') as input:
    Datasets = pickle.load(input)
AME12 = Datasets[0]
AME16 = Datasets[1]
testset = Datasets[2]

#Divide in train and test
AME16.sort_train_test(AME12, useAME12=False)

#Make model
AME16Fit = fit(AME16)

#Create polynomial design matrix for train and test sets
X_train = AME16Fit.create_design_matrix(deg=0)
X_test = AME16Fit.create_design_matrix(x=AME16.test_x_1d, deg=0)

#Initialize inputs for Neural Network
y_train = AME16.y_1d[:, np.newaxis]
y_test = AME16.test_y_1d[:, np.newaxis]
n_samples = X_train.shape[0]

###### grid search #######

#Initialize vectors for saving values
depth_vals = np.linspace(1, 10, 10)
lmbd_vals = np.hstack((np.array([0.0]), np.logspace(-6, -1, 6)))
Beispiel #4
0
    def kfold_cross_validation(self,
                               method,
                               descent_method='SGD-skl',
                               deg=0,
                               Niterations=100,
                               lambd=0.01,
                               eta=0.000005,
                               m=5,
                               verbose=False):
        """Method that implements the k-fold cross-validation algorithm. It takes
        as input the method we want to use. if "least squares" an ordinary OLS will be evaulated.
        if "ridge" then the ridge method will be used, and respectively the same for "lasso"."""

        inst = self.inst
        lowest_mse = 1e5

        self.mse = []
        self.R2 = []
        self.mse_train = []
        self.R2_train = []
        self.bias = []
        self.variance = []
        self.accuracy = []
        self.design_matrix = fit(inst)
        self.rocaucs = []
        self.area_ratios = []
        #whole_DM = self.design_matrix.create_design_matrix(deg=deg).copy() #design matrix for the whole dataset
        #whole_y = inst.y_1d.copy() #save the whole output

        for i in range(self.inst.k):
            #pick the i-th set as test
            inst.sort_training_test_kfold(i)
            inst.fill_array_test_training()
            self.design_matrix.create_design_matrix(
                deg=deg
            )  #create design matrix for the training set, and evaluate

            if method == 'OLS':
                y_train, beta_train = self.design_matrix.fit_design_matrix_numpy(
                )
            elif method == "Ridge":
                y_train, beta_train = self.design_matrix.fit_design_matrix_ridge(
                    lambd)
            elif method == "LASSO":
                y_train, beta_train = self.design_matrix.fit_design_matrix_lasso(
                    lambd, maxiter=Niterations)
            elif method == 'logreg':
                y_train, beta_train = self.design_matrix.fit_design_matrix_logistic_regression(
                    descent_method=descent_method,
                    eta=eta,
                    Niteration=Niterations,
                    m=m,
                    verbose=verbose)

            else:
                sys.exit("Wrongly designated method: ", method, " not found")

            #Find out which values get predicted by the training set
            X_test = self.design_matrix.create_design_matrix(x=inst.test_x_1d,
                                                             N=inst.N_testing,
                                                             deg=deg)
            y_pred = self.design_matrix.test_design_matrix(beta_train,
                                                           X=X_test)

            #Take the real target values from the test datset for comparison (and also a rescaled set)
            y_test = inst.test_y_1d
            _, y_test_rescaled = inst.rescale_back(x=inst.test_x_1d,
                                                   y=inst.test_y_1d,
                                                   split=True)
            target = y_test_rescaled.astype(int)

            #Calculate the prediction for the whole dataset
            #whole_y_pred = self.design_matrix.test_design_matrix(beta_train, X=whole_DM)

            if method == 'logreg':
                # Statistically evaluate the training set with test and predicted solution.
                y_pred_onehot = np.column_stack((1 - y_pred, y_pred))
                accuracy_batch = statistics.calc_accuracy(target, y_pred)
                rocaucs_batch = statistics.calc_rocauc(target, y_pred)

                max_area_test = statistics.calc_cumulative_auc(
                    target, make_onehot(target))
                area_ratio_batch = (statistics.calc_cumulative_auc(
                    target, y_pred_onehot) - 0.5) / (max_area_test - 0.5)
                self.accuracy.append(accuracy_batch)
                self.rocaucs.append(rocaucs_batch)
                self.area_ratios.append(area_ratio_batch)
            else:

                # Statistically evaluate the training set with test and predicted solution.
                mse, calc_r2 = statistics.calc_statistics(y_test, y_pred)

                # Statistically evaluate the training set with itself
                mse_train, calc_r2_train = statistics.calc_statistics(
                    inst.y_1d, y_train)

                # Get the values for the bias and the variance
                bias, variance = statistics.calc_bias_variance(y_test, y_pred)
                self.mse.append(mse)
                self.R2.append(calc_r2)
                self.mse_train.append(mse_train)
                self.R2_train.append(calc_r2_train)
                self.bias.append(bias)
                self.variance.append(variance)
                # If needed/wanted:
                if abs(mse) < lowest_mse:
                    lowest_mse = abs(mse)
                    self.best_predicting_beta = beta_train
Beispiel #5
0
    CDds = credit_card_dataset(filename)

    #polishing the dataset, and divide into data and target data
    CDds.CreditCardPolish()

#Normalize dataset
CDds.normalize_dataset()

#Divide in train and test
if CV:
    CDds.sort_in_k_batches(k)
else:
    CDds.sort_train_test(ratio=0.2, random=False)

#Make model
model = fit(CDds)

#Fit model
model.create_simple_design_matrix()

if CV:
    # Run k-fold CV algorithm and fit models.
    sample = sampling(CDds)
    sample.kfold_cross_validation(method,
                                  deg=deg,
                                  descent_method=desc_method,
                                  eta=input_eta,
                                  Niterations=Niterations,
                                  m=m)

    # Print metrics
Beispiel #6
0
# Normalize the dataset and divide in samples
dataset.normalize_dataset()
dataset.sort_in_k_batches(k)

# Run k-fold algorithm and fit models.
sample = sampling(dataset)
sample.kfold_cross_validation(k, method, deg=deg)

# Calculate statistics
print("Batches: k = ", k)
statistics.print_mse(sample.mse)
statistics.print_R2(sample.R2)

# Plotting the best fit with the lowest mse.
dataset.reload_data()
fitted = fit(dataset)
fitted.create_design_matrix(deg=deg)
z_model_norm = fitted.test_design_matrix(sample.best_predicting_beta)
rescaled_dataset = dataset.rescale_back(z=z_model_norm)
z_model = rescaled_dataset[2]

# Generate analytical solution for plotting purposes
analytical = data_generate()
analytical.generate_franke(n, noise=0)

# Plot
plot_3d(rescaled_dataset[0], rescaled_dataset[1], z_model, analytical.x_mesh,
        analytical.y_mesh, analytical.z_mesh, ["surface", "scatter"])

try:
    os.remove("backup_data.npz")
    def kfold_cross_validation(self, k, method, deg=5, lambd=1):
        """Method that implements the k-fold cross-validation algorithm. It takes
        as input the method we want to use. if "least squares" an ordinary OLS will be evaulated.
        if "ridge" then the ridge method will be used, and respectively the same for "lasso"."""

        inst = self.inst
        lowest_mse = 1e5

        self.mse = []
        self.R2 = []
        self.mse_train = []
        self.R2_train = []
        self.bias = []
        self.variance = []
        design_matrix = fit(inst)
        whole_DM = design_matrix.create_design_matrix(
            deg=deg).copy()  #design matrix for the whole dataset
        whole_z = inst.z_1d.copy()  #save the whole output

        for i in range(self.inst.k):
            #pick the i-th set as test
            inst.sort_training_test_kfold(i)
            inst.fill_array_test_training()

            design_matrix.create_design_matrix(
                deg=deg
            )  #create design matrix for the training set, and evaluate
            if method == "least squares":
                z_train, beta_train = design_matrix.fit_design_matrix_numpy()
            elif method == "ridge":
                z_train, beta_train = design_matrix.fit_design_matrix_ridge(
                    lambd)
            elif method == "lasso":
                z_train, beta_train = design_matrix.fit_design_matrix_lasso(
                    lambd)
            else:
                sys.exit("Wrongly designated method: ", method, " not found")

            #Find out which values get predicted by the training set
            X_test = design_matrix.create_design_matrix(x=inst.test_x_1d,
                                                        y=inst.test_y_1d,
                                                        z=inst.test_z_1d,
                                                        N=inst.N_testing,
                                                        deg=deg)
            z_pred = design_matrix.test_design_matrix(beta_train, X=X_test)

            #Take the real values from the dataset for comparison
            z_test = inst.test_z_1d

            #Calculate the prediction for the whole dataset
            whole_z_pred = design_matrix.test_design_matrix(beta_train,
                                                            X=whole_DM)

            # Statistically evaluate the training set with test and predicted solution.
            mse, calc_r2 = statistics.calc_statistics(z_test, z_pred)

            # Statistically evaluate the training set with itself
            mse_train, calc_r2_train = statistics.calc_statistics(
                inst.z_1d, z_train)

            # Get the values for the bias and the variance
            bias, variance = statistics.calc_bias_variance(z_test, z_pred)

            self.mse.append(mse)
            self.R2.append(calc_r2)
            self.mse_train.append(mse_train)
            self.R2_train.append(calc_r2_train)
            self.bias.append(bias)
            self.variance.append(variance)
            # If needed/wanted:
            if abs(mse) < lowest_mse:
                lowest_mse = abs(mse)
                self.best_predicting_beta = beta_train
Beispiel #8
0
fifth order. Also adding MSE and R^2 score."""

# Load data from previously saved file
deg = 5
dataset = data_generate()
dataset.load_data()

# Or you can generate directly.
#dataset = data_generate()
#dataset.generate_franke(n=100, noise=0.2)

# Normalize the dataset
dataset.normalize_dataset()

# Fit design matrix
fitted_model = fit(dataset)

# Ordinary least square fitting
fitted_model.create_design_matrix(deg)
z_model_norm, beta = fitted_model.fit_design_matrix_numpy()

# Statistical evaluation
mse, calc_r2 = statistics.calc_statistics(dataset.z_1d, z_model_norm)
print("Mean square error: ", mse, "\n", "R2 score: ", calc_r2)

# Scale back the dataset
rescaled_dataset = dataset.rescale_back(z=z_model_norm)
#x_model = rescaled_dataset[0]
#y_model = rescaled_dataset[1]
z_model = rescaled_dataset[2]