def score(self, y_true, y_test): """Returns the R^2 score. Args: y_test (ndarray): X array of shape (N, p - 1) to test for y_true (ndarray): true values for X Returns: float: R2 score for X_test values. """ return metrics.R2(y_true, y_test)
def sk_learn_bootstrap(x, y, z, design_matrix, kf_reg, N_bs=100, test_percent=0.4, print_results=True): """Sci-kit learn bootstrap method.""" x_train, x_test, y_train, y_test = sk_modsel.train_test_split( np.c_[x.ravel(), y.ravel()], z.ravel(), test_size=test_percent, shuffle=False) # Ensures we are on axis shape (N_observations, N_predictors) y_test = y_test.reshape(-1, 1) y_train = y_train.reshape(-1, 1) y_pred = np.empty((y_test.shape[0], N_bs)) X_test = design_matrix(x_test) R2_ = np.empty(N_bs) mse_ = np.empty(N_bs) bias2_ = np.empty(N_bs) beta_coefs = [] X_train = design_matrix(x_train) for i_bs in tqdm(range(N_bs), desc="SciKit-Learn bootstrap"): x_boot, y_boot = sk_utils.resample(X_train, y_train) # x_boot, y_boot = sk_utils.resample(x_train, y_train) # X_boot = design_matrix(x_boot) kf_reg.fit(X_boot, y_boot) # y_pred[:, i_bs] = kf_reg.predict(cp.deepcopy(x_test)).ravel() y_predict = kf_reg.predict(X_test) # print(sk_metrics.r2_score(y_test.flatten(), y_pred[:,i_bs].flatten())) # R2_[i_bs] = sk_metrics.r2_score(y_test.flatten(), y_pred[:,i_bs].flatten()) # R2_[i_bs] = metrics.R2(y_test, y_predict) # mse_[i_bs] = metrics.mse(y_test.flatten(), y_pred[:, i_bs].flatten()) # bias2_[i_bs] = metrics.bias2( # y_test.flatten(), y_pred[:, i_bs].flatten()) y_pred[:, i_bs] = y_predict.ravel() beta_coefs.append(kf_reg.coef_) # R2 = np.mean(R2_) # # print("R2 from each bs step = ",R2) # # # MSE = mse_.mean() # # # bias = bias2_.mean() # # R2 = np.mean(R2_list) # # R2 = (1 - np.sum(np.average((y_test - y_pred)**2, axis=1)) / # # np.sum((y_test - np.average(y_test)**2))) # # print(R2) # print(y_test.shape, y_pred.shape) # s1 = np.sum((np.mean((y_test - y_pred)**2, axis=1))) # s2 = np.sum((y_test - np.mean(y_test))**2) # print ("R2=",1 - s1/s2) # R2 = (1 - np.sum(np.mean((y_test - y_pred)**2, axis=0, keepdims=True),keepdims=True) / # np.sum((y_test - np.mean(y_test, keepdims=True)**2,),keepdims=True)) # print(R2.mean()) # R2 = R2.mean() R2 = np.mean(metrics.R2(y_test, y_pred, axis=0)) # Mean Square Error, mean((y - y_approx)**2) _mse = ((y_test - y_pred))**2 MSE = np.mean(np.mean(_mse, axis=1, keepdims=True)) # Bias, (y - mean(y_approx))^2 _mean_pred = np.mean(y_pred, axis=1, keepdims=True) bias = np.mean((y_test - _mean_pred)**2) # Variance, var(y_predictions) var = np.mean(np.var(y_pred, axis=1, keepdims=True)) beta_coefs_var = np.asarray(beta_coefs).var(axis=0) beta_coefs = np.asarray(beta_coefs).mean(axis=0) # # R^2 score, 1 - sum((y-y_approx)**2)/sum((y-mean(y))**2) # y_pred_mean = np.mean(y_pred, axis=1) # _y_test = y_test.reshape(-1) # print ("R2:", metrics.R2(_y_test, y_pred_mean)) # _s1 = np.sum(((y_test - y_pred))**2, axis=1, keepdims=True) # _s2 = np.sum((y_test - np.mean(y_test))**2) # print (_s1.mean(), _s2) # R2 = 1 - _s1.mean()/_s2 # print(np.array([sk_metrics.r2_score(y_test, y_pred[:,i]) for i in range(N_bs)]).mean()) # R2 = metrics.R2(y_test, y_pred, axis=1) # R2 = np.mean(metrics.R2(y_test, y_pred, axis=1)) # print(np.mean(metrics.R2(y_test, y_pred, axis=1))) # R2 = R2.mean() # print(R2.mean()) if print_results: print("R2: {:-20.16f}".format(R2)) print("MSE: {:-20.16f}".format(MSE)) print("Bias^2:{:-20.16f}".format(bias)) print("Var(y):{:-20.16f}".format(var)) print("Beta coefs: {}".format(beta_coefs)) print("Beta coefs variances: {}".format(beta_coefs_var)) print("Diff: {}".format(abs(MSE - bias - var))) results = { "y_pred": np.mean(y_pred, axis=1), "y_pred_var": np.var(y_pred, axis=1), "mse": MSE, "r2": R2, "var": var, "bias": bias, "beta_coefs": beta_coefs, "beta_coefs_var": beta_coefs_var, "beta_95c": np.sqrt(beta_coefs_var) * 2, "diff": abs(MSE - bias - var), } return results
def sk_learn_k_fold_cv(x, y, z, kf_reg, design_matrix, k_splits=4, test_percent=0.4, print_results=True): """Scikit Learn method for cross validation.""" x_train, x_test, y_train, y_test = sk_modsel.train_test_split( np.c_[x.ravel(), y.ravel()], z.ravel(), test_size=test_percent, shuffle=True) kf = sk_modsel.KFold(n_splits=k_splits) X_test = design_matrix(x_test) X_train = design_matrix(x_train) y_pred_list = [] beta_coefs = [] for train_index, test_index in tqdm( kf.split(X_train), desc="SciKit-Learn k-fold Cross Validation"): kX_train, kX_test = X_train[train_index], X_train[test_index] kY_train, kY_test = y_train[train_index], y_train[test_index] kf_reg.fit(kX_train, kY_train) y_pred_list.append(kf_reg.predict(X_test)) beta_coefs.append(kf_reg.coef_) y_pred_list = np.asarray(y_pred_list) # Mean Square Error, mean((y - y_approx)**2) _mse = (y_test - y_pred_list)**2 MSE = np.mean(np.mean(_mse, axis=0, keepdims=True)) # Bias, (y - mean(y_approx))^2 _mean_pred = np.mean(y_pred_list, axis=0, keepdims=True) bias = np.mean((y_test - _mean_pred)**2) # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y)) R2 = np.mean(metrics.R2(y_test, y_pred_list, axis=0)) # Variance, var(y_predictions) var = np.mean(np.var(y_pred_list, axis=0, keepdims=True)) beta_coefs_var = np.asarray(beta_coefs).var(axis=0) beta_coefs = np.asarray(beta_coefs).mean(axis=0) if print_results: print("R2: {:-20.16f}".format(R2)) print("MSE: {:-20.16f}".format(MSE)) print("Bias^2:{:-20.16f}".format(bias)) print("Var(y):{:-20.16f}".format(var)) print("Beta coefs: {}".format(beta_coefs)) print("Beta coefs variances: {}".format(beta_coefs_var)) print("Diff: {}".format(abs(MSE - bias - var))) results = { "y_pred": np.mean(y_pred_list, axis=0), "y_pred_var": np.var(y_pred_list, axis=0), "mse": MSE, "r2": R2, "var": var, "bias": bias, "beta_coefs": beta_coefs, "beta_coefs_var": beta_coefs_var, "beta_95c": np.sqrt(beta_coefs_var) * 2, "diff": abs(MSE - bias - var), } return results
def __init__(self, x, y, z, deg=1, N_bs=100, N_cv_bs=100, k_splits=4, test_percent=0.4, print_results=False): """Manual implementation of the OLS.""" poly = sk_preproc.PolynomialFeatures(degree=deg, include_bias=True) X = poly.fit_transform(cp.deepcopy(np.c_[x.ravel(), y.ravel()]), z.ravel()) linreg = reg.OLSRegression() linreg.fit(X, cp.deepcopy(z.ravel())) z_predict_ = linreg.predict(X).ravel() if print_results: print("R2: {:-20.16f}".format(metrics.R2(z.ravel(), z_predict_))) print("MSE: {:-20.16f}".format(metrics.mse(z.ravel(), z_predict_))) print("Bias: {:-20.16f}".format( metrics.bias2(z.ravel(), z_predict_))) print("Beta coefs: {}".format(linreg.coef_)) print("Beta coefs variances: {}".format(linreg.coef_var)) self.data["regression"] = { "y_pred": z_predict_, "r2": metrics.R2(z.ravel(), z_predict_), "mse": metrics.mse(z.ravel(), z_predict_), "bias": metrics.bias2(z.ravel(), z_predict_), "beta_coefs": linreg.coef_, "beta_coefs_var": linreg.coef_var, "beta_95c": np.sqrt(linreg.coef_var) * 2, } # Resampling with k-fold cross validation kfcv = cv.kFoldCrossValidation( cp.deepcopy(np.c_[x.ravel(), y.ravel()]), cp.deepcopy(z.ravel()), reg.OLSRegression(), poly.transform) kfcv.cross_validate(k_splits=k_splits, test_percent=test_percent) if print_results: print("R2: {:-20.16f}".format(kfcv.R2)) print("MSE: {:-20.16f}".format(kfcv.MSE)) print("Bias^2:{:-20.16f}".format(kfcv.bias)) print("Var(y):{:-20.16f}".format(kfcv.var)) print("Beta coefs: {}".format(kfcv.coef_)) print("Beta coefs variances: {}".format(kfcv.coef_var)) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(kfcv.MSE, kfcv.bias, kfcv.var, kfcv.bias + kfcv.var)) print("Diff: {}".format(abs(kfcv.bias + kfcv.var - kfcv.MSE))) self._fill_data(kfcv, "kfoldcv") # Resampling with mc cross validation mccv = cv.MCCrossValidation(cp.deepcopy(np.c_[x.ravel(), y.ravel()]), cp.deepcopy(z.ravel()), reg.OLSRegression(), poly.transform) mccv.cross_validate(N_cv_bs, k_splits=k_splits, test_percent=test_percent) if print_results: print("R2: {:-20.16f}".format(mccv.R2)) print("MSE: {:-20.16f}".format(mccv.MSE)) print("Bias^2:{:-20.16f}".format(mccv.bias)) print("Var(y):{:-20.16f}".format(mccv.var)) print("Beta coefs: {}".format(mccv.coef_)) print("Beta coefs variances: {}".format(mccv.coef_var)) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(mccv.MSE, mccv.bias, mccv.var, mccv.bias + mccv.var)) print("Diff: {}".format(abs(mccv.bias + mccv.var - mccv.MSE))) self._fill_data(mccv, "mccv") # Resampling with bootstrapping bs_reg = bs.BootstrapRegression( cp.deepcopy(np.c_[x.ravel(), y.ravel()]), cp.deepcopy(z.ravel()), reg.OLSRegression(), poly.transform) bs_reg.bootstrap(N_bs, test_percent=test_percent) if print_results: print("R2: {:-20.16f}".format(bs_reg.R2)) print("MSE: {:-20.16f}".format(bs_reg.MSE)) print("Bias^2:{:-20.16f}".format(bs_reg.bias)) print("Var(y):{:-20.16f}".format(bs_reg.var)) print("Beta coefs: {}".format(bs_reg.coef_)) print("Beta coefs variances: {}".format(bs_reg.coef_var)) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(bs_reg.MSE, bs_reg.bias, bs_reg.var, bs_reg.bias + bs_reg.var)) print("Diff: {}".format(abs(bs_reg.bias + bs_reg.var - bs_reg.MSE))) self._fill_data(bs_reg, "bootstrap")
def __init__(self, x, y, z, deg=1, N_bs=100, N_cv_bs=100, k_splits=4, test_percent=0.4, print_results=False): """SK-Learn implementation of OLS.""" poly = sk_preproc.PolynomialFeatures(degree=deg, include_bias=True) X = poly.fit_transform(np.c_[cp.deepcopy(x).reshape(-1, 1), cp.deepcopy(y).reshape(-1, 1)]) linreg = sk_model.LinearRegression(fit_intercept=False) linreg.fit(X, z.ravel()) z_predict_ = linreg.predict(X) r2 = metrics.R2(z.ravel(), z_predict_) bias = metrics.bias2(z.ravel(), z_predict_) mse_error = metrics.mse(z.ravel(), z_predict_) N, P = X.shape z_variance = np.sum((z.ravel() - z_predict_)**2) / (N - P - 1) linreg_coef_var = np.diag(np.linalg.inv(X.T @ X)) * z_variance self.data["regression"] = { "y_pred": z_predict_, "r2": r2, "mse": mse_error, "bias": bias, "beta_coefs": linreg.coef_, "beta_coefs_var": linreg_coef_var, "beta_95c": np.sqrt(linreg_coef_var) * 2, } # Resampling coefs if print_results: print("R2: {:-20.16f}".format(r2)) print("MSE: {:-20.16f}".format(mse_error)) print("Bias: {:-20.16f}".format(bias)) print("Beta coefs: {}".format(linreg.coef_)) print("Beta coefs variances: {}".format(linreg_coef_var)) sk_kfold_res = sk_resampling.sk_learn_k_fold_cv( cp.deepcopy(x), cp.deepcopy(y), cp.deepcopy(z), sk_model.LinearRegression(fit_intercept=False), poly.transform, test_percent=test_percent, k_splits=k_splits, print_results=print_results) self.data["kfoldcv"] = sk_kfold_res bs_reg = bs.BootstrapRegression( cp.deepcopy(np.c_[x.ravel(), y.ravel()]), cp.deepcopy(z.ravel()), sk_model.LinearRegression(fit_intercept=False), poly.transform) bs_reg.bootstrap(N_bs, test_percent=test_percent) self._fill_data(bs_reg, "bootstrap") if print_results: print("R2: {:-20.16f}".format(bs_reg.R2)) print("MSE: {:-20.16f}".format(bs_reg.MSE)) print("Bias^2:{:-20.16f}".format(bs_reg.bias)) print("Var(y):{:-20.16f}".format(bs_reg.var)) print("Beta coefs: {}".format(bs_reg.coef_)) print("Beta coefs variances: {}".format(bs_reg.coef_var)) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(bs_reg.MSE, bs_reg.bias, bs_reg.var, bs_reg.bias + bs_reg.var)) print("Diff: {}".format(abs(bs_reg.bias + bs_reg.var - bs_reg.MSE)))
def cross_validate(self, k_splits=5, test_percent=0.2): """ Args: k_splits (float): percentage of the data which is to be used for cross validation. Default is 0.2 """ N_total_size = self.x_data.shape[0] # Splits dataset into a holdout test chuck to find bias, variance ect # on and one to perform k-fold CV on. holdout_test_size = int(np.floor(N_total_size * test_percent)) # Shuffles np.random.shuffle(self.x_data) np.random.shuffle(self.y_data) # Manual splitting x_holdout_test = self.x_data[:holdout_test_size, :] x_kfold_train = self.x_data[holdout_test_size:, :] y_holdout_test = self.y_data[:holdout_test_size] y_kfold_train = self.y_data[holdout_test_size:] np.random.shuffle(x_holdout_test) np.random.shuffle(y_holdout_test) np.random.shuffle(x_kfold_train) np.random.shuffle(y_kfold_train) # # print (x_kfold_train[:5]) # x_kfold_train, x_holdout_test, y_kfold_train, y_holdout_test = \ # sk_modsel.train_test_split(self.x_data, self.y_data, # test_size=test_percent) # holdout_test_size = y_holdout_test.shape[0] N_kfold_data = len(y_kfold_train) # Sets up the holdout design matrix X_holdout_test = self._design_matrix(x_holdout_test) # Splits dataset into managable k fold tests test_size = int(np.floor(N_kfold_data / k_splits)) # Splits kfold train data into k actual folds x_subdata = np.array_split(x_kfold_train, k_splits, axis=0) y_subdata = np.array_split(y_kfold_train, k_splits, axis=0) # Stores the test values from each k trained data set in an array R2_list = np.empty(k_splits) beta_coefs = [] self.y_pred_list = np.empty((k_splits, holdout_test_size)) for ik in tqdm(range(k_splits), desc="k-fold Cross Validation"): # Gets the testing data k_x_test = x_subdata[ik] k_y_test = y_subdata[ik] X_test = self._design_matrix(k_x_test) # Sets up indexes set_list = list(range(k_splits)) set_list.pop(ik) # Sets up new data set k_x_train = np.concatenate([x_subdata[d] for d in set_list]) k_y_train = np.concatenate([y_subdata[d] for d in set_list]) # Trains method bu fitting data self.reg.fit(self._design_matrix(k_x_train), k_y_train) # Getting a prediction given the test data y_predict = self.reg.predict(X_holdout_test).ravel() # Appends prediction and beta coefs self.y_pred_list[ik] = y_predict beta_coefs.append(self.reg.coef_) # Mean Square Error, mean((y - y_approx)**2) _mse = (y_holdout_test - self.y_pred_list)**2 self.MSE = np.mean(np.mean(_mse, axis=0, keepdims=True)) # Bias, (y - mean(y_approx))^2 _mean_pred = np.mean(self.y_pred_list, axis=0, keepdims=True) _bias = y_holdout_test - _mean_pred self.bias = np.mean(_bias**2) # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y)) _R2 = metrics.R2(y_holdout_test, self.y_pred_list, axis=0) self.R2 = np.mean(_R2) # Variance, var(y_predictions) self.var = np.mean(np.var(self.y_pred_list, axis=0, keepdims=True)) beta_coefs = np.asarray(beta_coefs) self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0) self.beta_coefs = np.asarray(beta_coefs).mean(axis=0) self.x_pred_test = x_holdout_test self.y_pred = np.mean(self.y_pred_list, axis=0) self.y_pred_var = np.var(self.y_pred_list, axis=0)
def cross_validate(self, N_mc_crossvalidations, k_splits=4, test_percent=0.2): """ Args: k_splits (float): percentage of the data which is to be used for cross validation. Default is 0.2 """ # raise NotImplementedError("Not implemnted MC CV") N_total_size = len(self.x_data) # Splits dataset into a holdout test chuck to find bias, variance ect # on and one to perform k-fold CV on. # k_holdout, holdout_test_size = self._get_split_percent( # test_percent, N_total_size, enforce_equal_intervals=False) # # Splits X data and design matrix data # x_holdout_test, x_mc_train = np.split(self.x_data, # [holdout_test_size], axis=0) # y_holdout_test, y_mc_train = np.split(self.y_data, # [holdout_test_size], axis=0) # Splits X data and design matrix data x_mc_train, x_holdout_test, y_mc_train, y_holdout_test = \ sk_modsel.train_test_split(self.x_data, self.y_data, test_size=test_percent) holdout_test_size = y_holdout_test.shape[0] N_mc_data = len(x_mc_train) # Sets up the holdout design matrix X_holdout_test = self._design_matrix(x_holdout_test) # Splits dataset into managable k fold tests mc_test_size = int(np.floor(N_mc_data / k_splits)) # Splits kfold train data into k actual folds # x_subdata = np.array_split(x_kfold_train, k_splits, axis=0) # y_subdata = np.array_split(y_kfold_train, k_splits, axis=0) # All possible indices available mc_indices = list(range(N_mc_data)) # Stores the test values from each k trained data set in an array R2_list = np.empty(N_mc_crossvalidations) beta_coefs = [] self.y_pred_list = np.empty((N_mc_crossvalidations, holdout_test_size)) # Sets up design matrices beforehand X_mc_train = self._design_matrix(x_mc_train) for i_mc in tqdm(range(N_mc_crossvalidations), desc="Monte Carlo Cross Validation"): # Gets retrieves indexes for MC-CV. No replacement. mccv_test_indexes = np.random.choice(mc_indices, mc_test_size) mccv_train_indices = np.array( list(set(mc_indices) - set(mccv_test_indexes))) # # Gets the testing data # k_x_test = x_mc_train[mccv_test_indexes] # k_y_test = y_mc_train[mccv_test_indexes] # X_test = self._design_matrix(k_x_test) # # Sets up indexes # set_list = list(range(k_splits)) # set_list.pop(ik) # Sets up new data set # k_x_train = x_mc_train[mccv_train_indices] X_train = X_mc_train[mccv_train_indices] k_y_train = y_mc_train[mccv_train_indices] # Sets up function to predict # X_train = self._design_matrix(k_x_train) # Trains method bu fitting data self.reg.fit(X_train, k_y_train) # Getting a prediction given the test data y_predict = self.reg.predict(X_holdout_test).ravel() # Appends prediction and beta coefs self.y_pred_list[i_mc] = y_predict beta_coefs.append(self.reg.coef_) # Mean Square Error, mean((y - y_approx)**2) _mse = (y_holdout_test - self.y_pred_list)**2 self.MSE = np.mean(np.mean(_mse, axis=0, keepdims=True)) # Bias, (y - mean(y_approx))^2 _mean_pred = np.mean(self.y_pred_list, axis=0, keepdims=True) _bias = y_holdout_test - _mean_pred self.bias = np.mean(_bias**2) # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y)) _R2 = metrics.R2(y_holdout_test, self.y_pred_list, axis=1) self.R2 = np.mean(_R2) # Variance, var(y_predictions) self.var = np.mean(np.var(self.y_pred_list, axis=0, keepdims=True)) beta_coefs = np.asarray(beta_coefs) self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0) self.beta_coefs = np.asarray(beta_coefs).mean(axis=0) self.x_pred_test = x_holdout_test self.y_pred = np.mean(self.y_pred_list, axis=0) self.y_pred_var = np.var(self.y_pred_list, axis=0)
def cross_validate(self, k_splits=4, kk_splits=4, test_percent=0.2): """ Args: k_splits (float): percentage of the data which is to be used for cross validation. Default is 0.2 """ # raise NotImplementedError("Not implemnted kk fold CV") N_total_size = len(self.x_data) # Splits dataset into a holdout test chuck to find bias, variance ect # on and one to perform k-fold CV on. holdout_test_size = int(np.floor(N_total_size / k_splits)) x_holdout_data = np.split(self.x_data, k_splits, axis=0) y_holdout_data = np.split(self.y_data, k_splits, axis=0) # Sets up some arrays for storing the different MSE, bias, var, R^2 # scores. MSE_arr = np.empty(k_splits) R2_arr = np.empty(k_splits) var_arr = np.empty(k_splits) bias_arr = np.empty(k_splits) beta_coefs = [] x_pred_test = [] y_pred_mean_list = [] y_pred_var_list = [] for i_holdout in tqdm(range(k_splits), desc="Nested k fold Cross Validation"): # Gets the testing holdout data to be used. Makes sure to use # every holdout test data once. x_holdout_test = x_holdout_data[i_holdout] y_holdout_test = y_holdout_data[i_holdout] # Sets up indexes holdout_set_list = list(range(k_splits)) holdout_set_list.pop(i_holdout) # Sets up new holdout data sets x_holdout_train = np.concatenate( [x_holdout_data[d] for d in holdout_set_list]) y_holdout_train = np.concatenate( [y_holdout_data[d] for d in holdout_set_list]) # Sets up the holdout design matrix X_holdout_test = self._design_matrix(x_holdout_test) # Splits dataset into managable k fold tests N_holdout_data = len(x_holdout_train) test_size = int(np.floor(N_holdout_data / kk_splits)) # Splits kfold train data into k actual folds x_subdata = np.array_split(x_holdout_train, kk_splits, axis=0) y_subdata = np.array_split(y_holdout_train, kk_splits, axis=0) # Stores the test values from each k trained data set in an array R2_list = np.empty(kk_splits) self.y_pred_list = np.empty((kk_splits, holdout_test_size)) # self.y_test_list = np.empty((kk_splits, holdout_test_size)) for ik in range(kk_splits): # Gets the testing data k_x_test = x_subdata[ik] k_y_test = y_subdata[ik] X_test = self._design_matrix(k_x_test) # Sets up indexes set_list = list(range(kk_splits)) set_list.pop(ik) # Sets up new data set k_x_train = np.concatenate([x_subdata[d] for d in set_list]) k_y_train = np.concatenate([y_subdata[d] for d in set_list]) # Sets up function to predict X_train = self._design_matrix(k_x_train) # Trains method bu fitting data self.reg.fit(X_train, k_y_train) # Getting a prediction given the test data y_predict = self.reg.predict(X_holdout_test).ravel() # Appends prediction and beta coefs self.y_pred_list[ik] = y_predict beta_coefs.append(self.reg.coef_) # Mean Square Error, mean((y - y_approx)**2) _mse = (y_holdout_test - self.y_pred_list)**2 MSE_arr[i_holdout] = np.mean(np.mean(_mse, axis=0, keepdims=True)) # Bias, (y - mean(y_approx))^2 _mean_pred = np.mean(self.y_pred_list, axis=0, keepdims=True) _bias = y_holdout_test - _mean_pred bias_arr[i_holdout] = np.mean(_bias**2) # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y)) _R2 = metrics.R2(y_holdout_test, self.y_pred_list, axis=1) R2_arr[i_holdout] = np.mean(_R2) # Variance, var(y_predictions) _var = np.var(self.y_pred_list, axis=0, keepdims=True) var_arr[i_holdout] = np.mean(_var) x_pred_test.append(x_holdout_test) y_pred_mean_list.append(np.mean(self.y_pred_list, axis=0)) y_pred_var_list.append(np.var(self.y_pred_list, axis=0)) self.var = np.mean(var_arr) self.bias = np.mean(bias_arr) self.R2 = np.mean(R2_arr) self.MSE = np.mean(MSE_arr) beta_coefs = np.asarray(beta_coefs) self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0) self.beta_coefs = np.asarray(beta_coefs).mean(axis=0) self.x_pred_test = np.array(x_pred_test) self.y_pred = np.array(y_pred_mean_list) self.y_pred_var = np.array(y_pred_var_list)
def bootstrap(self, N_bs, test_percent=0.25): """ Performs a bootstrap for a given regression type, design matrix function and excact function. Args: N_bs (int): number of bootstraps to perform test_percent (float): what percentage of data to reserve for testing. """ assert not isinstance(self._reg, type(None)) assert not isinstance(self._design_matrix, type(None)) assert test_percent < 1.0, "test_percent must be less than one." N = len(self.x_data) # Splits into test and train set. test_size = int(np.floor(N * test_percent)) x = self.x_data y = self.y_data # # Splits into training and test set. # x_test, x_train = np.split(x, [test_size], axis=0) # y_test, y_train = np.split(y, [test_size], axis=0) # Splits X data and design matrix data x_train, x_test, y_train, y_test = \ sk_modsel.train_test_split(self.x_data, self.y_data, test_size=test_percent) test_size = x_test.shape[0] # Sets up emtpy lists for gathering the relevant scores in R2_list = np.empty(N_bs) # MSE_list = np.empty(N_bs) # bias_list = np.empty(N_bs) # var_list = np.empty(N_bs) beta_coefs = [] # Sets up design matrix to test for X_test = self._design_matrix(x_test) y_pred_list = np.empty((N_bs, test_size)) # Sets up the X_tra X_train = self._design_matrix(x_train) # Bootstraps for i_bs in tqdm(range(N_bs), desc="Bootstrapping"): # Bootstraps test data x_boot, y_boot = boot(x_train, y_train) # X_boot, y_boot = boot(X_train, y_train) # Sets up design matrix X_boot = self._design_matrix(x_boot) # Fits the bootstrapped values self.reg.fit(X_boot, y_boot) # Tries to predict the y_test values the bootstrapped model y_predict = self.reg.predict(X_test) # Calculates R2 R2_list[i_bs] = metrics.R2(y_test, y_predict) # MSE_list[i_bs] = metrics.mse(y_predict, y_test) # bias_list[i_bs] = metrics.bias2(y_predict, y_test) # var_list[i_bs] = np.var(y_predict) # Stores the prediction and beta coefs. y_pred_list[i_bs] = y_predict.ravel() beta_coefs.append(self.reg.coef_) # pred_list_bs = np.mean(y_pred_list, axis=0) # R^2 score, 1 - sum(y-y_approx)/sum(y-mean(y)) self.R2 = np.mean(R2_list) # Mean Square Error, mean((y - y_approx)**2) _mse = np.mean((y_test.ravel() - y_pred_list)**2, axis=0, keepdims=True) self.MSE = np.mean(_mse) # Bias, (y - mean(y_approx))^2 _y_pred_mean = np.mean(y_pred_list, axis=0, keepdims=True) self.bias = np.mean((y_test.ravel() - _y_pred_mean)**2) # Variance, var(y_approx) self.var = np.mean(np.var(y_pred_list, axis=0, keepdims=True)) beta_coefs = np.asarray(beta_coefs) self.beta_coefs_var = np.asarray(beta_coefs).var(axis=0) self.beta_coefs = np.asarray(beta_coefs).mean(axis=0) self.x_pred_test = x_test self.y_pred = y_pred_list.mean(axis=0) self.y_pred_var = y_pred_list.var(axis=0)