def nll(self, num_models, model_type, X_train, y_train, num_averaged): stdev = np.std(y_train) a_nll = [] b_nll = [] for i in range(0, len(num_models)): a = np.asarray([]) b = np.asarray([]) for j in range(0, num_averaged): print("i: {}, j: {}".format(i, j)) # Get residuals and model errors from a set of CV split CVD = cvd.CVData() seedValue = random.randint(1000000, 10000000) residuals, model_errors = CVD.get_residuals_and_model_errors(model_type, \ X_train, y_train, model_num=num_models[i], random_state=seedValue) residuals = residuals / stdev model_errors = model_errors / stdev # Get correction factors for this CV data CF = cf.CorrectionFactors(residuals, model_errors) a_curr, b_curr, r_squared = CF.nll() a = np.append(a, a_curr) b = np.append(b, b_curr) # Calculate average for the current number of trees a_mu = np.mean(a) a_sigma = np.std(a) b_mu = np.mean(b) b_sigma = np.std(b) a_curr = [num_models[i], a_mu, a_sigma] b_curr = [num_models[i], b_mu, b_sigma] a_nll.append(a_curr) b_nll.append(b_curr) return a_nll, b_nll
def _get_RF_looped(self, dataset, X_values, y_values, model_num, random_state): if dataset == "Diffusion": rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=random_state) elif dataset == "Perovskite": rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=random_state) else: rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=random_state) print("Neither 'Diffusion' nor 'Perovskite' was specified as the dataset for get_residuals_and_model_errors_looped function.") print("Setting repeated k-fold to 5-fold splits repeated twice.") # RF RF_unscaled_model_errors = np.asarray([]) RF_scaled_model_errors = np.asarray([]) RF_resid = np.asarray([]) a_array = [] b_array = [] for train_index, test_index in rkf.split(X_values): X_train, X_test = X_values[train_index], X_values[test_index] y_train, y_test = y_values[train_index], y_values[test_index] # Get CV residuals and model errors CVD = cvd.CVData() CV_residuals, CV_model_errors = CVD.get_residuals_and_model_errors("RF", X_train, y_train) # Scale residuals and model errors by data set standard deviation stdev = np.std(y_train) CV_residuals = CV_residuals / stdev CV_model_errors = CV_model_errors / stdev # Get correction factors CF = cf.CorrectionFactors(CV_residuals, CV_model_errors) a, b, r_squared = CF.nll() print('Correction Factors:') print('a: ' + str(a)) print('b: ' + str(b)) print('r^2: ' + str(r_squared)) # Record the newly calculated a and b values a_array.append(a) b_array.append(b) # Get test data residuals and model errors Test_residuals, Test_model_errors = self._get_RF(X_train, y_train, X_test, y_test, model_num) # Scale by standard deviation Test_residuals = Test_residuals / stdev Test_model_errors = Test_model_errors / stdev # Scale model errors using scale factors obtained above Test_model_errors_scaled = Test_model_errors * a + b # Append results from this split to the arrays to be returned RF_unscaled_model_errors = np.concatenate((RF_unscaled_model_errors, Test_model_errors), axis=None) RF_scaled_model_errors = np.concatenate((RF_scaled_model_errors, Test_model_errors_scaled), axis=None) RF_resid = np.concatenate((RF_resid, Test_residuals), axis=None) a_array = np.asarray(a_array) b_array = np.asarray(b_array) return RF_resid, RF_unscaled_model_errors, RF_scaled_model_errors, a_array, b_array
# Specify what models to run # Options: "RF", "GPR", "LR" models = ["RF", "LR"] for model in models: print("STARTING {} Friedman 500".format(model)) # Path to save files path = 'Supplemental_Info/Friedman_500/5-Fold/{}'.format(model) #path = 'plots/' # Load data X_train = np.load('friedman_500_data/training_x_values.npy') y_train = np.load('friedman_500_data/training_y_values.npy') # Get CV residuals and model errors CVD = cvd.CVData() CV_residuals, CV_model_errors = CVD.get_residuals_and_model_errors( model, X_train, y_train, model_num=200) # Scale residuals and model errors by data set standard deviation stdev = np.std(y_train) CV_residuals = CV_residuals / stdev CV_model_errors = CV_model_errors / stdev # Get correction factors CF = cf.CorrectionFactors(CV_residuals, CV_model_errors) a, b, r_squared = CF.nll() print('Correction Factors:') print('a: ' + str(a)) print('b: ' + str(b)) print('r^2: ' + str(r_squared))