Example #1
0
 def nll(self, num_models, model_type, X_train, y_train, num_averaged):
     stdev = np.std(y_train)
     a_nll = []
     b_nll = []
     for i in range(0, len(num_models)):
         a = np.asarray([])
         b = np.asarray([])
         for j in range(0, num_averaged):
             print("i: {}, j: {}".format(i, j))
             # Get residuals and model errors from a set of CV split
             CVD = cvd.CVData()
             seedValue = random.randint(1000000, 10000000)
             residuals, model_errors = CVD.get_residuals_and_model_errors(model_type, \
              X_train, y_train, model_num=num_models[i], random_state=seedValue)
             residuals = residuals / stdev
             model_errors = model_errors / stdev
             # Get correction factors for this CV data
             CF = cf.CorrectionFactors(residuals, model_errors)
             a_curr, b_curr, r_squared = CF.nll()
             a = np.append(a, a_curr)
             b = np.append(b, b_curr)
         # Calculate average for the current number of trees
         a_mu = np.mean(a)
         a_sigma = np.std(a)
         b_mu = np.mean(b)
         b_sigma = np.std(b)
         a_curr = [num_models[i], a_mu, a_sigma]
         b_curr = [num_models[i], b_mu, b_sigma]
         a_nll.append(a_curr)
         b_nll.append(b_curr)
     return a_nll, b_nll
Example #2
0
	def _get_RF_looped(self, dataset, X_values, y_values, model_num, random_state):
		if dataset == "Diffusion":
			rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=random_state)
		elif dataset == "Perovskite":
			rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=random_state)
		else:
			rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=random_state)
			print("Neither 'Diffusion' nor 'Perovskite' was specified as the dataset for get_residuals_and_model_errors_looped function.")
			print("Setting repeated k-fold to 5-fold splits repeated twice.")
		# RF
		RF_unscaled_model_errors = np.asarray([])
		RF_scaled_model_errors = np.asarray([])
		RF_resid = np.asarray([])
		a_array = []
		b_array = []
		for train_index, test_index in rkf.split(X_values):
			X_train, X_test = X_values[train_index], X_values[test_index]
			y_train, y_test = y_values[train_index], y_values[test_index]
			# Get CV residuals and model errors
			CVD = cvd.CVData()
			CV_residuals, CV_model_errors = CVD.get_residuals_and_model_errors("RF", X_train, y_train)
			# Scale residuals and model errors by data set standard deviation
			stdev = np.std(y_train)
			CV_residuals = CV_residuals / stdev
			CV_model_errors = CV_model_errors / stdev
			# Get correction factors
			CF = cf.CorrectionFactors(CV_residuals, CV_model_errors)
			a, b, r_squared = CF.nll()
			print('Correction Factors:')
			print('a: ' + str(a))
			print('b: ' + str(b))
			print('r^2: ' + str(r_squared))
			# Record the newly calculated a and b values
			a_array.append(a)
			b_array.append(b)
			# Get test data residuals and model errors
			Test_residuals, Test_model_errors = self._get_RF(X_train, y_train, X_test, y_test, model_num)
			# Scale by standard deviation
			Test_residuals = Test_residuals / stdev
			Test_model_errors = Test_model_errors / stdev
			# Scale model errors using scale factors obtained above
			Test_model_errors_scaled = Test_model_errors * a + b
			# Append results from this split to the arrays to be returned
			RF_unscaled_model_errors = np.concatenate((RF_unscaled_model_errors, Test_model_errors), axis=None)
			RF_scaled_model_errors = np.concatenate((RF_scaled_model_errors, Test_model_errors_scaled), axis=None)
			RF_resid = np.concatenate((RF_resid, Test_residuals), axis=None)
		a_array = np.asarray(a_array)
		b_array = np.asarray(b_array)
		return RF_resid, RF_unscaled_model_errors, RF_scaled_model_errors, a_array, b_array
Example #3
0
# Specify what models to run
# Options: "RF", "GPR", "LR"
models = ["RF", "LR"]

for model in models:
    print("STARTING {} Friedman 500".format(model))
    # Path to save files
    path = 'Supplemental_Info/Friedman_500/5-Fold/{}'.format(model)
    #path = 'plots/'

    # Load data
    X_train = np.load('friedman_500_data/training_x_values.npy')
    y_train = np.load('friedman_500_data/training_y_values.npy')

    # Get CV residuals and model errors
    CVD = cvd.CVData()
    CV_residuals, CV_model_errors = CVD.get_residuals_and_model_errors(
        model, X_train, y_train, model_num=200)

    # Scale residuals and model errors by data set standard deviation
    stdev = np.std(y_train)
    CV_residuals = CV_residuals / stdev
    CV_model_errors = CV_model_errors / stdev

    # Get correction factors
    CF = cf.CorrectionFactors(CV_residuals, CV_model_errors)
    a, b, r_squared = CF.nll()
    print('Correction Factors:')
    print('a: ' + str(a))
    print('b: ' + str(b))
    print('r^2: ' + str(r_squared))