def test_linear(eng): X = randn(10, 2) y = fromarray(randn(10, 4).T, engine=eng) truth = asarray(fit_models(LR, X, y)) betas = LinearRegression().fit(X, y).betas.toarray() assert allclose(truth, betas) truth = asarray(fit_models(LR, X, y, fit_intercept=False)) betas = LinearRegression(fit_intercept=False).fit(X, y).betas.toarray() assert allclose(truth, betas)
def __build_model(self, var_idx, method='forward'): linear_reg = LinearRegression(gradient_descent=False) if method == 'forward': candidate_features = self.__best_features + [var_idx] elif method == 'backward': candidate_features = deepcopy(self.__best_features) candidate_features.remove(var_idx) X = self.X[:, candidate_features] linear_reg.fit(X, self.y) y_preds = [linear_reg.predict(x) for x in X] return calculate_r2(self.y, y_preds)
def test_predict_and_score(eng): X = randn(10, 2) y = fromarray(randn(10, 4).T, engine=eng) model = LinearRegression().fit(X, y) yhat = model.predict(X).toarray() rsq = model.score(X, y).toarray() truth = hstack([yhat, rsq[:, newaxis]]) result = model.predict_and_score(X, y).toarray() assert allclose(truth, result)
def test_predict(eng): X = randn(10, 2) y = fromarray(randn(10, 4).T, engine=eng) truth = asarray(predict_models(LR, X, y)) predictions = LinearRegression().fit(X, y).predict(X).toarray() assert allclose(truth, predictions)
def test_score(eng): X = randn(10, 2) y = fromarray(randn(10, 4).T, engine=eng) truth = asarray(score_models(LR, X, y)) scores = LinearRegression().fit(X, y).score(X, y).toarray() assert allclose(truth, scores)
def single_linear_regression_model(single_linear_regression_data): linear_regression_model = LinearRegression( independent_vars=single_linear_regression_data["independent_vars"], dependent_var=single_linear_regression_data["dependent_var"], iterations=10000, learning_rate=0.001, train_split=0.7, seed=123, ) return linear_regression_model
def test_betas_and_scores(eng): X = randn(10, 2) y = fromarray(randn(10, 4).T, engine=eng) true_betas = asarray(fit_models(LR, X, y)) true_scores = asarray(score_models(LR, X, y)) truth = hstack([true_betas, true_scores[:, newaxis]]) result = LinearRegression().fit(X, y).betas_and_scores.toarray() assert allclose(truth, result)
def build_tree(x_data, y_data): feature, split_val = best_split(x_data, y_data) node = TreeNode(feature, split_val) if feature is None: node.model = LinearRegression() node.model.fit(x_data, y_data) else: idx = (x_data[:, feature] <= split_val) node.left = build_tree(x_data[idx], y_data[idx]) node.right = build_tree(x_data[~idx], y_data[~idx]) return node
def build_models(self, df): self.n, self.p = df.shape performances = [] for k in range(1, self.p): for var_combo in itertools.combinations(df.columns[:-1], k): linear_reg = LinearRegression() X = np.asarray(df[list(var_combo)]) self.y = np.asarray(df.iloc[:, -1]) linear_reg.fit(X, self.y) y_preds = [linear_reg.predict(x) for x in X] adj_r2, aic, bic, r2, rss = self.__calculate_criterions( y_preds, k) performance = [var_combo, k, aic, bic, rss, r2, adj_r2] performances.append(performance) col_names = [ 'subset', 'num_of_variables', 'aic', 'bic', 'rss', 'r2', 'adj_r2' ] self.models_summary = pd.DataFrame(performances, columns=col_names) self.__visualize_best_subset_performance()
def linear_regression (self): data_preprocessor = DataPreProcessor(self.df_variable, self.pd_variable) # define preprocess.DataPreProcessor instance data_preprocessor_for_task = DataPreProcessorForTask(data_preprocessor) # perform preprocessing using preprocess.DataPreProcessorForTask using preprocess.DataPreProcessor instance data_preprocessor_for_task.preprocess_for_linear_regression() # perform preprocessing for Linear Regression regression_pd_variable = data_preprocessor.pd_variable regression_df_variable = data_preprocessor.df_variable linear_regression = LinearRegression(regression_df_variable) # define instance of regression.LinearRegression on preproccessed DF linear_regression.regression() # run regression coefficients = linear_regression.coefficients # regression retrieve its coefficients result intercept = linear_regression.intercept # regression retrieve its intercept result liner_equation = linear_regression.liner_equation # regression retrieve its linear equation result mean_squared_error = linear_regression.mean_squared_error # regression retrieve its MSE result messagebox.showinfo("liner_equation", liner_equation) # show linear equation on message box messagebox.showinfo("mean_squared_error", mean_squared_error) # show regression MSE on message box print (coefficients) print (intercept) print (liner_equation) print (mean_squared_error)
def run(self): x, y = self.__readData() model = LinearRegression() model.fit(x, y) y_predicted = [model.sumForRow(row) for row in x] # this plot is just to make sure we get values of a linear function plt.scatter(y, y_predicted, c='r') plt.show() mean_error = model.error(y_predicted, y) return mean_error
def regression(): #LinearRegression resultString = '' input1 = '' input2 = '' output1 = '' liner_regression = LinearRegression() print("okay") if request.method == "POST": input1 = request.form.get('input1') print("input1", input1) input_x_list = liner_regression.regression_string_2_list(input1) print("input_x_list", input_x_list) input2 = request.form.get('input2') print("input2", input2) input_y_list = liner_regression.regression_string_2_list(input2) print("input_y_list", input_y_list) liner_regression.model(input_x_list, input_y_list) output1 = request.form.get('output1') print("output1", output1) predict_x_list = liner_regression.regression_string_2_list(output1) print("predict_x_list", predict_x_list) predict_y_list = liner_regression.predict(predict_x_list) print("predict_y_list", predict_y_list) resultString = '' for i in range(len(predict_y_list)): resultString += str(predict_y_list[i]) + ', ' print("resultString", resultString) return render_template('regression.html', title = 'Regression', resultString = resultString, input1 = input1,input2 = input2, output1 =output1) # customize the title return render_template('regression.html', title = 'Regression', resultString = resultString, input1 = input1,input2 = input2, output1 =output1) # customize the title
def main(): X, y = make_regression(n_samples=100, n_features=1, noise=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) n_samples, n_features = np.shape(X) model = LinearRegression(n_iterations=100) model.fit(X_train, y_train) # Training error plot n = len(model.training_errors) training, = plt.plot(range(n), model.training_errors, label="Training Error") plt.legend(handles=[training]) plt.title("Error Plot") plt.ylabel('Mean Squared Error') plt.xlabel('Iterations') plt.show() y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) print ("Mean squared error: %s" % (mse)) y_pred_line = model.predict(X) # Color map cmap = plt.get_cmap('viridis') # Plot the results m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10) m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10) plt.plot(366 * X, y_pred_line, color='b', linewidth=2, label="Prediction") plt.suptitle("Linear Regression") plt.title("MSE: %.2f" % mse, fontsize=10) plt.xlabel('Day') plt.ylabel('Temperature in Celcius') plt.legend((m1, m2), ("Training data", "Test data"), loc='lower right') plt.show()
def main(): X, y = make_regression(n_samples=100, n_features=1, noise=20) x_train, x_test, y_train, y_test = DataManipulation().train_test_split( X, y, test_size=0.4) n_samples, n_features = X.shape model = LinearRegression() model.fit(x_train, y_train) n = len(model.errors) training = plt.plot(range(n), model.errors, label='Training Errors') plt.title('Error plot') plt.xlabel('Iteration') plt.ylabel('Mean Squared Error') plt.show() y_pred = model.predict(x_test) y_pred_line = model.predict(X) # Color map cmap = plt.get_cmap('viridis') # Plot the results m1 = plt.scatter(366 * x_train, y_train, color=cmap(0.9), s=10) m2 = plt.scatter(366 * x_test, y_test, color=cmap(0.5), s=10) plt.plot(366 * X, y_pred_line, color='black', linewidth=2, label="Prediction") plt.suptitle("Linear Regression") plt.xlabel('Day') plt.ylabel('Temperature in Celcius') plt.legend((m1, m2), ("Training data", "Test data"), loc='lower right') plt.show()
data = pd.read_csv('house.csv') max = data['size'].max() data['size'] = data['size'].apply(lambda x: x / max) bedroomMax = data['bedroom'].max() data['bedroom'] = data['bedroom'].apply(lambda x: x / bedroomMax) size = data['size'].values bedroom = data['bedroom'].values price = data['price'].values X = np.array([np.ones(len(size)), size, bedroom]).T Y = np.array(price) regression = LinearRegression(alpha=0.006, iteration=1000, feature_count=2) regression.fit(X, Y) regression.plot() while (True): size = int(input("Enter size of house:")) bedroom = int(input("Enter number of bedroom:")) print("price:", int(regression.predict(np.array([1, size, bedroom]))))
def __get_full_model_r2(self): linear_reg = LinearRegression(gradient_descent=False) linear_reg.fit(self.X, self.y) y_preds = [linear_reg.predict(x) for x in self.X] return calculate_r2(self.y, y_preds)
max = data['floor'].max() data['floor'] = data['floor'].apply(lambda x: x / max) max = data['top_floor'].max() data['top_floor'] = data['top_floor'].apply(lambda x: x / max) price = data['price'].values X = np.array([np.ones(len(price)), data['size'].values, data['room'].values, data['year'].values, data['floor'].values, data['top_floor'].values]).T Y = np.array(price) regression = LinearRegression(alpha=0.000001, iteration=300, feature_count=5) regression.fit(X, Y) regression.plot() print("price:", "13800000") print("price:", int(regression.predict(np.array([1, 45,2,1977,5,5])))) print() print("price:", "15333333") print("price:", int(regression.predict(np.array([1, 40,2,1967,1,4])))) # while (True): # size = int(input("Enter size of house:")) # bedroom = int(input("Enter number of bedroom:")) # print("price:", int(regression.predict(np.array([1, size, bedroom]))))
# Fake training set for property rental with feature scaling training_set_rent = np.array([ [(1 - 2.5) / 5, (500 - 500) / 1000, (24000 - 50000) / 100000], [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000], [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000], [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000], [(2 - 2.5) / 5, (500 - 500) / 1000, (32000 - 50000) / 100000], [(2 - 2.5) / 5, (1000 - 500) / 1000, (29000 - 50000) / 100000], [(3 - 2.5) / 5, (500 - 500) / 1000, (40000 - 50000) / 100000], [(1 - 2.5) / 5, (500 - 500) / 1000, (24000 - 50000) / 100000], [(1 - 2.5) / 5, (1000 - 500) / 1000, (22000 - 50000) / 100000], [(2 - 2.5) / 5, (500 - 500) / 1000, (32000 - 50000) / 100000], [(2 - 2.5) / 5, (1000 - 500) / 1000, (29000 - 50000) / 100000], [(3 - 2.5) / 5, (500 - 500) / 1000, (40000 - 50000) / 100000], ]) # Create and train LR with gradient linear = LinearRegression(training_set_rent) linear.train_gradient() linear.show_info() # Try to predict something print(linear.hypothesis([(1 - 2.5) / 5, (1000 - 500) / 1000]) * 100000 + 50000) # Create and train LR with normal equation linear_with_normal = LinearRegression(training_set_rent) linear_with_normal.normal_equation() linear_with_normal.show_info()
def main(): #from sklearn.datasets import load_boston #boston = load_boston() #print(boston.data.shape) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG if DEBUG else logging.INFO) dataFile = "data/housing.data" col_names = ["crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat", "medv"] train_df = pd.read_csv(dataFile, names = col_names, delim_whitespace = True) test_df = train_df.iloc[::7, :] train_df.drop(train_df.index[::7], inplace=True) train_df_features = train_df.iloc[:, :-1] train_df_targets = train_df.iloc[:, -1] test_df_features = test_df.iloc[:, :-1] test_df_targets = test_df.iloc[:, -1] # Data analysis print("Data analysis:") print("No. of attributes: ", len(train_df.iloc[0])) print("No. of features usable for classifcation: ", len(train_df.iloc[0])-1) print("Size of training data: ", len(train_df)) print("Size of testing data: ", len(test_df)) print("Histogram of attributes will be shown at the end of generating all results") print("\nPearson correlations:") target_col = col_names[-1] for col in col_names: if col.lower() == 'chas': # categorical. Also, see dtypes continue print("Correlation of %s with target(%s): %f" % (col, target_col, train_df[[col, target_col]].corr(method='pearson').iloc[0,1])) normalizer = DataFrameStdNormalizer(train_df_features) train_df_features_normalized = normalizer.get_normalized_data(train_df_features) test_df_features_normalized = normalizer.get_normalized_data(test_df_features) print("\n*********************Linear Regression*******************") regmodel = LinearRegression() eval = ModelEvaluator(regmodel) regmodel.train(train_df_features_normalized, train_df_targets) trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets) print("Mean squared error on training data: %f" % trainingError) print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features_normalized, test_df_targets)) print("\n***********Ridge regression with lambda 0.01m 0.1, 1.0***************") for lambdaval in (0.01, 0.1, 1.0): regmodel = RidgeRegression(lambdaval) eval = ModelEvaluator(regmodel) regmodel.train(train_df_features_normalized, train_df_targets) trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets) testingError = eval.mean_squared_error(test_df_features_normalized, test_df_targets) print("Ridge regression model with lambda = %f" % lambdaval) print("Mean squared error on training data = %f" % trainingError) print("Mean squared error on test data = %f" % testingError) print("") print("\n*********************Cross Validation*******************") lambdaval = float(10.0) # Shuffle data shuffled_train_df = train_df.reindex(np.random.permutation(train_df.index)) shuffled_train_df_features = train_df.iloc[:, :-1] shuffled_train_df_targets = train_df.iloc[:, -1] shuffled_train_df_features_normalized = (DataFrameStdNormalizer(shuffled_train_df_features)).get_normalized_data(shuffled_train_df_features) lambda_error_map = {} for i in range(0,6): lambdaval = float(10.0) / (10**i) # cross validation mean_cv_error = 0 regmodel = RidgeRegression(lambdaval) eval = ModelEvaluator(regmodel) for i in range(0,10): chunksize = len(train_df)/10 test_df_cv = None train_df_cv_targets = None test_df_cv = None test_df_cv_targets = None test_df_cv = shuffled_train_df_features_normalized.iloc[i*chunksize:i*chunksize+chunksize] test_df_cv_targets = shuffled_train_df_targets.iloc[i*chunksize:i*chunksize+chunksize] train_df_cv = shuffled_train_df_features_normalized.drop(shuffled_train_df_features_normalized.index[i*chunksize:i*chunksize+chunksize]) train_df_cv_targets = shuffled_train_df_targets.drop(shuffled_train_df_targets.index[i*chunksize:i*chunksize+chunksize]) regmodel.train(train_df_cv, train_df_cv_targets) #print(eval.mean_squared_error(test_df_cv, test_df_cv_targets)) mean_cv_error += eval.mean_squared_error(test_df_cv, test_df_cv_targets) mean_cv_error /= 10 print("MSE for lambda %f = %f" % (lambdaval, mean_cv_error)) lambda_error_map[lambdaval] = mean_cv_error lambdabest = min(lambda_error_map, key=lambda_error_map.get) print("Lowest MSE for lambda = %f" % lambdabest) regmodel = RidgeRegression(lambdabest) regmodel.train(train_df_features_normalized, train_df_targets) eval = ModelEvaluator(regmodel) test_meansquarederror = eval.mean_squared_error(test_df_features_normalized, test_df_targets) print("Test error for model with lambda %f = %f" % (lambdabest, test_meansquarederror)) print("") print("\n*********************Feature Selection*******************") print("*********************i. Max correlation*******************") target_col = col_names[-1] corr = {} for col in col_names: if col.lower() == 'chas': # categorical. Also, see dtypes continue corr[col] = abs(train_df[[col, target_col]].corr(method='pearson').iloc[0,1]) maxcorrcols = heapq.nlargest(5, corr, key=corr.get)[1:] print("Selecting the following coluns with max correlation: ") print(maxcorrcols) train_df_features_normalized_maxcorr = train_df_features[maxcorrcols] regmodel = LinearRegression() regmodel.train(train_df_features_normalized_maxcorr, train_df_targets) eval = ModelEvaluator(regmodel) trainingError = eval.mean_squared_error(train_df_features[maxcorrcols], train_df_targets) print("Mean squared error on training data: %f" % trainingError) print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[maxcorrcols], test_df_targets)) print("*******************ii. Max correlation with residue*****************") residue = train_df_targets.copy(deep=True) cols = [] regmodel = LinearRegression() eval = ModelEvaluator(regmodel) for i in range(0, 4): corr = {} for col in col_names: if col.lower() in ('medv', 'chas') or col in cols: # categorical. Also, see dtypes continue # corr[col] = train_df[[col]].corrwith(residue).iloc[0] corr[col] = abs(pd.concat([train_df[[col]], residue], axis = 1).corr(method='pearson').iloc[0,1]) maxcorrcol = max(corr, key=corr.get) cols.append(maxcorrcol) print("Taking cols: %s" % maxcorrcol) regmodel.train(train_df_features[cols], train_df_targets) for i in range(0,len(residue)): residue.at[residue.index[i]] = train_df_targets.iloc[i] - regmodel.predict(train_df_features[cols].iloc[i]) #trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets) #print("Mean squared error on training data: %f" % trainingError) #print(cols) print("Mean squared error on train data: %f" % eval.mean_squared_error(train_df_features[cols], train_df_targets)) print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[cols], test_df_targets)) print("*********************iii. All 4 feature combinations*******************") bestcols = None besttrainmse = 999999 regmodel = LinearRegression() eval = ModelEvaluator(regmodel) for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 4)): regmodel.train(train_df_features_normalized[cols], train_df_targets) mse_train = eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets) #print("Mean squared error on train data: %f" % ) #print("Mean squared error on test data: %f" % eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets)) if mse_train < besttrainmse: bestcols = cols besttrainmse = mse_train print("Best training MSE = %f for columns:" % besttrainmse) print(bestcols) regmodel.train(train_df_features_normalized[bestcols], train_df_targets) print("Testing MSE of this model: %f" % eval.mean_squared_error(test_df_features_normalized[cols], test_df_targets)) print("\n*********************Feature Expansion*******************") df_train_featuregen = train_df_features_normalized.copy(deep=True) df_test_featuregen = test_df_features_normalized.copy(deep=True) #i = 0 for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 2)) + [[col,col] for col in train_df_features_normalized.columns]: #i += 1 #print("Gen %d: %s" % (i,cols[0]+cols[1])) #df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1) df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen[cols[0]]*df_train_featuregen[cols[1]] df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen[cols[0]]*df_test_featuregen[cols[1]] #df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1) regmodel = LinearRegression() regmodel.train(df_train_featuregen, train_df_targets) eval = ModelEvaluator(regmodel) trainingError = eval.mean_squared_error(df_train_featuregen, train_df_targets) print("Mean squared error on training data: %f" % trainingError) print("Mean squared error on test data: %f" % eval.mean_squared_error(df_test_featuregen, test_df_targets)) print("\n******************************** Showing histogram of attributes********************************") Histogrammer.plot_histgram_of_features(train_df, 3, 5) print("\nClose window to terminate") #plt.show(block=False) #.draw() #plt.pause(0.001) #raw_input("Press enter to continue") plt.show() return
def test_simple_linear_regression(self): self._run_simple_linear_regression_for_model(LinearRegression())
def test_linear_regression(): # define sample data xs = np.linspace(0, 2, 50).astype(np.float32)[:, np.newaxis] noise = np.random.normal(0, 0.5, xs.shape).astype(np.float32) ys = xs + noise # define figure fig = plt.figure('linear regression') # show the sample data ax = fig.add_subplot(1, 1, 1, title='linear regression') ax.scatter(xs, ys, c='r', label='sample points') # show real curve _ys = xs ax.plot(xs, _ys, 'g', label='real curve') # define linear regression operation sgd_optimizer = GradientDescentOptimizer(0.008, 1) linear_regression = LinearRegression(xs, ys, sgd_optimizer) # start regression ax.plot(xs, linear_regression.predict(xs), 'b', label='regression curve') plt.legend(loc='upper left') plt.ion() for step in range(50): plt.pause(0.01) linear_regression.regress_once() ax.lines.pop() ax.plot(xs, linear_regression.predict(xs), 'b', label='regression curve') print('step {}: loss = {}'.format(step, linear_regression.compute_loss())) print('\n', '-' * 20, 'optimize end', '-' * 20, '\n') print('weights : \n{}'.format(linear_regression.weights)) print('loss : \n{}'.format(linear_regression.compute_loss())) print('-' * 50, '\n') print('compute directly: ') linear_regression.weights = LinearRegression.compute_weights(xs, ys) print('weights: \n{}'.format(linear_regression.weights)) print('loss: \n{}'.format(linear_regression.compute_loss())) print('-' * 50, '\n') print('real value: ') linear_regression.weights = np.array([1, 0], dtype=np.float32)[:, np.newaxis] print('weights: \n{}'.format( np.array([1, 0], dtype=np.float32)[:, np.newaxis])) print('loss : \n{}'.format(linear_regression.compute_loss())) print('-' * 50, '\n') ax.plot(xs, linear_regression.predict(xs), 'g', label='real curve') plt.ioff() plt.savefig('linear regression.png') plt.show()
""" @author Victor I. Afolabi A.I. Engineer & Software developer [email protected] Created on 26 August, 2017 @ 9:33 PM. Copyright (c) 2017. victor. All rights reserved. """ # Create a LinearRegression object from regression import LinearRegression import numpy as np data = np.genfromtxt('data.csv', delimiter=',') num_iter = 1000 clf = LinearRegression(learning_rate=1e-4) clf.fit(data=data, num_iter=num_iter) print('After {:,} iterations. m = {:.2f} and b = {:.2f}'.format(num_iter, clf.m, clf.b))
def linear_solve(x_data, y_data): model = LinearRegression() model.fit(x_data, y_data) return model.predict(x_data)
import pandas as pd import numpy as np from regression import LinearRegression df = pd.read_csv("/Users/yliang/data/trunk1/spark/assembly/target/tmp/LinearRegressionSuite/datasetWithDenseFeature2/part-00000", header = None) X = np.array(df[df.columns[1:3]]) y = np.array(df[df.columns[0]]) lir = LinearRegression(fit_intercept=True, alpha=2.3, max_iter=100, tol=1e-06, standardization=False, lower_bound=[-np.inf, 6.0, -np.inf], upper_bound=[0.0, 10.0, np.inf]) lir.fit(X, y) print("coefficients = " + str(lir.coef_)) print("intercept = " + str(lir.intercept_))
def __test_bootstrap_fit(): # A small implementation of a test case from regression import LinearRegression N_bs = 1000 # Initial values n = 200 noise = 0.2 np.random.seed(1234) test_percent = 0.35 # Sets up random matrices x = np.random.rand(n, 1) def func_excact(_x): return 2*_x*_x + np.exp(-2*_x) + noise * \ np.random.randn(_x.shape[0], _x.shape[1]) y = func_excact(x) def design_matrix(_x): return np.c_[np.ones(_x.shape), _x, _x*_x] # Sets up design matrix X = design_matrix(x) # Performs regression reg = LinearRegression() reg.fit(X, y) y = y.ravel() y_predict = reg.predict(X).ravel() print("Regular linear regression") print("R2: {:-20.16f}".format(reg.score(y_predict, y))) print("MSE: {:-20.16f}".format(metrics.mse(y, y_predict))) print("Beta: ", reg.coef_.ravel()) print("var(Beta): ", reg.coef_var.ravel()) print("") # Performs a bootstrap print("Bootstrapping") bs_reg = BootstrapRegression(x, y, LinearRegression, design_matrix) bs_reg.bootstrap(N_bs, test_percent=test_percent) print("R2: {:-20.16f}".format(bs_reg.R2)) print("MSE: {:-20.16f}".format(bs_reg.MSE)) print("Bias^2:{:-20.16f}".format(bs_reg.bias)) print("Var(y):{:-20.16f}".format(bs_reg.var)) print("Beta: ", bs_reg.coef_.ravel()) print("var(Beta): ", bs_reg.coef_var.ravel()) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(bs_reg.MSE, bs_reg.bias, bs_reg.var, bs_reg.bias + bs_reg.var)) print("Diff: {}".format(abs(bs_reg.bias + bs_reg.var - bs_reg.MSE))) import matplotlib.pyplot as plt plt.plot(x.ravel(), y, "o", label="Data") plt.plot(x.ravel(), y_predict, "o", label=r"Pred, R^2={:.4f}".format(reg.score(y_predict, y))) print (bs_reg.y_pred.shape, bs_reg.y_pred_var.shape) plt.errorbar(bs_reg.x_pred_test, bs_reg.y_pred, yerr=np.sqrt(bs_reg.y_pred_var), fmt="o", label=r"Bootstrap Prediction, $R^2={:.4f}$".format(bs_reg.R2)) plt.xlabel(r"$x$") plt.ylabel(r"$y$") plt.title(r"$2x^2 + \sigma^2$") plt.legend() plt.show()
def __test_cross_validation_methods(): # A small implementation of a test case from regression import LinearRegression import matplotlib.pyplot as plt # Initial values n = 100 N_bs = 1000 k_splits = 4 test_percent = 0.2 noise = 0.3 np.random.seed(1234) # Sets up random matrices x = np.random.rand(n, 1) def func_excact(_x): return 2*_x*_x + np.exp(-2*_x) + noise * \ np.random.randn(_x.shape[0], _x.shape[1]) y = func_excact(x) def design_matrix(_x): return np.c_[np.ones(_x.shape), _x, _x * _x] # Sets up design matrix X = design_matrix(x) # Performs regression reg = LinearRegression() reg.fit(X, y) y = y.ravel() y_predict = reg.predict(X).ravel() print("Regular linear regression") print("R2: {:-20.16f}".format(reg.score(y, y_predict))) print("MSE: {:-20.16f}".format(metrics.mse(y, y_predict))) # print (metrics.bias(y, y_predict)) print("Bias^2:{:-20.16f}".format(metrics.bias2(y, y_predict))) # Small plotter import matplotlib.pyplot as plt plt.plot(x, y, "o", label="data") plt.plot(x, y_predict, "o", label=r"Pred, $R^2={:.4f}$".format(reg.score(y, y_predict))) print("k-fold Cross Validation") kfcv = kFoldCrossValidation(x, y, LinearRegression, design_matrix) kfcv.cross_validate(k_splits=k_fold_size, test_percent=test_percent) print("R2: {:-20.16f}".format(kfcv.R2)) print("MSE: {:-20.16f}".format(kfcv.MSE)) print("Bias^2:{:-20.16f}".format(kfcv.bias)) print("Var(y):{:-20.16f}".format(kfcv.var)) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(kfcv.MSE, kfcv.bias, kfcv.var, kfcv.bias + kfcv.var)) print("Diff: {}".format(abs(kfcv.bias + kfcv.var - kfcv.MSE))) plt.errorbar(kfcv.x_pred_test, kfcv.y_pred, yerr=np.sqrt(kfcv.y_pred_var), fmt="o", label=r"k-fold CV, $R^2={:.4f}$".format(kfcv.R2)) print("kk Cross Validation") kkcv = kkFoldCrossValidation(x, y, LinearRegression, design_matrix) kkcv.cross_validate(k_splits=k_fold_size, test_percent=test_percent) print("R2: {:-20.16f}".format(kkcv.R2)) print("MSE: {:-20.16f}".format(kkcv.MSE)) print("Bias^2:{:-20.16f}".format(kkcv.bias)) print("Var(y):{:-20.16f}".format(kkcv.var)) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(kkcv.MSE, kkcv.bias, kkcv.var, kkcv.bias + kkcv.var)) print("Diff: {}".format(abs(kkcv.bias + kkcv.var - kkcv.MSE))) plt.errorbar(kkcv.x_pred_test.ravel(), kkcv.y_pred.ravel(), yerr=np.sqrt(kkcv.y_pred_var.ravel()), fmt="o", label=r"kk-fold CV, $R^2={:.4f}$".format(kkcv.R2)) print("Monte Carlo Cross Validation") mccv = MCCrossValidation(x, y, LinearRegression, design_matrix) mccv.cross_validate(N_bs, k_splits=k_fold_size, test_percent=test_percent) print("R2: {:-20.16f}".format(mccv.R2)) print("MSE: {:-20.16f}".format(mccv.MSE)) print("Bias^2:{:-20.16f}".format(mccv.bias)) print("Var(y):{:-20.16f}".format(mccv.var)) print("MSE = Bias^2 + Var(y) = ") print("{} = {} + {} = {}".format(mccv.MSE, mccv.bias, mccv.var, mccv.bias + mccv.var)) print("Diff: {}".format(abs(mccv.bias + mccv.var - mccv.MSE))) print("\nCross Validation methods tested.") plt.errorbar(mccv.x_pred_test, mccv.y_pred, yerr=np.sqrt(mccv.y_pred_var), fmt="o", label=r"MC CV, $R^2={:.4f}$".format(mccv.R2)) plt.xlabel(r"$x$") plt.ylabel(r"$y$") plt.title(r"$y=2x^2$") plt.legend() plt.show()
action="store_true") parser.add_argument( "--loo", help= "Calculate leave-one-out error. Will have an adverse effect on run-time.", action="store_true") return parser.parse_args() if __name__ == '__main__': args = parseArguments() settings = dict() if args.noPlot: settings['plot'] = False else: settings['plot'] = True if args.loo: settings['loo'] = True else: settings['loo'] = False if args.type == 'linear': # Linear Regression LinearRegression(calculate_error=settings.get('loo'), plot_now=settings.get('plot')) elif args.type == 'logistic': # Logistic Regression LogisticRegression(calculate_error=settings.get('loo'), plot_now=settings.get('plot'))