def fitLineAndPlot(independentList, dependentList, plt, order=1): regression = PolynomialRegression(order) coefficient = regression.fit(independentList, dependentList) print('Coefficients for order %d' % order, coefficient) predictionList = getPredictions(coefficient, independentList) plt.plot(independentList, predictionList, label='Order-%d' % order, linewidth=3) plt.legend()
def main(): # numpy oba city = "nis" y = pd.read_csv("../data/aqi_" + city + ".csv") y = y.to_numpy() y = y[-100:] x = np.linspace(0, len(y), len(y)) x_train, x_test, y_train, y_test = train_test_split(x, y) PR = PolynomialRegression(x, y) theta = PR.fit(order=2, tol=10**-3, numIters=100, alpha=10**-3) PR.plot_predicted() PR.plot_cost()
def compare_dataset_size(train_datasets, test_dataset, K=10, out_dir="."): """ Generate plot to compare effects of dataset size. Args: - train_datasets (list of dict): A list of training datasets from the same distribution. - test_dataset (dict): The test dataset. - K (int): The degree of the polynomial to fit. Note: 1 <= K <= 10 """ model = PolynomialRegression(K=K) title = "Comparing Effects of Dataset Size" x_label = "Dataset size" y_label = "Error (Log Scale)" # One for training error, one for testing error labels = ("Train Error", "Test Error") x_s = [[], []] y_s = [[], []] test_X = test_dataset["X"] test_Y = test_dataset["Y"] for dataset in train_datasets: train_X = dataset["X"] train_Y = dataset["Y"] num_samples = len(train_X) x_s[0].append(num_samples) x_s[1].append(num_samples) model.fit(train_X, train_Y) train_loss = model.compute_mse(train_X, train_Y) test_loss = model.compute_mse(test_X, test_Y) y_s[0].append(np.log(train_loss)) y_s[1].append(np.log(test_loss)) visualize(x_s, y_s, labels, title, x_label, y_label, savefig=True, out_dir=out_dir)
def compare_regularization(train_dataset, test_dataset, K, l2_coefs, title_prefix="", out_dir="."): """ Generate plot to compare effects of model complexity """ title = f"{title_prefix}Comparing Effects of Regularization" x_label = "L2 Coefficient (Lambda Term) 1e-2" y_label = "Error (Log Scale)" labels = ("Train Error", "Test Error") x_s = [[], []] y_s = [[], []] train_X = train_dataset["X"] train_Y = train_dataset["Y"] test_X = test_dataset["X"] test_Y = test_dataset["Y"] for l2_coef in l2_coefs: x_s[0].append(l2_coef * 1e2) x_s[1].append(l2_coef * 1e2) model = PolynomialRegression(K) model.fit_with_l2_regularization(train_X, train_Y, l2_coef) train_loss = model.compute_mse(train_X, train_Y) test_loss = model.compute_mse(test_X, test_Y) y_s[0].append(np.log(train_loss)) y_s[1].append(np.log(test_loss)) visualize(x_s, y_s, labels, title, x_label, y_label, savefig=True, out_dir=out_dir)
def compare_model_complexity(train_dataset, test_dataset, Ks, title_prefix="", out_dir="."): """ Generate plot to compare effects of model complexity """ title = f"{title_prefix}Comparing Effects of Model Complexity" x_label = "Model Complexity (Degree of Polynomial)" y_label = "Error (Log Scale)" labels = ("Train Error", "Test Error") x_s = [[], []] y_s = [[], []] train_X = train_dataset["X"] train_Y = train_dataset["Y"] test_X = test_dataset["X"] test_Y = test_dataset["Y"] for K in Ks: x_s[0].append(K) x_s[1].append(K) model = PolynomialRegression(K) model.fit(train_X, train_Y) train_loss = model.compute_mse(train_X, train_Y) test_loss = model.compute_mse(test_X, test_Y) y_s[0].append(np.log(train_loss)) y_s[1].append(np.log(test_loss)) visualize(x_s, y_s, labels, title, x_label, y_label, savefig=True, out_dir=out_dir)
def polynomial_regression(): mse_train = [] mse_test = [] N = 100 max_degree = 10 random_list_size = 10 d = 4 x, y = generate_regression_data(d, N, amount_of_noise=0.1) x_train, y_train = np.zeros( (random_list_size, 1)), np.zeros((random_list_size, 1)) x_test, y_test = np.zeros( (N - random_list_size, 1)), np.zeros((N - random_list_size, 1)) random_list = [] for i in range(0, random_list_size): n = random.randint(0, N - 1) while n in random_list: n = random.randint(0, N - 1) random_list.append(n) counter_train = 0 counter_test = 0 for i in range(N): if i in random_list: x_train[counter_train] = x[i] y_train[counter_train] = y[i] counter_train += 1 else: x_test[counter_test] = x[i] y_test[counter_test] = y[i] counter_test += 1 for degree in range(max_degree): p = PolynomialRegression(degree) p.fit(x_train, y_train) y_hat_train = p.predict(x_train) y_hat_test = p.predict(x_test) if len(mse_train) == 0 or min(mse_train) > mean_squared_error(y_train, y_hat_train): min_train_y_predict = y_hat_train if len(mse_test) == 0 or min(mse_test) > mean_squared_error(y_test, y_hat_test): min_test_y_predict = y_hat_test mse_train.append(mean_squared_error(y_train, y_hat_train)) mse_test.append(mean_squared_error(y_test, y_hat_test)) # p.visualize(x_test, y_test) # Q1A plt.figure() plt.plot(range(max_degree), mse_train, color='orange', label='The train error') plt.plot(range(max_degree), mse_test, color='blue', label='The test error') plt.title('error vs degree') plt.xlabel('degree') plt.ylabel('error') plt.yscale('log') plt.legend(loc="best") plt.savefig("Q1A.png") # Q1B features_sorted = np.zeros(x_train.shape) targets_sorted = np.zeros(min_train_y_predict.shape) sort_indexes = x_train.argsort(axis=0) for i in range(len(x_train.argsort(axis=0))): features_sorted[i] = x_train[sort_indexes[i]] targets_sorted[i] = min_train_y_predict[sort_indexes[i]] features2_sorted = np.zeros(x_test.shape) targets2_sorted = np.zeros(min_test_y_predict.shape) sort_indexes = x_test.argsort(axis=0) for i in range(len(x_test.argsort(axis=0))): features2_sorted[i] = x_test[sort_indexes[i]] targets2_sorted[i] = min_test_y_predict[sort_indexes[i]] plt.figure() plt.scatter(x_train, y_train, color='blue') plt.plot(features_sorted, targets_sorted, color='orange', label='The lowest training error') plt.plot(features2_sorted, targets2_sorted, color='green', label='The lowest testing error') plt.title('X vs Y') plt.xlabel('X') plt.ylabel('Y') plt.legend(loc="best") plt.savefig("Q1B.png") # Q5 # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=200) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([[x1, x2]]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' cs = plt.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolor='black', s=20, label='data points') cs.collections[0].set_label('h(x)=0') plt.axis('tight') plt.title('Linear Classification Example') plt.xlabel('x1') plt.ylabel('x2') plt.legend(loc="best") plt.savefig("Q5.png")
def train_x(): return matrix(train['x']) def train_y(): return matrix(train['y']) def test_x(): return matrix(test['x']) def test_y(): return matrix(test['y']) print('linear regression') linear = LinearRegression() linear.fit(train_x(), train_y()) print(linear.score(test_x(), test_y())) print() print('polynomial regression') polynomial = PolynomialRegression([2, 3, 4], sigma=.000000000000972, iterations=100) polynomial.fit(train_x(), train_y()) print(polynomial.score(test_x(), test_y())) print()
# Plot error curves range_x = range(1, D + 1) plt.plot(range_x, training_errors, label="Training Error", marker="o") plt.plot(range_x, validation_errors, label="Validation Error", marker="o") plt.title(title) plt.xlabel('Number of Principal Components') plt.ylabel('Cross Validation MSE Scores') plt.legend() plt.show() if __name__ == '__main__': dataset = "./data/oil_500.pkl" rates = load_pickle_dataset(dataset) X = rates['X'] y = rates['y'] seed = 0 val_percent = 0.3 # Linear regression model = PolynomialRegression(1) model_selection(model, X, y, seed, 'Linear Regression', True) # Quadratic regression model = PolynomialRegression(2) model_selection(model, X, y, seed, 'Quadratic Regression', False)
from generatePolyPoints import generatePolyPoints from polynomial_regression import PolynomialRegression x_pts, y_pts = generatePolyPoints(0, 50, 100, [5, 1, 1], noiseLevel=2, plot=1) PR = PolynomialRegression(x_pts, y_pts) theta = PR.fit(method='normal_equation', order=2) PR.plot_predictedPolyLine()