def test_2dimx_1dimy(): X_train = np.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 6, 5, 7, 8, 9]]) Y_train = np.array([3, 5, 7, 9, 11, 13, 15, 17, 19]) model = LinearRegression() model.fit(X_train, Y_train) assert model.coef_[0] == pytest.approx(2) assert model.intercept_ == pytest.approx(1) # test predictions X_test = np.transpose([[1.5, 2.5, 3.5, 7.5], [1.5, 2.5, 3.5, 7.5]]) Y_test = np.array([4, 6, 8, 16]) Y_pred = model.predict(X_test) assert np.all(Y_pred == pytest.approx(Y_test))
def test_2d_nonzero_intercept(): X_train = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])[:, np.newaxis] Y_train = np.array([3, 5, 7, 9, 11, 13, 15, 17, 19]) model = LinearRegression() model.fit(X_train, Y_train) # test fit assert model.coef_[0] == pytest.approx(2) assert model.intercept_ == pytest.approx(1) # test predictions X_test = np.array([1.5, 2.5, 3.5, 7.5])[:, np.newaxis] Y_test = np.array([4, 6, 8, 16]) Y_pred = model.predict(X_test) assert np.all(Y_pred == pytest.approx(Y_test)) assert model.residuals_ is None
def test_2d_zero_intercept(): X_train = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])[:, np.newaxis] Y_train = np.array([2, 4, 6, 8, 10, 12, 14, 16, 18]) model = LinearRegression(calculate_residuals=True) model.fit(X_train, Y_train) # test fit assert model.coef_[0] == pytest.approx(2) assert model.intercept_ == pytest.approx(0) assert model.residuals_.shape == (9, ) assert np.all(model.residuals_ == pytest.approx(0)) # test predictions X_test = np.array([1.5, 2.5, 3.5, 7.5])[:, np.newaxis] Y_test = np.array([3, 5, 7, 15]) Y_pred = model.predict(X_test) assert np.all(Y_pred == pytest.approx(Y_test))
def hacky_polynomial(): """As we only have a single feature, x, we perform polynomial regression by adding features for x^n up to the desired power. It's slightly hacky in that it doesn't give us a nice way to produce predictions other than performing the same power calculations for the x_test features. NB. This starts to diverge from the sklearn version for large powers (~15) I haven't figured out why but I suspect it is to do with f.p. precision""" min_x, max_x = -5, 10 # Plot the true function with a green line X_show = np.linspace(min_x, max_x, 200)[:, np.newaxis] Y_show = poly_func(X_show) plt.plot(X_show, Y_show, color='green') # Define our training set, we add noise to y_values to make it interesting n_samples = 500 X_train = np.random.uniform(min_x, max_x, size=(n_samples, 1)) Y_exact = poly_func(X_train) Y_noise = 5 * np.random.standard_normal(size=n_samples)[:, np.newaxis] Y_train = Y_exact + Y_noise max_pow = 16 powers = np.array(range(1, max_pow + 1)) X_train_pow = np.power.outer(X_train[:, 0], powers) # Now train a regression model model = LinearRegression() model.fit(X_train_pow, Y_train) # compare with SKL skl_model = PolynomialRegression(max_pow) skl_model.fit(X_train, Y_train) # Plot the results X_show_pow = np.power.outer(X_show[:, 0], powers) plt.scatter(X_train, Y_train, s=1) plt.plot(X_show, model.predict(X_show_pow), c='red') plt.plot(X_show, skl_model.predict(X_show), c='blue') plt.show()
def linear(): # First invent some data n_samples = 300 n_dim = 1 x_max = 100 X_train = np.random.uniform(x_max, size=(n_samples, n_dim)) coefs = np.random.uniform(1, 10, size=n_dim) intercept = random.uniform(0, 100) noise = np.sqrt(x_max) * np.random.standard_normal(size=(n_samples, 1)) Y_mult = np.sum(X_train * coefs, axis=1)[:, np.newaxis] Y_train = Y_mult + intercept + noise # Now train a regression model model = LinearRegression() model.fit(X_train, Y_train) # Add line of best fit to graph X_fit = np.linspace(0, x_max, 20)[:, np.newaxis] Y_fit = model.predict(X_fit) print(model.score(X_train, Y_train)) # if we're in 1 dimension we can plot this if n_dim == 1: plt.figure(facecolor="w", figsize=(15, 10)) plt.scatter(X_train, Y_train, s=1) plt.plot(X_fit, Y_fit, color='green') plt.show()
def test_invalid_dimensions(): X_train = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])[:, np.newaxis] Y_train = np.array([3, 5, 7, 9, 11, 13, 15, 17, 19]) model = LinearRegression() with pytest.raises(ValueError): model.fit(X_train, Y_train[:, np.newaxis]) with pytest.raises(ValueError): model.fit(X_train.ravel(), Y_train)
def test_nd_nonzero_intercept(n_samples=1000, dim=20): X_train = np.random.uniform(100, size=(n_samples, dim)) coefs = np.random.uniform(1, 10, size=dim) intercept = random.uniform(-10, 10) Y_train_values = [ sum(point * coefs) + intercept + random.uniform(-0.2, 0.2) for point in X_train ] Y_train = np.array(Y_train_values) model = LinearRegression() model.fit(X_train, Y_train) # test fit assert model.intercept_ == pytest.approx(intercept, abs=0.2) assert np.all(model.coef_ == pytest.approx(coefs, abs=0.2)) # test predictions X_test = np.random.uniform(100, size=(50, dim)) Y_test = np.array([sum(point * coefs) + intercept for point in X_test]) Y_pred = model.predict(X_test) assert np.all(Y_pred == pytest.approx(Y_test, abs=0.2)) assert model.score(X_test, Y_test) > 0.999 assert model.residuals_ is None
def test_invalid_method(): with pytest.raises(ValueError): _ = LinearRegression(method='bob')
def test_gradient_desc(): X_train = np.transpose([[1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9]]) Y_train = np.array([6, 10, 14, 18, 22, 26, 30, 34, 38]) model_la = LinearRegression() with pytest.raises(np.linalg.LinAlgError): model_la.fit(X_train, Y_train) model_gd_no_params = LinearRegression(method='gradient_descent') with pytest.warns(RuntimeWarning): model_gd_no_params.fit(X_train, Y_train) gd_params = {'tol': 0.001, 'max_iter': 2000, 'learning_rate': 2} model_gd = LinearRegression(method='gradient_descent', params=gd_params) model_gd.fit(X_train, Y_train) assert np.sum(model_gd.coef_) == pytest.approx(4, abs=0.1) assert model_gd.intercept_ == pytest.approx(2, abs=0.2) # test predictions X_test = np.transpose([[1.5, 2.5, 3.5, 7.5], [1.5, 2.5, 3.5, 7.5]]) Y_test = np.array([8, 12, 16, 32]) Y_pred = model_gd.predict(X_test) assert np.all(Y_pred == pytest.approx(Y_test, abs=0.2))