def test_boston_dataset(n_bins): X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) mapper = _BinMapper(n_bins=n_bins, random_state=42) X_train_binned = mapper.fit_transform(X_train) # Init gradients and hessians to that of least squares loss gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) min_samples_leaf = 8 max_leaf_nodes = 31 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, n_bins_non_missing=mapper.n_bins_non_missing_) grower.grow() predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) assert r2_score(y_train, predictor.predict(X_train)) > 0.85 assert r2_score(y_test, predictor.predict(X_test)) > 0.70
def test_partial_dependence_easy_target(est, power): # If the target y only depends on one feature in an obvious way (linear or # quadratic) then the partial dependence for that feature should reflect # it. # We here fit a linear regression_data model (with polynomial features if # needed) and compute r_squared to check that the partial dependence # correctly reflects the target. rng = np.random.RandomState(0) n_samples = 200 target_variable = 2 X = rng.normal(size=(n_samples, 5)) y = X[:, target_variable]**power est.fit(X, y) averaged_predictions, values = partial_dependence( est, features=[target_variable], X=X, grid_resolution=1000) new_X = values[0].reshape(-1, 1) new_y = averaged_predictions[0] # add polynomial features if needed new_X = PolynomialFeatures(degree=power).fit_transform(new_X) lr = LinearRegression().fit(new_X, new_y) r2 = r2_score(new_y, lr.predict(new_X)) assert r2 > .99
def test_regression_metrics(n_samples=50): y_true = np.arange(n_samples) y_pred = y_true + 1 assert_almost_equal(mean_squared_error(y_true, y_pred), 1.) assert_almost_equal( mean_squared_log_error(y_true, y_pred), mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred))) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=0), mean_squared_error(y_true, y_pred)) # Tweedie deviance needs positive y_pred, except for p=0, # p>=2 needs positive y_true # results evaluated by sympy y_true = np.arange(1, 1 + n_samples) y_pred = 2 * y_true n = n_samples assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=-1), 5 / 12 * n * (n**2 + 2 * n + 1)) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=1), (n + 1) * (1 - np.log(2))) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=2), 2 * np.log(2) - 1) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3 / 2), ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum()) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3), np.sum(1 / y_true) / (4 * n))
def test_regression_custom_weights(): y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] msew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6]) rmsew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6], squared=False) maew = mean_absolute_error(y_true, y_pred, multioutput=[0.4, 0.6]) rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6]) evsw = explained_variance_score(y_true, y_pred, multioutput=[0.4, 0.6]) assert_almost_equal(msew, 0.39, decimal=2) assert_almost_equal(rmsew, 0.62, decimal=2) assert_almost_equal(maew, 0.475, decimal=3) assert_almost_equal(rw, 0.94, decimal=2) assert_almost_equal(evsw, 0.94, decimal=2) # Handling msle separately as it does not accept negative inputs. y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) msle = mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7]) msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred), multioutput=[0.3, 0.7]) assert_almost_equal(msle, msle2, decimal=2)
def test_regression_metrics_at_limits(): assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.00, 2) assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(median_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(max_error([0.], [0.]), 0.00, 2) assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2) assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2) assert_raises_regex( ValueError, "Mean Squared Logarithmic Error cannot be " "used when targets contain negative values.", mean_squared_log_error, [-1.], [-1.]) assert_raises_regex( ValueError, "Mean Squared Logarithmic Error cannot be " "used when targets contain negative values.", mean_squared_log_error, [1., 2., 3.], [1., -2., 3.]) assert_raises_regex( ValueError, "Mean Squared Logarithmic Error cannot be " "used when targets contain negative values.", mean_squared_log_error, [1., -2., 3.], [1., 2., 3.]) # Tweedie deviance error p = -1.2 assert_allclose(mean_tweedie_deviance([0], [1.], p=p), 2. / (2. - p), rtol=1e-3) with pytest.raises(ValueError, match="can only be used on strictly positive y_pred."): mean_tweedie_deviance([0.], [0.], p=p) assert_almost_equal(mean_tweedie_deviance([0.], [0.], p=0), 0.00, 2) msg = "only be used on non-negative y_true and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=1.0) p = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], p=p), 2. / (2. - p)) msg = "only be used on non-negative y_true and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) p = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, atol=1e-8) msg = "can only be used on strictly positive y_true and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) p = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, atol=1e-8) msg = "can only be used on strictly positive y_true and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) with pytest.raises(ValueError, match="deviance is only defined for p<=0 and p>=1."): mean_tweedie_deviance([0.], [0.], p=0.5)
def test_regression_scorers(): # Test regression scorers. diabetes = load_diabetes() X, y = diabetes.data, diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = Ridge() clf.fit(X_train, y_train) score1 = get_scorer('r2')(clf, X_test, y_test) score2 = r2_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2)
def test_multioutput_regression(): y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]]) error = mean_squared_error(y_true, y_pred) assert_almost_equal(error, (1. / 3 + 2. / 3 + 2. / 3) / 4.) error = mean_squared_error(y_true, y_pred, squared=False) assert_almost_equal(error, 0.645, decimal=2) error = mean_squared_log_error(y_true, y_pred) assert_almost_equal(error, 0.200, decimal=2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. error = mean_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 2. / 3) / 4.) error = r2_score(y_true, y_pred, multioutput='variance_weighted') assert_almost_equal(error, 1. - 5. / 2) error = r2_score(y_true, y_pred, multioutput='uniform_average') assert_almost_equal(error, -.875)
def test_regression_multioutput_array(): y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [0.125, 0.5625], decimal=2) assert_array_almost_equal(mae, [0.25, 0.625], decimal=2) assert_array_almost_equal(r, [0.95, 0.93], decimal=2) assert_array_almost_equal(evs, [0.95, 0.93], decimal=2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. y_true = [[0, 0]] * 4 y_pred = [[1, 1]] * 4 mse = mean_squared_error(y_true, y_pred, multioutput='raw_values') mae = mean_absolute_error(y_true, y_pred, multioutput='raw_values') r = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(mse, [1., 1.], decimal=2) assert_array_almost_equal(mae, [1., 1.], decimal=2) assert_array_almost_equal(r, [0., 0.], decimal=2) r = r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(r, [0, -3.5], decimal=2) assert np.mean(r) == r2_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='uniform_average') evs = explained_variance_score([[0, -1], [0, 1]], [[2, 2], [1, 1]], multioutput='raw_values') assert_array_almost_equal(evs, [0, -1.25], decimal=2) # Checking for the condition in which both numerator and denominator is # zero. y_true = [[1, 3], [-1, 2]] y_pred = [[1, 4], [-1, 1]] r2 = r2_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(r2, [1., -3.], decimal=2) assert np.mean(r2) == r2_score(y_true, y_pred, multioutput='uniform_average') evs = explained_variance_score(y_true, y_pred, multioutput='raw_values') assert_array_almost_equal(evs, [1., -3.], decimal=2) assert np.mean(evs) == explained_variance_score(y_true, y_pred) # Handling msle separately as it does not accept negative inputs. y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) msle = mean_squared_log_error(y_true, y_pred, multioutput='raw_values') msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred), multioutput='raw_values') assert_array_almost_equal(msle, msle2, decimal=2)
# Split the targets into training/testing sets diabetes_y_train = diabetes_y[:-20] diabetes_y_test = diabetes_y[-20:] # Create linear regression object regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(diabetes_X_train, diabetes_y_train) # Make predictions using the testing set diabetes_y_pred = regr.predict(diabetes_X_test) # The coefficients print('Coefficients: \n', regr.coef_) # The mean squared error print('Mean squared error: %.2f' % mean_squared_error(diabetes_y_test, diabetes_y_pred)) # The coefficient of determination: 1 is perfect prediction print('Coefficient of determination: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred)) # Plot outputs plt.scatter(diabetes_X_test, diabetes_y_test, color='black') plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3) plt.xticks(()) plt.yticks(()) plt.show()
# reported by the median absolute error (MAE). f, (ax0, ax1) = plt.subplots(1, 2, sharey=True) regr = RidgeCV() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text( 100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train) y_pred = regr_trans.predict(X_test) ax1.scatter(y_test, y_pred) ax1.plot([0, 2000], [0, 2000], '--k') ax1.set_ylabel('Target predicted') ax1.set_xlabel('True Target') ax1.set_title('Ridge regression \n with target transformation') ax1.text(
y += 0.01 * np.random.normal(size=n_samples) # Split data in train set and test set n_samples = X.shape[0] X_train, y_train = X[:n_samples // 2], y[:n_samples // 2] X_test, y_test = X[n_samples // 2:], y[n_samples // 2:] # ############################################################################# # Lasso from mrex.linear_model import Lasso alpha = 0.1 lasso = Lasso(alpha=alpha) y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test) r2_score_lasso = r2_score(y_test, y_pred_lasso) print(lasso) print("r^2 on test data : %f" % r2_score_lasso) # ############################################################################# # ElasticNet from mrex.linear_model import ElasticNet enet = ElasticNet(alpha=alpha, l1_ratio=0.7) y_pred_enet = enet.fit(X_train, y_train).predict(X_test) r2_score_enet = r2_score(y_test, y_pred_enet) print(enet) print("r^2 on test data : %f" % r2_score_enet) m, s, _ = plt.stem(np.where(enet.coef_)[0], enet.coef_[enet.coef_ != 0],
def score(y_test, y_pred, case): r2 = r2_score(y_test, y_pred) print("r^2 on test data (%s) : %f" % (case, r2))