def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D n_sampless = [2, 3] n_featuress = [3, 2] rng = np.random.RandomState(42) for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) sample_weights_OK = rng.randn(n_samples) ** 2 + 1 sample_weights_OK_1 = 1.0 sample_weights_OK_2 = 2.0 sample_weights_not_OK = sample_weights_OK[:, np.newaxis] sample_weights_not_OK_2 = sample_weights_OK[np.newaxis, :] ridge = Ridge(alpha=1) # make sure the "OK" sample weights actually work ridge.fit(X, y, sample_weights_OK) ridge.fit(X, y, sample_weights_OK_1) ridge.fit(X, y, sample_weights_OK_2) def fit_ridge_not_ok(): ridge.fit(X, y, sample_weights_not_OK) def fit_ridge_not_ok_2(): ridge.fit(X, y, sample_weights_not_OK_2) assert_raise_message(ValueError, "Sample weights must be 1D array or scalar", fit_ridge_not_ok) assert_raise_message(ValueError, "Sample weights must be 1D array or scalar", fit_ridge_not_ok_2)
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(fit_intercept=False) # generalized cross-validation (efficient leave-one-out) K, v, Q = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(v, Q, y_diabetes, 1.0) values, c = ridge_gcv._values(K, v, Q, y_diabetes, 1.0) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) best_alpha = ridge_gcv.best_alpha ret.append(best_alpha) # check that we get same best alpha with custom loss_func ridge_gcv2 = _RidgeGCV(fit_intercept=False, loss_func=mean_squared_error) ridge_gcv2.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.best_alpha, best_alpha) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.best_alpha, best_alpha) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def test_ridge_singular(): # test on a singular matrix rng = np.random.RandomState(0) n_samples, n_features = 6, 6 y = rng.randn(n_samples // 2) y = np.concatenate((y, y)) X = rng.randn(n_samples // 2, n_features) X = np.concatenate((X, X), axis=0) ridge = Ridge(alpha=0) ridge.fit(X, y) assert_greater(ridge.score(X, y), 0.9)
def test_dtype_match_cholesky(): # Test different alphas in cholesky solver to ensure full coverage. # This test is separated from test_dtype_match for clarity. rng = np.random.RandomState(0) alpha = (1.0, 0.5) n_samples, n_features, n_target = 6, 7, 2 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples, n_target) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver='cholesky') ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver='cholesky') ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do all the checks at once, like this is easier to debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
def test_dtype_match(solver): rng = np.random.RandomState(0) alpha = 1.0 n_samples, n_features = 6, 5 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do the actual checks at once for easier debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
def test_dtype_match(): rng = np.random.RandomState(0) alpha = 1.0 n_samples, n_features = 6, 5 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) solvers = ["svd", "sparse_cg", "cholesky", "lsqr"] for solver in solvers: # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver=solver) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver=solver) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do the actual checks at once for easier debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
def test_ridge_sample_weights(): rng = np.random.RandomState(0) for solver in ("cholesky", ): for n_samples, n_features in ((6, 5), (5, 10)): for alpha in (1.0, 1e-2): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1 + rng.rand(n_samples) coefs = ridge_regression(X, y, alpha=alpha, sample_weight=sample_weight, solver=solver) # Sample weight can be implemented via a simple rescaling # for the square loss. coefs2 = ridge_regression( X * np.sqrt(sample_weight)[:, np.newaxis], y * np.sqrt(sample_weight), alpha=alpha, solver=solver) assert_array_almost_equal(coefs, coefs2) # Test for fit_intercept = True est = Ridge(alpha=alpha, solver=solver) est.fit(X, y, sample_weight=sample_weight) # Check using Newton's Method # Quadratic function should be solved in a single step. # Initialize sample_weight = np.sqrt(sample_weight) X_weighted = sample_weight[:, np.newaxis] * ( np.column_stack((np.ones(n_samples), X))) y_weighted = y * sample_weight # Gradient is (X*coef-y)*X + alpha*coef_[1:] # Remove coef since it is initialized to zero. grad = -np.dot(y_weighted, X_weighted) # Hessian is (X.T*X) + alpha*I except that the first # diagonal element should be zero, since there is no # penalization of intercept. diag = alpha * np.ones(n_features + 1) diag[0] = 0. hess = np.dot(X_weighted.T, X_weighted) hess.flat[::n_features + 2] += diag coef_ = - np.dot(linalg.inv(hess), grad) assert_almost_equal(coef_[0], est.intercept_) assert_array_almost_equal(coef_[1:], est.coef_)
def test_fit_simple_backupsklearn(): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.Ridge enet = Solver(glm_stop_early=False) print("h2o4gpu fit()") enet.fit(X, y) print("h2o4gpu predict()") print(enet.predict(X)) print("h2o4gpu score()") print(enet.score(X,y)) enet_wrapper = Solver(normalize=True, random_state=1234) print("h2o4gpu scikit wrapper fit()") enet_wrapper.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet_wrapper.predict(X)) print("h2o4gpu scikit wrapper score()") print(enet_wrapper.score(X, y)) from sklearn.linear_model.ridge import Ridge enet_sk = Ridge(normalize=True, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit score()") print(enet_sk.score(X, y)) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() print(enet_sk.coef_) print(enet_sk_coef) print(enet_wrapper.coef_) print(enet_sk.intercept_) print(enet_wrapper.intercept_) print(enet_sk.n_iter_) print(enet_wrapper.n_iter_) print("Coeffs, intercept, and n_iters should match") assert np.allclose(enet_wrapper.coef_, enet_sk_coef) assert np.allclose(enet_wrapper.intercept_, enet_sk.intercept_)
def test_ridge_intercept(): # Test intercept with multiple targets GH issue #708 rng = np.random.RandomState(0) n_samples, n_features = 5, 10 X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) Y = np.c_[y, 1. + y] ridge = Ridge() ridge.fit(X, y) intercept = ridge.intercept_ ridge.fit(X, Y) assert_almost_equal(ridge.intercept_[0], intercept) assert_almost_equal(ridge.intercept_[1], intercept + 1.)
def _test_tolerance(filter_): ridge = Ridge(tol=1e-5, fit_intercept=False) ridge.fit(filter_(X_diabetes), y_diabetes) score = ridge.score(filter_(X_diabetes), y_diabetes) ridge2 = Ridge(tol=1e-3, fit_intercept=False) ridge2.fit(filter_(X_diabetes), y_diabetes) score2 = ridge2.score(filter_(X_diabetes), y_diabetes) assert score >= score2
def _test_tolerance(filter_): ridge = Ridge(tol=1e-5) ridge.fit(filter_(X_diabetes), y_diabetes) score = ridge.score(filter_(X_diabetes), y_diabetes) ridge2 = Ridge(tol=1e-3) ridge2.fit(filter_(X_diabetes), y_diabetes) score2 = ridge2.score(filter_(X_diabetes), y_diabetes) assert_true(score >= score2)
def test_ridge_vs_lstsq(): """On alpha=0., Ridge and OLS yield the same solution.""" # we need more samples than features n_samples, n_features = 5, 4 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=0., fit_intercept=False) ols = LinearRegression(fit_intercept=False) ridge.fit(X, y) ols.fit(X, y) assert_almost_equal(ridge.coef_, ols.coef_) ridge.fit(X, y) ols.fit(X, y) assert_almost_equal(ridge.coef_, ols.coef_)
def test_ridge_sample_weights(): # TODO: loop over sparse data as well # Note: parametrizing this test with pytest results in failed # assertions, meaning that is is not extremely robust rng = np.random.RandomState(0) param_grid = product((1.0, 1e-2), (True, False), ('svd', 'cholesky', 'lsqr', 'sparse_cg')) for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) for (alpha, intercept, solver) in param_grid: # Ridge with explicit sample_weight est = Ridge(alpha=alpha, fit_intercept=intercept, solver=solver, tol=1e-6) est.fit(X, y, sample_weight=sample_weight) coefs = est.coef_ inter = est.intercept_ # Closed form of the weighted regularized least square # theta = (X^T W X + alpha I)^(-1) * X^T W y W = np.diag(sample_weight) if intercept is False: X_aug = X I = np.eye(n_features) else: dummy_column = np.ones(shape=(n_samples, 1)) X_aug = np.concatenate((dummy_column, X), axis=1) I = np.eye(n_features + 1) I[0, 0] = 0 cf_coefs = linalg.solve(X_aug.T.dot(W).dot(X_aug) + alpha * I, X_aug.T.dot(W).dot(y)) if intercept is False: assert_array_almost_equal(coefs, cf_coefs) else: assert_array_almost_equal(coefs, cf_coefs[1:]) assert_almost_equal(inter, cf_coefs[0])
def test_sparse_design_with_sample_weights(): # Sample weights must work with sparse matrices n_sampless = [2, 3] n_featuress = [3, 2] rng = np.random.RandomState(42) sparse_matrix_converters = [sp.coo_matrix, sp.csr_matrix, sp.csc_matrix, sp.lil_matrix, sp.dok_matrix ] sparse_ridge = Ridge(alpha=1., fit_intercept=False) dense_ridge = Ridge(alpha=1., fit_intercept=False) for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) sample_weights = rng.randn(n_samples) ** 2 + 1 for sparse_converter in sparse_matrix_converters: X_sparse = sparse_converter(X) sparse_ridge.fit(X_sparse, y, sample_weight=sample_weights) dense_ridge.fit(X, y, sample_weight=sample_weights) assert_array_almost_equal(sparse_ridge.coef_, dense_ridge.coef_, decimal=6)
def _test_multi_ridge_diabetes(filter_): # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T n_features = X_diabetes.shape[1] ridge = Ridge(fit_intercept=False) ridge.fit(filter_(X_diabetes), Y) assert_equal(ridge.coef_.shape, (2, n_features)) Y_pred = ridge.predict(filter_(X_diabetes)) ridge.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_ridge_fit_intercept_sparse(): X, y = make_regression(n_samples=1000, n_features=2, n_informative=2, bias=10.0, random_state=42) X_csr = sp.csr_matrix(X) dense = Ridge(alpha=1.0, tol=1.0e-15, solver="sag", fit_intercept=True) sparse = Ridge(alpha=1.0, tol=1.0e-15, solver="sag", fit_intercept=True) dense.fit(X, y) sparse.fit(X_csr, y) assert_almost_equal(dense.intercept_, sparse.intercept_) assert_array_almost_equal(dense.coef_, sparse.coef_) # test the solver switch and the corresponding warning sparse = Ridge(alpha=1.0, tol=1.0e-15, solver="lsqr", fit_intercept=True) assert_warns(UserWarning, sparse.fit, X_csr, y) assert_almost_equal(dense.intercept_, sparse.intercept_) assert_array_almost_equal(dense.coef_, sparse.coef_)
def connectWidgets(self): self.Ridge.setVisible(False) ridgecv = RidgeCV() self.alphasLineEdit_cv.setText(str(ridgecv.alphas)) self.fitInterceptCheckBox_cv.setChecked(ridgecv.fit_intercept) self.normalizeCheckBox_cv.setChecked(ridgecv.normalize) self.defaultComboItem(self.scoringComboBox_cv, ridgecv.scoring) self.defaultComboItem(self.gCVModeComboBox_cv, ridgecv.gcv_mode) self.storeCVValuesCheckBox_cv.setChecked(ridgecv.store_cv_values) ridge = Ridge() self.alphaDoubleSpinBox.setValue(ridge.alpha) self.fitInterceptCheckBox.setChecked(ridge.fit_intercept) self.normalizeCheckBox.setChecked(ridge.normalize) self.copyXCheckBox.setChecked(ridge.copy_X) self.defaultComboItem(self.solverComboBox, ridge.solver) self.toleranceDoubleSpinBox.setValue(ridge.tol) self.randomStateLineEdit.setText(str(ridge.random_state))
def test_ridge_shapes(): # Test shape of coef_ and intercept_ rng = np.random.RandomState(0) n_samples, n_features = 5, 10 X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) Y1 = y[:, np.newaxis] Y = np.c_[y, 1 + y] ridge = Ridge() ridge.fit(X, y) assert_equal(ridge.coef_.shape, (n_features, )) assert_equal(ridge.intercept_.shape, ()) ridge.fit(X, Y1) assert_equal(ridge.coef_.shape, (1, n_features)) assert_equal(ridge.intercept_.shape, (1, )) ridge.fit(X, Y) assert_equal(ridge.coef_.shape, (2, n_features)) assert_equal(ridge.intercept_.shape, (2, ))
def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None): self._hyperparams = { 'alpha': alpha, 'fit_intercept': fit_intercept, 'normalize': normalize, 'copy_X': copy_X, 'max_iter': max_iter, 'tol': tol, 'solver': solver, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams)
def function(self): m_attrib = {'None': None} r_attrib = {'None': None} try: m_state = int(self.maxNumOfIterationslineEdit.text()) except: m_state = m_attrib[self.maxNumOfIterationslineEdit.text()] try: r_state = int(self.randomStateLineEdit.text()) except: r_state = r_attrib[self.randomStateLineEdit.text()] if self.crossValidateCheckBox.isChecked(): params = { 'alphas': ast.literal_eval(self.alphasLineEdit_cv.text()), 'fit_intercept': self.fitInterceptCheckBox_cv.isChecked(), 'normalize': self.normalizeCheckBox_cv.isChecked(), 'scoring': { 'None': None }.get(self.scoringComboBox_cv.currentText()), 'gcv_mode': { 'None': None }.get(self.gCVModeComboBox_cv.currentText()), 'store_cv_values': self.storeCVValuesCheckBox_cv.isChecked(), 'CV': self.crossValidateCheckBox.isChecked() } return params, self.getChangedValues(params, RidgeCV()) else: params = { 'alpha': self.alphaDoubleSpinBox.value(), 'copy_X': self.copyXCheckBox.isChecked(), 'fit_intercept': self.fitInterceptCheckBox.isChecked(), 'max_iter': m_state, 'normalize': self.normalizeCheckBox.isChecked(), 'solver': self.solverComboBox.currentText(), 'tol': self.toleranceDoubleSpinBox.value(), 'random_state': r_state, 'CV': self.crossValidateCheckBox.isChecked() } return params, self.getChangedValues(params, Ridge())
def test_fit_simple_backupsklearn(): df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True) X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.Ridge enet = Solver(glm_stop_early=False) print("h2o4gpu fit()") enet.fit(X, y) print("h2o4gpu predict()") print(enet.predict(X)) print("h2o4gpu score()") print(enet.score(X, y)) enet_wrapper = Solver(normalize=True, random_state=1234) print("h2o4gpu scikit wrapper fit()") enet_wrapper.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet_wrapper.predict(X)) print("h2o4gpu scikit wrapper score()") print(enet_wrapper.score(X, y)) from sklearn.linear_model.ridge import Ridge enet_sk = Ridge(normalize=True, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit score()") print(enet_sk.score(X, y)) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() print(enet_sk.coef_) print(enet_sk_coef) print(enet_wrapper.coef_) print(enet_sk.intercept_) print(enet_wrapper.intercept_) print(enet_sk.n_iter_) print(enet_wrapper.n_iter_) print("Coeffs, intercept, and n_iters should match") assert np.allclose(enet_wrapper.coef_, enet_sk_coef) assert np.allclose(enet_wrapper.intercept_, enet_sk.intercept_)
def test_n_iter(): # Test that self.n_iter_ is correct. n_targets = 2 X, y = X_diabetes, y_diabetes y_n = np.tile(y, (n_targets, 1)).T for max_iter in range(1, 4): for solver in ('sag', 'saga', 'lsqr'): reg = Ridge(solver=solver, max_iter=max_iter, tol=1e-12) reg.fit(X, y_n) assert_array_equal(reg.n_iter_, np.tile(max_iter, n_targets)) for solver in ('sparse_cg', 'svd', 'cholesky'): reg = Ridge(solver=solver, max_iter=1, tol=1e-1) reg.fit(X, y_n) assert_equal(reg.n_iter_, None)
def test_ridgecv_sample_weight(): rng = np.random.RandomState(0) alphas = (0.1, 1.0, 10.0) # There are different algorithms for n_samples > n_features # and the opposite, so test them both. for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1.0 + rng.rand(n_samples) cv = KFold(5) ridgecv = RidgeCV(alphas=alphas, cv=cv) ridgecv.fit(X, y, sample_weight=sample_weight) # Check using GridSearchCV directly parameters = {'alpha': alphas} gs = GridSearchCV(Ridge(), parameters, cv=cv) gs.fit(X, y, sample_weight=sample_weight) assert_equal(ridgecv.alpha_, gs.best_estimator_.alpha) assert_array_almost_equal(ridgecv.coef_, gs.best_estimator_.coef_)
def test_toy_ridge_object(): # Test BayesianRegression ridge classifier # TODO: test also n_samples > n_features X = np.array([[1], [2]]) Y = np.array([1, 2]) reg = Ridge(alpha=0.0) reg.fit(X, Y) X_test = [[1], [2], [3], [4]] assert_almost_equal(reg.predict(X_test), [1., 2, 3, 4]) assert len(reg.coef_.shape) == 1 assert type(reg.intercept_) == np.float64 Y = np.vstack((Y, Y)).T reg.fit(X, Y) X_test = [[1], [2], [3], [4]] assert len(reg.coef_.shape) == 2 assert type(reg.intercept_) == np.ndarray
def test_toy_ridge_object(): # Test BayesianRegression ridge classifier # TODO: test also n_samples > n_features X = np.array([[1], [2]]) Y = np.array([1, 2]) clf = Ridge(alpha=0.0) clf.fit(X, Y) X_test = [[1], [2], [3], [4]] assert_almost_equal(clf.predict(X_test), [1., 2, 3, 4]) assert_equal(len(clf.coef_.shape), 1) assert_equal(type(clf.intercept_), np.float64) Y = np.vstack((Y, Y)).T clf.fit(X, Y) X_test = [[1], [2], [3], [4]] assert_equal(len(clf.coef_.shape), 2) assert_equal(type(clf.intercept_), np.ndarray)
def test_ridge_fit_intercept_sparse(): X, y = make_regression(n_samples=1000, n_features=2, n_informative=2, bias=10., random_state=42) X_csr = sp.csr_matrix(X) for solver in ['sag', 'sparse_cg']: dense = Ridge(alpha=1., tol=1.e-15, solver=solver, fit_intercept=True) sparse = Ridge(alpha=1., tol=1.e-15, solver=solver, fit_intercept=True) dense.fit(X, y) with pytest.warns(None) as record: sparse.fit(X_csr, y) assert len(record) == 0 assert_almost_equal(dense.intercept_, sparse.intercept_) assert_array_almost_equal(dense.coef_, sparse.coef_) # test the solver switch and the corresponding warning for solver in ['saga', 'lsqr']: sparse = Ridge(alpha=1., tol=1.e-15, solver=solver, fit_intercept=True) assert_raises_regex(ValueError, "In Ridge,", sparse.fit, X_csr, y)
def test_ridge_fit_intercept_sparse(solver): X, y = _make_sparse_offset_regression(n_features=20, random_state=0) X_csr = sp.csr_matrix(X) # for now only sparse_cg can correctly fit an intercept with sparse X with # default tol and max_iter. # sag is tested separately in test_ridge_fit_intercept_sparse_sag # because it requires more iterations and should raise a warning if default # max_iter is used. # other solvers raise an exception, as checked in # test_ridge_fit_intercept_sparse_error # # "auto" should switch to "sparse_cg" when X is sparse # so the reference we use for both ("auto" and "sparse_cg") is # Ridge(solver="sparse_cg"), fitted using the dense representation (note # that "sparse_cg" can fit sparse or dense data) dense_ridge = Ridge(solver='sparse_cg') sparse_ridge = Ridge(solver=solver) dense_ridge.fit(X, y) with pytest.warns(None) as record: sparse_ridge.fit(X_csr, y) assert len(record) == 0 assert np.allclose(dense_ridge.intercept_, sparse_ridge.intercept_) assert np.allclose(dense_ridge.coef_, sparse_ridge.coef_)
def test_ridge_shapes(): # Test shape of coef_ and intercept_ rng = np.random.RandomState(0) n_samples, n_features = 5, 10 X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) Y1 = y[:, np.newaxis] Y = np.c_[y, 1 + y] ridge = Ridge() ridge.fit(X, y) assert_equal(ridge.coef_.shape, (n_features,)) assert_equal(ridge.intercept_.shape, ()) ridge.fit(X, Y1) assert_equal(ridge.coef_.shape, (1, n_features)) assert_equal(ridge.intercept_.shape, (1, )) ridge.fit(X, Y) assert_equal(ridge.coef_.shape, (2, n_features)) assert_equal(ridge.intercept_.shape, (2, ))
def eval_aggr_shifts(X, y, ignore_rows): eps = 1e-6 pred = [] real = [] for inst_n in ignore_rows: X = np.concatenate((X[:inst_n], X[inst_n+1:])) y = np.concatenate((y[:inst_n], y[inst_n+1:])) n = X.shape[0] for inst_n in range(n): x_i = X[inst_n] y_i = y[inst_n] X_train = np.concatenate((X[:inst_n], X[inst_n+1:])) y_train = np.concatenate((y[:inst_n], y[inst_n+1:])) y_train = np.array([max(eps, min(1 - eps, val)) for val in y_train]) y_train = np.log(y_train / (1 - y_train)) model = Ridge(alpha=.2, fit_intercept=True, normalize=True) #model = Lasso(alpha=.001, fit_intercept=True, normalize=True) model.fit(X_train, y_train) y_hat = model.predict(x_i.reshape(1, -1))[0] y_i1 = max(eps, min(1 - eps, y_i)) y_i1 = np.log(y_i1 / (1 - y_i1)) print('inst: ' + str(inst_n) + ', prediction: ' + str(y_hat) + ', err: ' + str(y_hat - y_i1)) pred.append(1 / (1 + exp(-y_hat))) real.append(y_i) model = Ridge(alpha=.2, fit_intercept=True, normalize=True) model.fit(X, y) return pred, real, model.coef_
def test_raises_value_error_if_sample_weights_greater_than_1d(): """Sample weights must be either scalar or 1D""" n_sampless = [2, 3] n_featuress = [3, 2] rng = np.random.RandomState(42) for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) sample_weights_OK = rng.randn(n_samples) ** 2 + 1 sample_weights_OK_1 = 1. sample_weights_OK_2 = 2. sample_weights_not_OK = sample_weights_OK[:, np.newaxis] sample_weights_not_OK_2 = sample_weights_OK[np.newaxis, :] ridge = Ridge(alpha=1) # make sure the "OK" sample weights actually work ridge.fit(X, y, sample_weights_OK) ridge.fit(X, y, sample_weights_OK_1) ridge.fit(X, y, sample_weights_OK_2) def fit_ridge_not_ok(): ridge.fit(X, y, sample_weights_not_OK) def fit_ridge_not_ok_2(): ridge.fit(X, y, sample_weights_not_OK_2) assert_raise_message(ValueError, "Sample weights must be 1D array or scalar", fit_ridge_not_ok) assert_raise_message(ValueError, "Sample weights must be 1D array or scalar", fit_ridge_not_ok_2)
def _test_ridge_diabetes(filter_): ridge = Ridge(fit_intercept=False) ridge.fit(filter_(X_diabetes), y_diabetes) return np.round(ridge.score(filter_(X_diabetes), y_diabetes), 5)
Xtest = X Utrain = U Utest = U else: raise Exception('Train size must be in (0,1]') dad = DaDControl() dad.learn(Xtrain, Utrain, learner, iters, Xtest, Utest, verbose=False) print(' DaD (iters:{:d}). Initial Err: {:.4g}, Best: {:.4g}'.format( iters, dad.initial_test_err, dad.min_test_error)) return dad if __name__ == "__main__": print('Defining the learner') learner = DynamicsControlDeltaWrapper(Ridge(alpha=1e-4, fit_intercept=True)) NUM_EPISODES = 50 T = 50 print('Generating train data') policy = RandomLinearPolicy(SYSTEM.state_dim(), SYSTEM.control_dim()) Xtrain, Utrain = run_episodes(policy, NUM_EPISODES, T) print('Generating test data') Xtest, Utest = run_episodes(policy, NUM_EPISODES, T) print('\nLearning dynamics') iters = 25 dad = optimize_learner_dad(learner, Xtrain, Utrain, iters, train_size=0.5) _, dad_err = dad.test(Xtest, Utest, dad.min_test_error_model)
def test_ridge(): # Ridge regression convergence test using score # TODO: for this test to be robust, we should use a dataset instead # of np.random. rng = np.random.RandomState(0) alpha = 1.0 for solver in ("svd", "sparse_cg", "cholesky", "lsqr"): # With more samples than features n_samples, n_features = 6, 5 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert_equal(ridge.coef_.shape, (X.shape[1], )) assert_greater(ridge.score(X, y), 0.47) if solver == "cholesky": # Currently the only solver to support sample_weight. ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_greater(ridge.score(X, y), 0.47) # With more features than samples n_samples, n_features = 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert_greater(ridge.score(X, y), .9) if solver == "cholesky": # Currently the only solver to support sample_weight. ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_greater(ridge.score(X, y), 0.9)
def test_ridge(): """Ridge regression convergence test using score TODO: for this test to be robust, we should use a dataset instead of np.random. """ alpha = 1.0 for solver in ("sparse_cg", "dense_cholesky", "lsqr"): # With more samples than features n_samples, n_features = 6, 5 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert_equal(ridge.coef_.shape, (X.shape[1], )) assert_greater(ridge.score(X, y), 0.47) ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_greater(ridge.score(X, y), 0.47) # With more features than samples n_samples, n_features = 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert_greater(ridge.score(X, y), .9) ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_greater(ridge.score(X, y), 0.9)
def test_sparse_cg_max_iter(): reg = Ridge(solver="sparse_cg", max_iter=1) reg.fit(X_diabetes, y_diabetes) assert_equal(reg.coef_.shape[0], X_diabetes.shape[1])
rcv.fit(X,y); #print('rcv score = ', rcv.score(X,y)); print('alpha selected from cv is ', rcv.alpha_); # Construct an estimator using the best alpha and rank features from sklearn.cross_validation import cross_val_score, ShuffleSplit def rankfeatures(X,Y,rf,names): scores = [] for i in range(X.shape[1]): score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2" ,cv=ShuffleSplit(len(X), 20, .2)); scores.append((np.mean(score),names[i])); # for i in (sorted(scores, reverse=True)): # print(i[1], ' ', round(i[0],2)); return sorted(scores); rf = Ridge(alpha=rcv.alpha_); scores = rankfeatures(X,y,rf,features); plotscores(scores,rf); # Model Selection def plotfit(model,Xtest,ytest,c = 'red', title = 'Fit model'): y_sm = np.array(model.predict(Xtest)); x_sm = np.array(list(range(1,len(ytest)+1))); x_smooth = np.linspace(x_sm.min(), x_sm.max(), 200) y_smooth = spline(x_sm, y_sm, x_smooth); plt.plot(x_smooth, y_smooth, color=c, linewidth=3) plt.scatter(x_sm, ytest, color='black') plt.xlabel('Samples') plt.ylabel('Returns') plt.title(title)
'PLSSVD':PLSSVD(), 'PassiveAggressiveClassifier':PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor':PassiveAggressiveRegressor(), 'Perceptron':Perceptron(), 'ProjectedGradientNMF':ProjectedGradientNMF(), 'QuadraticDiscriminantAnalysis':QuadraticDiscriminantAnalysis(), 'RANSACRegressor':RANSACRegressor(), 'RBFSampler':RBFSampler(), 'RadiusNeighborsClassifier':RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor':RadiusNeighborsRegressor(), 'RandomForestClassifier':RandomForestClassifier(), 'RandomForestRegressor':RandomForestRegressor(), 'RandomizedLasso':RandomizedLasso(), 'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(), 'RidgeClassifier':RidgeClassifier(), 'RidgeClassifierCV':RidgeClassifierCV(), 'RobustScaler':RobustScaler(), 'SGDClassifier':SGDClassifier(), 'SGDRegressor':SGDRegressor(), 'SVC':SVC(), 'SVR':SVR(), 'SelectFdr':SelectFdr(), 'SelectFpr':SelectFpr(), 'SelectFwe':SelectFwe(), 'SelectKBest':SelectKBest(), 'SelectPercentile':SelectPercentile(), 'ShrunkCovariance':ShrunkCovariance(), 'SkewedChi2Sampler':SkewedChi2Sampler(),
weights=[1.01, 1.01]), ['predict'], create_weird_classification_problem_1()), (GradientBoostingClassifier(max_depth=10, n_estimators=10), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (LogisticRegression(), ['predict_proba', 'predict'], create_weird_classification_problem_1()), (IsotonicRegression(out_of_bounds='clip'), ['predict'], create_isotonic_regression_problem_1()), (Earth(), ['predict', 'transform'], create_regression_problem_1()), (Earth(allow_missing=True), ['predict', 'transform'], create_regression_problem_with_missingness_1()), (ElasticNet(), ['predict'], create_regression_problem_1()), (ElasticNetCV(), ['predict'], create_regression_problem_1()), (LassoCV(), ['predict'], create_regression_problem_1()), (Ridge(), ['predict'], create_regression_problem_1()), (RidgeCV(), ['predict'], create_regression_problem_1()), (SGDRegressor(), ['predict'], create_regression_problem_1()), (Lasso(), ['predict'], create_regression_problem_1()), (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]), ['predict', 'predict_proba'], create_weird_classification_problem_1()), (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))], transformer_weights={ 'earth': 1, 'earth2': 2 }), ['transform'], create_weird_classification_problem_1()), (RandomForestRegressor(), ['predict'], create_regression_problem_1()), (CalibratedClassifierCV(LogisticRegression(), 'isotonic'), ['predict_proba'], create_weird_classification_problem_1()), (AdaBoostRegressor(), ['predict'], create_regression_problem_1()),
########################################################################### # # BENCHMARK DES METHODES DE REGRESSION # from sklearn.svm import SVR from sklearn.ensemble.forest import RandomForestRegressor from sklearn.linear_model.ridge import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import ElasticNet #from sklearn.linear_model.stochastic_gradient import SGDRegressor #(SVR(gamma='scale', C=1.0, epsilon=0.2),"SVR2"), models = [(SVR(kernel='linear',degree=3),"SVR"), (RandomForestRegressor(max_depth=2, random_state=0,n_estimators=100),"RFR"), (Ridge(alpha=1.0),"RIDGE"), (Lasso(alpha=0.1),"LASSO"), (ElasticNet(alpha=1.0),"ElasiticNet")] #clf_sgd = SGDRegressor(max_iter=5) # # PCA DATA X_train, X_test, y_train, y_test = train_test_split(features_X, y, test_size=0.2, random_state=42) for model in models: yy,err = compute_predict(model[0],model[1],X_train, X_test, y_train, y_test) plotRegressionModelResults(model[1],y_test,yy,err) ############################################################################ # # XGBOOST TESTS #
'instance': SGDRegressor(penalty='elasticnet', alpha=0.01, l1_ratio=0.25, fit_intercept=True, tol=1e-4), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_)}, {'name': 'RandomForest', 'instance': RandomForestRegressor(n_estimators=100), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators}, {'name': 'SVR', 'instance': SVR(kernel='rbf'), 'complexity_label': 'support vectors', 'complexity_computer': lambda clf: len(clf.support_vectors_)}, ] } benchmark(configuration) # benchmark n_features influence on prediction speed percentile = 90 percentiles = n_feature_influence({'ridge': Ridge()}, configuration['n_train'], configuration['n_test'], [100, 250, 500], percentile) plot_n_features_influence(percentiles, percentile) # benchmark throughput throughputs = benchmark_throughputs(configuration) plot_benchmark_throughput(throughputs, configuration) stop_time = time.time() print("example run in %.2fs" % (stop_time - start_time))
def test_ridge_sparse_svd(): X = sp.csc_matrix(rng.rand(100, 10)) y = rng.rand(100) ridge = Ridge(solver='svd') assert_raises(TypeError, ridge.fit, X, y)
model2_train1 += virtual_test1 print "now saving the result" ff = open('virtual_train_data.json', 'w') ff.write(json.dumps([model2_train0, model2_train1])) ff.close() if sys.argv[1] == "second": ff = open('virtual_train_data.json', 'r') model2_train0, model2_train1 = json.loads(ff.read()) ff.close() print "opened train0 and train1 with each length", len(model2_train0), len(model2_train1) print model2_train0[0] print model2_train1[0] ff = open('intermediate_result.json', 'r') model2_test0, _ = json.loads(ff.read()) print model2_test0[0] model2 = Ridge() print "start fitting 2nd model" model2.fit(model2_train0, model2_train1) print "start predicting" predictions=model2.predict(model2_test0) print "saving the predicted result into the file" f = open('result.csv', 'w') f.write("ID;COTIS\n"); for ind, prd in enumerate(predictions): f.write(my_ids[ind] + ';' + str(prd) + '\n') f.close() print "all tasks completed"
def test_ridge(): """Ridge regression convergence test using score TODO: for this test to be robust, we should use a dataset instead of np.random. """ alpha = 1.0 # With more samples than features n_samples, n_features = 6, 5 y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) ridge = Ridge(alpha=alpha) ridge.fit(X, y) assert_equal(ridge.coef_.shape, (X.shape[1], )) assert_true(ridge.score(X, y) > 0.5) ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_true(ridge.score(X, y) > 0.5) # With more features than samples n_samples, n_features = 5, 10 y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) ridge = Ridge(alpha=alpha) ridge.fit(X, y) assert_true(ridge.score(X, y) > .9) ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_true(ridge.score(X, y) > 0.9)
from helper_functions import create_X, create_y_train, train_model, predict, score >>>>>>> 0e453a6a82a8c1a46c61f3419a174391e7c7affd train = pd.read_csv('data/Train.csv', parse_dates=['saledate']) test = pd.read_csv('data/Test.csv', parse_dates=['saledate']) X_train = create_X(train) X_test = create_X(test) y_train = create_y_train(train) <<<<<<< HEAD X_train_normalized, X_test_normalized = normalize_X(X_train, X_test) model_linear = train_model(X_train, y_train, LinearRegression()) model_ridge = train_model(X_train_normalized, y_train, Ridge()) model_lasso = train_model(X_train_normalized, y_train, Lasso(alpha=0.00005, max_iter=120000)) submit_linear = predict(model_linear, test, X_test, 'model_lin') submit_ridge = predict(model_ridge, test, X_test_normalized, 'model_rid') submit_lasso = predict(model_lasso, test, X_test_normalized, 'model_las') y_test = pd.read_csv('data/do_not_open/test_soln.csv') print('Linear: ', score(submit_linear, y_test), '; Ridge: ', score(submit_ridge, y_test), '; Lasso: ', score(submit_lasso, y_test)) # Linear: 0.40826129534246886 ; Ridge: 0.40822991882415727 ; Lasso: 0.40834486305959367 # Pick Ridge ======= model = train_model(X_train, y_train) submit = predict(model, test, X_test, 'model_1')
train0 = extract_predictor(train_dataset, True) train1 = extract_target(train_dataset) test0 = extract_predictor(test_dataset, False) results = [] for cnt in range(1000): projected0 = [] projected1 = [] for i in xrange(len(train0)): if random.random() < 0.4: continue projected0.append(train0[i]) projected1.append(train1[i]) print "now fitting the model", cnt, "with len", len(projected0) model = Ridge() model.fit(projected0, projected1) predictions=model.predict(test0) results.append(list(predictions)) final_result = [] for ind in xrange(len(results[0])): cand = [] for i in xrange(len(results)): cand.append(results[i][ind]) final_result.append(sum(sorted(cand)[100:-100])*1.0/(len(cand)-200)) #predictions=model.predict(valid_dataset) #Evaluate the quality of the prediction #print sklearn.metrics.mean_absolute_error(predictions,valid_target)
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def test_ridge(solver): # Ridge regression convergence test using score # TODO: for this test to be robust, we should use a dataset instead # of np.random. rng = np.random.RandomState(0) alpha = 1.0 # With more samples than features n_samples, n_features = 6, 5 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert ridge.coef_.shape == (X.shape[1], ) assert ridge.score(X, y) > 0.47 if solver in ("cholesky", "sag"): # Currently the only solvers to support sample_weight. ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert ridge.score(X, y) > 0.47 # With more features than samples n_samples, n_features = 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert ridge.score(X, y) > .9 if solver in ("cholesky", "sag"): # Currently the only solvers to support sample_weight. ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert ridge.score(X, y) > 0.9
NAIVE_BAYS = GaussianNB() K_N_N = KNeighborsClassifier() SUPPORT_VECTOR = svm.SVC(kernel="linear") # Ensemble classifiers RANDOM_FOREST = RandomForestClassifier(n_estimators=100) GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100) ADA_BOOST = AdaBoostClassifier(n_estimators=100) EXTRA_TREE = ExtraTreesClassifier(n_estimators=100) # Regressors GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1) LINEAR_RG = LinearRegression() RIDGE_RG = Ridge() LASSO_RG = Lasso() SVR_RG = SVR() def getClassifierMap(): CLASSIFIER_MAP = { "DECISION_TREE": DECISION_TREE, "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION, "NAIVE_BAYS": NAIVE_BAYS, "K_N_N": K_N_N, "SUPPORT_VECTOR": SUPPORT_VECTOR, "RANDOM_FOREST": RANDOM_FOREST, "GRADIENT_BOOST": GRADIENT_BOOST_CL, "ADA_BOOST": GRADIENT_BOOST_CL, "EXTRA_TREE": EXTRA_TREE }
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value)**2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def trainModel(param,feat_folder,feat_name): #read data from folder print 'now we read data from folder:%s'%(feat_folder) #start cv print 'now we need to generate cross_validation' accuracy_cv = [] for i in range(0,2): print 'this is the run:%d cross-validation'%(i+1) testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1))) #if we use xgboost to train model ,we need to use svmlib format if param['task'] in ['regression']: #with xgb we will dump the file with CV,and we will read data train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) watchlist = [(train_data,'train'),(valid_data,'valid')] bst = xgb.train(param,train_data,int(param['num_round']),watchlist) pred = bst.predict(valid_data) elif param['task'] in ['clf_skl_lr']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() clf = LogisticRegression() clf.fit(train_data,train_label) pred = clf.predict(test_data) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(train_data, test_label) pred = rf.predict(test_data) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(train_data,test_label) pred = etr.predict(test_data) elif param['task'] in ['reg_skl_gbm'] : train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data,train_label) pred = gbm.predict(test_data) elif param['task'] in ['reg_skl_ridge']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data,train_label) predraw = ridge.predict(test_data) print predraw predrank = predraw.argsort().argsort() trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1))) cdf = creatCDF(train, trainIndex) pred = getScore(predrank,cdf) print pred """ elif param['task'] in ['regression']: elif param['task'] in ['reg_skl_gbm'] : gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data[feat_names],train_data['cid']) pred = gbm.predict(valid_data[feat_names]) elif param['task'] in ['reg_skl_ridge']: feat_names.remove('cid') ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data[feat_names],train_data['cid']) pred = ridge.predict(valid_data[feat_names]) """ #now we use the the accuracy to limit our model acc = accuracy_model(pred,train.iloc[testIndex]['cid']) print "the model accurary:%s"%(acc) accuracy_cv.append(acc) #here we will count the accuracy_cv_mean = np.mean(accuracy_cv) accuracy_cv_std = np.std(accuracy_cv) print 'the accuracy for %.6f'%(accuracy_cv_mean) return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
}, { 'name': 'RandomForest', 'instance': RandomForestRegressor(), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators }, { 'name': 'SVR', 'instance': SVR(kernel='rbf'), 'complexity_label': 'support vectors', 'complexity_computer': lambda clf: len(clf.support_vectors_) }, ] } benchmark(configuration) # benchmark n_features influence on prediction speed percentile = 90 percentiles = n_feature_influence({'ridge': Ridge()}, configuration['n_train'], configuration['n_test'], [100, 250, 500], percentile) plot_n_features_influence(percentiles, percentile) # benchmark throughput throughputs = benchmark_throughputs(configuration) plot_benchmark_throughput(throughputs, configuration) stop_time = time.time() print("example run in %.2fs" % (stop_time - start_time))
def exercise_one(): hitters = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Hitters.csv', sep=',', header=0).dropna() X = pd.get_dummies(hitters, drop_first=True) y = hitters.Salary # standardize and split the data x_scalar = StandardScaler().fit(X) y_scalar = StandardScaler().fit(y.values.reshape(-1,1)) X = x_scalar.transform(X) y = y_scalar.transform(y.values.reshape((-1, 1))).reshape((-1)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) def objective(x, y, beta, l): return 2/len(y) * (norm((y-x@beta)**2)+l*norm(beta)**2) def compute_grad(x, y, beta, l): return 2/len(y) * (x.T@x@beta + l*beta - x.T@y) def grad_descent(x, y, l, eta, max_iter): beta = np.zeros(x.shape[1]) i, xvals = 0, [] grad_x = compute_grad(x, y, beta, l) while i < max_iter: beta = beta - eta * grad_x xvals.append(objective(x, y, beta, l)) grad_x = compute_grad(x, y, beta, l) i += 1 return xvals, beta fx, bt = grad_descent(X_train, y_train, l=0.1, eta=0.1, max_iter=1000) # compare with sklearn's ridge clf = Ridge(alpha=0.1, max_iter=1000, solver='saga').fit(X_train, y_train) # plot the object function vs iteration number plt.plot(fx) plt.title("Objective function rapidly decreases") plt.xlabel("iterations (t)") plt.ylabel(r'$F(\beta)$') # calculate the difference in objective values between sklearn's and my own descent objective(X_train, y_train, bt, 0.1) - objective(X_train, y_train, clf.coef_, 0.1) # -1.21634123961e-05 # visualize the comparison def visualize(betas, sklb): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4.5)) plt.subplot(ax1) plt.bar(np.arange(len(betas)), betas, width=.5) plt.bar(np.arange(len(betas))+0.5, sklb, width=.5) plt.ylabel(r'$\beta$') plt.xlabel(r'$\beta_i$') plt.axis([0,20,-.05,max(max(betas), max(sklb))+0.1]) plt.xticks(np.arange(0, 20, step=2)) plt.subplot(ax2) plt.bar(np.arange(len(betas)), (betas-sklb)) plt.ylabel(r'$\Delta \, \beta$') plt.xlabel(r'$\beta_i$') plt.xticks(np.arange(0, 20, step=2)) st = plt.suptitle(r'Resulting $\beta$ values are quite similar', fontsize=16) st.set_y(.95) fig.subplots_adjust(wspace=.3, top=.85) visualize(bt, clf.coef_) runs = [(1/10**n, grad_descent(X_train, y_train, l=0.1, eta=1/10**n, max_iter=1000)) for n in np.linspace(1, 5, 10)] plt.clf() [plt.plot(r[1][0]) for r in runs] plt.title("Objective values per iteration") best_idx = np.argmin([min(r[1][0]) for r in runs]) bt = runs[best_idx][1][1] visualize(bt, clf.coef_) # calculate the difference in objective values between sklearn's and my own descent objective(X_train, y_train, bt, 0.1) - objective(X_train, y_train, clf.coef_, 0.1)
def test_ridge(): """Ridge regression convergence test using score TODO: for this test to be robust, we should use a dataset instead of np.random. """ alpha = 1.0 # With more samples than features n_samples, n_features = 6, 5 y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) ridge = Ridge(alpha=alpha) ridge.fit(X, y) assert_equal(ridge.coef_.shape, (X.shape[1],)) assert ridge.score(X, y) > 0.5 ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert ridge.score(X, y) > 0.5 # With more features than samples n_samples, n_features = 5, 10 y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) ridge = Ridge(alpha=alpha) ridge.fit(X, y) assert ridge.score(X, y) > 0.9 ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert ridge.score(X, y) > 0.9