def make_sparse_low_rank(n_dim_obs=3, n_dim_lat=2, T=10, epsilon=1e-3, n_samples=50, **kwargs): """Generate dataset (new new version).""" from sklearn.datasets import make_sparse_spd_matrix, make_low_rank_matrix K = make_sparse_spd_matrix(n_dim_obs) L = make_low_rank_matrix(n_dim_obs, n_dim_obs, effective_rank=n_dim_lat) Ks = [K] Ls = [L] Kobs = [K - L] for i in range(1, T): K = K + make_sparse_spd_matrix(n_dim_obs) L = L + make_low_rank_matrix( n_dim_obs, n_dim_obs, effective_rank=n_dim_lat) # assert is_pos_def(K - L) # assert is_pos_semidef(L) Ks.append(K) Ls.append(L) Kobs.append(K - L) return Ks, Kobs, Ls
def test_singular_values(svd_solver): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix( n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng ) X = da.from_array(X, chunks=[200, -1]) pca = PCA(n_components=10, svd_solver=svd_solver, random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100, svd_solver=svd_solver).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal( np.sum(pca.singular_values_ ** 2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12 ) assert_array_almost_equal( np.sum(ipca.singular_values_ ** 2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2 ) # Compare to the 2-norms of the score vectors assert_array_almost_equal( pca.singular_values_, np.sqrt(np.sum(X_pca ** 2.0, axis=0)), 12 ) assert_array_almost_equal( ipca.singular_values_, np.sqrt(np.sum(X_ipca ** 2.0, axis=0)), 2 ) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix( n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng ) X = da.from_array(X, chunks=[4, -1]) pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100, svd_solver=svd_solver) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca ** 2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) X_hat = da.from_array(X_hat, chunks=(4, -1)) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_singular_values(): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng) pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal(np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro")**2.0, 2) # Compare to the 2-norms of the score vectors assert_array_almost_equal(pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12) assert_array_almost_equal(ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng) pca = PCA(n_components=3, svd_solver='full', random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_shaping_3_values(eng): svd = lambda x: SVD(k=3, method='direct', seed=0).fit(x) # baseline: ndarray (local) or BoltArray (spark) x = make_low_rank_matrix(n_samples=10, n_features=10, random_state=0) x = series.fromarray(x, engine=eng).values u, s, v = svd(x) # simple series x1 = series.fromarray(x) u1, s1, v1 = svd(x1) assert allclose(u, u1) assert allclose(s, s1) assert allclose(v, v1) # series with multiple dimensions x1 = series.fromarray(x.reshape(2, 5, 10)) u1, s1, v1 = svd(x1) u1 = u1.reshape(10, 3) assert allclose(u, u1) assert allclose(s, s1) assert allclose(v, v1) # images (must have multiple dimensions) x1 = images.fromarray(x.reshape(10, 2, 5)) u1, s1, v1 = svd(x1) v1 = v1.reshape(3, 10) assert allclose(u, u1) assert allclose(s, s1) assert allclose(v, v1)
def test_shaping_2_values(eng): pca = lambda x: PCA(k=3, svd_method='direct', seed=0).fit(x) # baseline: ndarray (local) or BoltArray (spark) x = make_low_rank_matrix(n_samples=10, n_features=10, random_state=0) x = series.fromarray(x, engine=eng).values t, w = pca(x) # simple series x1 = series.fromarray(x) t1, w1 = pca(x1) assert allclose(t, t1) assert allclose(w, w1) # series with multiple dimensions x1 = series.fromarray(x.reshape(2, 5, 10)) t1, w1 = pca(x1) t1 = t1.reshape(10, 3) assert allclose(t, t1) assert allclose(w, w1) # images (must have multiple dimensions) x1 = images.fromarray(x.reshape(10, 2, 5)) t1, w1 = pca(x1) w1 = w1.reshape(3, 10) assert allclose(t, t1) assert allclose(w, w1)
def test_randomized_svd_infinite_rank(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert X.shape == (n_samples, n_features) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.1 # compute the singular values of X using the fast approximate method # with iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5, power_iteration_normalizer=normalizer) # the iterated power method is still managing to get most of the # structure at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_transpose_consistency(): # Check that transposing the design matrix has limited impact n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert X.shape == (n_samples, n_features) U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0) U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0) U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_shaping_3_values(eng): svd= lambda x: SVD(k=3, method='direct', seed=0).fit(x) # baseline: ndarray (local) or BoltArray (spark) x = make_low_rank_matrix(n_samples=10, n_features=10, random_state=0) x = series.fromarray(x, engine=eng).values u, s, v = svd(x) # simple series x1 = series.fromarray(x) u1, s1, v1 = svd(x1) assert allclose(u, u1) assert allclose(s, s1) assert allclose(v, v1) # series with multiple dimensions x1 = series.fromarray(x.reshape(2, 5, 10)) u1, s1, v1 = svd(x1) u1 = u1.reshape(10, 3) assert allclose(u, u1) assert allclose(s, s1) assert allclose(v, v1) # images (must have multiple dimensions) x1 = images.fromarray(x.reshape(10, 2, 5)) u1, s1, v1 = svd(x1) v1 = v1.reshape(3, 10) assert allclose(u, u1) assert allclose(s, s1) assert allclose(v, v1)
def test_whitening(svd_solver): # Test that PCA and IncrementalPCA transforms match to sign flip. X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0.0, effective_rank=2, random_state=1999) X = da.from_array(X, chunks=[200, -1]) prec = 3 n_samples, n_features = X.shape for nc in [None, 9]: pca = PCA(whiten=True, n_components=nc, svd_solver=svd_solver).fit(X.compute()) ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250, svd_solver=svd_solver).fit(X) Xt_pca = pca.transform(X) Xt_ipca = ipca.transform(X) assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec) Xinv_ipca = ipca.inverse_transform(Xt_ipca) Xinv_pca = pca.inverse_transform(Xt_pca) assert_almost_equal(X.compute(), Xinv_ipca, decimal=prec) assert_almost_equal(X.compute(), Xinv_pca, decimal=prec) assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
def test_randomized_svd_power_iteration_normalizer(): # randomized_svd with power_iteration_normalized='none' diverges for # large number of power iterations on this dataset rng = np.random.RandomState(42) X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng) X += 3 * rng.randint(0, 2, size=X.shape) n_components = 50 # Check that it diverges with many (non-normalized) power iterations U, s, Vt = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(Vt)) error_2 = linalg.norm(A, ord='fro') U, s, Vt = randomized_svd(X, n_components, n_iter=20, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(Vt)) error_20 = linalg.norm(A, ord='fro') assert np.abs(error_2 - error_20) > 100 for normalizer in ['LU', 'QR', 'auto']: U, s, Vt = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(Vt)) error_2 = linalg.norm(A, ord='fro') for i in [5, 10, 50]: U, s, Vt = randomized_svd(X, n_components, n_iter=i, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(Vt)) error = linalg.norm(A, ord='fro') assert 15 > np.abs(error_2 - error)
def test_poisson(): # For Poisson distributed target, Poisson loss should give better results # than least squares measured in Poisson deviance as metric. rng = np.random.RandomState(42) n_train, n_test, n_features = 500, 100, 100 X = make_low_rank_matrix(n_samples=n_train + n_test, n_features=n_features, random_state=rng) # We create a log-linear Poisson model and downscale coef as it will get # exponentiated. coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) y = rng.poisson(lam=np.exp(X @ coef)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, random_state=rng) gbdt_pois = HistGradientBoostingRegressor(loss="poisson", random_state=rng) gbdt_ls = HistGradientBoostingRegressor(loss="squared_error", random_state=rng) gbdt_pois.fit(X_train, y_train) gbdt_ls.fit(X_train, y_train) dummy = DummyRegressor(strategy="mean").fit(X_train, y_train) for X, y in [(X_train, y_train), (X_test, y_test)]: metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X)) # squared_error might produce non-positive predictions => clip metric_ls = mean_poisson_deviance( y, np.clip(gbdt_ls.predict(X), 1e-15, None)) metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) assert metric_pois < metric_ls assert metric_pois < metric_dummy
def test_randomized_svd_low_rank_with_noise(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.1, random_state=0) assert X.shape == (n_samples, n_features) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate # method without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.01 # compute the singular values of X using the fast approximate # method with iterated power method _, sap, _ = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_make_low_rank_matrix(): X = make_low_rank_matrix(n_samples=50, n_features=25, effective_rank=5, tail_strength=0.01, random_state=0) assert_equal(X.shape, (50, 25), "X shape mismatch") from numpy.linalg import svd u, s, v = svd(X) assert_less(sum(s) - 5, 0.1, "X rank is not approximately 5")
def make_regression(n_samples=100, n_features=50, effective_rank=10, tail_strength=0.5): """Make a synthetic regression problem using low rank matrices with an eigenspectrum of some effective rank with a tail. Splits matrix to produce train and test set. """ X0 = make_low_rank_matrix(n_samples=2 * n_samples, n_features=n_features + 1, effective_rank=effective_rank, tail_strength=tail_strength) X0 -= np.sum(X0, axis=0) X_train, X_test = X0[:n_samples, :n_features], X0[n_samples:, :n_features] y_train, y_test = X0[:n_samples, n_features], X0[n_samples:, n_features] return X_train, y_train, X_test, y_test
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == "lfw_people": X = fetch_lfw_people().data elif dataset_name == "20newsgroups": X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == "olivetti_faces": X = fetch_olivetti_faces().data elif dataset_name == "rcv1": X = fetch_rcv1().data elif dataset_name == "CIFAR": if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [ unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5) ] X = np.vstack(X1) del X1 elif dataset_name == "SVHN": if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == "low rank matrix": X = make_low_rank_matrix( n_samples=500, n_features=int(1e4), effective_rank=100, tail_strength=0.5, random_state=random_state, ) elif dataset_name == "uncorrelated matrix": X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == "big sparse matrix": sparsity = int(1e6) size = int(1e6) small_size = int(1e4) data = np.random.normal(0, 1, int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_openml(dataset_name, parser="auto").data return X
def test_fastica_eigh_low_rank_warning(global_random_seed): """Test FastICA eigh solver raises warning for low-rank data.""" rng = np.random.RandomState(global_random_seed) X = make_low_rank_matrix(n_samples=10, n_features=10, random_state=rng, effective_rank=2) ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh") msg = "There are some small singular values" with pytest.warns(UserWarning, match=msg): ica.fit(X)
def random_X_y_coef(linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42): """Random generate y, X and coef in valid range.""" rng = np.random.RandomState(seed) n_dof = n_features + linear_model_loss.fit_intercept X = make_low_rank_matrix( n_samples=n_samples, n_features=n_features, random_state=rng, ) if linear_model_loss.base_loss.is_multiclass: n_classes = linear_model_loss.base_loss.n_classes coef = np.empty((n_classes, n_dof)) coef.flat[:] = rng.uniform( low=coef_bound[0], high=coef_bound[1], size=n_classes * n_dof, ) if linear_model_loss.fit_intercept: raw_prediction = X @ coef[:, :-1].T + coef[:, -1] else: raw_prediction = X @ coef.T proba = linear_model_loss.base_loss.link.inverse(raw_prediction) # y = rng.choice(np.arange(n_classes), p=proba) does not work. # See https://stackoverflow.com/a/34190035/16761084 def choice_vectorized(items, p): s = p.cumsum(axis=1) r = rng.rand(p.shape[0])[:, None] k = (s < r).sum(axis=1) return items[k] y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64) else: coef = np.empty((n_dof, )) coef.flat[:] = rng.uniform( low=coef_bound[0], high=coef_bound[1], size=n_dof, ) if linear_model_loss.fit_intercept: raw_prediction = X @ coef[:-1] + coef[-1] else: raw_prediction = X @ coef y = linear_model_loss.base_loss.link.inverse( raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)) return X, y, coef
def test_randomized_svd_sparse_warnings(): # randomized_svd throws a warning for lil and dok matrix rng = np.random.RandomState(42) X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng) n_components = 5 for cls in (sparse.lil_matrix, sparse.dok_matrix): X = cls(X) assert_warns_message( sparse.SparseEfficiencyWarning, "Calculating SVD of a {} is expensive. " "csr_matrix is more efficient.".format(cls.__name__), randomized_svd, X, n_components, n_iter=1, power_iteration_normalizer='none')
def test_svd(eng): x_local = make_low_rank_matrix(n_samples=10, n_features=50, random_state=0) x = fromarray(x_local.reshape(10, 10, 5), engine=eng) x.cache() x.count() u1, s1, v1 = randomized_svd(x_local, n_components=2, random_state=0) u2, v2, s2 = getSVD(x, k=2, getComponents=True, getS=True) assert u1.shape == u2.shape assert s1.shape == s2.shape assert v1.shape == (2, 50) assert v2.shape == (2, 10, 5) u2, v2, s2 = getSVD(x, k=2, getComponents=True, getS=True, normalization='nanmean') assert u1.shape == u2.shape assert s1.shape == s2.shape assert v1.shape == (2, 50) assert v2.shape == (2, 10, 5) u2, v2, s2 = getSVD(x, k=2, getComponents=True, getS=True, normalization='zscore') assert u1.shape == u2.shape assert s1.shape == s2.shape assert v1.shape == (2, 50) assert v2.shape == (2, 10, 5) u2, v2, s2 = getSVD(x, k=2, getComponents=True, getS=True, normalization=None) assert u1.shape == u2.shape assert s1.shape == s2.shape assert v1.shape == (2, 50) assert v2.shape == (2, 10, 5) with pytest.raises(ValueError) as ex: u2, v2, s2 = getSVD(x, k=2, getComponents=True, getS=True, normalization='error') assert 'Normalization should be one of' in str(ex.value)
def test_explained_variances(): # Test that PCA and IncrementalPCA calculations match X = datasets.make_low_rank_matrix(1000, 100, tail_strength=0., effective_rank=10, random_state=1999) prec = 3 n_samples, n_features = X.shape for nc in [None, 99]: pca = PCA(n_components=nc).fit(X) ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X) assert_almost_equal(pca.explained_variance_, ipca.explained_variance_, decimal=prec) assert_almost_equal(pca.explained_variance_ratio_, ipca.explained_variance_ratio_, decimal=prec) assert_almost_equal(pca.noise_variance_, ipca.noise_variance_, decimal=prec)
def test_explained_variances(): """Test that PCA and IncrementalPCA calculations match""" X = datasets.make_low_rank_matrix(1000, 100, tail_strength=0., effective_rank=10, random_state=1999) prec = 3 n_samples, n_features = X.shape for nc in [None, 99]: pca = PCA(n_components=nc).fit(X) ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X) assert_almost_equal(pca.explained_variance_, ipca.explained_variance_, decimal=prec) assert_almost_equal(pca.explained_variance_ratio_, ipca.explained_variance_ratio_, decimal=prec) assert_almost_equal(pca.noise_variance_, ipca.noise_variance_, decimal=prec)
def test_svd(eng): x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0) x = fromarray(x, engine=eng) from sklearn.utils.extmath import randomized_svd u1, s1, v1 = randomized_svd(x.toarray(), n_components=2, random_state=0) u2, s2, v2 = SVD(k=2, method='direct').fit(x) assert allclose_sign(u1, u2) assert allclose(s1, s2) assert allclose_sign(v1.T, v2.T) u2, s2, v2 = SVD(k=2, method='em', max_iter=100, seed=0).fit(x) tol = 1e-1 assert allclose_sign(u1, u2, atol=tol) assert allclose(s1, s2, atol=tol) assert allclose_sign(v1.T, v2.T, atol=tol)
def test_pca(eng): x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0) x = fromarray(x, engine=eng) from sklearn.decomposition import PCA as skPCA pca = skPCA(n_components=2) t1 = pca.fit_transform(x.toarray()) w1_T = pca.components_ t2, w2_T = PCA(k=2, svd_method='direct').fit(x) assert allclose_sign(w1_T.T, w2_T.T) assert allclose_sign(t1, t2) t2, w2_T = PCA(k=2, svd_method='em', max_iter=100, seed=0).fit(x) tol = 1e-1 assert allclose_sign(w1_T.T, w2_T.T, atol=tol) assert allclose_sign(t1, t2, atol=tol)
def bench_b(power_list): n_samples, n_features = 1000, 10000 data_params = { "n_samples": n_samples, "n_features": n_features, "tail_strength": 0.7, "random_state": random_state, } dataset_name = "low rank matrix %d x %d" % (n_samples, n_features) ranks = [10, 50, 100] if enable_spectral_norm: all_spectral = defaultdict(list) all_frobenius = defaultdict(list) for rank in ranks: X = make_low_rank_matrix(effective_rank=rank, **data_params) if enable_spectral_norm: X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0) X_fro_norm = norm_diff(X, norm="fro", msg=False) for n_comp in [int(rank / 2), rank, rank * 2]: label = "rank=%d, n_comp=%d" % (rank, n_comp) print(label) for pi in power_list: U, s, V, _ = svd_timing( X, n_comp, n_iter=pi, n_oversamples=2, power_iteration_normalizer="LU", ) if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append( norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if enable_spectral_norm: title = "%s: spectral norm diff vs n power iteration" % (dataset_name) plot_power_iter_vs_s(power_iter, all_spectral, title) title = "%s: Frobenius norm diff vs n power iteration" % (dataset_name) plot_power_iter_vs_s(power_iter, all_frobenius, title)
def test_whitening(): """Test that PCA and IncrementalPCA transforms match to sign flip.""" X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0., effective_rank=2, random_state=1999) prec = 3 n_samples, n_features = X.shape for nc in [None, 9]: pca = PCA(whiten=True, n_components=nc).fit(X) ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250).fit(X) Xt_pca = pca.transform(X) Xt_ipca = ipca.transform(X) assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec) Xinv_ipca = ipca.inverse_transform(Xt_ipca) Xinv_pca = pca.inverse_transform(Xt_pca) assert_almost_equal(X, Xinv_ipca, decimal=prec) assert_almost_equal(X, Xinv_pca, decimal=prec) assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
def fetch_low_rank_matrix(self, effective_rank=None, tail_strength=None): """ Generates synthetic data using sklearn make_low_rank with self.n,self.d but with variable effective_rank and tail_strength if need be. """ if effective_rank == None: eff_rank = self.effective_rank else: eff_rank = effective_rank if tail_strength == None: t_strength = self.tail_strength else: t_strength = tail_strength X = make_low_rank_matrix(n_samples=self.n, n_features=self.d, effective_rank=eff_rank, tail_strength=t_strength, random_state=self.rng) return X
def compute_bench(samples_range, features_range, n_iter=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print("benchmarking scipy svd: ") tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=0") tstart = time() randomized_svd(X, rank, n_iter=0) results['scikit-learn randomized_svd (n_iter=0)'].append(time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=%d " % n_iter) tstart = time() randomized_svd(X, rank, n_iter=n_iter) results['scikit-learn randomized_svd (n_iter=%d)' % n_iter].append(time() - tstart) return results
def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 decimal = 5 if dtype == np.float32 else 7 dtype = np.dtype(dtype) # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0).astype(dtype, copy=False) assert X.shape == (n_samples, n_features) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # Convert the singular values to the specific dtype U = U.astype(dtype, copy=False) s = s.astype(dtype, copy=False) V = V.astype(dtype, copy=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # If the input dtype is float, then the output dtype is float of the # same bit size (f32 is not upcast to f64) # But if the input dtype is int, the output dtype is float64 if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype == np.float64 assert sa.dtype == np.float64 assert Va.dtype == np.float64 assert Ua.shape == (n_samples, k) assert sa.shape == (k, ) assert Va.shape == (k, n_features) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa, decimal=decimal) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va), decimal=decimal) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype.kind == 'f' assert sa.dtype.kind == 'f' assert Va.dtype.kind == 'f' assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
from sklearn import datasets import matplotlib.pyplot as plt # make_low_rank_matrix data X = datasets.make_low_rank_matrix(n_samples=100,n_features=2,effective_rank=2,tail_strength=0.5,random_state=None) print(X) plt.scatter(X[:,0],X[:,1]) plt.show()
def glm_dataset(global_random_seed, request): """Dataset with GLM solutions, well conditioned X. This is inspired by ols_ridge_dataset in test_ridge.py. The construction is based on the SVD decomposition of X = U S V'. Parameters ---------- type : {"long", "wide"} If "long", then n_samples > n_features. If "wide", then n_features > n_samples. model : a GLM model For "wide", we return the minimum norm solution: min ||w||_2 subject to w = argmin deviance(X, y, w) Note that the deviance is always minimized if y = inverse_link(X w) is possible to achieve, which it is in the wide data case. Therefore, we can construct the solution with minimum norm like (wide) OLS: min ||w||_2 subject to link(y) = raw_prediction = X w Returns ------- model : GLM model X : ndarray Last column of 1, i.e. intercept. y : ndarray coef_unpenalized : ndarray Minimum norm solutions, i.e. min sum(loss(w)) (with mininum ||w||_2 in case of ambiguity) Last coefficient is intercept. coef_penalized : ndarray GLM solution with alpha=l2_reg_strength=1, i.e. min 1/n * sum(loss) + ||w[:-1]||_2^2. Last coefficient is intercept. l2_reg_strength : float Always equal 1. """ data_type, model = request.param # Make larger dim more than double as big as the smaller one. # This helps when constructing singular matrices like (X, X). if data_type == "long": n_samples, n_features = 12, 4 else: n_samples, n_features = 4, 12 k = min(n_samples, n_features) rng = np.random.RandomState(global_random_seed) X = make_low_rank_matrix( n_samples=n_samples, n_features=n_features, effective_rank=k, tail_strength=0.1, random_state=rng, ) X[:, -1] = 1 # last columns acts as intercept U, s, Vt = linalg.svd(X, full_matrices=False) assert np.all(s > 1e-3) # to be sure assert np.max(s) / np.min(s) < 100 # condition number of X if data_type == "long": coef_unpenalized = rng.uniform(low=1, high=3, size=n_features) coef_unpenalized *= rng.choice([-1, 1], size=n_features) raw_prediction = X @ coef_unpenalized else: raw_prediction = rng.uniform(low=-3, high=3, size=n_samples) # minimum norm solution min ||w||_2 such that raw_prediction = X w: # w = X'(XX')^-1 raw_prediction = V s^-1 U' raw_prediction coef_unpenalized = Vt.T @ np.diag(1 / s) @ U.T @ raw_prediction linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=True) sw = np.full(shape=n_samples, fill_value=1 / n_samples) y = linear_loss.base_loss.link.inverse(raw_prediction) # Add penalty l2_reg_strength * ||coef||_2^2 for l2_reg_strength=1 and solve with # optimizer. Note that the problem is well conditioned such that we get accurate # results. l2_reg_strength = 1 fun = partial( linear_loss.loss, X=X[:, :-1], y=y, sample_weight=sw, l2_reg_strength=l2_reg_strength, ) grad = partial( linear_loss.gradient, X=X[:, :-1], y=y, sample_weight=sw, l2_reg_strength=l2_reg_strength, ) coef_penalized_with_intercept = _special_minimize(fun, grad, coef_unpenalized, tol_NM=1e-6, tol=1e-14) linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=False) fun = partial( linear_loss.loss, X=X[:, :-1], y=y, sample_weight=sw, l2_reg_strength=l2_reg_strength, ) grad = partial( linear_loss.gradient, X=X[:, :-1], y=y, sample_weight=sw, l2_reg_strength=l2_reg_strength, ) coef_penalized_without_intercept = _special_minimize(fun, grad, coef_unpenalized[:-1], tol_NM=1e-6, tol=1e-14) # To be sure assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm( coef_unpenalized) return ( model, X, y, coef_unpenalized, coef_penalized_with_intercept, coef_penalized_without_intercept, l2_reg_strength, )
ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='blue', marker='s') ax.scatter(xcord2, ycord2, s=30, c='red') x = arange(-3.0, 3.0, 0.1) y = (-weights[0] - weights[1] * x) / weights[2] ax.plot(x, y) plt.xlabel('X1') plt.ylabel('X2') plt.show() def main(): dataMat, labelMat = loadDataSet() weights = gradAscent(dataMat, labelMat).getA() plotBestFit(weights) if __name__ == '__main__': main() X = datasets.make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10, random_state=None) Y = datasets.make_low_rank_matrix(n_samples=100, n_features=1, effective_rank=10, random_state=None) print(X) plt.plot(X, Y) plt.show()
color="darkorange", lw=lw) plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) plt.legend(loc="best") plt.savefig('figures/syn_best_alpha_{}.png'.format(method_name)) plt.clf() np.random.seed(0) n_samples = 200 n_features = 500 e_rank = 30 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features + 1, effective_rank=e_rank, tail_strength=0.5) X, y = X[:,:-1], X[:,-1] print('data shape:', X.shape) methods = [("Ridge", Ridge(), "alpha", np.logspace(-3, 0, 20)), ("LASSO", Lasso(), "alpha", np.logspace(-3, 0, 20)), ("Echo", er.EchoRegression(), 'alpha', np.logspace(-1, 4, 20)), ("OLS", LinearRegression(), "fit_intercept", [True])] for method_name, method, param_name, param_range in methods: train_scores, test_scores = validation_curve(method, X, y, cv=10, scoring='neg_mean_squared_error', param_name=param_name, param_range=param_range)
def make_varratio_exercise(): return make_low_rank_matrix(tail_strength=0.1)