Beispiel #1
0
def test_svd_compressed_deterministic():
    m, n = 30, 25
    x = da.random.RandomState(1234).random_sample(size=(m, n), chunks=(5, 5))
    u, s, vt = svd_compressed(x, 3, seed=1234)
    u2, s2, vt2 = svd_compressed(x, 3, seed=1234)

    assert all(da.compute((u == u2).all(), (s == s2).all(), (vt == vt2).all()))
Beispiel #2
0
def test_svd_compressed_deterministic():
    m, n = 30, 25
    x = da.random.RandomState(1234).random_sample(size=(m, n), chunks=(5, 5))
    u, s, vt = svd_compressed(x, 3, seed=1234)
    u2, s2, vt2 = svd_compressed(x, 3, seed=1234)

    assert all(da.compute((u == u2).all(), (s == s2).all(), (vt == vt2).all()))
Beispiel #3
0
def test_svd_compressed():
    m, n = 2000, 250
    r = 10
    np.random.seed(4321)
    mat1 = np.random.randn(m, r)
    mat2 = np.random.randn(r, n)
    mat = mat1.dot(mat2)
    data = da.from_array(mat, chunks=(500, 50))

    u, s, vt = svd_compressed(data, r, seed=4321, n_power_iter=2)

    usvt = da.dot(u, da.dot(da.diag(s), vt))

    tol = 0.2
    assert_eq(da.linalg.norm(usvt), np.linalg.norm(mat), rtol=tol,
              atol=tol)  # average accuracy check

    u = u[:, :r]
    s = s[:r]
    vt = vt[:r, :]

    s_exact = np.linalg.svd(mat)[1]
    s_exact = s_exact[:r]

    assert_eq(np.eye(r, r), da.dot(u.T, u))  # u must be orthonormal
    assert_eq(np.eye(r, r), da.dot(vt, vt.T))  # v must be orthonormal
    assert_eq(s, s_exact)  # s must contain the singular values
Beispiel #4
0
def test_svd_compressed(iterator):
    m, n = 100, 50
    r = 5
    a = da.random.random((m, n), chunks=(m, n))

    # calculate approximation and true singular values
    u, s, vt = svd_compressed(a,
                              2 * r,
                              iterator=iterator[0],
                              n_power_iter=iterator[1],
                              seed=4321)  # worst case
    s_true = scipy.linalg.svd(a.compute(), compute_uv=False)

    # compute the difference with original matrix
    norm = scipy.linalg.norm((a - (u[:, :r] * s[:r]) @ vt[:r, :]).compute(), 2)

    # ||a-a_hat||_2 <= (1+tol)s_{k+1}: based on eq. 1.10/1.11:
    # Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.
    # "Finding structure with randomness: Probabilistic algorithms for constructing
    # approximate matrix decompositions." SIAM review 53.2 (2011): 217-288.
    frac = norm / s_true[r + 1] - 1
    # Tolerance determined via simulation to be slightly above max norm of difference matrix in 10k samples.
    # See https://github.com/dask/dask/pull/6799#issuecomment-726631175 for more details.
    tol = 0.4
    assert frac < tol

    assert_eq(np.eye(r, r), da.dot(u[:, :r].T,
                                   u[:, :r]))  # u must be orthonormal
    assert_eq(np.eye(r, r), da.dot(vt[:r, :],
                                   vt[:r, :].T))  # v must be orthonormal
Beispiel #5
0
def test_svd_compressed():
    m, n = 2000, 250
    r = 10
    np.random.seed(4321)
    mat1 = np.random.randn(m, r)
    mat2 = np.random.randn(r, n)
    mat = mat1.dot(mat2)
    data = da.from_array(mat, chunks=(500, 50))

    u, s, vt = svd_compressed(data, r, seed=4321, n_power_iter=2)

    usvt = da.dot(u, da.dot(da.diag(s), vt))

    tol = 0.2
    assert_eq(da.linalg.norm(usvt),
              np.linalg.norm(mat),
              rtol=tol, atol=tol)  # average accuracy check

    u = u[:, :r]
    s = s[:r]
    vt = vt[:r, :]

    s_exact = np.linalg.svd(mat)[1]
    s_exact = s_exact[:r]

    assert_eq(np.eye(r, r), da.dot(u.T, u))  # u must be orthonormal
    assert_eq(np.eye(r, r), da.dot(vt, vt.T))  # v must be orthonormal
    assert_eq(s, s_exact)  # s must contain the singular values
Beispiel #6
0
def test_svd_compressed_shapes(m, n, k, chunks):
    x = da.random.random(size=(m, n), chunks=chunks)
    u, s, v = svd_compressed(x, k=k, n_power_iter=1, compute=True, seed=1)
    u, s, v = da.compute(u, s, v)
    r = min(m, n, k)
    assert u.shape == (m, r)
    assert s.shape == (r, )
    assert v.shape == (r, n)
Beispiel #7
0
    def fit(self, A):

        if not hasattr(A, 'dask'):
            A = da.from_array(A, A.shape)

        n_comps = self.svd_kwargs.pop('n_components')
        _, _, vt = svd_compressed(A, n_comps, **self.svd_kwargs)

        self.components_ = vt

        return self
Beispiel #8
0
def test_svd_compressed():
    m, n = 300, 250
    r = 10
    np.random.seed(1234)
    mat1 = np.random.randn(m, r)
    mat2 = np.random.randn(r, n)
    mat = mat1.dot(mat2)
    data = from_array(mat, chunks=(50, 50))

    n_iter = 6
    for i in range(n_iter):
        u, s, vt = svd_compressed(data, r)
        u = np.array(u)
        s = np.array(s)
        vt = np.array(vt)
        if i == 0:
            usvt = np.dot(u, np.dot(np.diag(s), vt))
        else:
            usvt += np.dot(u, np.dot(np.diag(s), vt))
    usvt /= n_iter

    tol = 2e-1
    assert np.allclose(np.linalg.norm(mat - usvt),
                       np.linalg.norm(mat),
                       rtol=tol,
                       atol=tol)  # average accuracy check

    u, s, vt = svd_compressed(data, r)
    u = np.array(u)[:, :r]
    s = np.array(s)[:r]
    vt = np.array(vt)[:r, :]

    s_exact = np.linalg.svd(mat)[1]
    s_exact = s_exact[:r]

    assert np.allclose(np.eye(r, r), np.dot(u.T, u))  # u must be orthonormal
    assert np.allclose(np.eye(r, r), np.dot(vt, vt.T))  # v must be orthonormal
    assert np.allclose(s, s_exact)  # s must contain the singular values
Beispiel #9
0
def test_svd_compressed():
    m, n = 300, 250
    r = 10
    np.random.seed(1234)
    mat1 = np.random.randn(m, r)
    mat2 = np.random.randn(r, n)
    mat = mat1.dot(mat2)
    data = from_array(mat, chunks=(50, 50))

    n_iter = 6
    for i in range(n_iter):
        u, s, vt = svd_compressed(data, r)
        u = np.array(u)
        s = np.array(s)
        vt = np.array(vt)
        if i == 0:
            usvt = np.dot(u, np.dot(np.diag(s), vt))
        else:
            usvt += np.dot(u, np.dot(np.diag(s), vt))
    usvt /= n_iter

    tol = 2e-1
    assert np.allclose(np.linalg.norm(mat - usvt),
                       np.linalg.norm(mat),
                       rtol=tol, atol=tol)  # average accuracy check

    u, s, vt = svd_compressed(data, r)
    u = np.array(u)[:, :r]
    s = np.array(s)[:r]
    vt = np.array(vt)[:r, :]

    s_exact = np.linalg.svd(mat)[1]
    s_exact = s_exact[:r]

    assert np.allclose(np.eye(r, r), np.dot(u.T, u))  # u must be orthonormal
    assert np.allclose(np.eye(r, r), np.dot(vt, vt.T))  # v must be orthonormal
    assert np.allclose(s, s_exact)  # s must contain the singular values
Beispiel #10
0
def test_svd_compressed_dtype_preservation(input_dtype, output_dtype):
    x = da.random.random((50, 50), chunks=(50, 50)).astype(input_dtype)
    u, s, vt = svd_compressed(x, 1, seed=4321)
    assert u.dtype == s.dtype == vt.dtype == output_dtype
    def partial_fit(self, X, y=None, check_input=True):
        """Incremental fit with X. All of X is processed as a single batch.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        check_input : bool
            Run check_array on X.

        y : Ignored

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        if check_input:
            if sparse.issparse(X):
                raise TypeError(
                    "IncrementalPCA.partial_fit does not support "
                    "sparse input. Either convert data to dense "
                    "or use IncrementalPCA.fit to do so in batches.")
            X = check_array(
                X,
                copy=self.copy,
                dtype=[np.float64, np.float32],
                accept_multiple_blocks=True,
            )
        n_samples, n_features = X.shape
        if not hasattr(self, "components_"):
            self.components_ = None

        if self.n_components is None:
            if self.components_ is None:
                self.n_components_ = min(n_samples, n_features)
            else:
                self.n_components_ = self.components_.shape[0]
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        elif not self.n_components <= n_samples:
            raise ValueError("n_components=%r must be less or equal to "
                             "the batch number of samples "
                             "%d." % (self.n_components, n_samples))
        else:
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0] !=
                                               self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self.components_.shape[0], self.n_components_))

        # This is the first partial_fit
        if not hasattr(self, "n_samples_seen_"):
            self.n_samples_seen_ = 0
            self.mean_ = 0.0
            self.var_ = 0.0

        # Update stats - they are 0 if this is the first step
        # The next line is equivalent with np.repeat(self.n_samples_seen_, X.shape[1]),
        # which dask-array does not support
        last_sample_count = np.tile(np.expand_dims(self.n_samples_seen_, 0),
                                    X.shape[1])
        col_mean, col_var, n_total_samples = _incremental_mean_and_var(
            X,
            last_mean=self.mean_,
            last_variance=self.var_,
            last_sample_count=last_sample_count,
        )
        n_total_samples = da.compute(n_total_samples[0])[0]

        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X -= col_mean
        else:
            col_batch_mean = np.mean(X, axis=0)
            X -= col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = np.sqrt(
                (self.n_samples_seen_ * n_samples) /
                n_total_samples) * (self.mean_ - col_batch_mean)
            X = np.vstack((
                self.singular_values_.reshape((-1, 1)) * self.components_,
                X,
                mean_correction,
            ))

        # The following part is modified so that it can fit to large dask-array
        solver = self._get_solver(X, self.n_components_)
        if solver in {"full", "tsqr"}:
            U, S, V = linalg.svd(X)
            # manually implement full_matrix=False
            if V.shape[0] > len(S):
                V = V[:len(S)]
            if U.shape[1] > len(S):
                U = U[:, :len(S)]
        else:
            # randomized
            random_state = check_random_state(self.random_state)
            seed = draw_seed(random_state, np.iinfo("int32").max)
            n_power_iter = self.iterated_power
            U, S, V = linalg.svd_compressed(X,
                                            self.n_components_,
                                            n_power_iter=n_power_iter,
                                            seed=seed)
        U, V = svd_flip(U, V)
        explained_variance = S**2 / (n_total_samples - 1)
        components, singular_values = V, S

        # The following part is also updated for randomized solver,
        # which computes only a limited number of the singular values
        total_var = np.sum(col_var)
        explained_variance_ratio = (explained_variance / total_var *
                                    ((n_total_samples - 1) / n_total_samples))

        actual_rank = min(n_features, n_total_samples)
        if self.n_components_ < actual_rank:
            if solver == "randomized":
                noise_variance = (total_var - explained_variance.sum()) / (
                    actual_rank - self.n_components_)
            else:
                noise_variance = da.mean(
                    explained_variance[self.n_components_:])
        else:
            noise_variance = 0.0

        self.n_samples_seen_ = n_total_samples

        try:
            (
                self.n_samples_,
                self.mean_,
                self.var_,
                self.n_features_,
                self.components_,
                self.explained_variance_,
                self.explained_variance_ratio_,
                self.singular_values_,
                self.noise_variance_,
            ) = compute(
                n_samples,
                col_mean,
                col_var,
                n_features,
                components[:self.n_components_],
                explained_variance[:self.n_components_],
                explained_variance_ratio[:self.n_components_],
                singular_values[:self.n_components_],
                noise_variance,
            )
        except ValueError as e:
            if np.isnan([n_samples, n_features]).any():
                msg = (
                    "Computation of the SVD raised an error. It is possible "
                    "n_components is too large. i.e., "
                    "`n_components > np.nanmin(X.shape) = "
                    "np.nanmin({})`\n\n"
                    "A possible resolution to this error is to ensure that "
                    "n_components <= min(n_samples, n_features)")
                raise ValueError(msg.format(X.shape)) from e
            raise e

        if len(self.singular_values_) < self.n_components_:
            self.n_components_ = len(self.singular_values_)
            msg = (
                "n_components={n} is larger than the number of singular values"
                " ({s}) (note: PCA has attributes as if n_components == {s})")
            raise ValueError(
                msg.format(n=self.n_components_, s=len(self.singular_values_)))

        return self