Ejemplo n.º 1
0
def test_incremental_variance_ddof():
    """Test that degrees of freedom parameter for calculations are correct."""
    rng = np.random.RandomState(1999)
    X = rng.randn(50, 10)
    n_samples, n_features = X.shape
    for batch_size in [11, 20, 37]:
        steps = np.arange(0, X.shape[0], batch_size)
        if steps[-1] != X.shape[0]:
            steps = np.hstack([steps, n_samples])

        for i, j in zip(steps[:-1], steps[1:]):
            batch = X[i:j, :]
            if i == 0:
                incremental_means = batch.mean(axis=0)
                incremental_variances = batch.var(axis=0)
                # Assign this twice so that the test logic is consistent
                incremental_count = batch.shape[0]
                sample_count = batch.shape[0]
            else:
                result = _batch_mean_variance_update(batch, incremental_means,
                                                     incremental_variances,
                                                     sample_count)
                (incremental_means, incremental_variances,
                 incremental_count) = result
                sample_count += batch.shape[0]

            calculated_means = np.mean(X[:j], axis=0)
            calculated_variances = np.var(X[:j], axis=0)
            assert_almost_equal(incremental_means, calculated_means, 6)
            assert_almost_equal(incremental_variances, calculated_variances, 6)
            assert_equal(incremental_count, sample_count)
Ejemplo n.º 2
0
def test_incremental_variance_ddof():
    # Test that degrees of freedom parameter for calculations are correct.
    rng = np.random.RandomState(1999)
    X = rng.randn(50, 10)
    n_samples, n_features = X.shape
    for batch_size in [11, 20, 37]:
        steps = np.arange(0, X.shape[0], batch_size)
        if steps[-1] != X.shape[0]:
            steps = np.hstack([steps, n_samples])

        for i, j in zip(steps[:-1], steps[1:]):
            batch = X[i:j, :]
            if i == 0:
                incremental_means = batch.mean(axis=0)
                incremental_variances = batch.var(axis=0)
                # Assign this twice so that the test logic is consistent
                incremental_count = batch.shape[0]
                sample_count = batch.shape[0]
            else:
                result = _batch_mean_variance_update(
                    batch, incremental_means, incremental_variances,
                    sample_count)
                (incremental_means, incremental_variances,
                 incremental_count) = result
                sample_count += batch.shape[0]

            calculated_means = np.mean(X[:j], axis=0)
            calculated_variances = np.var(X[:j], axis=0)
            assert_almost_equal(incremental_means, calculated_means, 6)
            assert_almost_equal(incremental_variances,
                                calculated_variances, 6)
            assert_equal(incremental_count, sample_count)
Ejemplo n.º 3
0
def test_incremental_variance_update_formulas():
    """Test Youngs and Cramer incremental variance formulas."""
    # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html
    A = np.array([[600, 470, 170, 430, 300], [600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300], [600, 470, 170, 430, 300]]).T
    idx = 2
    X1 = A[:idx, :]
    X2 = A[idx:, :]

    old_means = X1.mean(axis=0)
    old_variances = X1.var(axis=0)
    old_sample_count = X1.shape[0]
    final_means, final_variances, final_count = _batch_mean_variance_update(
        X2, old_means, old_variances, old_sample_count)
    assert_almost_equal(final_means, A.mean(axis=0), 6)
    assert_almost_equal(final_variances, A.var(axis=0), 6)
    assert_almost_equal(final_count, A.shape[0])
Ejemplo n.º 4
0
def test_incremental_variance_update_formulas():
    # Test Youngs and Cramer incremental variance formulas.
    # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html
    A = np.array([[600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300],
                  [600, 470, 170, 430, 300]]).T
    idx = 2
    X1 = A[:idx, :]
    X2 = A[idx:, :]

    old_means = X1.mean(axis=0)
    old_variances = X1.var(axis=0)
    old_sample_count = X1.shape[0]
    final_means, final_variances, final_count = _batch_mean_variance_update(
        X2, old_means, old_variances, old_sample_count)
    assert_almost_equal(final_means, A.mean(axis=0), 6)
    assert_almost_equal(final_variances, A.var(axis=0), 6)
    assert_almost_equal(final_count, A.shape[0])
    def partial_fit(self, X, y=None):
        """Incremental fit with X. All of X is processed as a single batch.
        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        #X = check_array(X, copy=self.copy, dtype=np.float)  # --- ADJUSTED
        X = np.asarray(X)                                    # --- ADJUSTED
        n_samples, n_features = X.shape
        if not hasattr(self, 'components_'):
            self.components_ = None

        if self.n_components is None:
            self.n_components_ = n_features
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        else:
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0]
                                               != self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." % (
                                 self.components_.shape[0], self.n_components_))

        if self.components_ is None:
            # This is the first pass through partial_fit
            self.n_samples_seen_ = 0
            col_var = X.var(axis=0)
            col_mean = X.mean(axis=0)
            X -= col_mean
            U, S, V = linalg.svd(X, full_matrices=False)
            U, V = svd_flip(U, V, u_based_decision=False)
            explained_variance = S ** 2 / n_samples
            explained_variance_ratio = S ** 2 / np.sum(col_var *
                                                       n_samples)
        else:
            col_batch_mean = X.mean(axis=0)
            col_mean, col_var, n_total_samples = _batch_mean_variance_update(
                X, self.mean_, self.var_, self.n_samples_seen_)
            X -= col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = np.sqrt((self.n_samples_seen_ * n_samples) /
                                      n_total_samples) * (self.mean_ -
                                                          col_batch_mean)
            X_combined = np.vstack((self.singular_values_.reshape((-1, 1)) *
                                    self.components_, X,
                                    mean_correction))
            U, S, V = linalg.svd(X_combined, full_matrices=False)
            U, V = svd_flip(U, V, u_based_decision=False)
            explained_variance = S ** 2 / n_total_samples
            explained_variance_ratio = S ** 2 / np.sum(col_var *
                                                       n_total_samples)
        self.n_samples_seen_ += n_samples
        self.components_ = V[:self.n_components_]
        self.singular_values_ = S[:self.n_components_]
        self.mean_ = col_mean
        self.var_ = col_var
        self.explained_variance_ = explained_variance[:self.n_components_]
        self.explained_variance_ratio_ = \
            explained_variance_ratio[:self.n_components_]
        # if self.n_components_ < n_features:          # --- ADJUSTED
        #     self.noise_variance_ = \
        #         explained_variance[self.n_components_:].mean()
        # else:
        #     self.noise_variance_ = 0.

        return self
Ejemplo n.º 6
0
    def partial_fit(self, X, y=None):
        """Incremental fit with X. All of X is processed as a single batch.
        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        Returns
        -------
        self: object
            Returns the instance itself.
        """
        #X = check_array(X, copy=self.copy, dtype=np.float)  # --- ADJUSTED
        X = np.asarray(X)  # --- ADJUSTED
        n_samples, n_features = X.shape
        if not hasattr(self, 'components_'):
            self.components_ = None

        if self.n_components is None:
            self.n_components_ = n_features
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        else:
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0] !=
                                               self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self.components_.shape[0], self.n_components_))

        if self.components_ is None:
            # This is the first pass through partial_fit
            self.n_samples_seen_ = 0
            col_var = X.var(axis=0)
            col_mean = X.mean(axis=0)
            X -= col_mean
            U, S, V = linalg.svd(X, full_matrices=False)
            U, V = svd_flip(U, V, u_based_decision=False)
            explained_variance = S**2 / n_samples
            explained_variance_ratio = S**2 / np.sum(col_var * n_samples)
        else:
            col_batch_mean = X.mean(axis=0)
            col_mean, col_var, n_total_samples = _batch_mean_variance_update(
                X, self.mean_, self.var_, self.n_samples_seen_)
            X -= col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = np.sqrt(
                (self.n_samples_seen_ * n_samples) /
                n_total_samples) * (self.mean_ - col_batch_mean)
            X_combined = np.vstack((self.singular_values_.reshape(
                (-1, 1)) * self.components_, X, mean_correction))
            U, S, V = linalg.svd(X_combined, full_matrices=False)
            U, V = svd_flip(U, V, u_based_decision=False)
            explained_variance = S**2 / n_total_samples
            explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples)
        self.n_samples_seen_ += n_samples
        self.components_ = V[:self.n_components_]
        self.singular_values_ = S[:self.n_components_]
        self.mean_ = col_mean
        self.var_ = col_var
        self.explained_variance_ = explained_variance[:self.n_components_]
        self.explained_variance_ratio_ = \
            explained_variance_ratio[:self.n_components_]
        # if self.n_components_ < n_features:          # --- ADJUSTED
        #     self.noise_variance_ = \
        #         explained_variance[self.n_components_:].mean()
        # else:
        #     self.noise_variance_ = 0.

        return self