def test_mean_var_sparse():
    from sklearn.utils.sparsefuncs import mean_variance_axis

    csr64 = sp.random(10000, 1000, format="csr", dtype=np.float64)
    csc64 = csr64.tocsc()

    # Test that we're equivalent for 64 bit
    for mtx, ax in product((csr64, csc64), (0, 1)):
        scm, scv = sc.pp._utils._get_mean_var(mtx, axis=ax)
        skm, skv = mean_variance_axis(mtx, ax)
        skv *= (mtx.shape[ax] / (mtx.shape[ax] - 1))

        assert np.allclose(scm, skm)
        assert np.allclose(scv, skv)

    csr32 = csr64.astype(np.float32)
    csc32 = csc64.astype(np.float32)

    # Test whether ours is more accurate for 32 bit
    for mtx32, mtx64 in [(csc32, csc64), (csr32, csr64)]:
        scm32, scv32 = sc.pp._utils._get_mean_var(mtx32)
        scm64, scv64 = sc.pp._utils._get_mean_var(mtx64)
        skm32, skv32 = mean_variance_axis(mtx32, 0)
        skm64, skv64 = mean_variance_axis(mtx64, 0)
        skv32 *= (mtx.shape[0] / (mtx.shape[0] - 1))
        skv64 *= (mtx.shape[0] / (mtx.shape[0] - 1))

        m_resid_sc = np.mean(np.abs(scm64 - scm32))
        m_resid_sk = np.mean(np.abs(skm64 - skm32))
        v_resid_sc = np.mean(np.abs(scv64 - scv32))
        v_resid_sk = np.mean(np.abs(skv64 - skv32))

        assert m_resid_sc < m_resid_sk
        assert v_resid_sc < v_resid_sk
Beispiel #2
0
def test_mean_variance_axis1():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_lil = sp.lil_matrix(X)
    X_lil[1, 0] = 0
    X[1, 0] = 0
    X_csr = sp.csr_matrix(X_lil)

    X_means, X_vars = mean_variance_axis(X_csr, axis=1)
    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))

    X_csc = sp.csc_matrix(X_lil)
    X_means, X_vars = mean_variance_axis(X_csc, axis=1)

    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))
    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)

    X = X.astype(np.float32)
    X_csr = X_csr.astype(np.float32)
    X_csc = X_csr.astype(np.float32)
    X_means, X_vars = mean_variance_axis(X_csr, axis=1)
    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))
    X_means, X_vars = mean_variance_axis(X_csc, axis=1)
    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))
    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
def test_mean_variance_axis1():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_lil = sp.lil_matrix(X)
    X_lil[1, 0] = 0
    X[1, 0] = 0

    with pytest.raises(TypeError):
        mean_variance_axis(X_lil, axis=1)

    X_csr = sp.csr_matrix(X_lil)
    X_csc = sp.csc_matrix(X_lil)

    expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64),
                       (np.int32, np.float64), (np.int64, np.float64)]

    for input_dtype, output_dtype in expected_dtypes:
        X_test = X.astype(input_dtype)
        for X_sparse in (X_csr, X_csc):
            X_sparse = X_sparse.astype(input_dtype)
            X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
            assert X_means.dtype == output_dtype
            assert X_vars.dtype == output_dtype
            assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
            assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def test_mean_variance_axis1():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_lil = sp.lil_matrix(X)
    X_lil[1, 0] = 0
    X[1, 0] = 0
    X_csr = sp.csr_matrix(X_lil)

    X_means, X_vars = mean_variance_axis(X_csr, axis=1)
    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))

    X_csc = sp.csc_matrix(X_lil)
    X_means, X_vars = mean_variance_axis(X_csc, axis=1)

    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))
    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)

    X = X.astype(np.float32)
    X_csr = X_csr.astype(np.float32)
    X_csc = X_csr.astype(np.float32)
    X_means, X_vars = mean_variance_axis(X_csr, axis=1)
    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))
    X_means, X_vars = mean_variance_axis(X_csc, axis=1)
    assert_array_almost_equal(X_means, np.mean(X, axis=1))
    assert_array_almost_equal(X_vars, np.var(X, axis=1))
    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
Beispiel #5
0
def nanmean(x, axis=None):
    """ Equivalent of np.nanmean that supports sparse or dense matrices. """
    if not sp.issparse(x):
        means = np.nanmean(x, axis=axis)
    elif axis is None:
        means, _ = mean_variance_axis(x, axis=0)
        means = np.nanmean(means)
    else:
        means, _ = mean_variance_axis(x, axis=axis)

    return means
Beispiel #6
0
def nanmean(x, axis=None):
    """ Equivalent of np.nanmean that supports sparse or dense matrices. """
    if not sp.issparse(x):
        means = np.nanmean(x, axis=axis)
    elif axis is None:
        means, _ = mean_variance_axis(x, axis=0)
        means = np.nanmean(means)
    else:
        means, _ = mean_variance_axis(x, axis=axis)

    return means
def high_mem_mean_and_std(loom_file,
                          layer,
                          axis=None,
                          valid_ca=None,
                          valid_ra=None):
    """
    Calculates mean and standard deviation in a high memory fashion

    Args:
        loom_file (str): Path to loom file containing mC/C counts
        layer (str): Layer containing mC/C counts
        axis (int): Axis to calculate mean and standard deviation
            None: values are for entire layer
            0: Statistics are for cells
            1: Statistics are for features
        valid_ca (str): Optional, only use cells specified by valid_ca
        valid_ra (str): Optional, only use features specified by valid_ra
    """
    # Get valid indices
    row_idx = get_attr_index(loom_file=loom_file,
                             attr=valid_ra,
                             columns=False,
                             as_bool=False,
                             inverse=False)
    col_idx = get_attr_index(loom_file=loom_file,
                             attr=valid_ca,
                             columns=True,
                             as_bool=False,
                             inverse=False)
    # Get data
    with loompy.connect(loom_file, mode='r') as ds:
        dat = ds.layers[layer].sparse(row_idx, col_idx)
    # Get mean and variance
    if axis == 0:
        my_mean, my_var = sparsefuncs.mean_variance_axis(dat.tocsc(), axis=0)
        my_std = np.sqrt(my_var)
    elif axis == 1:
        my_mean, my_var = sparsefuncs.mean_variance_axis(dat.tocsr(), axis=1)
        my_std = np.sqrt(my_var)
    elif axis is None:
        my_mean = dat.tocsr().mean(axis=None)
        sqrd = dat.copy()
        sqrd.data **= 2
        my_var = sqrd.sum(axis=None) / (sqrd.shape[0] *
                                        sqrd.shape[1]) - my_mean**2
        my_std = np.sqrt(my_var)
    else:
        raise ValueError('Unsupported axis value ({})'.format(axis))
    return my_mean, my_std
Beispiel #8
0
def _get_mu_std(sam3, sam1, sam2, knn=False):
    g1, g2 = ut.extract_annotation(sam3.adata.var_names, 0, ";"), ut.extract_annotation(
        sam3.adata.var_names, 1, ";"
    )
    if knn:
        mu1, var1 = sf.mean_variance_axis(sam1.adata[:, g1].layers["X_knn_avg"], axis=0)
        mu2, var2 = sf.mean_variance_axis(sam2.adata[:, g2].layers["X_knn_avg"], axis=0)
    else:
        mu1, var1 = sf.mean_variance_axis(sam1.adata[:, g1].X, axis=0)
        mu2, var2 = sf.mean_variance_axis(sam2.adata[:, g2].X, axis=0)
    var1[var1 == 0] = 1
    var2[var2 == 0] = 1
    var1 = var1 ** 0.5
    var2 = var2 ** 0.5
    return mu1, var1, mu2, var2    
Beispiel #9
0
    def fit(self, X, y=None):
        """Don't trust the documentation of this module!

        Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : array-like or CSR matrix with shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.
        """
        X = check_array(X,
                        copy=self.copy,
                        accept_sparse="csc",
                        dtype=np.float32,
                        ensure_2d=False)
        if sparse.issparse(X):
            if self.center_sparse:
                means = []
                vars = []

                # This only works for csc matrices...
                for i in range(X.shape[1]):
                    if X.indptr[i] == X.indptr[i + 1]:
                        means.append(0)
                        vars.append(1)
                    else:
                        vars.append(X.data[X.indptr[i]:X.indptr[i + 1]].var())
                        # If the variance is 0, set all occurences of this
                        # features to 1
                        means.append(X.data[X.indptr[i]:X.indptr[i +
                                                                 1]].mean())
                        if 0.0000001 >= vars[-1] >= -0.0000001:
                            means[-1] -= 1

                self.std_ = np.sqrt(np.array(vars))
                self.std_[np.array(vars) == 0.0] = 1.0
                self.mean_ = np.array(means)

                return self
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                self.mean_ = None

            if self.with_std:
                var = mean_variance_axis(X, axis=0)[1]
                self.std_ = np.sqrt(var)
                self.std_[var == 0.0] = 1.0
            else:
                self.std_ = None
            return self
        else:
            self.mean_, self.std_ = _mean_and_std(X,
                                                  axis=0,
                                                  with_mean=self.with_mean,
                                                  with_std=self.with_std)
            return self
Beispiel #10
0
    def _centre_scale(self, views: Iterable[np.ndarray]):
        """
        Removes the mean of the training data and standardizes for each view and stores mean and standard deviation during training

        :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples)
        :return: train_views: the demeaned numpy arrays to be used to fit the model
        """

        self.view_means = []
        self.view_stds = []
        transformed_views = []
        for view in views:
            if issparse(view):
                view_mean, view_std = mean_variance_axis(view, axis=0)
                self.view_means.append(view_mean)
                self.view_stds.append(view_std)
                view = view - self.view_means[-1]
                view = view / self.view_stds[-1]
            else:
                if self.centre:
                    view_mean = view.mean(axis=0)
                    self.view_means.append(view_mean)
                    view = view - self.view_means[-1]
                if self.scale:
                    view_std = view.std(axis=0, ddof=1)
                    view_std[view_std == 0.0] = 1.0
                    self.view_stds.append(view_std)
                    view = view / self.view_stds[-1]
            transformed_views.append(view)
        return transformed_views
Beispiel #11
0
def run_rpca(data, scale=False, max_value=10.0, nPC=50, random_state=0):
    """ smooth outliers, then no center/scale data """
    start = time.time()

    # Smooth out outliers
    means, variances = mean_variance_axis(data.X, axis=0)
    stds = np.sqrt(variances * (data.X.shape[0] /
                                (data.X.shape[0] - 1)))  # make it unbiased
    assert (stds == 0.0).sum() == 0

    data_new = (data.X.data - means[data.X.indices]) / stds[data.X.indices]
    outliers = data_new > max_value
    data.X.data[outliers] = max_value * stds[data.X.indices[outliers]] + means[
        data.X.indices[outliers]]

    if scale:
        data.X.data /= stds[data.X.indices]

    U, S, VT = randomized_svd(data.X,
                              n_components=nPC,
                              random_state=random_state)
    data.obsm['X_rpca'] = U * S

    end = time.time()
    print("RPCA is done. Time spent = {:.2f}s.".format(end - start))
Beispiel #12
0
    def fit(self, X, y=None):
        """Don't trust the documentation of this module!

        Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : array-like or CSR matrix with shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.
        """
        X = check_array(X, copy=self.copy, accept_sparse="csc",
                         ensure_2d=False)
        if warn_if_not_float(X, estimator=self):
            # Costly conversion, but otherwise the pipeline will break:
            # https://github.com/scikit-learn/scikit-learn/issues/1709
            X = X.astype(np.float32)
        if sparse.issparse(X):
            if self.center_sparse:
                means = []
                vars = []

                # This only works for csc matrices...
                for i in range(X.shape[1]):
                    if X.indptr[i] == X.indptr[i + 1]:
                        means.append(0)
                        vars.append(1)
                    else:
                        vars.append(
                            X.data[X.indptr[i]:X.indptr[i + 1]].var())
                        # If the variance is 0, set all occurences of this
                        # features to 1
                        means.append(
                            X.data[X.indptr[i]:X.indptr[i + 1]].mean())
                        if 0.0000001 >= vars[-1] >= -0.0000001:
                            means[-1] -= 1

                self.std_ = np.sqrt(np.array(vars))
                self.std_[np.array(vars) == 0.0] = 1.0
                self.mean_ = np.array(means)

                return self
            elif self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
            else:
                self.mean_ = None

            if self.with_std:
                var = mean_variance_axis(X, axis=0)[1]
                self.std_ = np.sqrt(var)
                self.std_[var == 0.0] = 1.0
            else:
                self.std_ = None
            return self
        else:
            self.mean_, self.std_ = _mean_and_std(
                X, axis=0, with_mean=self.with_mean, with_std=self.with_std)
            return self
Beispiel #13
0
    def fit_transform(self, X, y=None):
        X = self._validate_data(X,
                                accept_sparse=['csr', 'csc'],
                                ensure_min_features=2)
        # "arpack" algo
        U, Sigma, VT = svds(X, k=self.n_components)
        # svds doesn't abide by scipy.linalg.svd/randomized_svd
        # conventions, so reverse its outputs.
        Sigma = Sigma[::-1]
        U, VT = svd_flip(U[:, ::-1], VT[::-1])

        # Store:
        # eigenvalues (left singular values): terms
        self.U_ = U
        # eigenvectors (right singular values): documents
        self.V_ = VT.T
        # singular values
        self.sigma_ = Sigma

        # Calculate explained variance & explained variance ratio
        X_transformed = U * Sigma
        self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
        if sp.issparse(X):
            _, full_var = mean_variance_axis(X, axis=0)
            full_var = full_var.sum()
        else:
            full_var = np.var(X, axis=0).sum()
        self.explained_variance_ratio_ = exp_var / full_var

        return X_transformed
Beispiel #14
0
def test_scale_function_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_scaled = scale(X, with_mean=False)
    assert_false(np.any(np.isnan(X_scaled)))

    X_csr_scaled = scale(X_csr, with_mean=False)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    # test csc has same outcome
    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())

    # raises value error on axis != 0
    assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
Beispiel #15
0
def test_scale_function_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_scaled = scale(X, with_mean=False)
    assert_false(np.any(np.isnan(X_scaled)))

    X_csr_scaled = scale(X_csr, with_mean=False)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    # test csc has same outcome
    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())

    # raises value error on axis != 0
    assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
Beispiel #16
0
def sparse_center_data(X, y, fit_intercept, normalize=False):
    """
    Compute information needed to center data to have mean zero along
    axis 0. Be aware that X will not be centered since it would break
    the sparsity, but will be normalized if asked so.
    """
    if fit_intercept:
        # we might require not to change the csr matrix sometimes
        # store a copy if normalize is True.
        # Change dtype to float64 since mean_variance_axis accepts
        # it that way.
        if sp.isspmatrix(X) and X.getformat() == 'csr':
            X = sp.csr_matrix(X, copy=normalize, dtype=np.float64)
        else:
            X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)

        X_offset, X_var = mean_variance_axis(X, axis=0)
        if normalize:
            # transform variance to std in-place
            X_var *= X.shape[0]
            X_std = np.sqrt(X_var, X_var)
            del X_var
            X_std[X_std == 0] = 1
            inplace_column_scale(X, 1. / X_std)
        else:
            X_std = np.ones(X.shape[1])
        y_offset = y.mean(axis=0)
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1])
        X_std = np.ones(X.shape[1])
        y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_std
Beispiel #17
0
 def _display_dataset(self, dataset):
     eps = 0.00001
     linewidth = dataset.linewidth
     delta = self.max_value - self.min_value
     density_delta = 1.2 * delta
     if delta > 0:
         x = np.arange(self.min_value - 0.1 * delta,
                       self.max_value + 0.1 * delta,
                       density_delta / self.num_points)
     else:
         x = np.array([self.min_value - 2 * eps, self.max_value + 2 * eps])
     if isinstance(dataset.values, spmatrix):
         variance = mean_variance_axis(dataset.values, axis=0)[1]
     else:
         variance = np.var(dataset.values)
     if variance < eps:
         linewidth += 2
         mean = np.mean(dataset.values)
         x = np.sort(np.append(x, [mean, mean - eps, mean + eps]))
         density = [1 if v == mean else 0 for v in x]
     else:
         self.kde.fit(dataset.values)
         x_density = [[y] for y in x]
         # kde.score_samples returns the 'log' of the density
         log_density = self.kde.score_samples(x_density).tolist()
         density = list(map(math.exp, log_density))
     self.ax.plot(x,
                  density,
                  label=dataset.label,
                  color=dataset.color,
                  linewidth=linewidth,
                  linestyle=dataset.linestyle)
Beispiel #18
0
    def compute_scoring_func(self, func):
        if func == 'variance':
            features = self.instances.features.get_values()
            annotations = self.instances.annotations.get_labels()
            if isinstance(features, spmatrix):
                variance = mean_variance_axis(features, axis=0)[1]
            else:
                variance = features.var(axis=0)
            return variance, None

        features = self.annotated_instances.features.get_values()
        annotations = self.annotated_instances.annotations.get_supervision(
            self.multiclass)
        if func == 'f_classif':
            return f_classif(features, annotations)
        elif func == 'mutual_info_classif':
            if isinstance(features, spmatrix):
                discrete_indexes = True
            else:
                features_types = self.instances.features.info.types
                discrete_indexes = [
                    i for i, t in enumerate(features_types)
                    if t == FeatureType.binary
                ]
                if not discrete_indexes:
                    discrete_indexes = False
            return (mutual_info_classif(features,
                                        annotations,
                                        discrete_features=discrete_indexes),
                    None)
        elif func == 'chi2':
            return chi2(features, annotations)
        else:
            assert (False)
def _tolerance(X, tol):
    """Return a tolerance which is independent of the dataset"""
    if sp.issparse(X):
        variances = mean_variance_axis(X, axis=0)[1]
    else:
        variances = np.var(X, axis=0)
    return np.mean(variances) * tol
def test_mean_variance_axis1():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_lil = sp.lil_matrix(X)
    X_lil[1, 0] = 0
    X[1, 0] = 0

    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)

    X_csr = sp.csr_matrix(X_lil)
    X_csc = sp.csc_matrix(X_lil)

    expected_dtypes = [(np.float32, np.float32),
                       (np.float64, np.float64),
                       (np.int32, np.float64),
                       (np.int64, np.float64)]

    for input_dtype, output_dtype in expected_dtypes:
        X_test = X.astype(input_dtype)
        for X_sparse in (X_csr, X_csc):
            X_sparse = X_sparse.astype(input_dtype)
            X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
            assert_equal(X_means.dtype, output_dtype)
            assert_equal(X_vars.dtype, output_dtype)
            assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
            assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
Beispiel #21
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
Beispiel #22
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
 def mapper(X):
     """Calculate statistics for every numpy or scipy blocks."""
     X = check_array(X, ('csr', 'csc'), dtype=np.float64)
     if hasattr(X, "toarray"):   # sparse matrix
         mean, var = mean_variance_axis(X, axis=0)
     else:
         mean, var = np.mean(X, axis=0), np.var(X, axis=0)
     return X.shape[0], mean, var
Beispiel #24
0
 def mapper(X):
     """Calculate statistics for every numpy or scipy blocks."""
     X = check_array(X, ('csr', 'csc'), dtype=np.float64)
     if hasattr(X, "toarray"):   # sparse matrix
         mean, var = mean_variance_axis(X, axis=0)
     else:
         mean, var = np.mean(X, axis=0), np.var(X, axis=0)
     return X.shape[0], mean, var
Beispiel #25
0
 def fit(self, X, y=None):
     if sparse.issparse(X):
         var = mean_variance_axis(X, axis=0)[1]
         deviations = np.sqrt(var)
     else:
         deviations = np.std(X, axis=0)
     self.passed_idx = deviations > self.threshold
     return self
Beispiel #26
0
def _pca_with_sparse(X, npcs, solver='arpack', mu=None, seed=0):
    random_state = check_random_state(seed)
    np.random.set_state(random_state.get_state())
    random_init = np.random.rand(np.min(X.shape))
    X = check_array(X, accept_sparse=['csr', 'csc'])

    if mu is None:
        mu = X.mean(0).A.flatten()[None, :]
    mdot = mu.dot
    mmat = mdot
    mhdot = mu.T.dot
    mhmat = mu.T.dot
    Xdot = X.dot
    Xmat = Xdot
    XHdot = X.T.conj().dot
    XHmat = XHdot
    ones = np.ones(X.shape[0])[None, :].dot

    def matvec(x):
        return Xdot(x) - mdot(x)

    def matmat(x):
        return Xmat(x) - mmat(x)

    def rmatvec(x):
        return XHdot(x) - mhdot(ones(x))

    def rmatmat(x):
        return XHmat(x) - mhmat(ones(x))

    XL = sp.sparse.linalg.LinearOperator(
        matvec=matvec,
        dtype=X.dtype,
        matmat=matmat,
        shape=X.shape,
        rmatvec=rmatvec,
        rmatmat=rmatmat,
    )

    u, s, v = sp.sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init)
    u, v = svd_flip(u, v)
    idx = np.argsort(-s)
    v = v[idx, :]

    X_pca = (u * s)[:, idx]
    ev = s[idx]**2 / (X.shape[0] - 1)

    total_var = sf.mean_variance_axis(X, axis=0)[1].sum()
    ev_ratio = ev / total_var

    output = {
        'X_pca': X_pca,
        'variance': ev,
        'variance_ratio': ev_ratio,
        'components': v,
    }
    return output
def test_incr_mean_variance_axis_weighted_axis1(Xw, X, weights,
                                                sparse_constructor, dtype):
    axis = 1
    Xw_sparse = sparse_constructor(Xw).astype(dtype)
    X_sparse = sparse_constructor(X).astype(dtype)

    last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype)
    last_var = np.zeros_like(last_mean, dtype=dtype)
    last_n = np.zeros_like(last_mean, dtype=np.int64)
    means0, vars0, n_incr0 = incr_mean_variance_axis(X=X_sparse,
                                                     axis=axis,
                                                     last_mean=last_mean,
                                                     last_var=last_var,
                                                     last_n=last_n,
                                                     weights=None)

    means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(X=Xw_sparse,
                                                           axis=axis,
                                                           last_mean=last_mean,
                                                           last_var=last_var,
                                                           last_n=last_n,
                                                           weights=weights)

    assert means_w0.dtype == dtype
    assert vars_w0.dtype == dtype
    assert n_incr_w0.dtype == dtype

    means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)

    assert_array_almost_equal(means0, means_w0)
    assert_array_almost_equal(means0, means_simple)
    assert_array_almost_equal(vars0, vars_w0)
    assert_array_almost_equal(vars0, vars_simple)
    assert_array_almost_equal(n_incr0, n_incr_w0)

    # check second round for incremental
    means1, vars1, n_incr1 = incr_mean_variance_axis(X=X_sparse,
                                                     axis=axis,
                                                     last_mean=means0,
                                                     last_var=vars0,
                                                     last_n=n_incr0,
                                                     weights=None)

    means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(X=Xw_sparse,
                                                           axis=axis,
                                                           last_mean=means_w0,
                                                           last_var=vars_w0,
                                                           last_n=n_incr_w0,
                                                           weights=weights)

    assert_array_almost_equal(means1, means_w1)
    assert_array_almost_equal(vars1, vars_w1)
    assert_array_almost_equal(n_incr1, n_incr_w1)

    assert means_w1.dtype == dtype
    assert vars_w1.dtype == dtype
    assert n_incr_w1.dtype == dtype
def _tolerance(X, rtol):
    """Compute absolute tolerance from the relative tolerance"""
    if rtol == 0.0:
        return rtol
    if sp.issparse(X):
        variances = mean_variance_axis(X, axis=0)[1]
        mean_var = np.mean(variances)
    else:
        mean_var = np.var(X, axis=0).mean()
    return mean_var * rtol
Beispiel #29
0
    def partial_fit(self, X, y=None):
        """Online computation of mean and std on X for later scaling.
        All of X is processed as a single batch. This is intended for cases
        when `fit` is not feasible due to very large number of `n_samples`
        or because X is read from a continuous stream.

        The algorithm for incremental mean and std is given in Equation 1.5a,b
        in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
        for computing the sample variance: Analysis and recommendations."
        The American Statistician 37.3 (1983): 242-247:

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.

        y: Passthrough for ``Pipeline`` compatibility.
        """
        X = check_array(X,
                        accept_sparse=('csr', 'csc'),
                        copy=self.copy,
                        ensure_2d=False,
                        warn_on_dtype=True,
                        estimator=self,
                        dtype=FLOAT_DTYPES)

        # Even in the case of `with_mean=False`, we update the mean anyway
        # This is needed for the incremental computation of the var
        # See incr_mean_variance_axis and _incremental_mean_variance_axis

        if not sparse.issparse(X):
            return super(SparseScaler, self).partial_fit(X)

        if self.with_std:
            # First pass
            if not hasattr(self, 'n_samples_seen_'):
                self.mean_, self.var_ = mean_variance_axis(X, axis=0)
                n = X.shape[0]
                self.n_samples_seen_ = n
            # Next passes
            else:
                self.mean_, self.var_, self.n_samples_seen_ = \
                    incr_mean_variance_axis(X, axis=0,
                                            last_mean=self.mean_,
                                            last_var=self.var_,
                                            last_n=self.n_samples_seen_)

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        return self
Beispiel #30
0
    def fit_transform(self, X, y=None):
        """ Fit LSI model to X and perform dimensionality reduction on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training data.

        Returns
        -------

        X_new : array, shape (n_samples, n_components)
            Reduced version of X. This will always be a dense array.
        """
        X = as_float_array(X, copy=False)
        random_state = check_random_state(self.random_state)

        # If sparse and not csr or csc, convert to csr
        if sp.issparse(X) and X.getformat() not in ["csr", "csc"]:
            X = X.tocsr()

        if self.algorithm == "arpack":
            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            Sigma = Sigma[::-1]
            U, VT = svd_flip(U[:, ::-1], VT[::-1])

        elif self.algorithm == "randomized":
            k = self.n_components
            n_features = X.shape[1]
            if k >= n_features:
                raise ValueError("n_components must be < n_features;"
                                 " got %d >= %d" % (k, n_features))
            U, Sigma, VT = randomized_svd(X,
                                          self.n_components,
                                          n_iter=self.n_iter,
                                          random_state=random_state)
        else:
            raise ValueError("unknown algorithm %r" % self.algorithm)

        self.components_ = VT
        self.Sigma = Sigma[:self.n_components]

        # Calculate explained variance & explained variance ratio
        X_transformed = np.dot(U, np.diag(Sigma))
        self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)
        if sp.issparse(X):
            _, full_var = mean_variance_axis(X, axis=0)
            full_var = full_var.sum()
        else:
            full_var = np.var(X, axis=0).sum()
        self.explained_variance_ratio_ = exp_var / full_var
        return X_transformed
Beispiel #31
0
def truncated_svd(x, n_components):
    u, sigma, v_t = randomized_svd(x, n_components, n_iter=5, random_state=42)
    x_trans = np.dot(u, np.diag(sigma))
    loading = np.transpose(v_t)
    exp_var = np.var(x_trans, axis=0)
    _, full_var = mean_variance_axis(x, axis=0)
    full_var = full_var.sum()
    exp_var_ratio = exp_var / full_var
    # exp_var_ratio = sigma / u.shape[0]
    #     svd = TruncatedSVD(n_components=n_components)
    #     xReduced = svd.fit_transform(patMat)
    return x_trans, loading, exp_var_ratio
def test_mean_variance_illegal_axis():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_csr = sp.csr_matrix(X)
    with pytest.raises(ValueError):
        mean_variance_axis(X_csr, axis=-3)
    with pytest.raises(ValueError):
        mean_variance_axis(X_csr, axis=2)
    with pytest.raises(ValueError):
        mean_variance_axis(X_csr, axis=-1)

    with pytest.raises(ValueError):
        incr_mean_variance_axis(X_csr,
                                axis=-3,
                                last_mean=None,
                                last_var=None,
                                last_n=None)

    with pytest.raises(ValueError):
        incr_mean_variance_axis(X_csr,
                                axis=2,
                                last_mean=None,
                                last_var=None,
                                last_n=None)

    with pytest.raises(ValueError):
        incr_mean_variance_axis(X_csr,
                                axis=-1,
                                last_mean=None,
                                last_var=None,
                                last_n=None)
Beispiel #33
0
def _sparse_sub_standardize(X, mu, var, rows=False):
    x, y = X.nonzero()
    if not rows:
        Xs = X.copy()
        Xs.data[:] = (X.data - mu[y]) / var[y]
    else:
        mu, var = sf.mean_variance_axis(X, axis=1)
        var = var ** 0.5
        var[var == 0] = 1
        Xs = X.copy()
        Xs.data[:] = (X.data - mu[x]) / var[x]
    Xs.data[Xs.data < 0] = 0
    Xs.eliminate_zeros()
    return Xs
Beispiel #34
0
def test_mean_variance_axis0_precision(dtype, sparse_constructor):
    # Check that there's no big loss of precision when the real variance is
    # exactly 0. (#19766)
    rng = np.random.RandomState(0)
    X = np.full(fill_value=100.0, shape=(1000, 1), dtype=dtype)
    # Add some missing records which should be ignored:
    missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
    X[missing_indices, 0] = np.nan
    X = sparse_constructor(X)

    # Random positive weights:
    sample_weight = rng.rand(X.shape[0]).astype(dtype)

    _, var = mean_variance_axis(X, weights=sample_weight, axis=0)

    assert var < np.finfo(dtype).eps
Beispiel #35
0
    def fit(self, X, y=None):
        """Fits LSA model on training data X."""
        (host, port) = FrovedisServer.getServerInstance()
        if self.algorithm != "arpack":
            raise ValueError("algorithm: currently Frovedis supports only " \
                              + "arpack!")
        if isinstance(X, FrovedisCRSMatrix):
            self.var_sum = None
        elif isinstance(X, FrovedisRowmajorMatrix):
            to_sample = False  # ddof = 0 in np.var(...)
            isdense = True
            self.var_sum = compute_var_sum(host, port, X.get(), to_sample,
                                           isdense, X.get_dtype())
            excpt = check_server_exception()
            if excpt["status"]:
                raise RuntimeError(excpt["info"])
        elif issparse(X):
            try:
                from sklearn.utils.sparsefuncs import mean_variance_axis
                _, full_var = mean_variance_axis(X, axis=0)
                self.var_sum = full_var.sum()
            except:  #for system without sklearn
                self.var_sum = None
        else:
            self.var_sum = np.var(X, axis=0).sum()
        # if X is not a sparse data, it would be converted as rowmajor matrix
        inp_data = FrovedisFeatureData(X, \
                     caller = "[" + self.__class__.__name__ + "] fit: ",\
                     dense_kind='rowmajor', densify=False)
        X = inp_data.get()
        x_dtype = inp_data.get_dtype()
        x_itype = inp_data.get_itype()
        dense = inp_data.is_dense()
        self.__mdtype = x_dtype
        if dense and self.use_shrink:
            raise ValueError("fit: use_shrink is applicable only for " \
                             + "sparse data!")

        res = compute_truncated_svd(host, port, X.get(), self.n_components,
                                    x_dtype, x_itype, dense, self.use_shrink)
        excpt = check_server_exception()
        if excpt["status"]:
            raise RuntimeError(excpt["info"])
        self.svd_res_ = svdResult(res, TypeUtil.to_numpy_dtype(x_dtype))
        return self
Beispiel #36
0
    def fit(self, X, y=None):
        """Learn empirical variances from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Sample vectors from which to compute variances.

        y : any
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        self
        """
        X = check_array(X, ('csr', 'csc'), dtype=np.float64)

        if hasattr(X, "toarray"):   # sparse matrix
            _, self.variances_ = mean_variance_axis(X, axis=0)
        else:
            self.variances_ = np.var(X, axis=0)

        return self
Beispiel #37
0
    def fit(self, X, y=None):
        """Learn empirical variances from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Sample vectors from which to compute variances.

        y : any
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        self
        """
        X = check_array(X, ('csr', 'csc'), dtype=np.float64)

        if hasattr(X, "toarray"):   # sparse matrix
            _, self.variances_ = mean_variance_axis(X, axis=0)
        else:
            self.variances_ = np.var(X, axis=0)

        return self
Beispiel #38
0
def test_incr_mean_variance_axis():
    for axis in [0, 1]:
        rng = np.random.RandomState(0)
        n_features = 50
        n_samples = 10
        data_chunks = [
            rng.randint(0, 2, size=n_features) for i in range(n_samples)
        ]

        # default params for incr_mean_variance
        last_mean = np.zeros(n_features)
        last_var = np.zeros_like(last_mean)
        last_n = 0

        # Test errors
        X = np.array(data_chunks[0])
        X = np.atleast_2d(X)
        X_lil = sp.lil_matrix(X)
        X_csr = sp.csr_matrix(X_lil)
        assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean,
                      last_var, last_n)
        assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean,
                      last_var, last_n)
        assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis,
                      last_mean, last_var, last_n)

        # Test _incr_mean_and_var with a 1 row input
        X_means, X_vars = mean_variance_axis(X_csr, axis)
        X_means_incr, X_vars_incr, n_incr = \
            incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)  # X.shape[axis] picks # samples

        X_csc = sp.csc_matrix(X_lil)
        X_means, X_vars = mean_variance_axis(X_csc, axis)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)

        # Test _incremental_mean_and_var with whole data
        X = np.vstack(data_chunks)
        X_lil = sp.lil_matrix(X)
        X_csr = sp.csr_matrix(X_lil)
        X_csc = sp.csc_matrix(X_lil)

        expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64),
                           (np.int32, np.float64), (np.int64, np.float64)]

        for input_dtype, output_dtype in expected_dtypes:
            for X_sparse in (X_csr, X_csc):
                X_sparse = X_sparse.astype(input_dtype)
                X_means, X_vars = mean_variance_axis(X_sparse, axis)
                X_means_incr, X_vars_incr, n_incr = \
                    incr_mean_variance_axis(X_sparse, axis, last_mean,
                                            last_var, last_n)
                assert_equal(X_means_incr.dtype, output_dtype)
                assert_equal(X_vars_incr.dtype, output_dtype)
                assert_array_almost_equal(X_means, X_means_incr)
                assert_array_almost_equal(X_vars, X_vars_incr)
                assert_equal(X.shape[axis], n_incr)
Beispiel #39
0
def test_incr_mean_variance_axis():
    for axis in [0, 1]:
        rng = np.random.RandomState(0)
        n_features = 50
        n_samples = 10
        data_chunks = [rng.random_integers(0, 1, size=n_features)
                       for i in range(n_samples)]

        # default params for incr_mean_variance
        last_mean = np.zeros(n_features)
        last_var = np.zeros_like(last_mean)
        last_n = 0

        # Test errors
        X = np.array(data_chunks[0])
        X = np.atleast_2d(X)
        X_lil = sp.lil_matrix(X)
        X_csr = sp.csr_matrix(X_lil)
        assert_raises(TypeError, incr_mean_variance_axis, axis,
                      last_mean, last_var, last_n)
        assert_raises(TypeError, incr_mean_variance_axis, axis,
                      last_mean, last_var, last_n)
        assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis,
                      last_mean, last_var, last_n)

        # Test _incr_mean_and_var with a 1 row input
        X_means, X_vars = mean_variance_axis(X_csr, axis)
        X_means_incr, X_vars_incr, n_incr = \
            incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)  # X.shape[axis] picks # samples

        X_csc = sp.csc_matrix(X_lil)
        X_means, X_vars = mean_variance_axis(X_csc, axis)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)

        # Test _incremental_mean_and_var with whole data
        X = np.vstack(data_chunks)
        X_lil = sp.lil_matrix(X)
        X_csr = sp.csr_matrix(X_lil)
        X_means, X_vars = mean_variance_axis(X_csr, axis)
        X_means_incr, X_vars_incr, n_incr = \
            incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)

        X_csc = sp.csc_matrix(X_lil)
        X_means, X_vars = mean_variance_axis(X_csc, axis)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)

        # All data but as float
        X = X.astype(np.float32)
        X_csr = X_csr.astype(np.float32)
        X_means, X_vars = mean_variance_axis(X_csr, axis)
        X_means_incr, X_vars_incr, n_incr = \
            incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)

        X_csc = X_csr.astype(np.float32)
        X_means, X_vars = mean_variance_axis(X_csc, axis)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)
def test_incr_mean_variance_axis():
    for axis in [0, 1]:
        rng = np.random.RandomState(0)
        n_features = 50
        n_samples = 10
        data_chunks = [rng.randint(0, 2, size=n_features)
                       for i in range(n_samples)]

        # default params for incr_mean_variance
        last_mean = np.zeros(n_features)
        last_var = np.zeros_like(last_mean)
        last_n = np.zeros_like(last_mean, dtype=np.int64)

        # Test errors
        X = np.array(data_chunks[0])
        X = np.atleast_2d(X)
        X_lil = sp.lil_matrix(X)
        X_csr = sp.csr_matrix(X_lil)
        assert_raises(TypeError, incr_mean_variance_axis, axis,
                      last_mean, last_var, last_n)
        assert_raises(TypeError, incr_mean_variance_axis, axis,
                      last_mean, last_var, last_n)
        assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis,
                      last_mean, last_var, last_n)

        # Test _incr_mean_and_var with a 1 row input
        X_means, X_vars = mean_variance_axis(X_csr, axis)
        X_means_incr, X_vars_incr, n_incr = \
            incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)  # X.shape[axis] picks # samples

        X_csc = sp.csc_matrix(X_lil)
        X_means, X_vars = mean_variance_axis(X_csc, axis)
        assert_array_almost_equal(X_means, X_means_incr)
        assert_array_almost_equal(X_vars, X_vars_incr)
        assert_equal(X.shape[axis], n_incr)

        # Test _incremental_mean_and_var with whole data
        X = np.vstack(data_chunks)
        X_lil = sp.lil_matrix(X)
        X_csr = sp.csr_matrix(X_lil)
        X_csc = sp.csc_matrix(X_lil)

        expected_dtypes = [(np.float32, np.float32),
                           (np.float64, np.float64),
                           (np.int32, np.float64),
                           (np.int64, np.float64)]

        for input_dtype, output_dtype in expected_dtypes:
            for X_sparse in (X_csr, X_csc):
                X_sparse = X_sparse.astype(input_dtype)
                last_mean = last_mean.astype(output_dtype)
                last_var = last_var.astype(output_dtype)
                X_means, X_vars = mean_variance_axis(X_sparse, axis)
                X_means_incr, X_vars_incr, n_incr = \
                    incr_mean_variance_axis(X_sparse, axis, last_mean,
                                            last_var, last_n)
                assert_equal(X_means_incr.dtype, output_dtype)
                assert_equal(X_vars_incr.dtype, output_dtype)
                assert_array_almost_equal(X_means, X_means_incr)
                assert_array_almost_equal(X_vars, X_vars_incr)
                assert_equal(X.shape[axis], n_incr)
Beispiel #41
0
def test_scaler_int():
    # test that scaler converts integer input to floating
    # for both sparse and dense matrices
    rng = np.random.RandomState(42)
    X = rng.randint(20, size=(4, 5))
    X[:, 0] = 0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    clean_warning_registry()
    with warnings.catch_warnings(record=True):
        X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    clean_warning_registry()
    with warnings.catch_warnings(record=True):
        scaler = StandardScaler(with_mean=False).fit(X)
        X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    clean_warning_registry()
    with warnings.catch_warnings(record=True):
        scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
        X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    clean_warning_registry()
    with warnings.catch_warnings(record=True):
        scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
        X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0),
        [0., 1.109, 1.856, 21., 1.559], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(
        X_csr_scaled.astype(np.float), 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)