def test_mean_var_sparse(): from sklearn.utils.sparsefuncs import mean_variance_axis csr64 = sp.random(10000, 1000, format="csr", dtype=np.float64) csc64 = csr64.tocsc() # Test that we're equivalent for 64 bit for mtx, ax in product((csr64, csc64), (0, 1)): scm, scv = sc.pp._utils._get_mean_var(mtx, axis=ax) skm, skv = mean_variance_axis(mtx, ax) skv *= (mtx.shape[ax] / (mtx.shape[ax] - 1)) assert np.allclose(scm, skm) assert np.allclose(scv, skv) csr32 = csr64.astype(np.float32) csc32 = csc64.astype(np.float32) # Test whether ours is more accurate for 32 bit for mtx32, mtx64 in [(csc32, csc64), (csr32, csr64)]: scm32, scv32 = sc.pp._utils._get_mean_var(mtx32) scm64, scv64 = sc.pp._utils._get_mean_var(mtx64) skm32, skv32 = mean_variance_axis(mtx32, 0) skm64, skv64 = mean_variance_axis(mtx64, 0) skv32 *= (mtx.shape[0] / (mtx.shape[0] - 1)) skv64 *= (mtx.shape[0] / (mtx.shape[0] - 1)) m_resid_sc = np.mean(np.abs(scm64 - scm32)) m_resid_sk = np.mean(np.abs(skm64 - skm32)) v_resid_sc = np.mean(np.abs(scv64 - scv32)) v_resid_sk = np.mean(np.abs(skv64 - skv32)) assert m_resid_sc < m_resid_sk assert v_resid_sc < v_resid_sk
def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 X_csr = sp.csr_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csr, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_csc = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csr, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) X_means, X_vars = mean_variance_axis(X_csc, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 with pytest.raises(TypeError): mean_variance_axis(X_lil, axis=1) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: X_test = X.astype(input_dtype) for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis=0) assert X_means.dtype == output_dtype assert X_vars.dtype == output_dtype assert_array_almost_equal(X_means, np.mean(X_test, axis=0)) assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def nanmean(x, axis=None): """ Equivalent of np.nanmean that supports sparse or dense matrices. """ if not sp.issparse(x): means = np.nanmean(x, axis=axis) elif axis is None: means, _ = mean_variance_axis(x, axis=0) means = np.nanmean(means) else: means, _ = mean_variance_axis(x, axis=axis) return means
def high_mem_mean_and_std(loom_file, layer, axis=None, valid_ca=None, valid_ra=None): """ Calculates mean and standard deviation in a high memory fashion Args: loom_file (str): Path to loom file containing mC/C counts layer (str): Layer containing mC/C counts axis (int): Axis to calculate mean and standard deviation None: values are for entire layer 0: Statistics are for cells 1: Statistics are for features valid_ca (str): Optional, only use cells specified by valid_ca valid_ra (str): Optional, only use features specified by valid_ra """ # Get valid indices row_idx = get_attr_index(loom_file=loom_file, attr=valid_ra, columns=False, as_bool=False, inverse=False) col_idx = get_attr_index(loom_file=loom_file, attr=valid_ca, columns=True, as_bool=False, inverse=False) # Get data with loompy.connect(loom_file, mode='r') as ds: dat = ds.layers[layer].sparse(row_idx, col_idx) # Get mean and variance if axis == 0: my_mean, my_var = sparsefuncs.mean_variance_axis(dat.tocsc(), axis=0) my_std = np.sqrt(my_var) elif axis == 1: my_mean, my_var = sparsefuncs.mean_variance_axis(dat.tocsr(), axis=1) my_std = np.sqrt(my_var) elif axis is None: my_mean = dat.tocsr().mean(axis=None) sqrd = dat.copy() sqrd.data **= 2 my_var = sqrd.sum(axis=None) / (sqrd.shape[0] * sqrd.shape[1]) - my_mean**2 my_std = np.sqrt(my_var) else: raise ValueError('Unsupported axis value ({})'.format(axis)) return my_mean, my_std
def _get_mu_std(sam3, sam1, sam2, knn=False): g1, g2 = ut.extract_annotation(sam3.adata.var_names, 0, ";"), ut.extract_annotation( sam3.adata.var_names, 1, ";" ) if knn: mu1, var1 = sf.mean_variance_axis(sam1.adata[:, g1].layers["X_knn_avg"], axis=0) mu2, var2 = sf.mean_variance_axis(sam2.adata[:, g2].layers["X_knn_avg"], axis=0) else: mu1, var1 = sf.mean_variance_axis(sam1.adata[:, g1].X, axis=0) mu2, var2 = sf.mean_variance_axis(sam2.adata[:, g2].X, axis=0) var1[var1 == 0] = 1 var2[var2 == 0] = 1 var1 = var1 ** 0.5 var2 = var2 ** 0.5 return mu1, var1, mu2, var2
def fit(self, X, y=None): """Don't trust the documentation of this module! Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, accept_sparse="csc", dtype=np.float32, ensure_2d=False) if sparse.issparse(X): if self.center_sparse: means = [] vars = [] # This only works for csc matrices... for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: means.append(0) vars.append(1) else: vars.append(X.data[X.indptr[i]:X.indptr[i + 1]].var()) # If the variance is 0, set all occurences of this # features to 1 means.append(X.data[X.indptr[i]:X.indptr[i + 1]].mean()) if 0.0000001 >= vars[-1] >= -0.0000001: means[-1] -= 1 self.std_ = np.sqrt(np.array(vars)) self.std_[np.array(vars) == 0.0] = 1.0 self.mean_ = np.array(means) return self elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: self.mean_ = None if self.with_std: var = mean_variance_axis(X, axis=0)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std(X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self
def _centre_scale(self, views: Iterable[np.ndarray]): """ Removes the mean of the training data and standardizes for each view and stores mean and standard deviation during training :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples) :return: train_views: the demeaned numpy arrays to be used to fit the model """ self.view_means = [] self.view_stds = [] transformed_views = [] for view in views: if issparse(view): view_mean, view_std = mean_variance_axis(view, axis=0) self.view_means.append(view_mean) self.view_stds.append(view_std) view = view - self.view_means[-1] view = view / self.view_stds[-1] else: if self.centre: view_mean = view.mean(axis=0) self.view_means.append(view_mean) view = view - self.view_means[-1] if self.scale: view_std = view.std(axis=0, ddof=1) view_std[view_std == 0.0] = 1.0 self.view_stds.append(view_std) view = view / self.view_stds[-1] transformed_views.append(view) return transformed_views
def run_rpca(data, scale=False, max_value=10.0, nPC=50, random_state=0): """ smooth outliers, then no center/scale data """ start = time.time() # Smooth out outliers means, variances = mean_variance_axis(data.X, axis=0) stds = np.sqrt(variances * (data.X.shape[0] / (data.X.shape[0] - 1))) # make it unbiased assert (stds == 0.0).sum() == 0 data_new = (data.X.data - means[data.X.indices]) / stds[data.X.indices] outliers = data_new > max_value data.X.data[outliers] = max_value * stds[data.X.indices[outliers]] + means[ data.X.indices[outliers]] if scale: data.X.data /= stds[data.X.indices] U, S, VT = randomized_svd(data.X, n_components=nPC, random_state=random_state) data.obsm['X_rpca'] = U * S end = time.time() print("RPCA is done. Time spent = {:.2f}s.".format(end - start))
def fit(self, X, y=None): """Don't trust the documentation of this module! Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_array(X, copy=self.copy, accept_sparse="csc", ensure_2d=False) if warn_if_not_float(X, estimator=self): # Costly conversion, but otherwise the pipeline will break: # https://github.com/scikit-learn/scikit-learn/issues/1709 X = X.astype(np.float32) if sparse.issparse(X): if self.center_sparse: means = [] vars = [] # This only works for csc matrices... for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: means.append(0) vars.append(1) else: vars.append( X.data[X.indptr[i]:X.indptr[i + 1]].var()) # If the variance is 0, set all occurences of this # features to 1 means.append( X.data[X.indptr[i]:X.indptr[i + 1]].mean()) if 0.0000001 >= vars[-1] >= -0.0000001: means[-1] -= 1 self.std_ = np.sqrt(np.array(vars)) self.std_[np.array(vars) == 0.0] = 1.0 self.mean_ = np.array(means) return self elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: self.mean_ = None if self.with_std: var = mean_variance_axis(X, axis=0)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std( X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self
def fit_transform(self, X, y=None): X = self._validate_data(X, accept_sparse=['csr', 'csc'], ensure_min_features=2) # "arpack" algo U, Sigma, VT = svds(X, k=self.n_components) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. Sigma = Sigma[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) # Store: # eigenvalues (left singular values): terms self.U_ = U # eigenvectors (right singular values): documents self.V_ = VT.T # singular values self.sigma_ = Sigma # Calculate explained variance & explained variance ratio X_transformed = U * Sigma self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) if sp.issparse(X): _, full_var = mean_variance_axis(X, axis=0) full_var = full_var.sum() else: full_var = np.var(X, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var return X_transformed
def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert_false(np.any(np.isnan(X_scaled))) X_csr_scaled = scale(X_csr, with_mean=False) assert_false(np.any(np.isnan(X_csr_scaled.data))) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
def sparse_center_data(X, y, fit_intercept, normalize=False): """ Compute information needed to center data to have mean zero along axis 0. Be aware that X will not be centered since it would break the sparsity, but will be normalized if asked so. """ if fit_intercept: # we might require not to change the csr matrix sometimes # store a copy if normalize is True. # Change dtype to float64 since mean_variance_axis accepts # it that way. if sp.isspmatrix(X) and X.getformat() == 'csr': X = sp.csr_matrix(X, copy=normalize, dtype=np.float64) else: X = sp.csc_matrix(X, copy=normalize, dtype=np.float64) X_offset, X_var = mean_variance_axis(X, axis=0) if normalize: # transform variance to std in-place X_var *= X.shape[0] X_std = np.sqrt(X_var, X_var) del X_var X_std[X_std == 0] = 1 inplace_column_scale(X, 1. / X_std) else: X_std = np.ones(X.shape[1]) y_offset = y.mean(axis=0) y = y - y_offset else: X_offset = np.zeros(X.shape[1]) X_std = np.ones(X.shape[1]) y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_std
def _display_dataset(self, dataset): eps = 0.00001 linewidth = dataset.linewidth delta = self.max_value - self.min_value density_delta = 1.2 * delta if delta > 0: x = np.arange(self.min_value - 0.1 * delta, self.max_value + 0.1 * delta, density_delta / self.num_points) else: x = np.array([self.min_value - 2 * eps, self.max_value + 2 * eps]) if isinstance(dataset.values, spmatrix): variance = mean_variance_axis(dataset.values, axis=0)[1] else: variance = np.var(dataset.values) if variance < eps: linewidth += 2 mean = np.mean(dataset.values) x = np.sort(np.append(x, [mean, mean - eps, mean + eps])) density = [1 if v == mean else 0 for v in x] else: self.kde.fit(dataset.values) x_density = [[y] for y in x] # kde.score_samples returns the 'log' of the density log_density = self.kde.score_samples(x_density).tolist() density = list(map(math.exp, log_density)) self.ax.plot(x, density, label=dataset.label, color=dataset.color, linewidth=linewidth, linestyle=dataset.linestyle)
def compute_scoring_func(self, func): if func == 'variance': features = self.instances.features.get_values() annotations = self.instances.annotations.get_labels() if isinstance(features, spmatrix): variance = mean_variance_axis(features, axis=0)[1] else: variance = features.var(axis=0) return variance, None features = self.annotated_instances.features.get_values() annotations = self.annotated_instances.annotations.get_supervision( self.multiclass) if func == 'f_classif': return f_classif(features, annotations) elif func == 'mutual_info_classif': if isinstance(features, spmatrix): discrete_indexes = True else: features_types = self.instances.features.info.types discrete_indexes = [ i for i, t in enumerate(features_types) if t == FeatureType.binary ] if not discrete_indexes: discrete_indexes = False return (mutual_info_classif(features, annotations, discrete_features=discrete_indexes), None) elif func == 'chi2': return chi2(features, annotations) else: assert (False)
def _tolerance(X, tol): """Return a tolerance which is independent of the dataset""" if sp.issparse(X): variances = mean_variance_axis(X, axis=0)[1] else: variances = np.var(X, axis=0) return np.mean(variances) * tol
def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: X_test = X.astype(input_dtype) for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis=0) assert_equal(X_means.dtype, output_dtype) assert_equal(X_vars.dtype, output_dtype) assert_array_almost_equal(X_means, np.mean(X_test, axis=0)) assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) assert_raises(ValueError, StandardScaler().fit, X_csr) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) assert_raises(ValueError, StandardScaler().fit, X_csr) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
def mapper(X): """Calculate statistics for every numpy or scipy blocks.""" X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix mean, var = mean_variance_axis(X, axis=0) else: mean, var = np.mean(X, axis=0), np.var(X, axis=0) return X.shape[0], mean, var
def fit(self, X, y=None): if sparse.issparse(X): var = mean_variance_axis(X, axis=0)[1] deviations = np.sqrt(var) else: deviations = np.std(X, axis=0) self.passed_idx = deviations > self.threshold return self
def _pca_with_sparse(X, npcs, solver='arpack', mu=None, seed=0): random_state = check_random_state(seed) np.random.set_state(random_state.get_state()) random_init = np.random.rand(np.min(X.shape)) X = check_array(X, accept_sparse=['csr', 'csc']) if mu is None: mu = X.mean(0).A.flatten()[None, :] mdot = mu.dot mmat = mdot mhdot = mu.T.dot mhmat = mu.T.dot Xdot = X.dot Xmat = Xdot XHdot = X.T.conj().dot XHmat = XHdot ones = np.ones(X.shape[0])[None, :].dot def matvec(x): return Xdot(x) - mdot(x) def matmat(x): return Xmat(x) - mmat(x) def rmatvec(x): return XHdot(x) - mhdot(ones(x)) def rmatmat(x): return XHmat(x) - mhmat(ones(x)) XL = sp.sparse.linalg.LinearOperator( matvec=matvec, dtype=X.dtype, matmat=matmat, shape=X.shape, rmatvec=rmatvec, rmatmat=rmatmat, ) u, s, v = sp.sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init) u, v = svd_flip(u, v) idx = np.argsort(-s) v = v[idx, :] X_pca = (u * s)[:, idx] ev = s[idx]**2 / (X.shape[0] - 1) total_var = sf.mean_variance_axis(X, axis=0)[1].sum() ev_ratio = ev / total_var output = { 'X_pca': X_pca, 'variance': ev, 'variance_ratio': ev_ratio, 'components': v, } return output
def test_incr_mean_variance_axis_weighted_axis1(Xw, X, weights, sparse_constructor, dtype): axis = 1 Xw_sparse = sparse_constructor(Xw).astype(dtype) X_sparse = sparse_constructor(X).astype(dtype) last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype) last_var = np.zeros_like(last_mean, dtype=dtype) last_n = np.zeros_like(last_mean, dtype=np.int64) means0, vars0, n_incr0 = incr_mean_variance_axis(X=X_sparse, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=None) means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(X=Xw_sparse, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights) assert means_w0.dtype == dtype assert vars_w0.dtype == dtype assert n_incr_w0.dtype == dtype means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis) assert_array_almost_equal(means0, means_w0) assert_array_almost_equal(means0, means_simple) assert_array_almost_equal(vars0, vars_w0) assert_array_almost_equal(vars0, vars_simple) assert_array_almost_equal(n_incr0, n_incr_w0) # check second round for incremental means1, vars1, n_incr1 = incr_mean_variance_axis(X=X_sparse, axis=axis, last_mean=means0, last_var=vars0, last_n=n_incr0, weights=None) means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(X=Xw_sparse, axis=axis, last_mean=means_w0, last_var=vars_w0, last_n=n_incr_w0, weights=weights) assert_array_almost_equal(means1, means_w1) assert_array_almost_equal(vars1, vars_w1) assert_array_almost_equal(n_incr1, n_incr_w1) assert means_w1.dtype == dtype assert vars_w1.dtype == dtype assert n_incr_w1.dtype == dtype
def _tolerance(X, rtol): """Compute absolute tolerance from the relative tolerance""" if rtol == 0.0: return rtol if sp.issparse(X): variances = mean_variance_axis(X, axis=0)[1] mean_var = np.mean(variances) else: mean_var = np.var(X, axis=0).mean() return mean_var * rtol
def partial_fit(self, X, y=None): """Online computation of mean and std on X for later scaling. All of X is processed as a single batch. This is intended for cases when `fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms for computing the sample variance: Analysis and recommendations." The American Statistician 37.3 (1983): 242-247: Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. y: Passthrough for ``Pipeline`` compatibility. """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, ensure_2d=False, warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var # See incr_mean_variance_axis and _incremental_mean_variance_axis if not sparse.issparse(X): return super(SparseScaler, self).partial_fit(X) if self.with_std: # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_, self.var_ = mean_variance_axis(X, axis=0) n = X.shape[0] self.n_samples_seen_ = n # Next passes else: self.mean_, self.var_, self.n_samples_seen_ = \ incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, last_n=self.n_samples_seen_) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) else: self.scale_ = None return self
def fit_transform(self, X, y=None): """ Fit LSI model to X and perform dimensionality reduction on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Returns ------- X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ X = as_float_array(X, copy=False) random_state = check_random_state(self.random_state) # If sparse and not csr or csc, convert to csr if sp.issparse(X) and X.getformat() not in ["csr", "csc"]: X = X.tocsr() if self.algorithm == "arpack": U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. Sigma = Sigma[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) elif self.algorithm == "randomized": k = self.n_components n_features = X.shape[1] if k >= n_features: raise ValueError("n_components must be < n_features;" " got %d >= %d" % (k, n_features)) U, Sigma, VT = randomized_svd(X, self.n_components, n_iter=self.n_iter, random_state=random_state) else: raise ValueError("unknown algorithm %r" % self.algorithm) self.components_ = VT self.Sigma = Sigma[:self.n_components] # Calculate explained variance & explained variance ratio X_transformed = np.dot(U, np.diag(Sigma)) self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) if sp.issparse(X): _, full_var = mean_variance_axis(X, axis=0) full_var = full_var.sum() else: full_var = np.var(X, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var return X_transformed
def truncated_svd(x, n_components): u, sigma, v_t = randomized_svd(x, n_components, n_iter=5, random_state=42) x_trans = np.dot(u, np.diag(sigma)) loading = np.transpose(v_t) exp_var = np.var(x_trans, axis=0) _, full_var = mean_variance_axis(x, axis=0) full_var = full_var.sum() exp_var_ratio = exp_var / full_var # exp_var_ratio = sigma / u.shape[0] # svd = TruncatedSVD(n_components=n_components) # xReduced = svd.fit_transform(patMat) return x_trans, loading, exp_var_ratio
def test_mean_variance_illegal_axis(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_csr = sp.csr_matrix(X) with pytest.raises(ValueError): mean_variance_axis(X_csr, axis=-3) with pytest.raises(ValueError): mean_variance_axis(X_csr, axis=2) with pytest.raises(ValueError): mean_variance_axis(X_csr, axis=-1) with pytest.raises(ValueError): incr_mean_variance_axis(X_csr, axis=-3, last_mean=None, last_var=None, last_n=None) with pytest.raises(ValueError): incr_mean_variance_axis(X_csr, axis=2, last_mean=None, last_var=None, last_n=None) with pytest.raises(ValueError): incr_mean_variance_axis(X_csr, axis=-1, last_mean=None, last_var=None, last_n=None)
def _sparse_sub_standardize(X, mu, var, rows=False): x, y = X.nonzero() if not rows: Xs = X.copy() Xs.data[:] = (X.data - mu[y]) / var[y] else: mu, var = sf.mean_variance_axis(X, axis=1) var = var ** 0.5 var[var == 0] = 1 Xs = X.copy() Xs.data[:] = (X.data - mu[x]) / var[x] Xs.data[Xs.data < 0] = 0 Xs.eliminate_zeros() return Xs
def test_mean_variance_axis0_precision(dtype, sparse_constructor): # Check that there's no big loss of precision when the real variance is # exactly 0. (#19766) rng = np.random.RandomState(0) X = np.full(fill_value=100.0, shape=(1000, 1), dtype=dtype) # Add some missing records which should be ignored: missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False) X[missing_indices, 0] = np.nan X = sparse_constructor(X) # Random positive weights: sample_weight = rng.rand(X.shape[0]).astype(dtype) _, var = mean_variance_axis(X, weights=sample_weight, axis=0) assert var < np.finfo(dtype).eps
def fit(self, X, y=None): """Fits LSA model on training data X.""" (host, port) = FrovedisServer.getServerInstance() if self.algorithm != "arpack": raise ValueError("algorithm: currently Frovedis supports only " \ + "arpack!") if isinstance(X, FrovedisCRSMatrix): self.var_sum = None elif isinstance(X, FrovedisRowmajorMatrix): to_sample = False # ddof = 0 in np.var(...) isdense = True self.var_sum = compute_var_sum(host, port, X.get(), to_sample, isdense, X.get_dtype()) excpt = check_server_exception() if excpt["status"]: raise RuntimeError(excpt["info"]) elif issparse(X): try: from sklearn.utils.sparsefuncs import mean_variance_axis _, full_var = mean_variance_axis(X, axis=0) self.var_sum = full_var.sum() except: #for system without sklearn self.var_sum = None else: self.var_sum = np.var(X, axis=0).sum() # if X is not a sparse data, it would be converted as rowmajor matrix inp_data = FrovedisFeatureData(X, \ caller = "[" + self.__class__.__name__ + "] fit: ",\ dense_kind='rowmajor', densify=False) X = inp_data.get() x_dtype = inp_data.get_dtype() x_itype = inp_data.get_itype() dense = inp_data.is_dense() self.__mdtype = x_dtype if dense and self.use_shrink: raise ValueError("fit: use_shrink is applicable only for " \ + "sparse data!") res = compute_truncated_svd(host, port, X.get(), self.n_components, x_dtype, x_itype, dense, self.use_shrink) excpt = check_server_exception() if excpt["status"]: raise RuntimeError(excpt["info"]) self.svd_res_ = svdResult(res, TypeUtil.to_numpy_dtype(x_dtype)) return self
def fit(self, X, y=None): """Learn empirical variances from X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Sample vectors from which to compute variances. y : any Ignored. This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns ------- self """ X = check_array(X, ('csr', 'csc'), dtype=np.float64) if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) else: self.variances_ = np.var(X, axis=0) return self
def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [ rng.randint(0, 2, size=n_features) for i in range(n_samples) ] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = 0 # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremental_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_sparse, axis, last_mean, last_var, last_n) assert_equal(X_means_incr.dtype, output_dtype) assert_equal(X_vars_incr.dtype, output_dtype) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr)
def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [rng.random_integers(0, 1, size=n_features) for i in range(n_samples)] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = 0 # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremental_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # All data but as float X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr)
def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [rng.randint(0, 2, size=n_features) for i in range(n_samples)] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = np.zeros_like(last_mean, dtype=np.int64) # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremental_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) last_mean = last_mean.astype(output_dtype) last_var = last_var.astype(output_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_sparse, axis, last_mean, last_var, last_n) assert_equal(X_means_incr.dtype, output_dtype) assert_equal(X_vars_incr.dtype, output_dtype) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr)
def test_scaler_int(): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) clean_warning_registry() with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) clean_warning_registry() with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) clean_warning_registry() with warnings.catch_warnings(record=True): scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) clean_warning_registry() with warnings.catch_warnings(record=True): scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., 1.109, 1.856, 21., 1.559], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis( X_csr_scaled.astype(np.float), 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)