def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=coefficient_of_variation, ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), default_val=len(matrices[0]) if matrices else 0) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] # Temporary replacement for scipy return ut.nanmode(x, *args, **kwargs)[0] self._center = self.__compute_stat( matrices, discrete_f=None, continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), ) self._median = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmedian(x, axis=0), time_f=lambda x: ut.nanmedian(x, axis=0), )
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] return ut.nanmode(x, *args, **kwargs)[0] # Temporary replacement for scipy self._center = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def _init_feature_marker_values(self): self.feature_marker_values = [] cls_index = self.target_class_index instances = Table(self.domain, self.instances) \ if self.instances else None values = [] for i, attr in enumerate(self.domain.attributes): value, feature_val = 0, None if len(self.log_reg_coeffs): if attr.is_discrete: ind, n = unique(self.data.X[:, i], return_counts=True) feature_val = np.nan_to_num(ind[np.argmax(n)]) else: feature_val = nanmean(self.data.X[:, i]) # If data is provided on a separate signal, use the first data # instance to position the points instead of the mean inst_in_dom = instances and attr in instances.domain if inst_in_dom and not np.isnan(instances[0][attr]): feature_val = instances[0][attr] if feature_val is not None: value = (self.points[i][cls_index][int(feature_val)] if attr.is_discrete else self.log_reg_coeffs_orig[i][cls_index][0] * feature_val) values.append(value) self.feature_marker_values = np.asarray(values)
def _get_colors(self): """Compute colors for different kinds of histograms.""" if self.target_var and self.target_var.is_discrete: colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = ContinuousPaletteGenerator(*self.target_var.colors) bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[ 1:-1] # Need to digitize on `right` here so the samples will be assigned # to the correct bin for coloring bin_indices = ut.digitize(self.x, bins=edges, right=True) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): biny = self.y[mask[bin_idx]] if np.isfinite(biny).any(): mean = ut.nanmean(biny) / ut.nanmax(self.y) else: mean = 0 # bin is empty, color does not matter colors.append([palette[mean]]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors
def __call__(self, data: Table) -> Table: n_groups = min(self.n_groups, len(data.domain.attributes)) mean = ut.nanmean(data.X, axis=0) variance = ut.nanvar(data.X, axis=0) percentiles = [percentileofscore(mean, m) for m in mean] _, bins = np.histogram(percentiles, n_groups) bin_indices = np.digitize(percentiles, bins, True) # Right limit is treated differently in histogram and digitize # See https://github.com/numpy/numpy/issues/4217 bin_indices[bin_indices == 0] = 1 zscores = np.zeros_like(mean) for group in range(n_groups): group_indices, = np.where(bin_indices == group + 1) if self.method == SelectMostVariableGenes.Dispersion: group_mean = mean[group_indices] group_scores = np.divide(variance[group_indices], group_mean, out=np.zeros_like(group_mean), where=group_mean != 0) elif self.method == SelectMostVariableGenes.Variance: group_scores = variance[group_indices] elif self.method == SelectMostVariableGenes.Mean: group_scores = mean[group_indices] with np.errstate(invalid="ignore"): zscores[group_indices] = zscore(group_scores) indices = np.argsort(np.nan_to_num(zscores))[-self.n_genes:] return self._filter_columns(data, indices)
def _get_colors(self): """Compute colors for different kinds of histograms.""" target = self.target_var if target and target.is_discrete: colors = [list(target.palette)[:len(target.values)]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = self.target_var.palette bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[ 1:-1] bin_indices = ut.digitize(self.x, bins=edges) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): biny = self.y[mask[bin_idx]] if np.isfinite(biny).any(): mean = ut.nanmean(biny) / ut.nanmax(self.y) else: mean = 0 # bin is empty, color does not matter colors.append([palette.value_to_qcolor(mean)]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors
def _get_colors(self): """Compute colors for different kinds of histograms.""" if self.target_var and self.target_var.is_discrete: colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = ContinuousPaletteGenerator(*self.target_var.colors) bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[1:-1] # Need to digitize on `right` here so the samples will be assigned # to the correct bin for coloring bin_indices = ut.digitize(self.x, bins=edges, right=True) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): biny = self.y[mask[bin_idx]] if np.isfinite(biny).any(): mean = ut.nanmean(biny) / ut.nanmax(self.y) else: mean = 0 # bin is empty, color does not matter colors.append([palette[mean]]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors
def fit_rows(self, attributes, x, n_vals): """Return a model for cosine distances with stored means for imputation """ discrete = n_vals > 0 x = self.discrete_to_indicators(x, discrete) means = util.nanmean(x, axis=0) means = np.nan_to_num(means) return self.CosineModel(attributes, self.axis, self.impute, discrete, means)
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean( x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def detection(self, table: AnyArray) -> Tuple[np.ndarray, np.ndarray]: with np.errstate(invalid="ignore"): # comparison can include nans mask = table > self.threshold if sp.issparse(table): A = table.copy() np.log2(A.data, out=A.data) else: A = np.ma.log2(table) # avoid log2(0) A.mask = False detection_rate = ut.nanmean(mask, axis=0) zero_rate = 1 - detection_rate detected = detection_rate > 0 detected_mean = ut.nanmean(A[:, detected], axis=0) mean_expr = np.full_like(zero_rate, fill_value=np.nan) mean_expr[detected] = detected_mean / detection_rate[detected] low_detection = np.array(np.sum(mask, axis=0)).squeeze() zero_rate[low_detection < self.at_least] = np.nan mean_expr[low_detection < self.at_least] = np.nan return zero_rate, mean_expr
def get_continuous_stats(self, column): """ Return mean, variance and distance betwwen pairs of missing values for the given columns. The method is called by inherited `fit_rows` to construct a row-distance model """ mean = util.nanmean(column) var = util.nanvar(column) if self.normalize: if var == 0: return None dist_missing2_cont = 1 else: dist_missing2_cont = 2 * var if np.isnan(dist_missing2_cont): dist_missing2_cont = 0 return mean, var, dist_missing2_cont
def __init__(self, data, indices, color, graph): self.x_data = np.arange(1, data.X.shape[1] + 1) self.y_data = data.X self.indices = indices self.ids = data.ids self.color = color self.graph = graph self.profiles_added = False self.sub_profiles_added = False self.range_added = False self.mean_added = False self.error_bar_added = False self.graph_items = [] self.__mean = nanmean(self.y_data, axis=0) self.__create_curves()
def _get_colors(self): """Compute colors for different kinds of histograms.""" if self.target_var and self.target_var.is_discrete: colors = [[QColor(*color) for color in self.target_var.colors]] * self.n_bins elif self.target_var and self.target_var.is_continuous: palette = ContinuousPaletteGenerator(*self.target_var.colors) bins = np.arange(self.n_bins)[:, np.newaxis] edges = self.edges if self.attribute.is_discrete else self.edges[ 1:-1] bin_indices = ut.digitize(self.x, bins=edges) mask = bin_indices == bins colors = [] for bin_idx in range(self.n_bins): mean = ut.nanmean(self.y[mask[bin_idx]], axis=0) / self.y.max() colors.append([palette[mean]]) else: colors = [[QColor('#ccc')]] * self.n_bins return colors
def test_axis_1(self, array): np.testing.assert_almost_equal(np.nanmean(self.x, axis=1), nanmean(array(self.x), axis=1))
def coefficient_of_variation(x: np.ndarray) -> np.ndarray: mu = ut.nanmean(x, axis=0) mask = ~np.isclose(mu, 0, atol=1e-12) result = np.full_like(mu, fill_value=np.inf) result[mask] = np.sqrt(ut.nanvar(x, axis=0)[mask]) / mu[mask] return result
def __compute_statistics(self): # We will compute statistics over all data at once matrices = [self._data.X, self._data._Y, self._data.metas] # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = zip([ self._domain.attributes, self._domain.class_vars, self._domain.metas ], matrices) # Filter out any matrices with size 0, filter the zipped matrices to # eliminate variables in a single swoop matrices = list(filter(lambda tup: tup[1].size, matrices)) def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None, time_f=None, string_f=None, default_val=np.nan): """Apply functions to variable types e.g. discrete_f to discrete variables. Default value is returned if there is no function defined for specific variable types.""" attrs, x = attrs_x_pair result = np.full(len(attrs), default_val) disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs) if discrete_f and x[:, disc_var_idx].size: result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64)) if continuous_f and x[:, cont_var_idx].size: result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64)) if time_f and x[:, time_var_idx].size: result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64)) if string_f and x[:, str_var_idx].size: result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object)) return result self._variable_types = [type(var) for var in self._attributes] self._variable_names = [var.name.lower() for var in self._attributes] # Compute the center _center = partial( _apply_to_types, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), ) self._center = np.hstack(map(_center, matrices)) # Compute the dispersion def _entropy(x): p = [ut.bincount(row)[0] for row in x.T] p = [pk / np.sum(pk) for pk in p] return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64) _dispersion = partial( _apply_to_types, discrete_f=lambda x: _entropy(x), continuous_f=lambda x: ut.nanvar(x, axis=0), ) self._dispersion = np.hstack(map(_dispersion, matrices)) # Compute minimum values _max = partial( _apply_to_types, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), ) self._max = np.hstack(map(_max, matrices)) # Compute maximum values _min = partial( _apply_to_types, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), ) self._min = np.hstack(map(_min, matrices)) # Compute # of missing values _missing = partial( _apply_to_types, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._missing = np.hstack(map(_missing, matrices))
def test_axis_1(self, array): np.testing.assert_almost_equal( np.nanmean(self.x, axis=1), nanmean(array(self.x), axis=1) )
def test_axis_none(self, array): np.testing.assert_almost_equal( np.nanmean(self.x), nanmean(array(self.x)) )
def test_nanmean(self): for X in self.data: X_sparse = csr_matrix(X) np.testing.assert_array_equal( nanmean(X_sparse), np.nanmean(X))
def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto", flip_sign=True, random_state=0): """Compute the randomized PCA decomposition of a given matrix. This method differs from the scikit-learn implementation in that it supports and handles sparse matrices well. """ if n_iter == "auto": # Checks if the number of iterations is explicitly specified # Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299 n_iter = 7 if n_components < .1 * min(A.shape) else 4 n_samples, n_features = A.shape c = np.atleast_2d(ut.nanmean(A, axis=0)) if n_samples >= n_features: Q = random_state.normal(size=(n_features, n_components + n_oversamples)) if A.dtype.kind == "f": Q = Q.astype(A.dtype, copy=False) Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) # Normalized power iterations for _ in range(n_iter): Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) Q, _ = lu(Q, permute_l=True) Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) Q, _ = lu(Q, permute_l=True) Q, _ = qr(Q, mode="economic") QA = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) R, s, V = svd(QA.T, full_matrices=False) U = Q.dot(R) else: # n_features > n_samples Q = random_state.normal(size=(n_samples, n_components + n_oversamples)) if A.dtype.kind == "f": Q = Q.astype(A.dtype, copy=False) Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) # Normalized power iterations for _ in range(n_iter): Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) Q, _ = lu(Q, permute_l=True) Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :]) Q, _ = lu(Q, permute_l=True) Q, _ = qr(Q, mode="economic") QA = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q) U, s, R = svd(QA, full_matrices=False) V = R.dot(Q.T) if flip_sign: U, V = svd_flip(U, V) return U[:, :n_components], s[:n_components], V[:n_components, :]
def test_nanmean(self): for X in self.data: X_sparse = csr_matrix(X) np.testing.assert_array_equal(nanmean(X_sparse), np.nanmean(X))
def test_axis_none(self, array): np.testing.assert_almost_equal(np.nanmean(self.x), nanmean(array(self.x)))