def normalize(self, table: AnyArray, group_col: np.ndarray) -> AnyArray: group_sums = np.bincount(group_col, ut.nansum(table, axis=1)) group_sums[group_sums == 0] = 1 group_sums_row = np.zeros_like(group_col) medians = [] row_sums = ut.nansum(table, axis=1) for value, group_sum in zip(np.unique(group_col), group_sums): mask = group_col == value group_sums_row[mask] = group_sum if self.method == NormalizeGroups.Median: medians.append(np.nanmedian(row_sums[mask])) if self.method == NormalizeGroups.Median: factor = np.min(medians) else: factor = 1e6 if sp.issparse(table): table = sp.diags(1 / group_sums_row) @ table else: table = table / group_sums_row[:, None] table *= factor return table
def transform(self, data): """ Transform data based on inferred parameters. :param data: Data table with expression values as counts. Columns are genes and rows are cells. :return: Data table with normalized values. """ # Result in expected number of reads Xeq = data.X.copy() n = Xeq.shape[0] # Normalize cell profiles if self.normalize_cells: # Each cell is normalized independently by default if sp.isspmatrix(Xeq): rs = Xeq.sum(axis=1).astype(float) else: rs = nansum(Xeq, axis=1).astype(float) rs[rs == 0] = 1.0 rsm = np.ones((n, ), dtype=float) * self.target_row_mean factors = rsm / rs # Override with library size factor, if provided. Else, each row is # treated as a separate group if self.equalize_var is not None: vals = np.array( list( map(lambda lib: self.size_factors.get(lib, np.nan), data.get_column_view(self.equalize_var)[0]))) inxs = np.logical_not(np.isnan(vals)) factors[inxs] = vals[inxs] Xd = sp.dia_matrix((factors.ravel(), 0), shape=(n, n), dtype=float) Xeq = Xd.dot(Xeq) # Log transform log(1 + x) if self.log_base is not None: if sp.isspmatrix(Xeq): Xeq = Xeq.log1p() / np.log(self.log_base) else: Xeq = np.log(1 + Xeq) / np.log(self.log_base) # Binary transform; # potential change to sparsity structure; if self.bin_thresh is not None: if sp.isspmatrix(Xeq): Xeq.data = (Xeq.data > self.bin_thresh).astype(int) Xeq.eliminate_zeros() else: Xeq = (Xeq > self.bin_thresh) # Preserve sparsity X_new = Xeq.tocsr() if sp.isspmatrix(Xeq) else Xeq data_new = Table.from_numpy(domain=data.domain, X=X_new, Y=data.Y, W=data.W, metas=data.metas) return data_new
def fit(self, X, Y=None): """ Infer row normalization parameters from the data. :param X: Continuous data matrix. :param Y: Grouping values :return: """ # Equalize based on read depth per library / match mean read count per cell # Must not store indices if Y is not None: libraries = {lib: np.where(Y == lib)[0] for lib in set(Y)} lib_sizes = {} for lib, rows in libraries.items(): lib_sizes[lib] = nanmedian(nansum(X[rows, :], axis=1)) self.target_row_mean = min(lib_sizes.values()) for lib in libraries: self.size_factors[lib] = self.target_row_mean / lib_sizes[lib] else: self.target_row_mean = nanmedian(nansum(X, axis=1))
def normalize(self, table: AnyArray) -> AnyArray: row_sums = ut.nansum(table, axis=1) row_sums[row_sums == 0] = 1 # avoid division by zero errors if self.method == NormalizeSamples.Median: factor = np.nanmedian(row_sums) else: factor = 1e6 if sp.issparse(table): table = sp.diags(1 / row_sums) @ table else: table = table / row_sums[:, None] table *= factor return table
def test_nansum(self, array): for X in self.data: X_sparse = array(X) np.testing.assert_array_equal(nansum(X_sparse), np.nansum(X))
def test_nansum(self, array): for X in self.data: X_sparse = array(X) np.testing.assert_array_equal( nansum(X_sparse), np.nansum(X))
class Pivot: Functions = AggregationFunctionsEnum (Count, Count_defined, Sum, Mean, Min, Max, Mode, Median, Var, Majority) = Functions AutonomousFunctions = (Count, ) AnyVarFunctions = (Count_defined, ) ContVarFunctions = (Sum, Mean, Min, Max, Mode, Median, Var) DiscVarFunctions = (Majority, ) TimeVarFunctions = (Mean, Min, Max, Mode, Median) FloatFunctions = (Count, Count_defined, Sum, Var) class Tables: table = None # type: Table total_h = None # type: Table total_v = None # type: Table total = None # type: Table def __call__(self): return self.table, self.total_h, self.total_v, self.total def __init__(self, table: Table, agg_funs: Iterable[Functions], row_var: Variable, col_var: Variable = None, val_var: Variable = None): self._group_tables = self.Tables() self._pivot_tables = self.Tables() self._table = table self._row_var = row_var self._col_var = col_var if col_var else row_var self.renamed = [] if not table: return if not self._row_var.is_primitive(): raise TypeError("Row variable should be DiscreteVariable" " or ContinuousVariable") if self._col_var and not self._col_var.is_discrete: raise TypeError("Column variable should be DiscreteVariable") self._row_var_col = table.get_column_view(row_var)[0].astype(np.float) self._col_var_col = table.get_column_view(self._col_var)[0].astype( np.float) self._row_var_groups = nanunique(self._row_var_col) self._col_var_groups = nanunique(self._col_var_col) self._total_var = DiscreteVariable("Total", values=("total", )) self._current_agg_functions = sorted(agg_funs) self._indepen_agg_done = {} # type: Dict[Functions, int] self._depen_agg_done = {} # type: Dict[Functions, Dict[Variable, int]] self._initialize(agg_funs, val_var) @property def group_table(self) -> Table: table = self._group_tables.table if not table or len(table) == 0: return None indices = [0, 1] if not self.single_var_grouping else [0] for f in self._current_agg_functions: if f in self._indepen_agg_done: indices.append(self._indepen_agg_done[f]) for v in self._table.domain.variables + self._table.domain.metas: for f in self._current_agg_functions: if f in self._depen_agg_done and v in self._depen_agg_done[f]: indices.append(self._depen_agg_done[f][v]) return table[:, indices] @property def pivot_table(self) -> Table: return self._pivot_tables.table @property def pivot_total_h(self) -> Table: return self._pivot_tables.total_h @property def pivot_total_v(self) -> Table: return self._pivot_tables.total_v @property def pivot_total(self) -> Table: return self._pivot_tables.total @property def pivot_tables(self) -> Table: return self._pivot_tables() @property def single_var_grouping(self) -> bool: return self._row_var is self._col_var def update_group_table(self, agg_funs: Iterable[Functions], val_var: Variable = None): if not self._group_tables: return self._current_agg_functions = sorted(agg_funs) agg_funs = set(self._indepen_agg_done.keys()) | \ set(self._depen_agg_done.keys()) | set(agg_funs) self._initialize(sorted(agg_funs), val_var) def _initialize(self, agg_funs, val_var): var_indep_funs, var_dep_funs = self.__group_aggregations(agg_funs) self._create_group_tables(var_indep_funs, var_dep_funs) self.__reference_aggregations(var_indep_funs, var_dep_funs) self._create_pivot_tables(val_var) def __group_aggregations(self, agg_funs): auto_funcs = self.AutonomousFunctions var_indep_funs = [fun for fun in agg_funs if fun in auto_funcs] var_dep_funs = [] attrs = self._table.domain.variables + self._table.domain.metas prod = product(filter_visible(attrs), [fun for fun in agg_funs if fun not in auto_funcs]) for var, fun in prod: if self.__include_aggregation(fun, var): var_dep_funs.append((var, fun)) return var_indep_funs, var_dep_funs def __include_aggregation(self, fun, var): return fun in self.ContVarFunctions and var.is_continuous or \ fun in self.DiscVarFunctions and var.is_discrete or \ fun in self.AnyVarFunctions def __reference_aggregations(self, var_indep_funs, var_dep_funs): self._indepen_agg_done = {} self._depen_agg_done = defaultdict(dict) i = 1 - int(bool(self.single_var_grouping)) for i, fun in enumerate(var_indep_funs, i + 1): self._indepen_agg_done[fun] = i for j, (var, fun) in enumerate(var_dep_funs, i + 1): self._depen_agg_done[fun].update({var: j}) def _create_group_tables(self, var_indep_funs, var_dep_funs): attrs = [ ContinuousVariable(f"({str(fun).lower()})") for fun in var_indep_funs ] for var, fun in var_dep_funs: name = f"{var.name} ({str(fun).lower()})" if fun in self.DiscVarFunctions: attrs.append(DiscreteVariable(name, var.values)) else: if isinstance(var, TimeVariable) and \ fun in self.TimeVarFunctions: attrs.append( TimeVariable(name, have_date=var.have_date, have_time=var.have_time)) else: attrs.append(ContinuousVariable(name)) args = (var_indep_funs, var_dep_funs, attrs) for t, var in (("table", None), ("total_h", self._col_var), ("total_v", self._row_var), ("total", self._total_var)): setattr(self._group_tables, t, self.__get_group_table(var, *args)) def __get_group_table(self, var, var_indep_funs, var_dep_funs, attrs): if var is self._total_var: group_tab = self._group_tables.total offset = int(bool(not self.single_var_grouping)) leading_vars = [self._total_var] combs = np.array([[0]]) sub_table_getter = lambda x: \ self._table[np.where((~np.isnan(self._row_var_col)) & (~np.isnan(self._col_var_col)))[0]] elif var is self._row_var or self.single_var_grouping: group_tab = self._group_tables.total_v offset = int(bool(not self.single_var_grouping)) leading_vars = [self._row_var] combs = self._row_var_groups[:, None] sub_table_getter = lambda x: \ self._table[np.where((~np.isnan(self._col_var_col)) & (self._row_var_col == x[0]))[0]] elif var is self._col_var: group_tab = self._group_tables.total_h offset = int(bool(not self.single_var_grouping)) leading_vars = [self._col_var] combs = self._col_var_groups[:, None] sub_table_getter = lambda x: \ self._table[np.where((~np.isnan(self._row_var_col)) & (self._col_var_col == x[0]))[0]] else: group_tab = self._group_tables.table offset = 0 leading_vars = [self._row_var, self._col_var] combs = np.array( list(product(self._row_var_groups, self._col_var_groups))) sub_table_getter = lambda x: \ self._table[np.where((self._row_var_col == x[0]) & (self._col_var_col == x[1]))[0]] if not combs.shape[0]: return None n = len(var_indep_funs) + len(var_dep_funs) X = np.zeros((len(combs), n), dtype=float) for i, comb in enumerate(combs): sub_table = sub_table_getter(comb) j = -1 for j, fun in enumerate(var_indep_funs): if fun in self._indepen_agg_done: # TODO - optimize - after this line is executed, # the whole column is already set X[:, j] = group_tab.X[:, self._indepen_agg_done[fun] - offset] else: X[i, j] = fun(sub_table) for k, (v, fun) in enumerate(var_dep_funs, j + 1): if fun in self._depen_agg_done: X[:, k] = group_tab.X[:, self._depen_agg_done[fun][v] - offset] else: X[i, k] = fun(sub_table.get_column_view(v)[0]) #rename leading vars (seems the easiest) if needed current = [var.name for var in attrs] uniq_leading_vars = [] for v in leading_vars: uniq = get_unique_names(current, v.name) if uniq != v.name: self.renamed.append(v.name) v = v.copy(name=uniq) uniq_leading_vars.append(v) current.append(uniq) return Table(Domain(uniq_leading_vars + attrs), np.hstack((combs, X))) def update_pivot_table(self, val_var: Variable): self._create_pivot_tables(val_var) def _create_pivot_tables(self, val_var): if not self._group_tables.table: self._pivot_tables = self.Tables() return agg_funs = [ fun for fun in self._current_agg_functions if fun in self.AutonomousFunctions or val_var and self.__include_aggregation(fun, val_var) ] X, X_h, X_v, X_t = self.__get_pivot_tab_x(val_var, agg_funs) dom, dom_h, dom_v, dom_t = self.__get_pivot_tab_domain( val_var, X, X_h, X_v, X_t, agg_funs) for t, d, x in (("table", dom, X), ("total_h", dom_h, X_h), ("total_v", dom_v, X_v), ("total", dom_t, X_t)): setattr(self._pivot_tables, t, Table(d, x)) # pylint: disable=invalid-name def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs): def map_values(index, _X): values = np.unique(_X[:, index]) values = np.delete(values, np.where(values == "nan")[0]) for j, value in enumerate(values): _X[:, index][_X[:, index] == value] = j return values create_time_var = \ isinstance(val_var, TimeVariable) and \ all(fun in self.TimeVarFunctions for fun in agg_funs) create_cont_var = \ not val_var or val_var.is_continuous and \ (not isinstance(val_var, TimeVariable) or all(fun in self.FloatFunctions for fun in agg_funs)) vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)] if create_time_var: kwargs = { "have_date": val_var.have_date, "have_time": val_var.have_time } attrs = [[TimeVariable(f"{v}", **kwargs) for v in vals]] * 2 attrs.extend([[TimeVariable("Total", **kwargs)]] * 2) elif create_cont_var: attrs = [[ContinuousVariable(f"{v}", 1) for v in vals]] * 2 attrs.extend([[ContinuousVariable("Total", 1)]] * 2) else: attrs = [] for x in (X, X_h): attrs.append([ DiscreteVariable(f"{v}", map_values(i, x)) for i, v in enumerate(vals, 2) ]) for x in (X_v, X_t): attrs.append([DiscreteVariable("Total", map_values(0, x))]) row_var_h = DiscreteVariable(self._row_var.name, values=["Total"]) aggr_attr = DiscreteVariable('Aggregate', [str(f) for f in agg_funs]) same_row_col = self._col_var is self._row_var extra_vars = [self._row_var, aggr_attr] uniq_a = get_unique_names_duplicates([v.name for v in extra_vars] + [atr.name for atr in attrs[0]]) for (idx, var), u in zip(enumerate(chain(extra_vars, attrs[0])), uniq_a): if var.name == u: continue if idx == 0: self.renamed.append(self._row_var.name) self._row_var = self._row_var.copy(name=u) if same_row_col: self._col_var = self._row_var row_var_h = row_var_h.copy(name=u) elif idx == 1: self.renamed.append(aggr_attr.name) aggr_attr = aggr_attr.copy(name=u) else: self.renamed.append(var.name) attrs[0][idx - 2] = var.copy(name=u) attrs[1][idx - 2] = var.copy(name=u) if same_row_col: vals = tuple(v.name for v in attrs[0]) self._row_var.make(self._row_var.name, values=vals) vals = tuple(v.name for v in attrs[2]) row_var_h.make(row_var_h.name, vals) return (Domain([self._row_var, aggr_attr] + attrs[0]), Domain([row_var_h, aggr_attr] + attrs[1]), Domain(attrs[2]), Domain(attrs[3])) def __get_pivot_tab_x(self, val_var, agg_funs): gt = self._group_tables n_fun = len(agg_funs) n_rows, n_cols = len(self._row_var_groups), len(self._col_var_groups) is_float_type = not val_var or val_var.is_continuous if isinstance(val_var, TimeVariable): is_float_type = \ all(fun in self.TimeVarFunctions for fun in agg_funs) or \ all(fun in self.FloatFunctions for fun in agg_funs) kwargs = {"fill_value": np.nan, "dtype": float} if is_float_type \ else {"fill_value": "", "dtype": object} X = np.full((n_rows * n_fun, 2 + n_cols), **kwargs) X_h = np.full((n_fun, 2 + n_cols), **kwargs) X_v = np.full((n_rows * n_fun, 1), **kwargs) X_t = np.full((n_fun, 1), **kwargs) for i, fun in enumerate(agg_funs): args = (val_var, fun, is_float_type) X[i::n_fun, 2:] = self.__rows_for_function(n_rows, n_cols, *args) X[i::n_fun, :2] = np.array([[row_val, agg_funs.index(fun)] for row_val in self._row_var_groups]) X_h[i, :2] = 0, agg_funs.index(fun) X_h[i, 2:] = self.__total_for_function(gt.total_h, *args) X_v[i::n_fun, 0] = self.__total_for_function(gt.total_v, *args) X_t[i] = self.__total_for_function(gt.total, *args) return X, X_h, X_v, X_t def __total_for_function(self, group_tab, val_var, fun, is_float_type): ref = self._indepen_agg_done.get(fun, None) \ or self._depen_agg_done[fun][val_var] ref -= int(bool(not self.single_var_grouping)) return self.__check_continuous(val_var, group_tab.X[:, ref], fun, is_float_type) def __rows_for_function(self, n_rows, n_cols, val_var, fun, is_float_type): ref = self._indepen_agg_done.get(fun, None) \ or self._depen_agg_done[fun][val_var] column = self._group_tables.table.X[:, ref] if self.single_var_grouping: rows = np.full((n_rows, n_cols), fun(np.array([]), ), dtype=float) rows[np.diag_indices_from(rows)] = column else: rows = column.reshape(n_rows, n_cols) return self.__check_continuous(val_var, rows, fun, is_float_type) def __check_continuous(self, val_var, column, fun, is_float_type): if val_var and not val_var.is_continuous: column = column.astype(str) if fun in self.DiscVarFunctions: for j, val in enumerate(val_var.values): column[column == str(float(j))] = val elif isinstance(val_var, TimeVariable) and not is_float_type: shape = column.shape column = column.flatten() column_ = column.astype(str) if fun in self.TimeVarFunctions: for i in range(column.shape[0]): if not np.isnan(column[i]): column_[i] = val_var.repr_val(column[i]) return column_.reshape(shape) return column @staticmethod def count_defined(x): if x.shape[0] == 0: return 0 if x.size and np.issubdtype(x.dtype, np.number) and not sp.issparse(x): nans = np.isnan(x).sum(axis=0) elif sp.issparse(x) and x.size: nans = np.bincount(x.nonzero()[1], minlength=x.shape[1]) x = x.tocsc() else: x_str = x.astype(str) nans = ((x_str == "nan") | (x_str == "")).sum(axis=0) \ if x.size else np.zeros(x.shape[1]) return x.shape[0] - nans @staticmethod def stat(x, f): return f(x.astype(np.float), axis=0) if x.shape[0] > 0 else np.nan @staticmethod def mode(x): return Pivot.stat(x, nanmode).mode if x.shape[0] > 0 else np.nan @staticmethod def majority(x): if x.shape[0] == 0: return np.nan counts = bincount(x)[0] return np.argmax(counts) if counts.shape[0] else np.nan Count.func = lambda x: len(x[0]) Count_defined.func = lambda x: Pivot.count_defined(x[0]) Sum.func = lambda x: nansum(x[0], axis=0) if x[0].shape[0] > 0 else 0 Mean.func = lambda x: Pivot.stat(x[0], nanmean) Min.func = lambda x: Pivot.stat(x[0], nanmin) Max.func = lambda x: Pivot.stat(x[0], nanmax) Median.func = lambda x: Pivot.stat(x[0], nanmedian) Mode.func = lambda x: Pivot.mode(x[0]) Var.func = lambda x: Pivot.stat(x[0], nanvar) Majority.func = lambda x: Pivot.majority(x[0])