def sort_indices(compressed: CompressedMatrix) -> None: """ Ensure the indices are sorted in each row/column. """ with utm.timed_step("sparse.sort_indices"): with unfrozen(compressed): utm.timed_parameters(before=compressed.nnz) compressed.sort_indices() utm.timed_parameters(after=compressed.nnz)
def eliminate_zeros(compressed: CompressedMatrix) -> None: """ Eliminate zeros in a compressed matrix. """ with utm.timed_step("sparse.eliminate_zeros"): with unfrozen(compressed): utm.timed_parameters(before=compressed.nnz) compressed.eliminate_zeros() utm.timed_parameters(after=compressed.nnz)
def sum_duplicates(compressed: CompressedMatrix) -> None: """ Eliminate duplicates in a compressed matrix. """ with utm.timed_step("sparse.sum_duplicates"): with unfrozen(compressed): utm.timed_parameters(before=compressed.nnz) compressed.sum_duplicates() utm.timed_parameters(after=compressed.nnz)
def to_proper_matrix(matrix: Matrix, *, default_layout: str = "row_major") -> ProperMatrix: """ Given some 2D ``matrix``, return in in a :py:const:`ProperMatrix` format we can safely process. If the data is in some strange sparse format, use ``default_layout`` (default: {default_layout}) to decide whether to return it in ``row_major`` (CSR) or ``column_major`` (CSC) layout. """ if matrix.ndim != 2: raise ValueError(f"data is {matrix.ndim}-dimensional, " "expected 2-dimensional") if default_layout not in LAYOUT_OF_AXIS: raise ValueError(f"invalid default layout: {default_layout}") frame = maybe_pandas_frame(matrix) if frame is not None: matrix = frame.values if isinstance(matrix, pd.core.arrays.categorical.Categorical): matrix = np.array(matrix) compressed = maybe_compressed_matrix(matrix) if compressed is not None: return compressed sparse = maybe_sparse_matrix(matrix) if sparse is not None: if default_layout == "column_major": with utm.timed_step("matrix.tocsc"): utm.timed_parameters(results=sparse.shape[1], elements=sparse.nnz / sparse.shape[1]) return sparse.tocsc() with utm.timed_step("matrix.tocsr"): utm.timed_parameters(results=sparse.shape[0], elements=sparse.nnz / sparse.shape[0]) return sparse.tocsr() dense = maybe_numpy_matrix(matrix) if dense is None: dense = np.asarray(matrix) return dense
def copy_adata(adata: AnnData, *, name: Optional[str] = None, share_derived: bool = True, top_level: bool = True) -> AnnData: """ Return a copy of some annotated ``adata``. If ``name`` is not specified, the data will be unnamed. Otherwise, if it starts with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new name. If ``share_derived`` is ``True`` (the default), then the copy will share the derived data cache, which contains specific layout variants of matrix data and sums of columns/rows of matrix data. Use this if you intend to modify the copy in-place. .. note:: In general we assume annotated data is **not** modified in-place, but it might make sense to create a copy (**not** sharing derived data), modify it immediately (before accessing data in a specific layout), and then proceed to process it without further modifications. """ with utm.timed_step("adata.copy"): bdata = adata.copy() set_name(bdata, name) if hasattr(bdata, "__is_top_level__"): delattr(bdata, "__is_top_level__") if share_derived: if not hasattr(adata, "__derived__"): setattr(adata, "__derived__", {}) setattr(bdata, "__derived__", getattr(adata, "__derived__")) else: if hasattr(bdata, "__derived__"): delattr(bdata, "__derived__") if top_level: utl.top_level(bdata) return bdata
def to_numpy_matrix( matrix: Matrix, *, default_layout: str = "row_major", copy: bool = False, only_extract: bool = False, ) -> NumpyMatrix: """ Convert any :py:const:`Matrix` to a dense 2-dimensional :py:const:`NumpyMatrix`. If ``copy`` (default: {copy}), a copy of the data is returned even if no conversion needed to be done. If ``only_extract`` (default: {only_extract}), then assert this only extracts the data inside some pandas data. If the data is in some strange sparse format, use ``default_layout`` (default: {default_layout}) to decide whether to return it in ``row_major`` (CSR) or ``column_major`` (CSC) layout. """ assert default_layout in ("row_major", "column_major") sparse = maybe_sparse_matrix(matrix) if sparse is not None: assert not only_extract with utm.timed_step("sparse.toarray"): utm.timed_parameters(results=sparse.shape[0], elements=sparse.shape[1]) layout = matrix_layout(sparse) or default_layout if layout == "row_major": order = "C" else: order = "F" dense = sparse.toarray(order=order) else: dense = mustbe_numpy_matrix(to_proper_matrix(matrix)) if copy and id(dense) == id(matrix): dense = np.copy(dense) return mustbe_numpy_matrix(dense)
def slice( # pylint: disable=redefined-builtin,too-many-branches,too-many-statements adata: AnnData, *, name: Optional[str] = None, obs: utt.Vector = None, vars: utt.Vector = None, track_obs: Optional[str] = None, track_var: Optional[str] = None, share_derived: bool = True, top_level: bool = True, ) -> AnnData: """ Return new annotated data which includes a subset of the full ``adata``. If ``name`` is not specified, the data will be unnamed. Otherwise, if it starts with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new name. If ``obs`` and/or ``vars`` are specified, they should be set to either a boolean mask or a collection of indices to include in the data slice. In the case of an indices array, it is assumed the indices are unique and sorted, that is that their effect is similar to a mask. If ``track_obs`` and/or ``track_var`` are specified, the result slice will include a per-observation and/or per-variable annotation containing the indices of the sliced elements in the original full data. If the slice happens to be the full original data, then this becomes equivalent to :py:func:`copy_adata`, and by default this will ``share_derived`` (share the derived data cache). """ assert "__x__" not in adata.layers is_same_obs: Optional[bool] = None if obs is None: obs = range(adata.n_obs) is_same_obs = True else: assert 0 < len(obs) <= adata.n_obs if len(obs) < adata.n_obs: is_same_obs = False obs = utt.to_numpy_vector(obs) if obs.dtype == "bool": assert obs.size == adata.n_obs assert np.any(obs) is_same_obs = bool(np.all(obs)) is_same_vars: Optional[bool] = None if vars is None: vars = range(adata.n_vars) is_same_vars = True else: assert 0 < len(vars) <= adata.n_vars if len(vars) < adata.n_vars: is_same_vars = False vars = utt.to_numpy_vector(vars) if vars.dtype == "bool": assert vars.size == adata.n_vars assert np.any(vars) is_same_vars = bool(np.all(vars)) if is_same_obs and is_same_vars: bdata = copy_adata(adata, name=name, share_derived=share_derived, top_level=top_level) else: if is_same_vars: replaced = _replace_with_layout(adata, "row_major") elif is_same_obs: replaced = _replace_with_layout(adata, "column_major") else: replaced = {} try: with utm.timed_step("adata.slice"): bdata = adata[obs, vars].copy() finally: _replace_back(adata, replaced) set_name(bdata, name) if hasattr(bdata, "__is_top_level__"): delattr(bdata, "__is_top_level__") if hasattr(bdata, "__derived__"): delattr(bdata, "__derived__") if share_derived and (is_same_obs or is_same_obs is None) and ( is_same_vars or is_same_vars is None): if is_same_obs is None: is_same_obs = bdata.n_obs == adata.n_obs and bool( np.all(bdata.obs_names == adata.obs_names)) if is_same_vars is None: is_same_vars = bdata.n_vars == adata.n_vars and bool( np.all(bdata.var_names == adata.var_names)) if is_same_obs and is_same_vars: if not hasattr(adata, "__derived__"): setattr(adata, "__derived__", {}) setattr(bdata, "__derived__", getattr(adata, "__derived__")) if top_level: utl.top_level(bdata) if utl.logging_calc(): utl.log_calc( f'slice {get_name(adata, "unnamed")} into {get_name(bdata, "unnamed")} shape {bdata.shape}' ) # if track_obs is not None: set_o_data(bdata, track_obs, np.arange(adata.n_obs)[obs]) if track_var is not None: set_v_data(bdata, track_var, np.arange(adata.n_vars)[vars]) return bdata
def parallel_map( function: Callable[[int], T], invocations: int, *, max_processors: int = 0, ) -> Iterable[T]: """ Execute ``function``, in parallel, ``invocations`` times. Each invocation is given the invocation's index as its single argument. For our simple pipelines, only the main process is allowed to execute functions in parallel processes, that is, we do not support nested ``parallel_map`` calls. This uses :py:func:`get_processors_count` processes. If ``max_processors`` (default: {max_processors}) is zero, use all available processors. Otherwise, further reduces the number of processes used to at most the specified value. If this ends up using a single process, runs the function serially. Otherwise, fork new processes to execute the function invocations (using ``multiprocessing.get_context('fork').Pool.map``). The downside is that this is slow, and you need to set up **mutable** shared memory (e.g. for large results) in advance. The upside is that each of these processes starts with a shared memory copy(-on-write) of the full Python state, that is, all the inputs for the function are available "for free". .. todo:: It is currently only possible to invoke :py:func:`parallel_map` from the main application thread (that is, it does not nest). """ assert function.__is_timed__ # type: ignore global IS_MAIN_PROCESS assert IS_MAIN_PROCESS global PROCESSES_COUNT PROCESSES_COUNT = min(PROCESSORS_COUNT, invocations) if max_processors != 0: assert max_processors > 0 PROCESSES_COUNT = min(PROCESSES_COUNT, max_processors) if PROCESSES_COUNT == 1: return [function(index) for index in range(invocations)] NEXT_PROCESS_INDEX.value = 0 # type: ignore global PARALLEL_FUNCTION assert PARALLEL_FUNCTION is None global MAP_INDEX MAP_INDEX += 1 PARALLEL_FUNCTION = function IS_MAIN_PROCESS = None try: utm.flush_timing() with utm.timed_step("parallel_map"): utm.timed_parameters(index=MAP_INDEX, processes=PROCESSES_COUNT) with get_context("fork").Pool(PROCESSES_COUNT) as pool: return pool.map(_invocation, range(invocations)) finally: IS_MAIN_PROCESS = True PARALLEL_FUNCTION = None