Esempio n. 1
0
def sort_indices(compressed: CompressedMatrix) -> None:
    """
    Ensure the indices are sorted in each row/column.
    """
    with utm.timed_step("sparse.sort_indices"):
        with unfrozen(compressed):
            utm.timed_parameters(before=compressed.nnz)
            compressed.sort_indices()
            utm.timed_parameters(after=compressed.nnz)
Esempio n. 2
0
def eliminate_zeros(compressed: CompressedMatrix) -> None:
    """
    Eliminate zeros in a compressed matrix.
    """
    with utm.timed_step("sparse.eliminate_zeros"):
        with unfrozen(compressed):
            utm.timed_parameters(before=compressed.nnz)
            compressed.eliminate_zeros()
            utm.timed_parameters(after=compressed.nnz)
Esempio n. 3
0
def sum_duplicates(compressed: CompressedMatrix) -> None:
    """
    Eliminate duplicates in a compressed matrix.
    """
    with utm.timed_step("sparse.sum_duplicates"):
        with unfrozen(compressed):
            utm.timed_parameters(before=compressed.nnz)
            compressed.sum_duplicates()
            utm.timed_parameters(after=compressed.nnz)
Esempio n. 4
0
def to_proper_matrix(matrix: Matrix, *, default_layout: str = "row_major") -> ProperMatrix:
    """
    Given some 2D ``matrix``, return in in a :py:const:`ProperMatrix` format we can safely process.

    If the data is in some strange sparse format, use ``default_layout`` (default: {default_layout})
    to decide whether to return it in ``row_major`` (CSR) or ``column_major`` (CSC) layout.
    """
    if matrix.ndim != 2:
        raise ValueError(f"data is {matrix.ndim}-dimensional, " "expected 2-dimensional")

    if default_layout not in LAYOUT_OF_AXIS:
        raise ValueError(f"invalid default layout: {default_layout}")

    frame = maybe_pandas_frame(matrix)
    if frame is not None:
        matrix = frame.values
        if isinstance(matrix, pd.core.arrays.categorical.Categorical):
            matrix = np.array(matrix)

    compressed = maybe_compressed_matrix(matrix)
    if compressed is not None:
        return compressed

    sparse = maybe_sparse_matrix(matrix)
    if sparse is not None:
        if default_layout == "column_major":
            with utm.timed_step("matrix.tocsc"):
                utm.timed_parameters(results=sparse.shape[1], elements=sparse.nnz / sparse.shape[1])
                return sparse.tocsc()

        with utm.timed_step("matrix.tocsr"):
            utm.timed_parameters(results=sparse.shape[0], elements=sparse.nnz / sparse.shape[0])
            return sparse.tocsr()

    dense = maybe_numpy_matrix(matrix)
    if dense is None:
        dense = np.asarray(matrix)

    return dense
Esempio n. 5
0
def copy_adata(adata: AnnData,
               *,
               name: Optional[str] = None,
               share_derived: bool = True,
               top_level: bool = True) -> AnnData:
    """
    Return a copy of some annotated ``adata``.

    If ``name`` is not specified, the data will be unnamed. Otherwise, if it starts with a ``.``, it
    will be appended to the current name (if any). Otherwise, ``name`` is the new name.

    If ``share_derived`` is ``True`` (the default), then the copy will share the derived data cache,
    which contains specific layout variants of matrix data and sums of columns/rows of matrix data.
    Use this if you intend to modify the copy in-place.

    .. note::

        In general we assume annotated data is **not** modified in-place, but it might make sense to
        create a copy (**not** sharing derived data), modify it immediately (before accessing data
        in a specific layout), and then proceed to process it without further modifications.
    """
    with utm.timed_step("adata.copy"):
        bdata = adata.copy()

    set_name(bdata, name)

    if hasattr(bdata, "__is_top_level__"):
        delattr(bdata, "__is_top_level__")

    if share_derived:
        if not hasattr(adata, "__derived__"):
            setattr(adata, "__derived__", {})
        setattr(bdata, "__derived__", getattr(adata, "__derived__"))
    else:
        if hasattr(bdata, "__derived__"):
            delattr(bdata, "__derived__")

    if top_level:
        utl.top_level(bdata)
    return bdata
Esempio n. 6
0
def to_numpy_matrix(
    matrix: Matrix,
    *,
    default_layout: str = "row_major",
    copy: bool = False,
    only_extract: bool = False,
) -> NumpyMatrix:
    """
    Convert any :py:const:`Matrix` to a dense 2-dimensional :py:const:`NumpyMatrix`.

    If ``copy`` (default: {copy}), a copy of the data is returned even if no conversion needed to be
    done.

    If ``only_extract`` (default: {only_extract}), then assert this only extracts the data inside
    some pandas data.

    If the data is in some strange sparse format, use ``default_layout`` (default: {default_layout})
    to decide whether to return it in ``row_major`` (CSR) or ``column_major`` (CSC) layout.
    """
    assert default_layout in ("row_major", "column_major")

    sparse = maybe_sparse_matrix(matrix)
    if sparse is not None:
        assert not only_extract
        with utm.timed_step("sparse.toarray"):
            utm.timed_parameters(results=sparse.shape[0], elements=sparse.shape[1])
            layout = matrix_layout(sparse) or default_layout
            if layout == "row_major":
                order = "C"
            else:
                order = "F"
            dense = sparse.toarray(order=order)
    else:
        dense = mustbe_numpy_matrix(to_proper_matrix(matrix))

    if copy and id(dense) == id(matrix):
        dense = np.copy(dense)

    return mustbe_numpy_matrix(dense)
Esempio n. 7
0
def slice(  # pylint: disable=redefined-builtin,too-many-branches,too-many-statements
    adata: AnnData,
    *,
    name: Optional[str] = None,
    obs: utt.Vector = None,
    vars: utt.Vector = None,
    track_obs: Optional[str] = None,
    track_var: Optional[str] = None,
    share_derived: bool = True,
    top_level: bool = True,
) -> AnnData:
    """
    Return new annotated data which includes a subset of the full ``adata``.

    If ``name`` is not specified, the data will be unnamed. Otherwise, if it starts with a ``.``, it
    will be appended to the current name (if any). Otherwise, ``name`` is the new name.

    If ``obs`` and/or ``vars`` are specified, they should be set to either a boolean mask or a
    collection of indices to include in the data slice. In the case of an indices array, it is
    assumed the indices are unique and sorted, that is that their effect is similar to a mask.

    If ``track_obs`` and/or ``track_var`` are specified, the result slice will include a
    per-observation and/or per-variable annotation containing the indices of the sliced elements in
    the original full data.

    If the slice happens to be the full original data, then this becomes equivalent to
    :py:func:`copy_adata`, and by default this will ``share_derived`` (share the derived data
    cache).
    """
    assert "__x__" not in adata.layers

    is_same_obs: Optional[bool] = None
    if obs is None:
        obs = range(adata.n_obs)
        is_same_obs = True
    else:
        assert 0 < len(obs) <= adata.n_obs
        if len(obs) < adata.n_obs:
            is_same_obs = False

        obs = utt.to_numpy_vector(obs)
        if obs.dtype == "bool":
            assert obs.size == adata.n_obs
            assert np.any(obs)
            is_same_obs = bool(np.all(obs))

    is_same_vars: Optional[bool] = None
    if vars is None:
        vars = range(adata.n_vars)
        is_same_vars = True
    else:
        assert 0 < len(vars) <= adata.n_vars
        if len(vars) < adata.n_vars:
            is_same_vars = False

        vars = utt.to_numpy_vector(vars)
        if vars.dtype == "bool":
            assert vars.size == adata.n_vars
            assert np.any(vars)
            is_same_vars = bool(np.all(vars))

    if is_same_obs and is_same_vars:
        bdata = copy_adata(adata,
                           name=name,
                           share_derived=share_derived,
                           top_level=top_level)

    else:
        if is_same_vars:
            replaced = _replace_with_layout(adata, "row_major")
        elif is_same_obs:
            replaced = _replace_with_layout(adata, "column_major")
        else:
            replaced = {}

        try:
            with utm.timed_step("adata.slice"):
                bdata = adata[obs, vars].copy()
        finally:
            _replace_back(adata, replaced)

        set_name(bdata, name)

        if hasattr(bdata, "__is_top_level__"):
            delattr(bdata, "__is_top_level__")
        if hasattr(bdata, "__derived__"):
            delattr(bdata, "__derived__")

        if share_derived and (is_same_obs or is_same_obs is None) and (
                is_same_vars or is_same_vars is None):

            if is_same_obs is None:
                is_same_obs = bdata.n_obs == adata.n_obs and bool(
                    np.all(bdata.obs_names == adata.obs_names))

            if is_same_vars is None:
                is_same_vars = bdata.n_vars == adata.n_vars and bool(
                    np.all(bdata.var_names == adata.var_names))

            if is_same_obs and is_same_vars:
                if not hasattr(adata, "__derived__"):
                    setattr(adata, "__derived__", {})
                setattr(bdata, "__derived__", getattr(adata, "__derived__"))

        if top_level:
            utl.top_level(bdata)

    if utl.logging_calc():
        utl.log_calc(
            f'slice {get_name(adata, "unnamed")} into {get_name(bdata, "unnamed")} shape {bdata.shape}'
        )  #

    if track_obs is not None:
        set_o_data(bdata, track_obs, np.arange(adata.n_obs)[obs])

    if track_var is not None:
        set_v_data(bdata, track_var, np.arange(adata.n_vars)[vars])

    return bdata
Esempio n. 8
0
def parallel_map(
    function: Callable[[int], T],
    invocations: int,
    *,
    max_processors: int = 0,
) -> Iterable[T]:
    """
    Execute ``function``, in parallel, ``invocations`` times. Each invocation is given the
    invocation's index as its single argument.

    For our simple pipelines, only the main process is allowed to execute functions in parallel
    processes, that is, we do not support nested ``parallel_map`` calls.

    This uses :py:func:`get_processors_count` processes. If ``max_processors`` (default:
    {max_processors}) is zero, use all available processors. Otherwise, further reduces the number
    of processes used to at most the specified value.

    If this ends up using a single process, runs the function serially. Otherwise, fork new
    processes to execute the function invocations (using
    ``multiprocessing.get_context('fork').Pool.map``).

    The downside is that this is slow, and you need to set up **mutable** shared memory (e.g. for
    large results) in advance. The upside is that each of these processes starts with a shared
    memory copy(-on-write) of the full Python state, that is, all the inputs for the function are
    available "for free".

    .. todo::

        It is currently only possible to invoke :py:func:`parallel_map` from the main application
        thread (that is, it does not nest).
    """
    assert function.__is_timed__  # type: ignore

    global IS_MAIN_PROCESS
    assert IS_MAIN_PROCESS

    global PROCESSES_COUNT
    PROCESSES_COUNT = min(PROCESSORS_COUNT, invocations)
    if max_processors != 0:
        assert max_processors > 0
        PROCESSES_COUNT = min(PROCESSES_COUNT, max_processors)

    if PROCESSES_COUNT == 1:
        return [function(index) for index in range(invocations)]

    NEXT_PROCESS_INDEX.value = 0  # type: ignore

    global PARALLEL_FUNCTION
    assert PARALLEL_FUNCTION is None

    global MAP_INDEX
    MAP_INDEX += 1

    PARALLEL_FUNCTION = function
    IS_MAIN_PROCESS = None
    try:
        utm.flush_timing()
        with utm.timed_step("parallel_map"):
            utm.timed_parameters(index=MAP_INDEX, processes=PROCESSES_COUNT)
            with get_context("fork").Pool(PROCESSES_COUNT) as pool:
                return pool.map(_invocation, range(invocations))
    finally:
        IS_MAIN_PROCESS = True
        PARALLEL_FUNCTION = None