def test_merge(self):
        avals = randn(2, 10)
        bvals = randn(2, 10)

        ref_cols = Index(['e', 'a', 'b', 'd', 'f'])

        ablock = make_block(avals,
                            ref_cols.get_indexer(['e', 'b']))
        bblock = make_block(bvals,
                            ref_cols.get_indexer(['a', 'd']))
        merged = ablock.merge(bblock)
        assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3])
        assert_almost_equal(merged.values[[0, 2]], avals)
        assert_almost_equal(merged.values[[1, 3]], bvals)
def test_get_indexer():
    major_axis = Index(lrange(4))
    minor_axis = Index(lrange(2))

    major_labels = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp)
    minor_labels = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp)

    index = MultiIndex(levels=[major_axis, minor_axis],
                       labels=[major_labels, minor_labels])
    idx1 = index[:5]
    idx2 = index[[1, 3, 5]]

    r1 = idx1.get_indexer(idx2)
    assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))

    r1 = idx2.get_indexer(idx1, method='pad')
    e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
    assert_almost_equal(r1, e1)

    r2 = idx2.get_indexer(idx1[::-1], method='pad')
    assert_almost_equal(r2, e1[::-1])

    rffill1 = idx2.get_indexer(idx1, method='ffill')
    assert_almost_equal(r1, rffill1)

    r1 = idx2.get_indexer(idx1, method='backfill')
    e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
    assert_almost_equal(r1, e1)

    r2 = idx2.get_indexer(idx1[::-1], method='backfill')
    assert_almost_equal(r2, e1[::-1])

    rbfill1 = idx2.get_indexer(idx1, method='bfill')
    assert_almost_equal(r1, rbfill1)

    # pass non-MultiIndex
    r1 = idx1.get_indexer(idx2.values)
    rexp1 = idx1.get_indexer(idx2)
    assert_almost_equal(r1, rexp1)

    r1 = idx1.get_indexer([1, 2, 3])
    assert (r1 == [-1, -1, -1]).all()

    # create index with duplicates
    idx1 = Index(lrange(10) + lrange(10))
    idx2 = Index(lrange(20))

    msg = "Reindexing only valid with uniquely valued Index objects"
    with tm.assert_raises_regex(InvalidIndexError, msg):
    def test_get_indexer_strings_raises(self):
        index = Index(["b", "c"])

        msg = r"unsupported operand type\(s\) for -: 'str' and 'str'"
        with pytest.raises(TypeError, match=msg):
            index.get_indexer(["a", "b", "c", "d"], method="nearest")

        with pytest.raises(TypeError, match=msg):
            index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2)

        with pytest.raises(TypeError, match=msg):
            index.get_indexer(["a", "b", "c", "d"],
                              tolerance=[2, 2, 2, 2])
def _gen_query_anndata(
    data: Union[MultimodalData, UnimodalData],
    ref_features: pd.Index,
    obs_columns: Optional[List[str]] = None,
    matkey: str = "counts",
) -> anndata.AnnData:
    """ Generate a new query Anndata object for scvitools

    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.
    ref_features: ``pd.Index``
        A pandas index of reference feature names
    obs_columns: ``List[str]``
        A list of obs keys that should be included in the new anndata.
    matkey: ``str``, optional, default: ``"counts"``
        Matrix key for the raw count

    An AnnData object.
    mat = data.get_matrix(matkey)
    if obs_columns is not None and obs_columns:
        obs_field = data.obs[obs_columns]
        obs_field = data.obs
    var_field = pd.DataFrame(index=ref_features)

    indexer = ref_features.get_indexer(data.var_names)
    new_size = (indexer[mat.indices] >= 0).sum()
    data_new, indices_new, indptr_new = _select_csr(, mat.indices,
                                                    mat.indptr, indexer,
    X = csr_matrix((data_new, indices_new, indptr_new),
                   shape=(mat.shape[0], ref_features.size))

    return anndata.AnnData(X=X, obs=obs_field, var=var_field)
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order: boolean, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

        .. versionadded:: 0.20.0

    result : Categorical

        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
        Empty list of categoricals passed
    from pandas import Index, Categorical, CategoricalIndex, Series

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
               for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([ for c in to_union])

        if sort_categories and not ignore_order and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)
                new_codes.append(take_1d(indexer,, fill_value=-1))
                # must be all NaN
        new_codes = np.concatenate(new_codes)
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
            raise TypeError('Categorical.ordered must be the same')

    if ignore_order:
        ordered = False

    return Categorical(new_codes, categories=categories, ordered=ordered,
    def test_get_indexer_nearest(self, method, tolerance, indexer, expected):
        index = Index(np.arange(10))

        actual = index.get_indexer(indexer, method=method, tolerance=tolerance)
        tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp))
Exemple #10
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order: boolean, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

        .. versionadded:: 0.20.0

    result : Categorical

        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
        Empty list of categoricals passed


    To learn more about categories, see `link


    >>> from pandas.api.types import union_categoricals

    If you want to combine categoricals that do not necessarily have
    the same categories, `union_categoricals` will combine a list-like
    of categoricals. The new categories will be the union of the
    categories being combined.

    >>> a = pd.Categorical(["b", "c"])
    >>> b = pd.Categorical(["a", "b"])
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]

    By default, the resulting categories will be ordered as they appear
    in the `categories` of the data. If you want the categories to be
    lexsorted, use `sort_categories=True` argument.

    >>> union_categoricals([a, b], sort_categories=True)
    [b, c, a, b]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with the case of combining two
    categoricals of the same categories and order information (e.g. what
    you could also `append` for).

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
    >>> union_categoricals([a, b])
    [a, b, a, b, a]
    Categories (2, object): [a < b]

    Raises `TypeError` because the categories are ordered and not identical.

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> union_categoricals([a, b])
    TypeError: to union ordered Categoricals, all categories must be the same

    New in version 0.20.0

    Ordered categoricals with different categories or orderings can be
    combined by using the `ignore_ordered=True` argument.

    >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
    >>> union_categoricals([a, b], ignore_order=True)
    [a, b, c, c, b, a]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with a `CategoricalIndex`, or `Series`
    containing categorical data, but note that the resulting array will
    always be a plain `Categorical`

    >>> a = pd.Series(["b", "c"], dtype='category')
    >>> b = pd.Series(["a", "b"], dtype='category')
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]
    from pandas import Index, Categorical, CategoricalIndex, Series

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(
            is_dtype_equal(other.categories.dtype, first.categories.dtype)
            for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([ for c in to_union])

        if sort_categories and not ignore_order and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)

            from pandas.core.algorithms import take_1d
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)

                from pandas.core.algorithms import take_1d
                new_codes.append(take_1d(indexer,, fill_value=-1))
                # must be all NaN
        new_codes = np.concatenate(new_codes)
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
            raise TypeError('Categorical.ordered must be the same')

    if ignore_order:
        ordered = False

    return Categorical(new_codes,
def gen_reindexer(new_var: pd.Index, cur_var: pd.Index, *, fill_value=0):
    Given a new set of var_names, and a current set, generates a function which will reindex
    a matrix to be aligned with the new set.


    >>> a = AnnData(sparse.eye(3), var=pd.DataFrame(index=list("abc")))
    >>> b = AnnData(sparse.eye(2), var=pd.DataFrame(index=list("ba")))
    >>> reindexer = gen_reindexer(a.var_names, b.var_names)
    >>> sparse.vstack([a.X, reindexer(b.X)]).toarray()
    array([[1., 0., 0.],
           [0., 1., 0.],
           [0., 0., 1.],
           [0., 1., 0.],
           [1., 0., 0.]], dtype=float32)
    >>> reindexer_nan = gen_reindexer(a.var_names, b.var_names, fill_value=np.nan)
    >>> sparse.vstack([a.X, reindexer_nan(b.X)]).toarray()
    array([[ 1.,  0.,  0.],
           [ 0.,  1.,  0.],
           [ 0.,  0.,  1.],
           [ 0.,  1., nan],
           [ 1.,  0., nan]], dtype=float32)
    new_size = len(new_var)
    old_size = len(cur_var)
    new_pts = new_var.get_indexer(cur_var)
    cur_pts = np.arange(len(new_pts))

    mask = new_pts != -1

    new_pts = new_pts[mask]
    cur_pts = cur_pts[mask]

    def reindexer(X, fill_value=fill_value):
        if not np.can_cast(fill_value, X.dtype):
            out_dtype = np.promote_types(np.array(fill_value).dtype, X.dtype)
            out_dtype = X.dtype

        idxmtx = sparse.coo_matrix(
            (np.ones(len(new_pts), dtype=int), (cur_pts, new_pts)),
            shape=(old_size, new_size),
        out = X @ idxmtx

        if fill_value != 0:
            to_fill = new_var.get_indexer(new_var.difference(cur_var))
            if len(to_fill) > 0:
                # More efficient to set columns on csc
                if sparse.issparse(out):
                    out = sparse.csc_matrix(out)
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
                    out[:, to_fill] = fill_value

        return out

    return reindexer
 def _process_pd_index(base_idx: pd.Index, index_1d: pd.Index) -> List[int]:
     if index_1d.has_duplicates:
         index_1d = index_1d.drop_duplicates()
     indexer = base_idx.get_indexer(index_1d)
     indexer = indexer[indexer >= 0]
     return indexer
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order: boolean, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

        .. versionadded:: 0.20.0

    result : Categorical

        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
        Empty list of categoricals passed


    To learn more about categories, see `link


    >>> from pandas.api.types import union_categoricals

    If you want to combine categoricals that do not necessarily have
    the same categories, `union_categoricals` will combine a list-like
    of categoricals. The new categories will be the union of the
    categories being combined.

    >>> a = pd.Categorical(["b", "c"])
    >>> b = pd.Categorical(["a", "b"])
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]

    By default, the resulting categories will be ordered as they appear
    in the `categories` of the data. If you want the categories to be
    lexsorted, use `sort_categories=True` argument.

    >>> union_categoricals([a, b], sort_categories=True)
    [b, c, a, b]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with the case of combining two
    categoricals of the same categories and order information (e.g. what
    you could also `append` for).

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
    >>> union_categoricals([a, b])
    [a, b, a, b, a]
    Categories (2, object): [a < b]

    Raises `TypeError` because the categories are ordered and not identical.

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> union_categoricals([a, b])
    TypeError: to union ordered Categoricals, all categories must be the same

    New in version 0.20.0

    Ordered categoricals with different categories or orderings can be
    combined by using the `ignore_ordered=True` argument.

    >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
    >>> union_categoricals([a, b], ignore_order=True)
    [a, b, c, c, b, a]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with a `CategoricalIndex`, or `Series`
    containing categorical data, but note that the resulting array will
    always be a plain `Categorical`

    >>> a = pd.Series(["b", "c"], dtype='category')
    >>> b = pd.Series(["a", "b"], dtype='category')
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]
    from pandas import Index, Categorical, CategoricalIndex, Series

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
               for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([ for c in to_union])

        if sort_categories and not ignore_order and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)

            from pandas.core.algorithms import take_1d
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)

                from pandas.core.algorithms import take_1d
                new_codes.append(take_1d(indexer,, fill_value=-1))
                # must be all NaN
        new_codes = np.concatenate(new_codes)
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
            raise TypeError('Categorical.ordered must be the same')

    if ignore_order:
        ordered = False

    return Categorical(new_codes, categories=categories, ordered=ordered,