Beispiel #1
0
    def indices(self):
        # we have a list of groupers
        if isinstance(self.grouper, ops.BaseGrouper):
            return self.grouper.indices

        values = Categorical(self.grouper)
        return values._reverse_indexer()
Beispiel #2
0
    def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
        # we have a list of groupers
        if isinstance(self.grouping_vector, ops.BaseGrouper):
            return self.grouping_vector.indices

        values = Categorical(self.grouping_vector)
        return values._reverse_indexer()
Beispiel #3
0
    def where(self, cond, other=None):
        if other is None:
            other = self._na_value
        values = np.where(cond, self.values, other)

        from pandas.core.arrays import Categorical
        cat = Categorical(values,
                          categories=self.categories,
                          ordered=self.ordered)
        return self._shallow_copy(cat, **self._get_attributes_dict())
Beispiel #4
0
    def melt_stub(df, stub, i, j, value_vars, sep):
        newdf = melt(df, id_vars=i, value_vars=value_vars,
                     value_name=stub.rstrip(sep), var_name=j)
        newdf[j] = Categorical(newdf[j])
        newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")

        # GH17627 Cast numerics suffixes to int/float
        newdf[j] = to_numeric(newdf[j], errors='ignore')

        return newdf.set_index(i + [j])
Beispiel #5
0
def _try_cast(arr, dtype, copy, raise_cast_failure):
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : array-like
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.
    """
    # perf shortcut as this is the most common case
    if isinstance(arr, np.ndarray):
        if maybe_castable(arr) and not copy and dtype is None:
            return arr

    try:
        # GH#15832: Check if we are requesting a numeric dype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            subarr = maybe_cast_to_integer_array(arr, dtype)

        subarr = maybe_cast_to_datetime(arr, dtype)
        # Take care in creating object arrays (but iterators are not
        # supported):
        if is_object_dtype(dtype) and (
                is_list_like(subarr) and
                not (is_iterator(subarr) or isinstance(subarr, np.ndarray))):
            subarr = construct_1d_object_array_from_listlike(subarr)
        elif not is_extension_type(subarr):
            subarr = construct_1d_ndarray_preserving_na(subarr,
                                                        dtype,
                                                        copy=copy)
    except OutOfBoundsDatetime:
        # in case of out of bound datetime64 -> always raise
        raise
    except (ValueError, TypeError):
        if is_categorical_dtype(dtype):
            # We *do* allow casting to categorical, since we know
            # that Categorical is the only array type for 'category'.
            subarr = Categorical(arr, dtype.categories, ordered=dtype._ordered)
        elif is_extension_array_dtype(dtype):
            # create an extension array from its dtype
            array_type = dtype.construct_array_type()._from_sequence
            subarr = array_type(arr, dtype=dtype, copy=copy)
        elif dtype is not None and raise_cast_failure:
            raise
        else:
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Beispiel #6
0
    def get_result(self):
        values, _ = self.get_new_values()
        columns = self.get_new_columns()
        index = self.get_new_index()

        # may need to coerce categoricals here
        if self.is_categorical is not None:
            categories = self.is_categorical.categories
            ordered = self.is_categorical.ordered
            values = [Categorical(values[:, i], categories=categories,
                                  ordered=ordered)
                      for i in range(values.shape[-1])]

        return self.constructor(values, index=index, columns=columns)
Beispiel #7
0
def lexsort_indexer(keys, orders=None, na_position: str = "last"):
    """
    Parameters
    ----------
    na_position : {'first', 'last'}, default 'last'
    """
    from pandas.core.arrays import Categorical

    labels = []
    shape = []
    if isinstance(orders, bool):
        orders = [orders] * len(keys)
    elif orders is None:
        orders = [True] * len(keys)

    for key, order in zip(keys, orders):

        # we are already a Categorical
        if is_categorical_dtype(key):
            cat = key

        # create the Categorical
        else:
            cat = Categorical(key, ordered=True)

        if na_position not in ["last", "first"]:
            raise ValueError(f"invalid na_position: {na_position}")

        n = len(cat.categories)
        codes = cat.codes.copy()

        mask = cat.codes == -1
        if order:  # ascending
            if na_position == "last":
                codes = np.where(mask, n, codes)
            elif na_position == "first":
                codes += 1
        else:  # not order means descending
            if na_position == "last":
                codes = np.where(mask, n, n - codes - 1)
            elif na_position == "first":
                codes = np.where(mask, 0, n - codes)
        if mask.any():
            n += 1

        shape.append(n)
        labels.append(codes)

    return indexer_from_factorized(labels, shape)
Beispiel #8
0
def lexsort_indexer(keys, orders=None, na_position='last'):
    from pandas.core.arrays import Categorical

    labels = []
    shape = []
    if isinstance(orders, bool):
        orders = [orders] * len(keys)
    elif orders is None:
        orders = [True] * len(keys)

    for key, order in zip(keys, orders):

        # we are already a Categorical
        if is_categorical_dtype(key):
            c = key

        # create the Categorical
        else:
            c = Categorical(key, ordered=True)

        if na_position not in ['last', 'first']:
            raise ValueError('invalid na_position: {!r}'.format(na_position))

        n = len(c.categories)
        codes = c.codes.copy()

        mask = (c.codes == -1)
        if order:  # ascending
            if na_position == 'last':
                codes = np.where(mask, n, codes)
            elif na_position == 'first':
                codes += 1
        else:  # not order means descending
            if na_position == 'last':
                codes = np.where(mask, n, n - codes - 1)
            elif na_position == 'first':
                codes = np.where(mask, 0, n - codes)
        if mask.any():
            n += 1

        shape.append(n)
        labels.append(codes)

    return indexer_from_factorized(labels, shape)
Beispiel #9
0
    def _create_categorical(self,
                            data,
                            categories=None,
                            ordered=None,
                            dtype=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        dtype : CategoricalDtype, defaults to existing

        Returns
        -------
        Categorical
        """
        if (isinstance(data, (ABCSeries, type(self)))
                and is_categorical_dtype(data)):
            data = data.values

        if not isinstance(data, ABCCategorical):
            if ordered is None and dtype is None:
                ordered = False
            from pandas.core.arrays import Categorical
            data = Categorical(data,
                               categories=categories,
                               ordered=ordered,
                               dtype=dtype)
        else:
            if categories is not None:
                data = data.set_categories(categories, ordered=ordered)
            elif ordered is not None and ordered != data.ordered:
                data = data.set_ordered(ordered)
            if isinstance(dtype, CategoricalDtype):
                # we want to silently ignore dtype='category'
                data = data._set_dtype(dtype)
        return data
Beispiel #10
0
def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure):

    # perf shortcut as this is the most common case
    if take_fast_path:
        if maybe_castable(arr) and not copy and dtype is None:
            return arr

    try:
        # GH#15832: Check if we are requesting a numeric dype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            subarr = maybe_cast_to_integer_array(arr, dtype)

        subarr = maybe_cast_to_datetime(arr, dtype)
        # Take care in creating object arrays (but iterators are not
        # supported):
        if is_object_dtype(dtype) and (
                is_list_like(subarr) and
                not (is_iterator(subarr) or isinstance(subarr, np.ndarray))):
            subarr = construct_1d_object_array_from_listlike(subarr)
        elif not is_extension_type(subarr):
            subarr = construct_1d_ndarray_preserving_na(subarr,
                                                        dtype,
                                                        copy=copy)
    except OutOfBoundsDatetime:
        # in case of out of bound datetime64 -> always raise
        raise
    except (ValueError, TypeError):
        if is_categorical_dtype(dtype):
            # We *do* allow casting to categorical, since we know
            # that Categorical is the only array type for 'category'.
            subarr = Categorical(arr, dtype.categories, ordered=dtype.ordered)
        elif is_extension_array_dtype(dtype):
            # create an extension array from its dtype
            array_type = dtype.construct_array_type()._from_sequence
            subarr = array_type(arr, dtype=dtype, copy=copy)
        elif dtype is not None and raise_cast_failure:
            raise
        else:
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Beispiel #11
0
def lexsort_indexer(keys,
                    orders=None,
                    na_position: str = "last",
                    key: Optional[Callable] = None):
    """
    Performs lexical sorting on a set of keys

    Parameters
    ----------
    keys : sequence of arrays
        Sequence of ndarrays to be sorted by the indexer
    orders : boolean or list of booleans, optional
        Determines the sorting order for each element in keys. If a list,
        it must be the same length as keys. This determines whether the
        corresponding element in keys should be sorted in ascending
        (True) or descending (False) order. if bool, applied to all
        elements as above. if None, defaults to True.
    na_position : {'first', 'last'}, default 'last'
        Determines placement of NA elements in the sorted list ("last" or "first")
    key : Callable, optional
        Callable key function applied to every element in keys before sorting

        .. versionadded:: 1.0.0
    """
    from pandas.core.arrays import Categorical

    labels = []
    shape = []
    if isinstance(orders, bool):
        orders = [orders] * len(keys)
    elif orders is None:
        orders = [True] * len(keys)

    keys = [ensure_key_mapped(k, key) for k in keys]

    for k, order in zip(keys, orders):
        cat = Categorical(k, ordered=True)

        if na_position not in ["last", "first"]:
            raise ValueError(f"invalid na_position: {na_position}")

        n = len(cat.categories)
        codes = cat.codes.copy()

        mask = cat.codes == -1
        if order:  # ascending
            if na_position == "last":
                codes = np.where(mask, n, codes)
            elif na_position == "first":
                codes += 1
        else:  # not order means descending
            if na_position == "last":
                codes = np.where(mask, n, n - codes - 1)
            elif na_position == "first":
                codes = np.where(mask, 0, n - codes)
        if mask.any():
            n += 1

        shape.append(n)
        labels.append(codes)

    return indexer_from_factorized(labels, shape)
Beispiel #12
0
def lexsort_indexer(
    keys, orders=None, na_position: str = "last", key: Callable | None = None
) -> npt.NDArray[np.intp]:
    """
    Performs lexical sorting on a set of keys

    Parameters
    ----------
    keys : sequence of arrays
        Sequence of ndarrays to be sorted by the indexer
    orders : bool or list of booleans, optional
        Determines the sorting order for each element in keys. If a list,
        it must be the same length as keys. This determines whether the
        corresponding element in keys should be sorted in ascending
        (True) or descending (False) order. if bool, applied to all
        elements as above. if None, defaults to True.
    na_position : {'first', 'last'}, default 'last'
        Determines placement of NA elements in the sorted list ("last" or "first")
    key : Callable, optional
        Callable key function applied to every element in keys before sorting

        .. versionadded:: 1.0.0

    Returns
    -------
    np.ndarray[np.intp]
    """
    from pandas.core.arrays import Categorical

    labels = []
    shape = []
    if isinstance(orders, bool):
        orders = [orders] * len(keys)
    elif orders is None:
        orders = [True] * len(keys)

    keys = [ensure_key_mapped(k, key) for k in keys]

    for k, order in zip(keys, orders):
        with warnings.catch_warnings():
            # TODO(2.0): unnecessary once deprecation is enforced
            # GH#45618 don't issue warning user can't do anything about
            warnings.filterwarnings("ignore", ".*SparseArray.*", category=FutureWarning)

            cat = Categorical(k, ordered=True)

        if na_position not in ["last", "first"]:
            raise ValueError(f"invalid na_position: {na_position}")

        n = len(cat.categories)
        codes = cat.codes.copy()

        mask = cat.codes == -1
        if order:  # ascending
            if na_position == "last":
                codes = np.where(mask, n, codes)
            elif na_position == "first":
                codes += 1
        else:  # not order means descending
            if na_position == "last":
                codes = np.where(mask, n, n - codes - 1)
            elif na_position == "first":
                codes = np.where(mask, 0, n - codes)
        if mask.any():
            n += 1

        shape.append(n)
        labels.append(codes)

    return indexer_from_factorized(labels, tuple(shape))
Beispiel #13
0
def union_categoricals(to_union,
                       sort_categories: bool = False,
                       ignore_order: bool = False) -> Categorical:
    """
    Combine list-like of Categorical-like, unioning categories.

    All categories must have the same dtype.

    Parameters
    ----------
    to_union : list-like
        Categorical, CategoricalIndex, or Series with dtype='category'.
    sort_categories : bool, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order : bool, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

    Returns
    -------
    Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Empty list of categoricals passed

    Notes
    -----
    To learn more about categories, see `link
    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__

    Examples
    --------
    If you want to combine categoricals that do not necessarily have
    the same categories, `union_categoricals` will combine a list-like
    of categoricals. The new categories will be the union of the
    categories being combined.

    >>> a = pd.Categorical(["b", "c"])
    >>> b = pd.Categorical(["a", "b"])
    >>> pd.api.types.union_categoricals([a, b])
    ['b', 'c', 'a', 'b']
    Categories (3, object): ['b', 'c', 'a']

    By default, the resulting categories will be ordered as they appear
    in the `categories` of the data. If you want the categories to be
    lexsorted, use `sort_categories=True` argument.

    >>> pd.api.types.union_categoricals([a, b], sort_categories=True)
    ['b', 'c', 'a', 'b']
    Categories (3, object): ['a', 'b', 'c']

    `union_categoricals` also works with the case of combining two
    categoricals of the same categories and order information (e.g. what
    you could also `append` for).

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
    >>> pd.api.types.union_categoricals([a, b])
    ['a', 'b', 'a', 'b', 'a']
    Categories (2, object): ['a' < 'b']

    Raises `TypeError` because the categories are ordered and not identical.

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> pd.api.types.union_categoricals([a, b])
    Traceback (most recent call last):
        ...
    TypeError: to union ordered Categoricals, all categories must be the same

    New in version 0.20.0

    Ordered categoricals with different categories or orderings can be
    combined by using the `ignore_ordered=True` argument.

    >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
    >>> pd.api.types.union_categoricals([a, b], ignore_order=True)
    ['a', 'b', 'c', 'c', 'b', 'a']
    Categories (3, object): ['a', 'b', 'c']

    `union_categoricals` also works with a `CategoricalIndex`, or `Series`
    containing categorical data, but note that the resulting array will
    always be a plain `Categorical`

    >>> a = pd.Series(["b", "c"], dtype='category')
    >>> b = pd.Series(["a", "b"], dtype='category')
    >>> pd.api.types.union_categoricals([a, b])
    ['b', 'c', 'a', 'b']
    Categories (3, object): ['b', 'c', 'a']
    """
    from pandas import Categorical
    from pandas.core.arrays.categorical import recode_for_categories

    if len(to_union) == 0:
        raise ValueError("No Categoricals to union")

    def _maybe_unwrap(x):
        if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
            return x._values
        elif isinstance(x, Categorical):
            return x
        else:
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(
            is_dtype_equal(other.categories.dtype, first.categories.dtype)
            for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(
            first._categories_match_up_to_permutation(other)
            for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered

        all_codes = [
            first._encode_with_my_categories(x)._codes for x in to_union
        ]
        new_codes = np.concatenate(all_codes)

        if sort_categories and not ignore_order and ordered:
            raise TypeError(
                "Cannot use sort_categories=True with ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)

            from pandas.core.algorithms import take_nd

            new_codes = take_nd(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = cats.unique()
        if sort_categories:
            categories = categories.sort_values()

        new_codes = [
            recode_for_categories(c.codes, c.categories, categories)
            for c in to_union
        ]
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = "to union ordered Categoricals, all categories must be the same"
            raise TypeError(msg)
        else:
            raise TypeError("Categorical.ordered must be the same")

    if ignore_order:
        ordered = False

    return Categorical(new_codes,
                       categories=categories,
                       ordered=ordered,
                       fastpath=True)