Ejemplo n.º 1
0
    def copy(self, deep=True):
        if deep:
            copied_col = libcudf.copying.copy_column(self)

            return column.build_categorical_column(
                categories=self.dtype.categories,
                codes=column.as_column(
                    copied_col.base_data, dtype=copied_col.dtype
                ),
                offset=copied_col.offset,
                size=copied_col.size,
                mask=copied_col.base_mask,
                ordered=self.dtype.ordered,
            )
        else:
            return column.build_categorical_column(
                categories=self.dtype.categories,
                codes=column.as_column(
                    self.codes.base_data, dtype=self.codes.dtype
                ),
                mask=self.base_mask,
                ordered=self.dtype.ordered,
                offset=self.offset,
                size=self.size,
            )
Ejemplo n.º 2
0
 def __init__(self, values, **kwargs):
     kwargs = _setdefault_name(values, kwargs)
     if isinstance(values, CategoricalColumn):
         values = values
     elif isinstance(values, pd.Series) and (
         is_categorical_dtype(values.dtype)
     ):
         codes_data = column.as_column(values.cat.codes.values)
         values = column.build_categorical_column(
             categories=values.cat.categories,
             codes=codes_data,
             ordered=values.cat.ordered,
         )
     elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
         codes_data = column.as_column(values.codes)
         values = column.build_categorical_column(
             categories=values.categories,
             codes=codes_data,
             ordered=values.ordered,
         )
     elif isinstance(values, (list, tuple)):
         values = column.as_column(
             pd.Categorical(values, categories=values)
         )
     super(CategoricalIndex, self).__init__(values, **kwargs)
Ejemplo n.º 3
0
    def _set_categories(
        self,
        current_categories: Any,
        new_categories: Any,
        is_unique: bool = False,
        ordered: bool = False,
    ) -> CategoricalColumn:
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        cur_cats = column.as_column(current_categories)
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (is_unique or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = (cudf.Series(new_cats).drop_duplicates(
                ignore_index=True)._column)

        cur_codes = self.codes
        max_cat_size = (len(cur_cats)
                        if len(cur_cats) > len(new_cats) else len(new_cats))
        out_code_dtype = min_unsigned_type(max_cat_size)

        cur_order = column.arange(len(cur_codes))
        old_codes = column.arange(len(cur_cats), dtype=out_code_dtype)
        new_codes = column.arange(len(new_cats), dtype=out_code_dtype)

        new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order")
        df.reset_index(drop=True, inplace=True)

        ordered = ordered if ordered is not None else self.ordered
        new_codes = df["new_codes"]._column

        # codes can't have masks, so take mask out before moving in
        return column.build_categorical_column(
            categories=new_cats,
            codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype),
            mask=new_codes.base_mask,
            size=new_codes.size,
            offset=new_codes.offset,
            ordered=ordered,
        )
Ejemplo n.º 4
0
    def find_and_replace(self, to_replace, replacement, all_nan):
        """
        Return col with *to_replace* replaced with *replacement*.
        """
        replaced = column.as_column(self.cat().codes)

        to_replace_col = column.as_column(
            np.asarray(
                [self._encode(val) for val in to_replace], dtype=replaced.dtype
            )
        )
        replacement_col = column.as_column(
            np.asarray(
                [self._encode(val) for val in replacement],
                dtype=replaced.dtype,
            )
        )

        output = libcudf.replace.replace(
            replaced, to_replace_col, replacement_col
        )

        return column.build_categorical_column(
            categories=self.dtype.categories,
            codes=column.as_column(output.base_data, dtype=output.dtype),
            mask=output.base_mask,
            offset=output.offset,
            size=output.size,
            ordered=self.dtype.ordered,
        )
Ejemplo n.º 5
0
 def copy(self, deep=True):
     if deep:
         copied_col = libcudf.copying.copy_column(self)
         return column.build_categorical_column(
             categories=self.dtype.categories,
             codes=copied_col,
             mask=copied_col.mask,
             ordered=self.dtype.ordered,
         )
     else:
         return column.build_categorical_column(
             categories=self.dtype.categories,
             codes=self.codes,
             mask=self.mask,
             ordered=self.dtype.ordered,
         )
Ejemplo n.º 6
0
    def set_categories(self, new_categories, **kwargs):
        """Returns a new Series with the categories set to the
        specified *new_categories*."""
        ordered = kwargs.get("ordered", self.ordered)
        rename = kwargs.pop("rename", False)
        new_categories = column.as_column(new_categories)
        # when called with rename=True, the pandas behavior is
        # to replace the current category values with the new
        # categories.
        if rename:
            # enforce same length
            if len(new_categories) != len(self._column.categories):
                raise ValueError("new_categories must have the same "
                                 "number of items as old categories")
            out_col = column.build_categorical_column(
                new_categories,
                self._column.children[0],
                self._column.mask,
                self._column.size,
                ordered=ordered,
            )
        else:
            out_col = self._column
            if not self._categories_equal(new_categories, **kwargs):
                out_col = self._set_categories(new_categories, **kwargs)

        return self._return_or_inplace(out_col, **kwargs)
Ejemplo n.º 7
0
 def apply_boolean_mask(self, mask):
     codes = super().apply_boolean_mask(mask)
     return column.build_categorical_column(
         categories=self.dtype.categories,
         codes=codes,
         mask=codes.mask,
         ordered=self.dtype.ordered,
     )
Ejemplo n.º 8
0
 def unique(self, method="sort"):
     codes = self.as_numerical.unique(method=method)
     return column.build_categorical_column(
         categories=self.categories,
         codes=codes,
         mask=codes.mask,
         ordered=self.ordered,
     )
Ejemplo n.º 9
0
 def sort_by_values(self, ascending=True, na_position="last"):
     codes, inds = self.as_numerical.sort_by_values(ascending, na_position)
     col = column.build_categorical_column(
         categories=self.dtype.categories,
         codes=codes,
         mask=self.mask,
         ordered=self.dtype.ordered,
     )
     return col, inds
Ejemplo n.º 10
0
 def unique(self):
     codes = self.as_numerical.unique()
     return column.build_categorical_column(
         categories=self.categories,
         codes=column.as_column(codes.base_data, dtype=codes.dtype),
         mask=codes.base_mask,
         offset=codes.offset,
         size=codes.size,
         ordered=self.ordered,
     )
Ejemplo n.º 11
0
 def normalize_binop_value(self, other):
     ary = utils.scalar_broadcast_to(self._encode(other),
                                     size=len(self),
                                     dtype=self.codes.dtype)
     col = column.build_categorical_column(
         categories=self.dtype.categories,
         codes=column.as_column(ary),
         mask=self.mask,
         ordered=self.dtype.ordered,
     )
     return col
Ejemplo n.º 12
0
    def _set_categories(self, new_categories, **kwargs):
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        from cudf import DataFrame, Series

        cur_cats = self._column.categories
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (kwargs.get("is_unique", False) or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = (
                Series(new_cats).drop_duplicates(ignore_index=True)._column
            )

        cur_codes = self.codes
        cur_order = cupy.arange(len(cur_codes))
        old_codes = cupy.arange(len(cur_cats), dtype=cur_codes.dtype)
        new_codes = cupy.arange(len(new_cats), dtype=cur_codes.dtype)

        new_df = DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order")
        df.reset_index(drop=True, inplace=True)

        ordered = kwargs.get("ordered", self.ordered)
        new_codes = df["new_codes"]._column

        # codes can't have masks, so take mask out before moving in
        return column.build_categorical_column(
            categories=new_cats,
            codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype),
            mask=new_codes.base_mask,
            size=new_codes.size,
            offset=new_codes.offset,
            ordered=ordered,
        )
Ejemplo n.º 13
0
    def fillna(
        self, fill_value: Any = None, method: Any = None, dtype: Dtype = None
    ) -> CategoricalColumn:
        """
        Fill null values with *fill_value*
        """
        if not self.nullable:
            return self

        if fill_value is not None:
            fill_is_scalar = np.isscalar(fill_value)

            if fill_is_scalar:
                if fill_value == self.default_na_value():
                    fill_value = self.codes.dtype.type(fill_value)
                else:
                    try:
                        fill_value = self._encode(fill_value)
                        fill_value = self.codes.dtype.type(fill_value)
                    except (ValueError) as err:
                        err_msg = "fill value must be in categories"
                        raise ValueError(err_msg) from err
            else:
                fill_value = column.as_column(fill_value, nan_as_null=False)
                if isinstance(fill_value, CategoricalColumn):
                    if self.dtype != fill_value.dtype:
                        raise ValueError(
                            "Cannot set a Categorical with another, "
                            "without identical categories"
                        )
                # TODO: only required if fill_value has a subset of the
                # categories:
                fill_value = fill_value.cat()._set_categories(
                    fill_value.cat().categories,
                    self.categories,
                    is_unique=True,
                )
                fill_value = column.as_column(fill_value.codes).astype(
                    self.codes.dtype
                )

        result = super().fillna(value=fill_value, method=method)

        result = column.build_categorical_column(
            categories=self.dtype.categories._values,
            codes=column.as_column(result.base_data, dtype=result.dtype),
            offset=result.offset,
            size=result.size,
            mask=result.base_mask,
            ordered=self.dtype.ordered,
        )

        return result
Ejemplo n.º 14
0
 def sort_by_values(
     self, ascending: bool = True, na_position="last"
 ) -> Tuple[CategoricalColumn, NumericalColumn]:
     codes, inds = self.as_numerical.sort_by_values(ascending, na_position)
     col = column.build_categorical_column(
         categories=self.dtype.categories._values,
         codes=column.as_column(codes.base_data, dtype=codes.dtype),
         mask=codes.base_mask,
         size=codes.size,
         ordered=self.dtype.ordered,
     )
     return col, inds
Ejemplo n.º 15
0
    def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
        if isinstance(dtype, CategoricalDtype):
            return column.build_categorical_column(
                categories=dtype.categories._values,
                codes=build_column(self.base_data, dtype=self.dtype),
                mask=self.base_mask,
                ordered=dtype.ordered,
                size=self.size,
                offset=self.offset,
                null_count=self.null_count,
            )

        return self
Ejemplo n.º 16
0
    def as_unordered(self, inplace=False):
        if inplace:
            self._parent.dtype.ordered = False
        else:
            from cudf import Series

            parent = self._parent
            return Series(
                column.build_categorical_column(
                    categories=parent.dtype.categories,
                    codes=parent.codes,
                    mask=parent.mask,
                    ordered=False,
                ))
Ejemplo n.º 17
0
    def normalize_binop_value(self, other):

        if isinstance(other, np.ndarray) and other.ndim == 0:
            other = other.item()

        ary = cudf.utils.utils.scalar_broadcast_to(self._encode(other),
                                                   size=len(self),
                                                   dtype=self.codes.dtype)
        col = column.build_categorical_column(
            categories=self.dtype.categories,
            codes=column.as_column(ary),
            mask=self.base_mask,
            ordered=self.dtype.ordered,
        )
        return col
Ejemplo n.º 18
0
def _create_empty_categorical_column(categorical_column, dtype):

    return column.build_categorical_column(
        categories=dtype.categories,
        codes=column.as_column(
            cudf.utils.utils.scalar_broadcast_to(
                categorical_column.default_na_value(),
                categorical_column.size,
                np.dtype(categorical_column.cat().codes),
            )),
        offset=categorical_column.offset,
        size=categorical_column.size,
        mask=categorical_column.base_mask,
        ordered=dtype.ordered,
    )
Ejemplo n.º 19
0
    def as_ordered(self, **kwargs):
        inplace = kwargs.get("inplace", False)
        data = None if inplace else self._parent
        if not self.ordered:
            kwargs["ordered"] = True
            data = self._set_categories(self.categories, **kwargs)
        if data is not None:
            from cudf import Series

            parent = self._parent
            return Series(
                column.build_categorical_column(
                    categories=parent.dtype.categories,
                    codes=parent.cat().codes,
                    mask=parent.mask,
                    ordered=True,
                ))
Ejemplo n.º 20
0
def indices_from_labels(obj, labels):
    from cudf.core.column import column

    labels = column.as_column(labels)

    if is_categorical_dtype(obj.index):
        labels = labels.astype("category")
        codes = labels.codes.astype(obj.index._values.codes.dtype)
        labels = column.build_categorical_column(
            categories=labels.dtype.categories,
            codes=codes,
            ordered=labels.dtype.ordered,
        )
    else:
        labels = labels.astype(obj.index.dtype)

    lhs = cudf.DataFrame({}, index=labels)
    rhs = cudf.DataFrame({"_": cupy.arange(len(obj))}, index=obj.index)
    return lhs.join(rhs)["_"]
Ejemplo n.º 21
0
    def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series:
        if self.categories.dtype.kind == "f":
            new_mask = bools_to_mask(self.notnull())
            col = column.build_categorical_column(
                categories=self.categories,
                codes=column.as_column(self.codes, dtype=self.codes.dtype),
                mask=new_mask,
                ordered=self.dtype.ordered,
                size=self.codes.size,
            )
        else:
            col = self

        signed_dtype = min_signed_type(len(col.categories))
        codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
        categories = col.categories.dropna(drop_nan=True).to_pandas()
        data = pd.Categorical.from_codes(codes,
                                         categories=categories,
                                         ordered=col.ordered)
        return pd.Series(data, index=index)
Ejemplo n.º 22
0
    def fillna(self, fill_value):
        """
        Fill null values with *fill_value*
        """
        if not self.nullable:
            return self

        fill_is_scalar = np.isscalar(fill_value)

        if fill_is_scalar:
            if fill_value == self.default_na_value():
                fill_value = self.codes.dtype.type(fill_value)
            else:
                try:
                    fill_value = self._encode(fill_value)
                    fill_value = self.codes.dtype.type(fill_value)
                except (ValueError) as err:
                    err_msg = "fill value must be in categories"
                    raise ValueError(err_msg) from err
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)
            # TODO: only required if fill_value has a subset of the categories:
            fill_value = fill_value.cat()._set_categories(
                self.categories, is_unique=True
            )
            fill_value = column.as_column(fill_value.codes).astype(
                self.codes.dtype
            )

        result = libcudf.replace.replace_nulls(self, fill_value)

        result = column.build_categorical_column(
            categories=self.dtype.categories,
            codes=column.as_column(result.base_data, dtype=result.dtype),
            offset=result.offset,
            size=result.size,
            mask=None,
            ordered=self.dtype.ordered,
        )

        return result
Ejemplo n.º 23
0
def pandas_categorical_as_column(categorical, codes=None):
    """Creates a CategoricalColumn from a pandas.Categorical

    If ``codes`` is defined, use it instead of ``categorical.codes``
    """
    codes = categorical.codes if codes is None else codes
    codes = column.as_column(codes)

    valid_codes = codes != -1

    mask = None
    if not np.all(valid_codes):
        mask = cudautils.compact_mask_bytes(valid_codes)
        mask = Buffer(mask)

    return column.build_categorical_column(
        categories=categorical.categories,
        codes=codes,
        mask=mask,
        ordered=categorical.ordered,
    )
Ejemplo n.º 24
0
def pandas_categorical_as_column(categorical, codes=None):
    """Creates a CategoricalColumn from a pandas.Categorical

    If ``codes`` is defined, use it instead of ``categorical.codes``
    """
    codes = categorical.codes if codes is None else codes
    codes = column.as_column(codes)

    valid_codes = codes != codes.dtype.type(-1)

    mask = None
    if not valid_codes.all():
        mask = bools_to_mask(valid_codes)

    return column.build_categorical_column(
        categories=categorical.categories,
        codes=column.as_column(codes.base_data, dtype=codes.dtype),
        size=codes.size,
        mask=mask,
        ordered=categorical.ordered,
    )
Ejemplo n.º 25
0
def indices_from_labels(obj, labels):
    from cudf.core.column import column

    if not isinstance(labels, cudf.MultiIndex):
        labels = column.as_column(labels)

        if is_categorical_dtype(obj.index):
            labels = labels.astype("category")
            codes = labels.codes.astype(obj.index._values.codes.dtype)
            labels = column.build_categorical_column(
                categories=labels.dtype.categories,
                codes=codes,
                ordered=labels.dtype.ordered,
            )
        else:
            labels = labels.astype(obj.index.dtype)

    # join is not guaranteed to maintain the index ordering
    # so we will sort it with its initial ordering which is stored
    # in column "__"
    lhs = cudf.DataFrame({"__": column.arange(len(labels))}, index=labels)
    rhs = cudf.DataFrame({"_": column.arange(len(obj))}, index=obj.index)
    return lhs.join(rhs).sort_values("__")["_"]
Ejemplo n.º 26
0
    def find_and_replace(self, to_replace, replacement, all_nan):
        """
        Return col with *to_replace* replaced with *replacement*.
        """

        # create a dataframe containing the pre-replacement categories
        # and a copy of them to work with. The index of this dataframe
        # represents the original ints that map to the categories
        old_cats = cudf.DataFrame()
        old_cats["cats"] = column.as_column(self.dtype.categories)
        new_cats = old_cats.copy(deep=True)

        # Create a column with the appropriate labels replaced
        old_cats["cats_replace"] = old_cats["cats"].replace(
            to_replace, replacement)

        # Construct the new categorical labels
        # If a category is being replaced by an existing one, we
        # want to map it to None. If it's totally new, we want to
        # map it to the new label it is to be replaced by
        dtype_replace = cudf.Series(replacement)
        dtype_replace[dtype_replace.isin(old_cats["cats"])] = None
        new_cats["cats"] = new_cats["cats"].replace(to_replace, dtype_replace)

        # anything we mapped to None, we want to now filter out since
        # those categories don't exist anymore
        # Resetting the index creates a column 'index' that associates
        # the original integers to the new labels
        bmask = new_cats["cats"]._column.notna()
        new_cats = cudf.DataFrame({
            "cats":
            new_cats["cats"]._column.apply_boolean_mask(bmask)
        }).reset_index()

        # old_cats contains replaced categories and the ints that
        # previously mapped to those categories and the index of
        # new_cats is a RangeIndex that contains the new ints
        catmap = old_cats.merge(new_cats,
                                left_on="cats_replace",
                                right_on="cats",
                                how="inner")

        # The index of this frame is now the old ints, but the column
        # named 'index', which came from the filtered categories,
        # contains the new ints that we need to map to
        to_replace_col = column.as_column(catmap.index).astype(
            self.cat().codes.dtype)
        replacement_col = catmap["index"]._column.astype(
            self.cat().codes.dtype)

        replaced = column.as_column(self.cat().codes)
        output = libcudf.replace.replace(replaced, to_replace_col,
                                         replacement_col)

        return column.build_categorical_column(
            categories=new_cats["cats"],
            codes=column.as_column(output.base_data, dtype=output.dtype),
            mask=output.base_mask,
            offset=output.offset,
            size=output.size,
            ordered=self.dtype.ordered,
        )
Ejemplo n.º 27
0
    def set_categories(
        self,
        new_categories,
        ordered=None,
        rename=False,
        inplace=False,
    ):
        """
        Set the categories to the specified new_categories.


        `new_categories` can include new categories (which
        will result in unused categories) or remove old categories
        (which results in values set to null). If `rename==True`,
        the categories will simple be renamed (less or more items
        than in old categories will result in values set to null or
        in unused categories respectively).

        This method can be used to perform more than one action
        of adding, removing, and reordering simultaneously and
        is therefore faster than performing the individual steps
        via the more specialised methods.

        On the other hand this methods does not do checks
        (e.g., whether the old categories are included in the
        new categories on a reorder), which can result in
        surprising changes.

        Parameters
        ----------

        new_categories : list-like
            The categories in new order.

        ordered : bool, default None
            Whether or not the categorical is treated as
            a ordered categorical. If not given, do
            not change the ordered information.

        rename : bool, default False
            Whether or not the `new_categories` should be
            considered as a rename of the old categories
            or as reordered categories.

        inplace : bool, default False
            Whether or not to reorder the categories in-place
            or return a copy of this categorical with
            reordered categories.

        Returns
        -------
        cat
            Categorical with reordered categories
            or None if inplace.

        Examples
        --------
        >>> import cudf
        >>> s = cudf.Series([1, 1, 2, 10, 2, 10], dtype='category')
        >>> s
        0     1
        1     1
        2     2
        3    10
        4     2
        5    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.set_categories([1, 10])
        0      1
        1      1
        2   null
        3     10
        4   null
        5     10
        dtype: category
        Categories (2, int64): [1, 10]
        >>> s.cat.set_categories([1, 10], inplace=True)
        >>> s
        0      1
        1      1
        2   null
        3     10
        4   null
        5     10
        dtype: category
        Categories (2, int64): [1, 10]
        """
        ordered = ordered if ordered is not None else self.ordered
        new_categories = column.as_column(new_categories)

        if isinstance(new_categories, CategoricalColumn):
            new_categories = new_categories.categories

        # when called with rename=True, the pandas behavior is
        # to replace the current category values with the new
        # categories.
        if rename:
            # enforce same length
            if len(new_categories) != len(self._column.categories):
                raise ValueError("new_categories must have the same "
                                 "number of items as old categories")

            out_col = column.build_categorical_column(
                categories=new_categories,
                codes=self._column.base_children[0],
                mask=self._column.base_mask,
                size=self._column.size,
                offset=self._column.offset,
                ordered=ordered,
            )
        else:
            out_col = self._column
            if not (type(out_col.categories) is type(new_categories)):
                # If both categories are of different Column types,
                # return a column full of Nulls.
                out_col = _create_empty_categorical_column(
                    self._column,
                    CategoricalDtype(categories=new_categories,
                                     ordered=ordered),
                )
            elif (not self._categories_equal(new_categories, ordered=ordered)
                  or not self.ordered == ordered):
                out_col = self._set_categories(
                    self._column.categories,
                    new_categories,
                    ordered=ordered,
                )
        return self._return_or_inplace(out_col, inplace=inplace)