Ejemplo n.º 1
0
 def _compute_drop_idx(self):
     """Helper to compute indices to drop from category to drop"""
     if self.drop is None:
         return None
     elif isinstance(self.drop, str) and self.drop == 'first':
         return {feature: 0 for feature in self._encoders.keys()}
     elif isinstance(self.drop, (dict, list)):
         if isinstance(self.drop, list):
             self.drop = dict(zip(range(len(self.drop)), self.drop))
         if len(self.drop.keys()) != len(self._encoders):
             msg = ("`drop` should have as many columns as the number "
                    "of features ({}), got {}")
             raise ValueError(msg.format(len(self._encoders),
                                         len(self.drop.keys())))
         drop_idx = dict()
         for feature in self.drop.keys():
             self.drop[feature] = Series(self.drop[feature])
             if len(self.drop[feature]) != 1:
                 msg = ("Trying to drop multiple values for feature {}, "
                        "this is not supported.").format(feature)
                 raise ValueError(msg)
             cats = self._encoders[feature].classes_
             if not self.drop[feature].isin(cats).all():
                 msg = ("Some categories for feature {} were supposed "
                        "to be dropped, but were not found in the encoder "
                        "categories.".format(feature))
                 raise ValueError(msg)
             cats = Series(cats)
             idx = cats.isin(self.drop[feature])
             drop_idx[feature] = cp.asarray(cats[idx].index)
         return drop_idx
     else:
         msg = ("Wrong input for parameter `drop`. Expected "
                "'first', None or a dict, got {}")
         raise ValueError(msg.format(type(self.drop)))
Ejemplo n.º 2
0
    def remove_categories(self, removals, **kwargs):
        from cudf import Series

        cats = self.categories.to_series()
        removals = Series(removals, dtype=cats.dtype)
        removals_mask = removals.isin(cats)
        # ensure all the removals are in the current categories
        # list. If not, raise an error to match Pandas behavior
        if not removals_mask.all():
            vals = removals[~removals_mask].to_array()
            msg = "removals must all be in old categories: {}".format(vals)
            raise ValueError(msg)
        return self.set_categories(cats[~cats.isin(removals)], **kwargs)
Ejemplo n.º 3
0
    def remove_categories(self, removals, **kwargs):
        """
        Remove the specified categories.

        `removals` must be included in the
        old categories. Values which were in the
        removed categories will be set to null.

        Parameters
        ----------

        removals : category or list-like of category
            The categories which should be removed.

        inplace : bool, default False
            Whether or not to remove the categories
            inplace or return a copy of this categorical
            with removed categories.

        Returns
        -------
        cat
            Categorical with removed categories or None
            if inplace.

        Examples
        --------
        >>> import cudf
        >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category")
        >>> s
        0    10
        1     1
        2     1
        3     2
        4    10
        5     2
        6    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.remove_categories([1])
        0     10
        1   null
        2   null
        3      2
        4     10
        5      2
        6     10
        dtype: category
        Categories (2, int64): [2, 10]
        >>> s
        0    10
        1     1
        2     1
        3     2
        4    10
        5     2
        6    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.remove_categories([10], inplace=True)
        >>> s
        0   null
        1      1
        2      1
        3      2
        4   null
        5      2
        6   null
        dtype: category
        Categories (2, int64): [1, 2]
        """
        from cudf import Series

        cats = self.categories.to_series()
        removals = Series(removals, dtype=cats.dtype)
        removals_mask = removals.isin(cats)

        # ensure all the removals are in the current categories
        # list. If not, raise an error to match Pandas behavior
        if not removals_mask.all():
            vals = removals[~removals_mask].to_array()
            msg = "removals must all be in old categories: {}".format(vals)
            raise ValueError(msg)

        new_categories = cats[~cats.isin(removals)]._column
        out_col = self._column
        if not self._categories_equal(new_categories, **kwargs):
            out_col = self._set_categories(new_categories, **kwargs)

        return self._return_or_inplace(out_col, **kwargs)