Example #1
0
    def from_array(cls, data):
        try:
            labels, levels, _ = factorize(data, sort=True)
        except TypeError:
            labels, levels, _ = factorize(data, sort=False)

        return Categorical(labels, levels, name=getattr(data, "name", None))
Example #2
0
    def test_datelike(self):

        # M8
        v1 = pd.Timestamp('20130101 09:00:00.00004')
        v2 = pd.Timestamp('20130101')
        x = Series([v1, v1, v1, v2, v2, v1])
        labels, uniques = algos.factorize(x)
        self.assert_numpy_array_equal(labels, np.array(
            [0, 0, 0, 1, 1, 0], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(
            [v1.value, v2.value], dtype='M8[ns]'))

        labels, uniques = algos.factorize(x, sort=True)
        self.assert_numpy_array_equal(labels, np.array(
            [1, 1, 1, 0, 0, 1], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(
            [v2.value, v1.value], dtype='M8[ns]'))

        # period
        v1 = pd.Period('201302', freq='M')
        v2 = pd.Period('201303', freq='M')
        x = Series([v1, v1, v1, v2, v2, v1])

        # periods are not 'sorted' as they are converted back into an index
        labels, uniques = algos.factorize(x)
        self.assert_numpy_array_equal(labels, np.array(
            [0, 0, 0, 1, 1, 0], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))

        labels, uniques = algos.factorize(x, sort=True)
        self.assert_numpy_array_equal(labels, np.array(
            [0, 0, 0, 1, 1, 0], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))
Example #3
0
    def from_array(cls, data):
        try:
            labels, levels, _ = factorize(data, sort=True)
        except TypeError:
            labels, levels, _ = factorize(data, sort=False)

        return Factor(labels, levels)
Example #4
0
    def from_array(cls, data):
        from pandas.core.algorithms import factorize

        try:
            labels, levels, _ = factorize(data, sort=True)
        except TypeError:
            labels, levels, _ = factorize(data, sort=False)

        return Factor(labels, levels)
    def from_array(cls, data):
        if isinstance(data, Index) and hasattr(data, 'factorize'):
            labels, levels = data.factorize()
        else:
            try:
                labels, levels = factorize(data, sort=True)
            except TypeError:
                labels, levels = factorize(data, sort=False)

        return Categorical(labels, levels,
                           name=getattr(data, 'name', None))
    def __init__(self, labels, levels=None, name=None):
        if levels is None:
            if name is None:
                name = getattr(labels, 'name', None)
            try:
                labels, levels = factorize(labels, sort=True)
            except TypeError:
                labels, levels = factorize(labels, sort=False)

        self.labels = labels
        self.levels = levels
        self.name = name
Example #7
0
    def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        self.assert_numpy_array_equal(labels, np.array([ 0,  0, -1,  1,  2,  3],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(['A', 'B', 3.14, np.inf], dtype=object))

        labels, uniques = algos.factorize(x, sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 2,  2, -1,  3,  0,  1],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([3.14, np.inf, 'A', 'B'], dtype=object))
Example #8
0
    def __new__(cls, data):
        from pandas.core.index import _ensure_index
        from pandas.core.algorithms import factorize

        try:
            labels, levels, _ = factorize(data, sort=True)
        except TypeError:
            labels, levels, _ = factorize(data, sort=False)

        labels = labels.view(Factor)
        labels.levels = _ensure_index(levels)
        return labels
Example #9
0
    def __init__(self, labels, levels=None, name=None):
        if levels is None:
            if name is None:
                name = getattr(labels, 'name', None)
            if isinstance(labels, Index) and hasattr(labels, 'factorize'):
                labels, levels = labels.factorize()
            else:
                try:
                    labels, levels = factorize(labels, sort=True)
                except TypeError:
                    labels, levels = factorize(labels, sort=False)

        self.labels = labels
        self.levels = levels
        self.name = name
Example #10
0
    def __init__(self, labels, levels=None, name=None):
        if levels is None:
            if name is None:
                name = getattr(labels, 'name', None)
            if isinstance(labels, Index) and hasattr(labels, 'factorize'):
                labels, levels = labels.factorize()
            else:
                try:
                    labels, levels = factorize(labels, sort=True)
                except TypeError:
                    labels, levels = factorize(labels, sort=False)

        self.labels = labels
        self.levels = levels
        self.name = name
Example #11
0
    def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
        if self._passed_categorical:
            # we make a CategoricalIndex out of the cat grouper
            # preserving the categories / ordered attributes
            cat = self.grouper
            categories = cat.categories

            if self.observed:
                ucodes = algorithms.unique1d(cat.codes)
                ucodes = ucodes[ucodes != -1]
                if self.sort or cat.ordered:
                    ucodes = np.sort(ucodes)
            else:
                ucodes = np.arange(len(categories))

            uniques = Categorical.from_codes(
                codes=ucodes, categories=categories, ordered=cat.ordered
            )
            return cat.codes, uniques

        elif isinstance(self.grouper, ops.BaseGrouper):
            # we have a list of groupers
            codes = self.grouper.codes_info
            uniques = self.grouper.result_arraylike
        else:
            # GH35667, replace dropna=False with na_sentinel=None
            if not self.dropna:
                na_sentinel = None
            else:
                na_sentinel = -1
            codes, uniques = algorithms.factorize(
                self.grouper, sort=self.sort, na_sentinel=na_sentinel
            )
        return codes, uniques
Example #12
0
    def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        self.assert_numpy_array_equal(labels, np.array(
            [0, 0, -1, 1, 2, 3], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(
            ['A', 'B', 3.14, np.inf], dtype=object))

        labels, uniques = algos.factorize(x, sort=True)
        self.assert_numpy_array_equal(labels, np.array(
            [2, 2, -1, 3, 0, 1], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(
            [3.14, np.inf, 'A', 'B'], dtype=object))
Example #13
0
    def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.Index(['A', 'B', 3.14, np.inf])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.Index([3.14, np.inf, 'A', 'B'])
        tm.assert_index_equal(uniques, exp)
Example #14
0
    def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.Index(['A', 'B', 3.14, np.inf])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.Index([3.14, np.inf, 'A', 'B'])
        tm.assert_index_equal(uniques, exp)
Example #15
0
 def factorize(self):
     """
     Specialized factorize that boxes uniques
     """
     from pandas.core.algorithms import factorize
     labels, uniques = factorize(self.values)
     uniques = PeriodIndex(ordinal=uniques, freq=self.freq)
     return labels, uniques
Example #16
0
 def factorize(self):
     """
     Specialized factorize that boxes uniques
     """
     from pandas.core.algorithms import factorize
     labels, uniques, counts = factorize(self.values)
     uniques = PeriodIndex(ordinal=uniques, freq=self.freq)
     return labels, uniques
Example #17
0
 def factorize(
     self,
     sort: bool = False,
     na_sentinel: int | lib.NoDefault = lib.no_default,
     use_na_sentinel: bool | lib.NoDefault = lib.no_default,
 ):
     return algorithms.factorize(
         self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
     )
Example #18
0
 def factorize(self, na_sentinel=-1):
     # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
     # The sparsity on this is backwards from what Sparse would want. Want
     # ExtensionArray.factorize -> Tuple[EA, EA]
     # Given that we have to return a dense array of codes, why bother
     # implementing an efficient factorize?
     codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
     uniques = SparseArray(uniques, dtype=self.dtype)
     return codes, uniques
Example #19
0
    def from_array(cls, data):
        """
        Make a Categorical type from a single array-like object.

        Parameters
        ----------
        data : array-like
            Can be an Index or array-like. The levels are assumed to be
            the unique values of `data`.
        """
        if isinstance(data, Index) and hasattr(data, 'factorize'):
            labels, levels = data.factorize()
        else:
            try:
                labels, levels = factorize(data, sort=True)
            except TypeError:
                labels, levels = factorize(data, sort=False)

        return Categorical(labels, levels, name=getattr(data, 'name', None))
Example #20
0
 def _make_labels(self):
     if self._was_factor:  # pragma: no cover
         raise Exception('Should not call this method grouping by level')
     else:
         labs, uniques, counts = algos.factorize(self.grouper,
                                                 sort=self.sort)
         uniques = Index(uniques, name=self.name)
         self._labels = labs
         self._group_index = uniques
         self._counts = counts
Example #21
0
 def _make_labels(self):
     if self._was_factor:  # pragma: no cover
         raise Exception('Should not call this method grouping by level')
     else:
         labs, uniques, counts = algos.factorize(self.grouper,
                                                 sort=self.sort)
         uniques = Index(uniques, name=self.name)
         self._labels = labs
         self._group_index = uniques
         self._counts = counts
Example #22
0
    def from_array(cls, data):
        """
        Make a Categorical type from a single array-like object.

        Parameters
        ----------
        data : array-like
            Can be an Index or array-like. The levels are assumed to be
            the unique values of `data`.
        """
        if isinstance(data, Index) and hasattr(data, "factorize"):
            labels, levels = data.factorize()
        else:
            try:
                labels, levels = factorize(data, sort=True)
            except TypeError:
                labels, levels = factorize(data, sort=False)

        return Categorical(labels, levels, name=getattr(data, "name", None))
Example #23
0
 def _make_labels(self):
     if self._labels is None or self._group_index is None:
         # we have a list of groupers
         if isinstance(self.grouper, BaseGrouper):
             labels = self.grouper.label_info
             uniques = self.grouper.result_index
         else:
             labels, uniques = algorithms.factorize(self.grouper, sort=self.sort)
             uniques = Index(uniques, name=self.name)
         self._labels = labels
         self._group_index = uniques
Example #24
0
 def _make_codes(self) -> None:
     if self._codes is None or self._group_index is None:
         # we have a list of groupers
         if isinstance(self.grouper, ops.BaseGrouper):
             codes = self.grouper.codes_info
             uniques = self.grouper.result_index
         else:
             codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
             uniques = Index(uniques, name=self.name)
         self._codes = codes
         self._group_index = uniques
Example #25
0
 def _make_labels(self):
     if self._labels is None or self._group_index is None:
         # we have a list of groupers
         if isinstance(self.grouper, BaseGrouper):
             labels = self.grouper.label_info
             uniques = self.grouper.result_index
         else:
             labels, uniques = algorithms.factorize(
                 self.grouper, sort=self.sort)
             uniques = Index(uniques, name=self.name)
         self._labels = labels
         self._group_index = uniques
Example #26
0
    def indices(self):
        # we have a list of groupers
        if isinstance(self.grouper, ops.BaseGrouper):
            return self.grouper.indices

        # Return a dictionary of {group label: [indices belonging to the group label]}
        # respecting whether sort was specified
        codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
        return {
            category: np.flatnonzero(codes == i)
            for i, category in enumerate(Index(uniques))
        }
Example #27
0
    def test_basic(self):

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'])
        # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object))

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'], sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object))

        labels, uniques = algos.factorize(list(reversed(range(5))))
        self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
        self.assert_numpy_array_equal(labels, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
        self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
Example #28
0
    def test_basic(self):

        labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
        # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))

        labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"], sort=True)
        self.assert_numpy_array_equal(labels, np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))

        labels, uniques = algos.factorize(list(reversed(range(5))))
        self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.0))))
        self.assert_numpy_array_equal(labels, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64))
        self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64))
Example #29
0
    def test_basic(self):

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'])
        # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object))

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'], sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object))

        labels, uniques = algos.factorize(list(reversed(range(5))))
        self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
        self.assert_numpy_array_equal(labels, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
        self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True)
        self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))
        self.assert_numpy_array_equal(uniques, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
Example #30
0
    def factorize(self,
                  na_sentinel: int = -1) -> Tuple[np.ndarray, "RLEArray"]:
        # optimized version of `ExtensionArray.factorize`:
        #   1. replace `_values_for_factorize` with a version that does not decompress the data
        #   2. passing compressed data to `factorize` (instead of `_factorize_array` because that does not handle NA
        #      values nicely)
        #   3. decompress `codes`
        arr = self._data

        codes, uniques = factorize(arr, na_sentinel=na_sentinel)

        uniques = self._from_factorized(uniques, self)
        codes = decompress(codes, self._positions)
        return codes, uniques
Example #31
0
    def _get_compressed_labels(self):
        all_labels = [ping.labels for ping in self.groupings]
        if self._overflow_possible:
            tups = lib.fast_zip(all_labels)
            labs, uniques, _ = algos.factorize(tups)

            if self.sort:
                uniques, labs = _reorder_by_uniques(uniques, labs)

            return labs, uniques
        else:
            if len(all_labels) > 1:
                group_index = get_group_index(all_labels, self.shape)
            else:
                group_index = all_labels[0]
            comp_ids, obs_group_ids = _compress_group_index(group_index)
            return comp_ids, obs_group_ids
Example #32
0
 def _make_codes(self) -> None:
     if self._codes is None or self._group_index is None:
         # we have a list of groupers
         if isinstance(self.grouper, ops.BaseGrouper):
             codes = self.grouper.codes_info
             uniques = self.grouper.result_index
         else:
             # GH35667, replace dropna=False with na_sentinel=None
             if not self.dropna:
                 na_sentinel = None
             else:
                 na_sentinel = -1
             codes, uniques = algorithms.factorize(self.grouper,
                                                   sort=self.sort,
                                                   na_sentinel=na_sentinel)
             uniques = Index(uniques, name=self.name)
         self._codes = codes
         self._group_index = uniques
Example #33
0
    def factorize(self, sort=False, na_sentinel=-1):
        """
        Encode the object as an enumerated type or categorical variable

        Parameters
        ----------
        sort : boolean, default False
            Sort by values
        na_sentinel: int, default -1
            Value to mark "not found"

        Returns
        -------
        labels : the indexer to the original array
        uniques : the unique Index
        """
        from pandas.core.algorithms import factorize
        return factorize(self, sort=sort, na_sentinel=na_sentinel)
Example #34
0
def _levels_to_axis(
    ss,
    levels: tuple[int] | list[int],
    valid_ilocs: npt.NDArray[np.intp],
    sort_labels: bool = False,
) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
    """
    For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
    where `ax_coords` are the coordinates along one of the two axes of the
    destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
    which correspond to these coordinates.

    Parameters
    ----------
    ss : Series
    levels : tuple/list
    valid_ilocs : numpy.ndarray
        Array of integer positions of valid values for the sparse matrix in ss.
    sort_labels : bool, default False
        Sort the axis labels before forming the sparse matrix. When `levels`
        refers to a single level, set to True for a faster execution.

    Returns
    -------
    ax_coords : numpy.ndarray (axis coordinates)
    ax_labels : list (axis labels)
    """
    # Since the labels are sorted in `Index.levels`, when we wish to sort and
    # there is only one level of the MultiIndex for this axis, the desired
    # output can be obtained in the following simpler, more efficient way.
    if sort_labels and len(levels) == 1:
        ax_coords = ss.index.codes[levels[0]][valid_ilocs]
        ax_labels = ss.index.levels[levels[0]]

    else:
        levels_values = lib.fast_zip(
            [ss.index.get_level_values(lvl).values for lvl in levels]
        )
        codes, ax_labels = factorize(levels_values, sort=sort_labels)
        ax_coords = codes[valid_ilocs]

    ax_labels = ax_labels.tolist()
    return ax_coords, ax_labels
Example #35
0
    def test_datelike(self):

        # M8
        v1 = pd.Timestamp('20130101 09:00:00.00004')
        v2 = pd.Timestamp('20130101')
        x = Series([v1, v1, v1, v2, v2, v1])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.DatetimeIndex([v1, v2])
        self.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.DatetimeIndex([v2, v1])
        self.assert_index_equal(uniques, exp)

        # period
        v1 = pd.Period('201302', freq='M')
        v2 = pd.Period('201303', freq='M')
        x = Series([v1, v1, v1, v2, v2, v1])

        # periods are not 'sorted' as they are converted back into an index
        labels, uniques = algos.factorize(x)
        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

        # GH 5986
        v1 = pd.to_timedelta('1 day 1 min')
        v2 = pd.to_timedelta('1 day')
        x = Series([v1, v2, v1, v1, v2, v2, v1])
        labels, uniques = algos.factorize(x)
        exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
Example #36
0
    def test_datelike(self):

        # M8
        v1 = pd.Timestamp('20130101 09:00:00.00004')
        v2 = pd.Timestamp('20130101')
        x = Series([v1, v1, v1, v2, v2, v1])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.DatetimeIndex([v1, v2])
        self.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        exp = pd.DatetimeIndex([v2, v1])
        self.assert_index_equal(uniques, exp)

        # period
        v1 = pd.Period('201302', freq='M')
        v2 = pd.Period('201303', freq='M')
        x = Series([v1, v1, v1, v2, v2, v1])

        # periods are not 'sorted' as they are converted back into an index
        labels, uniques = algos.factorize(x)
        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

        # GH 5986
        v1 = pd.to_timedelta('1 day 1 min')
        v2 = pd.to_timedelta('1 day')
        x = Series([v1, v2, v1, v1, v2, v2, v1])
        labels, uniques = algos.factorize(x)
        exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.int_)
        self.assert_numpy_array_equal(labels, exp)
        self.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
Example #37
0
    def _codes_and_uniques(
            self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
        if self._passed_categorical:
            # we make a CategoricalIndex out of the cat grouper
            # preserving the categories / ordered attributes
            cat = self.grouping_vector
            categories = cat.categories

            if self._observed:
                ucodes = algorithms.unique1d(cat.codes)
                ucodes = ucodes[ucodes != -1]
                if self._sort or cat.ordered:
                    ucodes = np.sort(ucodes)
            else:
                ucodes = np.arange(len(categories))

            uniques = Categorical.from_codes(codes=ucodes,
                                             categories=categories,
                                             ordered=cat.ordered)
            return cat.codes, uniques

        elif isinstance(self.grouping_vector, ops.BaseGrouper):
            # we have a list of groupers
            codes = self.grouping_vector.codes_info
            # error: Incompatible types in assignment (expression has type "Union
            # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical")
            uniques = (
                self.grouping_vector.result_index.
                _values  # type: ignore[assignment]
            )
        else:
            # GH35667, replace dropna=False with use_na_sentinel=False
            # error: Incompatible types in assignment (expression has type "Union[
            # ndarray[Any, Any], Index]", variable has type "Categorical")
            codes, uniques = algorithms.factorize(  # type: ignore[assignment]
                self.grouping_vector,
                sort=self._sort,
                use_na_sentinel=self._dropna)
        return codes, uniques
Example #38
0
def level_styler(linestyle=None, color=None, marker=None):
    """
    This function is useful for categorical plotting. Based on certain categories,
    it will return styles that are persistant. 

    This is when you want to distinguish groups of line plots by their style
    """
    vars = locals().copy()

    styles = OrderedDict()

    for k, SC in STYLES.items():
        vals = vars.get(k, None)
        if vals is None:
            continue
        labels, uniques = factorize(vals)
        labels = labels % len(SC)  # cycle back to start
        style_values = np.take(SC, labels)
        styles[k] = style_values

    keys = styles.keys()
    return [dict(zip(keys, st)) for st in itertools.izip(*styles.values())]
Example #39
0
def level_styler(linestyle=None, color=None, marker=None):
    """
    This function is useful for categorical plotting. Based on certain categories,
    it will return styles that are persistant. 

    This is when you want to distinguish groups of line plots by their style
    """
    vars = locals().copy()

    styles = OrderedDict()

    for k, SC in STYLES.items():
        vals = vars.get(k, None)
        if vals is None:
            continue
        labels, uniques = factorize(vals)
        labels = labels % len(SC)  # cycle back to start
        style_values = np.take(SC, labels)
        styles[k] = style_values

    keys = styles.keys()
    return [dict(zip(keys, st)) for st in itertools.izip(*styles.values())]
Example #40
0
 def f(vals):
     labels, shape = _algorithms.factorize(vals,
                                           size_hint=min(
                                               len(df), SIZE_HINT_LIMIT))
     return labels.astype("i8", copy=False), len(shape)
Example #41
0
 def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1):
     return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
Example #42
0
    def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
                 levels=None):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.categories = categories
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # TODO: Remove after deprecation period in 2017/ after 0.18
        if not levels is None:
            warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead",
                 FutureWarning)
            if categories is None:
                categories = levels
            else:
                raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', "
                                 "use only 'categories'")

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if categories is None:
                categories = cat.categories
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values, convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the category
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if categories is None:
            try:
                codes, categories = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, categories = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the user should
                    # give us one by specifying categories
                    raise TypeError("'values' is not ordered, please explicitly specify the "
                                    "categories order by passing in a categories argument.")
        else:
            # there were two ways if categories are present
            # - the old one, where each value is a int pointer to the levels array -> not anymore
            #   possible, but code outside of pandas could call us like that, so make some checks
            # - the new one, where each value is also in the categories array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            categories = self._validate_categories(categories)

            codes = _get_codes_for_values(values, categories)

            # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
            if com.is_integer_dtype(values) and not com.is_integer_dtype(categories):
                warn("Values and categories have different dtypes. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            if com.is_integer_dtype(values) and (codes == -1).all():
                warn("None of the categories were found in values. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            # if we got categories, we can assume that the order is intended
            # if ordered is unspecified
            if ordered is None:
                ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.categories = categories
        self.name = name
Example #43
0
 def factorize(self, sort=False, na_sentinel=-1):
     return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
Example #44
0
    def test_warn(self):

        s = Series([1, 2, 3])
        with tm.assert_produces_warning(FutureWarning):
            algos.factorize(s, order='A')
Example #45
0
    def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
                 levels=None):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.categories = categories
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # TODO: Remove after deprecation period in 2017/ after 0.18
        if not levels is None:
            warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead",
                 FutureWarning)
            if categories is None:
                categories = levels
            else:
                raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', "
                                 "use only 'categories'")

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if categories is None:
                categories = cat.categories
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values, convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the category
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if categories is None:
            try:
                codes, categories = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, categories = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the user should
                    # give us one by specifying categories
                    raise TypeError("'values' is not ordered, please explicitly specify the "
                                    "categories order by passing in a categories argument.")
        else:
            # there were two ways if categories are present
            # - the old one, where each value is a int pointer to the levels array -> not anymore
            #   possible, but code outside of pandas could call us like that, so make some checks
            # - the new one, where each value is also in the categories array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            categories = self._validate_categories(categories)

            codes = _get_codes_for_values(values, categories)

            # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
            if com.is_integer_dtype(values) and not com.is_integer_dtype(categories):
                warn("Values and categories have different dtypes. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            if com.is_integer_dtype(values) and (codes == -1).all():
                warn("None of the categories were found in values. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            # if we got categories, we can assume that the order is intended
            # if ordered is unspecified
            if ordered is None:
                ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.categories = categories
        self.name = name
Example #46
0
 def factorize(self, sort: bool = False, na_sentinel: int | None = -1):
     return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
Example #47
0
    def test_warn(self):

        s = Series([1, 2, 3])
        with tm.assert_produces_warning(FutureWarning):
            algos.factorize(s, order='A')
Example #48
0
    def __init__(self,
                 values,
                 levels=None,
                 ordered=None,
                 name=None,
                 fastpath=False,
                 compat=False):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.levels = levels
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if levels is None:
                levels = cat.levels
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values,
                                                         convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the level
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if levels is None:
            try:
                codes, levels = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, levels = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the usershould
                    # give us one by specifying levels
                    raise TypeError(
                        "'values' is not ordered, please explicitly specify the level "
                        "order by passing in a level argument.")
        else:
            # there are two ways if levels are present
            # the old one, where each value is a int pointer to the levels array
            # the new one, where each value is also in the level array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            levels = self._validate_levels(levels)

            # There can be two ways: the old which passed in codes and levels directly
            # and values have to be inferred and the new  one, which passes in values and levels
            # and _codes have to be inferred.

            # min and max can be higher and lower if not all levels are in the values
            if compat and (com.is_integer_dtype(values) and
                           (np.min(values) >= -1) and
                           (np.max(values) < len(levels))):
                warn(
                    "Using 'values' as codes is deprecated.\n"
                    "'Categorical(... , compat=True)' is only there for historical reasons and "
                    "should not be used in new code!\n"
                    "See https://github.com/pydata/pandas/pull/7217",
                    FutureWarning)
                codes = values
            else:
                codes = _get_codes_for_values(values, levels)

                # if we got levels, we can assume that the order is intended
                # if ordered is unspecified
                if ordered is None:
                    ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.levels = levels
        self.name = name
Example #49
0
 def factorize(self, sort=False, na_sentinel=-1):
     return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
Example #50
0
    def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.levels = levels
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if levels is None:
                levels = cat.levels
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the level
                # assignment step.
                dtype = 'object' if com.isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if levels is None:
            try:
                codes, levels = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, levels = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the usershould
                    # give us one by specifying levels
                    raise TypeError("'values' is not ordered, please explicitly specify the level "
                                    "order by passing in a level argument.")
        else:
            # there are two ways if levels are present
            # the old one, where each value is a int pointer to the levels array
            # the new one, where each value is also in the level array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            levels = self._validate_levels(levels)

            # There can be two ways: the old which passed in codes and levels directly
            # and values have to be inferred and the new  one, which passes in values and levels
            # and _codes have to be inferred.

            # min and max can be higher and lower if not all levels are in the values
            if compat and (com.is_integer_dtype(values) and
                               (np.min(values) >= -1) and (np.max(values) < len(levels))):
                warn("Using 'values' as codes is deprecated.\n"
                     "'Categorical(... , compat=True)' is only there for historical reasons and "
                     "should not be used in new code!\n"
                     "See https://github.com/pydata/pandas/pull/7217", FutureWarning)
                codes = values
            else:
                codes = _get_codes_for_values(values, levels)

                # if we got levels, we can assume that the order is intended
                # if ordered is unspecified
                if ordered is None:
                    ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.levels = levels
        self.name = name