Beispiel #1
0
 def __init__(self, keys : List[Union[pdarray,np.int64,Strings]], 
             assume_sorted : bool=False, hash_strings : bool=True) -> None:
     self.logger = getArkoudaLogger(name=self.__class__.__name__)
     self.assume_sorted = assume_sorted
     self.hash_strings = hash_strings
     self.keys = keys
     if isinstance(keys, pdarray):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = argsort(keys)
     # for Strings or Categorical
     elif hasattr(keys, "group"):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = keys.group()
     else:
         self.nkeys = len(keys)
         self.size = keys[0].size
         for k in keys:
             if k.size != self.size:
                 raise ValueError("Key arrays must all be same size")
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = coargsort(keys)
         
     # self.permuted_keys = self.keys[self.permutation]
     self.find_segments()       
Beispiel #2
0
 def __init__(self, values, **kwargs):
     if 'codes' in kwargs and 'categories' in kwargs:
         # This initialization is called by Categorical.from_codes()
         # The values arg is ignored
         self.codes = kwargs['codes']
         self.categories = kwargs['categories']            
         if 'permutation' in kwargs:
             self.permutation = kwargs['permutation']
         if 'segments' in kwargs:
             self.segments = kwargs['segments']
     else:
         # Typical initialization, called with values
         if not isinstance(values, Strings):
             raise ValueError("Categorical: inputs other than Strings not yet supported")
         g = GroupBy(values)
         self.categories = g.unique_keys
         self.codes = zeros(values.size, dtype=int64)
         self.codes[g.permutation] = g.broadcast(arange(self.categories.size))
         self.permutation = g.permutation
         self.segments = g.segments
     # Always set these values
     self.size = self.codes.size
     self.nlevels = self.categories.size
     self.ndim = self.codes.ndim
     self.shape = self.codes.shape
Beispiel #3
0
 def sort(self):
     __doc__ = sort.__doc__
     idxperm = argsort(self.categories)
     inverse = zeros_like(idxperm)
     inverse[idxperm] = arange(idxperm.size)
     newvals = inverse[self.codes]
     return Categorical.from_codes(newvals, self.categories[idxperm])
Beispiel #4
0
 def argsort(self):
     __doc__ = argsort.__doc__
     idxperm = argsort(self.categories)
     inverse = zeros_like(idxperm)
     inverse[idxperm] = arange(idxperm.size)
     newvals = inverse[self.codes]
     return argsort(newvals)
Beispiel #5
0
 def __init__(self, values, **kwargs) -> None:
     self.logger = getArkoudaLogger(name=__class__.__name__)  # type: ignore
     if 'codes' in kwargs and 'categories' in kwargs:
         # This initialization is called by Categorical.from_codes()
         # The values arg is ignored
         self.codes = kwargs['codes']
         self.categories = kwargs['categories']
         if 'permutation' in kwargs:
             self.permutation = cast(pdarray, kwargs['permutation'])
         if 'segments' in kwargs:
             self.segments = cast(pdarray, kwargs['segments'])
     else:
         # Typical initialization, called with values
         if not isinstance(values, Strings):
             raise ValueError(("Categorical: inputs other than " +
                               "Strings not yet supported"))
         g = GroupBy(values)
         self.categories = g.unique_keys
         self.codes = g.broadcast(arange(self.categories.size),
                                  permute=True)
         self.permutation = cast(pdarray, g.permutation)
         self.segments = g.segments
     # Always set these values
     self.size: int_scalars = self.codes.size
     self.nlevels = self.categories.size
     self.ndim = self.codes.ndim
     self.shape = self.codes.shape
     self.name: Optional[str] = None
Beispiel #6
0
    def concatenate(self, others : List[Categorical], ordered : bool=True) -> Categorical:
        """
        Merge this Categorical with other Categorical objects in the array, 
        concatenating the arrays and synchronizing the categories.

        Parameters
        ----------
        others : List[Categorical]
            The Categorical arrays to concatenate and merge with this one
        ordered : bool
            If True (default), the arrays will be appended in the
            order given. If False, array data may be interleaved
            in blocks, which can greatly improve performance but
            results in non-deterministic ordering of elements.

        Returns
        -------
        Categorical 
            The merged Categorical object
            
        Raises
        ------
        TypeError
            Raised if any others array objects are not Categorical objects

        Notes
        -----
        This operation can be expensive -- slower than concatenating Strings.
        """
        if isinstance(others, Categorical):
            others = [others]
        elif len(others) < 1:
            return self
        samecategories = True
        for c in others:
            if not isinstance(c, Categorical):
                raise TypeError(("Categorical: can only merge/concatenate " +
                                "with other Categoricals"))
            if (self.categories.size != c.categories.size) or not \
                                    (self.categories == c.categories).all():
                samecategories = False
        if samecategories:
            newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others], ordered=ordered))
            return Categorical.from_codes(newvals, self.categories)
        else:
            g = GroupBy(concatenate([self.categories] + \
                                       [o.categories for o in others],
                                       ordered=False))
            newidx = g.unique_keys
            wherediditgo = zeros(newidx.size, dtype=akint64)
            wherediditgo[g.permutation] = arange(newidx.size)
            idxsizes = np.array([self.categories.size] + \
                                [o.categories.size for o in others])
            idxoffsets = np.cumsum(idxsizes) - idxsizes
            oldvals = concatenate([c + off for c, off in \
                                   zip([self.codes] + [o.codes for o in others], idxoffsets)],
                                  ordered=ordered)
            newvals = wherediditgo[oldvals]
            return Categorical.from_codes(newvals, newidx)
Beispiel #7
0
    def __init__(self,
                 keys: Union[pdarray, Strings, 'Categorical',
                             List[Union[pdarray, np.int64, Strings]]],
                 assume_sorted: bool = False,
                 hash_strings: bool = True) -> None:
        from arkouda.categorical import Categorical
        self.logger = getArkoudaLogger(name=self.__class__.__name__)
        self.assume_sorted = assume_sorted
        self.hash_strings = hash_strings
        self.keys: Union[pdarray, Strings, Categorical]

        if isinstance(keys, pdarray):
            if keys.dtype != int64:
                raise TypeError(
                    'GroupBy only supports pdarrays with a dtype int64')
            self.keys = cast(pdarray, keys)
            self.nkeys = 1
            self.size = cast(int, keys.size)
            if assume_sorted:
                self.permutation = cast(pdarray, arange(self.size))
            else:
                self.permutation = cast(pdarray, argsort(keys))
        elif hasattr(keys, "group"):  # for Strings or Categorical
            self.nkeys = 1
            self.keys = cast(Union[Strings, Categorical], keys)
            self.size = cast(int, self.keys.size)  # type: ignore
            if assume_sorted:
                self.permutation = cast(pdarray, arange(self.size))
            else:
                self.permutation = cast(Union[Strings, Categorical],
                                        keys).group()
        else:
            self.keys = cast(Union[pdarray, Strings, Categorical], keys)
            self.nkeys = len(keys)
            self.size = cast(int, keys[0].size)  # type: ignore
            for k in keys:
                if k.size != self.size:
                    raise ValueError("Key arrays must all be same size")
            if assume_sorted:
                self.permutation = cast(pdarray, arange(self.size))
            else:
                self.permutation = cast(
                    pdarray, coargsort(cast(Sequence[pdarray], keys)))

        # self.permuted_keys = self.keys[self.permutation]
        self.find_segments()
Beispiel #8
0
 def unique(self) -> Categorical:
     #__doc__ = unique.__doc__
     reset_cat = self if self.uses_all_categories else self.reset_categories(
     )
     return Categorical.from_codes(
         arange(reset_cat.categories.size),
         reset_cat.categories,
         uses_all_categories=reset_cat.uses_all_categories)
Beispiel #9
0
 def reset_categories(self):
     """
     Recompute the category labels, discarding any unused labels. This method
     is often useful after slicing or indexing a Categorical array, when the
     resulting array only contains a subset of the original categories. In
     this case, eliminating unused categories can speed up other operations.
     """
     g = GroupBy(self.codes)
     idx = self.categories[g.unique_keys]
     newvals = zeros(self.codes.size, int64)
     newvals[g.permutation] = g.broadcast(arange(idx.size))
     return Categorical.from_codes(newvals, idx, permutation=g.permutation, segments=g.segments)
Beispiel #10
0
    def merge(self, others : List[Categorical]) -> Categorical:
        """
        Merge this Categorical with other Categorical objects in the array, 
        concatenating the arrays and synchronizing the categories.

        Parameters
        ----------
        others : List[Categorical]
            The Categorical arrays to concatenate and merge with this one

        Returns
        -------
        Categorical 
            The merged Categorical object
            
        Raises
        ------
        TypeError
            Raised if any others array objects are not Categorical objects

        Notes
        -----
        This operation can be expensive -- slower than concatenating Strings.
        """
        if isinstance(others, Categorical):
            others = [others]
        elif len(others) < 1:
            return self
        samecategories = True
        for c in others:
            if not isinstance(c, Categorical):
                raise TypeError(("Categorical: can only merge/concatenate " +
                                "with other Categoricals"))
            if (self.categories.size != c.categories.size) or not \
                                    (self.categories == c.categories).all():
                samecategories = False
        if samecategories:
            newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others]))
            return Categorical.from_codes(newvals, self.categories)
        else:
            g = GroupBy(concatenate([self.categories] + \
                                       [o.categories for o in others]))
            newidx = g.unique_keys
            wherediditgo = zeros(newidx.size, dtype=akint64)
            wherediditgo[g.permutation] = arange(newidx.size)
            idxsizes = np.array([self.categories.size] + \
                                [o.categories.size for o in others])
            idxoffsets = np.cumsum(idxsizes) - idxsizes
            oldvals = concatenate([c.codes + off for c, off in zip([self.codes] \
                                    + [o.codes for o in others], idxoffsets)])
            newvals = wherediditgo[oldvals]
            return Categorical.from_codes(newvals, newidx)
Beispiel #11
0
 def __init__(self, keys, assume_sorted=False, hash_strings=True):
     self.assume_sorted = assume_sorted
     self.hash_strings = hash_strings
     self.per_locale = False
     self.keys = keys
     if isinstance(keys, pdarray):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         elif self.per_locale:
             self.permutation = local_argsort(keys)
         else:
             self.permutation = argsort(keys)
     # for Strings or Categorical
     elif hasattr(keys, "group"):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         elif self.per_locale:
             raise ValueError("per-locale groupby not supported on Strings or Categorical")
         else:
             self.permutation = keys.group()
     else:
         self.nkeys = len(keys)
         self.size = keys[0].size
         for k in keys:
             if k.size != self.size:
                 raise ValueError("Key arrays must all be same size")
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = coargsort(keys)
         
     # self.permuted_keys = self.keys[self.permutation]
     self.find_segments()
Beispiel #12
0
 def reset_categories(self) -> Categorical:
     """
     Recompute the category labels, discarding any unused labels. This
     method is often useful after slicing or indexing a Categorical array, 
     when the resulting array only contains a subset of the original 
     categories. In this case, eliminating unused categories can speed up 
     other operations.
     
     Returns
     -------
     Categorical
         A Categorical object generated from the current instance
     """
     g = GroupBy(self.codes)
     idx = self.categories[g.unique_keys]
     newvals = g.broadcast(arange(idx.size), permute=True)
     return Categorical.from_codes(newvals,
                                   idx,
                                   permutation=g.permutation,
                                   segments=g.segments)
Beispiel #13
0
 def unique(self):
     __doc__ = unique.__doc__
     return Categorical.from_codes(arange(self.categories.size), self.categories)
Beispiel #14
0
    def _binop(self, other: Union[Categorical, str_scalars],
               op: str_scalars) -> pdarray:
        """
        Executes the requested binop on this Categorical instance and returns 
        the results within a pdarray object.

        Parameters
        ----------
        other : Union[Categorical,str_scalars]
            the other object is a Categorical object or string scalar
        op : str_scalars
            name of the binary operation to be performed 
      
        Returns
        -------
        pdarray
            encapsulating the results of the requested binop      

        Raises
    -   -----
        ValueError
            Raised if (1) the op is not in the self.BinOps set, or (2) if the
            sizes of this and the other instance don't match
        RuntimeError
            Raised if a server-side error is thrown while executing the
            binary operation
        """
        if op not in self.BinOps:
            raise NotImplementedError("Categorical: unsupported operator: {}".\
                                      format(op))
        if np.isscalar(other) and resolve_scalar_dtype(other) == "str":
            idxresult = self.categories._binop(other, op)
            return idxresult[self.codes]
        if self.size != cast(Categorical, other).size:
            raise ValueError("Categorical {}: size mismatch {} {}".\
                             format(op, self.size, cast(Categorical,other).size))
        if isinstance(other, Categorical):
            if (self.categories.size
                    == other.categories.size) and (self.categories
                                                   == other.categories).all():
                # Because categories are identical, codes can be compared directly
                return self.codes._binop(other.codes, op)
            else:
                # Remap both codes to the union of categories
                union = unique(
                    concatenate((self.categories, other.categories),
                                ordered=False))
                newinds = arange(union.size)
                # Inds of self.categories in unioned categories
                selfnewinds = newinds[in1d(union, self.categories)]
                # Need a permutation and segments to broadcast new codes
                if self.permutation is None or self.segments is None:
                    g = GroupBy(self.codes)
                    self.permutation = g.permutation
                    self.segments = g.segments
                # Form new codes by broadcasting new indices for unioned categories
                selfnewcodes = broadcast(self.segments, selfnewinds, self.size,
                                         self.permutation)
                # Repeat for other
                othernewinds = newinds[in1d(union, other.categories)]
                if other.permutation is None or other.segments is None:
                    g = GroupBy(other.codes)
                    other.permutation = g.permutation
                    other.segments = g.segments
                othernewcodes = broadcast(other.segments, othernewinds,
                                          other.size, other.permutation)
                # selfnewcodes and othernewcodes now refer to same unioned categories
                # and can be compared directly
                return selfnewcodes._binop(othernewcodes, op)
        else:
            raise NotImplementedError(
                ("Operations between Categorical and " +
                 "non-Categorical not yet implemented. " +
                 "Consider converting operands to Categorical."))
Beispiel #15
0
    def nunique(self, values: groupable) -> Tuple[groupable, pdarray]:
        """
        Using the permutation stored in the GroupBy instance, group another
        array of values and return the number of unique values in each group. 

        Parameters
        ----------
        values : pdarray, int64
            The values to group and find unique values

        Returns
        -------
        unique_keys : groupable
            The unique keys, in grouped order
        group_nunique : groupable
            Number of unique values per unique key in the GroupBy instance
            
        Raises
        ------
        TypeError
            Raised if the dtype(s) of values array(s) does/do not support 
            the nunique method
        ValueError
            Raised if the key array size does not match the values size or
            if the operator is not in the GroupBy.Reductions array
        RuntimeError
            Raised if nunique is not supported for the values dtype
            
        Examples
        --------
        >>> data = ak.array([3, 4, 3, 1, 1, 4, 3, 4, 1, 4])
        >>> data
        array([3, 4, 3, 1, 1, 4, 3, 4, 1, 4])
        >>> labels = ak.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4])
        >>> labels
        ak.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4])
        >>> g = ak.GroupBy(labels)
        >>> g.keys
        ak.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4])
        >>> g.nunique(data)
        array([1,2,3,4]), array([2, 2, 3, 1])
        #    Group (1,1,1) has values [3,4,3] -> there are 2 unique values 3&4
        #    Group (2,2,2) has values [1,1,4] -> 2 unique values 1&4
        #    Group (3,3,3) has values [3,4,1] -> 3 unique values
        #    Group (4) has values [4] -> 1 unique value
        """
        # TO DO: defer to self.aggregate once logic is ported over to Chapel
        # return self.aggregate(values, "nunique")

        ukidx = self.broadcast(arange(self.ngroups), permute=True)
        # Test if values is single array, i.e. either pdarray, Strings,
        # or Categorical (the last two have a .group() method).
        # Can't directly test Categorical due to circular import.
        if isinstance(values, pdarray):
            if cast(pdarray, values).dtype != int64:
                raise TypeError("nunique unsupported for this dtype")
            togroup = [ukidx, values]
        elif hasattr(values, "group"):
            togroup = [ukidx, values]
        else:
            for v in values:
                if isinstance(
                        values,
                        pdarray) and cast(pdarray, values).dtype != int64:
                    raise TypeError("nunique unsupported for this dtype")
            togroup = [ukidx] + list(values)
        # Find unique pairs of (key, val)
        g = GroupBy(togroup)
        # Group unique pairs again by original key
        g2 = GroupBy(g.unique_keys[0], assume_sorted=True)
        # Count number of unique values per key
        _, nuniq = g2.count()
        # Re-join unique counts with original keys (sorting guarantees same order)
        return self.unique_keys, nuniq