def __init__(self, keys : List[Union[pdarray,np.int64,Strings]], assume_sorted : bool=False, hash_strings : bool=True) -> None: self.logger = getArkoudaLogger(name=self.__class__.__name__) self.assume_sorted = assume_sorted self.hash_strings = hash_strings self.keys = keys if isinstance(keys, pdarray): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) else: self.permutation = argsort(keys) # for Strings or Categorical elif hasattr(keys, "group"): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) else: self.permutation = keys.group() else: self.nkeys = len(keys) self.size = keys[0].size for k in keys: if k.size != self.size: raise ValueError("Key arrays must all be same size") if assume_sorted: self.permutation = arange(self.size) else: self.permutation = coargsort(keys) # self.permuted_keys = self.keys[self.permutation] self.find_segments()
def __init__(self, values, **kwargs): if 'codes' in kwargs and 'categories' in kwargs: # This initialization is called by Categorical.from_codes() # The values arg is ignored self.codes = kwargs['codes'] self.categories = kwargs['categories'] if 'permutation' in kwargs: self.permutation = kwargs['permutation'] if 'segments' in kwargs: self.segments = kwargs['segments'] else: # Typical initialization, called with values if not isinstance(values, Strings): raise ValueError("Categorical: inputs other than Strings not yet supported") g = GroupBy(values) self.categories = g.unique_keys self.codes = zeros(values.size, dtype=int64) self.codes[g.permutation] = g.broadcast(arange(self.categories.size)) self.permutation = g.permutation self.segments = g.segments # Always set these values self.size = self.codes.size self.nlevels = self.categories.size self.ndim = self.codes.ndim self.shape = self.codes.shape
def sort(self): __doc__ = sort.__doc__ idxperm = argsort(self.categories) inverse = zeros_like(idxperm) inverse[idxperm] = arange(idxperm.size) newvals = inverse[self.codes] return Categorical.from_codes(newvals, self.categories[idxperm])
def argsort(self): __doc__ = argsort.__doc__ idxperm = argsort(self.categories) inverse = zeros_like(idxperm) inverse[idxperm] = arange(idxperm.size) newvals = inverse[self.codes] return argsort(newvals)
def __init__(self, values, **kwargs) -> None: self.logger = getArkoudaLogger(name=__class__.__name__) # type: ignore if 'codes' in kwargs and 'categories' in kwargs: # This initialization is called by Categorical.from_codes() # The values arg is ignored self.codes = kwargs['codes'] self.categories = kwargs['categories'] if 'permutation' in kwargs: self.permutation = cast(pdarray, kwargs['permutation']) if 'segments' in kwargs: self.segments = cast(pdarray, kwargs['segments']) else: # Typical initialization, called with values if not isinstance(values, Strings): raise ValueError(("Categorical: inputs other than " + "Strings not yet supported")) g = GroupBy(values) self.categories = g.unique_keys self.codes = g.broadcast(arange(self.categories.size), permute=True) self.permutation = cast(pdarray, g.permutation) self.segments = g.segments # Always set these values self.size: int_scalars = self.codes.size self.nlevels = self.categories.size self.ndim = self.codes.ndim self.shape = self.codes.shape self.name: Optional[str] = None
def concatenate(self, others : List[Categorical], ordered : bool=True) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one ordered : bool If True (default), the arrays will be appended in the order given. If False, array data may be interleaved in blocks, which can greatly improve performance but results in non-deterministic ordering of elements. Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others], ordered=ordered)) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others], ordered=False)) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c + off for c, off in \ zip([self.codes] + [o.codes for o in others], idxoffsets)], ordered=ordered) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def __init__(self, keys: Union[pdarray, Strings, 'Categorical', List[Union[pdarray, np.int64, Strings]]], assume_sorted: bool = False, hash_strings: bool = True) -> None: from arkouda.categorical import Categorical self.logger = getArkoudaLogger(name=self.__class__.__name__) self.assume_sorted = assume_sorted self.hash_strings = hash_strings self.keys: Union[pdarray, Strings, Categorical] if isinstance(keys, pdarray): if keys.dtype != int64: raise TypeError( 'GroupBy only supports pdarrays with a dtype int64') self.keys = cast(pdarray, keys) self.nkeys = 1 self.size = cast(int, keys.size) if assume_sorted: self.permutation = cast(pdarray, arange(self.size)) else: self.permutation = cast(pdarray, argsort(keys)) elif hasattr(keys, "group"): # for Strings or Categorical self.nkeys = 1 self.keys = cast(Union[Strings, Categorical], keys) self.size = cast(int, self.keys.size) # type: ignore if assume_sorted: self.permutation = cast(pdarray, arange(self.size)) else: self.permutation = cast(Union[Strings, Categorical], keys).group() else: self.keys = cast(Union[pdarray, Strings, Categorical], keys) self.nkeys = len(keys) self.size = cast(int, keys[0].size) # type: ignore for k in keys: if k.size != self.size: raise ValueError("Key arrays must all be same size") if assume_sorted: self.permutation = cast(pdarray, arange(self.size)) else: self.permutation = cast( pdarray, coargsort(cast(Sequence[pdarray], keys))) # self.permuted_keys = self.keys[self.permutation] self.find_segments()
def unique(self) -> Categorical: #__doc__ = unique.__doc__ reset_cat = self if self.uses_all_categories else self.reset_categories( ) return Categorical.from_codes( arange(reset_cat.categories.size), reset_cat.categories, uses_all_categories=reset_cat.uses_all_categories)
def reset_categories(self): """ Recompute the category labels, discarding any unused labels. This method is often useful after slicing or indexing a Categorical array, when the resulting array only contains a subset of the original categories. In this case, eliminating unused categories can speed up other operations. """ g = GroupBy(self.codes) idx = self.categories[g.unique_keys] newvals = zeros(self.codes.size, int64) newvals[g.permutation] = g.broadcast(arange(idx.size)) return Categorical.from_codes(newvals, idx, permutation=g.permutation, segments=g.segments)
def merge(self, others : List[Categorical]) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others])) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others])) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c.codes + off for c, off in zip([self.codes] \ + [o.codes for o in others], idxoffsets)]) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def __init__(self, keys, assume_sorted=False, hash_strings=True): self.assume_sorted = assume_sorted self.hash_strings = hash_strings self.per_locale = False self.keys = keys if isinstance(keys, pdarray): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) elif self.per_locale: self.permutation = local_argsort(keys) else: self.permutation = argsort(keys) # for Strings or Categorical elif hasattr(keys, "group"): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) elif self.per_locale: raise ValueError("per-locale groupby not supported on Strings or Categorical") else: self.permutation = keys.group() else: self.nkeys = len(keys) self.size = keys[0].size for k in keys: if k.size != self.size: raise ValueError("Key arrays must all be same size") if assume_sorted: self.permutation = arange(self.size) else: self.permutation = coargsort(keys) # self.permuted_keys = self.keys[self.permutation] self.find_segments()
def reset_categories(self) -> Categorical: """ Recompute the category labels, discarding any unused labels. This method is often useful after slicing or indexing a Categorical array, when the resulting array only contains a subset of the original categories. In this case, eliminating unused categories can speed up other operations. Returns ------- Categorical A Categorical object generated from the current instance """ g = GroupBy(self.codes) idx = self.categories[g.unique_keys] newvals = g.broadcast(arange(idx.size), permute=True) return Categorical.from_codes(newvals, idx, permutation=g.permutation, segments=g.segments)
def unique(self): __doc__ = unique.__doc__ return Categorical.from_codes(arange(self.categories.size), self.categories)
def _binop(self, other: Union[Categorical, str_scalars], op: str_scalars) -> pdarray: """ Executes the requested binop on this Categorical instance and returns the results within a pdarray object. Parameters ---------- other : Union[Categorical,str_scalars] the other object is a Categorical object or string scalar op : str_scalars name of the binary operation to be performed Returns ------- pdarray encapsulating the results of the requested binop Raises - ----- ValueError Raised if (1) the op is not in the self.BinOps set, or (2) if the sizes of this and the other instance don't match RuntimeError Raised if a server-side error is thrown while executing the binary operation """ if op not in self.BinOps: raise NotImplementedError("Categorical: unsupported operator: {}".\ format(op)) if np.isscalar(other) and resolve_scalar_dtype(other) == "str": idxresult = self.categories._binop(other, op) return idxresult[self.codes] if self.size != cast(Categorical, other).size: raise ValueError("Categorical {}: size mismatch {} {}".\ format(op, self.size, cast(Categorical,other).size)) if isinstance(other, Categorical): if (self.categories.size == other.categories.size) and (self.categories == other.categories).all(): # Because categories are identical, codes can be compared directly return self.codes._binop(other.codes, op) else: # Remap both codes to the union of categories union = unique( concatenate((self.categories, other.categories), ordered=False)) newinds = arange(union.size) # Inds of self.categories in unioned categories selfnewinds = newinds[in1d(union, self.categories)] # Need a permutation and segments to broadcast new codes if self.permutation is None or self.segments is None: g = GroupBy(self.codes) self.permutation = g.permutation self.segments = g.segments # Form new codes by broadcasting new indices for unioned categories selfnewcodes = broadcast(self.segments, selfnewinds, self.size, self.permutation) # Repeat for other othernewinds = newinds[in1d(union, other.categories)] if other.permutation is None or other.segments is None: g = GroupBy(other.codes) other.permutation = g.permutation other.segments = g.segments othernewcodes = broadcast(other.segments, othernewinds, other.size, other.permutation) # selfnewcodes and othernewcodes now refer to same unioned categories # and can be compared directly return selfnewcodes._binop(othernewcodes, op) else: raise NotImplementedError( ("Operations between Categorical and " + "non-Categorical not yet implemented. " + "Consider converting operands to Categorical."))
def nunique(self, values: groupable) -> Tuple[groupable, pdarray]: """ Using the permutation stored in the GroupBy instance, group another array of values and return the number of unique values in each group. Parameters ---------- values : pdarray, int64 The values to group and find unique values Returns ------- unique_keys : groupable The unique keys, in grouped order group_nunique : groupable Number of unique values per unique key in the GroupBy instance Raises ------ TypeError Raised if the dtype(s) of values array(s) does/do not support the nunique method ValueError Raised if the key array size does not match the values size or if the operator is not in the GroupBy.Reductions array RuntimeError Raised if nunique is not supported for the values dtype Examples -------- >>> data = ak.array([3, 4, 3, 1, 1, 4, 3, 4, 1, 4]) >>> data array([3, 4, 3, 1, 1, 4, 3, 4, 1, 4]) >>> labels = ak.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4]) >>> labels ak.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4]) >>> g = ak.GroupBy(labels) >>> g.keys ak.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4]) >>> g.nunique(data) array([1,2,3,4]), array([2, 2, 3, 1]) # Group (1,1,1) has values [3,4,3] -> there are 2 unique values 3&4 # Group (2,2,2) has values [1,1,4] -> 2 unique values 1&4 # Group (3,3,3) has values [3,4,1] -> 3 unique values # Group (4) has values [4] -> 1 unique value """ # TO DO: defer to self.aggregate once logic is ported over to Chapel # return self.aggregate(values, "nunique") ukidx = self.broadcast(arange(self.ngroups), permute=True) # Test if values is single array, i.e. either pdarray, Strings, # or Categorical (the last two have a .group() method). # Can't directly test Categorical due to circular import. if isinstance(values, pdarray): if cast(pdarray, values).dtype != int64: raise TypeError("nunique unsupported for this dtype") togroup = [ukidx, values] elif hasattr(values, "group"): togroup = [ukidx, values] else: for v in values: if isinstance( values, pdarray) and cast(pdarray, values).dtype != int64: raise TypeError("nunique unsupported for this dtype") togroup = [ukidx] + list(values) # Find unique pairs of (key, val) g = GroupBy(togroup) # Group unique pairs again by original key g2 = GroupBy(g.unique_keys[0], assume_sorted=True) # Count number of unique values per key _, nuniq = g2.count() # Re-join unique counts with original keys (sorting guarantees same order) return self.unique_keys, nuniq