def concatenate(self, others : List[Categorical], ordered : bool=True) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one ordered : bool If True (default), the arrays will be appended in the order given. If False, array data may be interleaved in blocks, which can greatly improve performance but results in non-deterministic ordering of elements. Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others], ordered=ordered)) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others], ordered=False)) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c + off for c, off in \ zip([self.codes] + [o.codes for o in others], idxoffsets)], ordered=ordered) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def merge(self, others : List[Categorical]) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others])) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others])) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c.codes + off for c, off in zip([self.codes] \ + [o.codes for o in others], idxoffsets)]) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def broadcast(self, values : pdarray) -> pdarray: """ Fill each group's segment with a constant value. Parameters ---------- values : pdarray The values to put in each group's segment Returns ------- pdarray The broadcast values Raises ------ TypeError Raised if value is not a pdarray object ValueError Raised if the values array does not have one value per segment Notes ----- This function is a sparse analog of ``np.broadcast``. If a GroupBy object represents a sparse matrix (tensor), then this function takes a (dense) column vector and replicates each value to the non-zero elements in the corresponding row. The returned array is in permuted (grouped) order. To get back to the order of the array on which GroupBy was called, the user must invert the permutation (see below). Examples -------- >>> a = ak.array([0, 1, 0, 1, 0]) >>> values = ak.array([3, 5]) >>> g = ak.GroupBy(a) # Result is in grouped order >>> g.broadcast(values) array([3, 3, 3, 5, 5] >>> b = ak.zeros_like(a) # Result is in original order >>> b[g.permutation] = g.broadcast(values) >>> b array([3, 5, 3, 5, 3]) """ if not isinstance(values, pdarray): raise TypeError("Vals must be pdarray") if values.size != self.segments.size: raise ValueError("Must have one value per segment") temp = zeros(self.size, values.dtype) if values.size == 0: return temp diffs = concatenate((array([values[0]]), values[1:] - values[:-1])) temp[self.segments] = diffs return cumsum(temp)
def broadcast(self, values : pdarray) -> pdarray: """ Fill each group's segment with a constant value. Parameters ---------- values : pdarray The values to put in each group's segment Returns ------- pdarray The broadcast values Raises ------ TypeError Raised if value is not a pdarray object ValueError Raised if the values array does not have one value per segment Notes ----- This function is a sparse analog of ``np.broadcast``. If a GroupBy object represents a sparse matrix (tensor), then this function takes a (dense) column vector and replicates each value to the non-zero elements in the corresponding row. The returned array is in permuted (grouped) order. To get back to the order of the array on which GroupBy was called, the user must invert the permutation (see below). Examples -------- >>> a = ak.array([0, 1, 0, 1, 0]) >>> values = ak.array([3, 5]) >>> g = ak.GroupBy(a) # Result is in grouped order >>> g.broadcast(values) array([3, 3, 3, 5, 5] >>> b = ak.zeros_like(a) # Result is in original order >>> b[g.permutation] = g.broadcast(values) >>> b array([3, 5, 3, 5, 3]) >>> a = ak.randint(1,5,10) >>> a array([3, 1, 4, 4, 4, 1, 3, 3, 2, 2]) >>> g = ak.GroupBy(a) >>> keys,counts = g.count() >>> g.broadcast(counts > 2) array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> g.broadcast(counts == 3) array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> g.broadcast(counts < 4) array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) """ '''if values a boolean array, convert to an int64 array, which is needed for now because Arkouda does not support broadcasting of boolean arrays''' if values.dtype == np.bool: values = 1*values if values.size != self.segments.size: raise ValueError("Must have one value per segment") temp = zeros(self.size, values.dtype) if values.size == 0: return temp diffs = concatenate((array([values[0]]), values[1:] - values[:-1])) temp[self.segments] = diffs return cumsum(temp)
def _binop(self, other: Union[Categorical, str_scalars], op: str_scalars) -> pdarray: """ Executes the requested binop on this Categorical instance and returns the results within a pdarray object. Parameters ---------- other : Union[Categorical,str_scalars] the other object is a Categorical object or string scalar op : str_scalars name of the binary operation to be performed Returns ------- pdarray encapsulating the results of the requested binop Raises - ----- ValueError Raised if (1) the op is not in the self.BinOps set, or (2) if the sizes of this and the other instance don't match RuntimeError Raised if a server-side error is thrown while executing the binary operation """ if op not in self.BinOps: raise NotImplementedError("Categorical: unsupported operator: {}".\ format(op)) if np.isscalar(other) and resolve_scalar_dtype(other) == "str": idxresult = self.categories._binop(other, op) return idxresult[self.codes] if self.size != cast(Categorical, other).size: raise ValueError("Categorical {}: size mismatch {} {}".\ format(op, self.size, cast(Categorical,other).size)) if isinstance(other, Categorical): if (self.categories.size == other.categories.size) and (self.categories == other.categories).all(): # Because categories are identical, codes can be compared directly return self.codes._binop(other.codes, op) else: # Remap both codes to the union of categories union = unique( concatenate((self.categories, other.categories), ordered=False)) newinds = arange(union.size) # Inds of self.categories in unioned categories selfnewinds = newinds[in1d(union, self.categories)] # Need a permutation and segments to broadcast new codes if self.permutation is None or self.segments is None: g = GroupBy(self.codes) self.permutation = g.permutation self.segments = g.segments # Form new codes by broadcasting new indices for unioned categories selfnewcodes = broadcast(self.segments, selfnewinds, self.size, self.permutation) # Repeat for other othernewinds = newinds[in1d(union, other.categories)] if other.permutation is None or other.segments is None: g = GroupBy(other.codes) other.permutation = g.permutation other.segments = g.segments othernewcodes = broadcast(other.segments, othernewinds, other.size, other.permutation) # selfnewcodes and othernewcodes now refer to same unioned categories # and can be compared directly return selfnewcodes._binop(othernewcodes, op) else: raise NotImplementedError( ("Operations between Categorical and " + "non-Categorical not yet implemented. " + "Consider converting operands to Categorical."))