def sort(pda): """ Return a sorted copy of the array. Parameters ---------- pda : pdarray The array to sort (int64 or float64) Returns ------- pdarray, int64 or float64 The sorted copy of pda Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> sorted = ak.sort(a) >>> a array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if isinstance(pda, pdarray): if pda.size == 0: return zeros(0, dtype=int64) repMsg = generic_msg("sort {}".format(pda.name)) return create_pdarray(repMsg) else: raise TypeError("must be pdarray {}".format(pda))
def __init__(self, values, **kwargs): if 'codes' in kwargs and 'categories' in kwargs: # This initialization is called by Categorical.from_codes() # The values arg is ignored self.codes = kwargs['codes'] self.categories = kwargs['categories'] if 'permutation' in kwargs: self.permutation = kwargs['permutation'] if 'segments' in kwargs: self.segments = kwargs['segments'] else: # Typical initialization, called with values if not isinstance(values, Strings): raise ValueError("Categorical: inputs other than Strings not yet supported") g = GroupBy(values) self.categories = g.unique_keys self.codes = zeros(values.size, dtype=int64) self.codes[g.permutation] = g.broadcast(arange(self.categories.size)) self.permutation = g.permutation self.segments = g.segments # Always set these values self.size = self.codes.size self.nlevels = self.categories.size self.ndim = self.codes.ndim self.shape = self.codes.shape
def concatenate(arrays): """ Concatenate an iterable of ``pdarray`` objects into one ``pdarray``. Parameters ---------- arrays : iterable of ``pdarray`` The arrays to concatenate. Must all have same dtype. Returns ------- pdarray Single array containing all values, in original order Examples -------- >>> ak.concatenate([ak.array([1, 2, 3]), ak.array([4, 5, 6])]) array([1, 2, 3, 4, 5, 6]) """ size = 0 dtype = None for a in arrays: if not isinstance(a, pdarray): raise ValueError("Argument must be an iterable of pdarrays") if dtype == None: dtype = a.dtype elif dtype != a.dtype: raise ValueError("All pdarrays must have same dtype") size += a.size if size == 0: return zeros(0, dtype=int64) repMsg = generic_msg("concatenate {} {}".format( len(arrays), ' '.join([a.name for a in arrays]))) return create_pdarray(repMsg)
def concatenate(self, others : List[Categorical], ordered : bool=True) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one ordered : bool If True (default), the arrays will be appended in the order given. If False, array data may be interleaved in blocks, which can greatly improve performance but results in non-deterministic ordering of elements. Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others], ordered=ordered)) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others], ordered=False)) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c + off for c, off in \ zip([self.codes] + [o.codes for o in others], idxoffsets)], ordered=ordered) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def broadcast(self, values : pdarray) -> pdarray: """ Fill each group's segment with a constant value. Parameters ---------- values : pdarray The values to put in each group's segment Returns ------- pdarray The broadcast values Raises ------ TypeError Raised if value is not a pdarray object ValueError Raised if the values array does not have one value per segment Notes ----- This function is a sparse analog of ``np.broadcast``. If a GroupBy object represents a sparse matrix (tensor), then this function takes a (dense) column vector and replicates each value to the non-zero elements in the corresponding row. The returned array is in permuted (grouped) order. To get back to the order of the array on which GroupBy was called, the user must invert the permutation (see below). Examples -------- >>> a = ak.array([0, 1, 0, 1, 0]) >>> values = ak.array([3, 5]) >>> g = ak.GroupBy(a) # Result is in grouped order >>> g.broadcast(values) array([3, 3, 3, 5, 5] >>> b = ak.zeros_like(a) # Result is in original order >>> b[g.permutation] = g.broadcast(values) >>> b array([3, 5, 3, 5, 3]) """ if not isinstance(values, pdarray): raise TypeError("Vals must be pdarray") if values.size != self.segments.size: raise ValueError("Must have one value per segment") temp = zeros(self.size, values.dtype) if values.size == 0: return temp diffs = concatenate((array([values[0]]), values[1:] - values[:-1])) temp[self.segments] = diffs return cumsum(temp)
def local_argsort(pda): if isinstance(pda, pdarray): if pda.size == 0: return zeros(0, dtype=int64) repMsg = generic_msg("localArgsort {}".format(pda.name)) return create_pdarray(repMsg) else: raise TypeError("must be pdarray {}".format(pda))
def argsort( pda: Union[pdarray, Strings, 'Categorical'], algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD ) -> pdarray: # type: ignore """ Return the permutation that sorts the array. Parameters ---------- pda : pdarray or Strings or Categorical The array to sort (int64 or float64) Returns ------- pdarray, int64 The indices such that ``pda[indices]`` is sorted Raises ------ TypeError Raised if the parameter is other than a pdarray or Strings See Also -------- coargsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> perm = ak.argsort(a) >>> a[perm] array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ from arkouda.categorical import Categorical check_type(argname='argsort', value=pda, expected_type=Union[pdarray, Strings, Categorical]) if hasattr(pda, "argsort"): return cast(Categorical, pda).argsort() if pda.size == 0: return zeros(0, dtype=int64) if isinstance(pda, Strings): name = '{}+{}'.format(pda.entry.name, "legacy_placeholder") else: name = pda.name repMsg = generic_msg(cmd="argsort", args="{} {} {}".format(algorithm.name, pda.objtype, name)) return create_pdarray(cast(str, repMsg))
def reset_categories(self): """ Recompute the category labels, discarding any unused labels. This method is often useful after slicing or indexing a Categorical array, when the resulting array only contains a subset of the original categories. In this case, eliminating unused categories can speed up other operations. """ g = GroupBy(self.codes) idx = self.categories[g.unique_keys] newvals = zeros(self.codes.size, int64) newvals[g.permutation] = g.broadcast(arange(idx.size)) return Categorical.from_codes(newvals, idx, permutation=g.permutation, segments=g.segments)
def merge(self, others : List[Categorical]) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others])) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others])) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c.codes + off for c, off in zip([self.codes] \ + [o.codes for o in others], idxoffsets)]) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def sort( pda: pdarray, algorithm: SortingAlgorithm = SortingAlgorithm.RadixSortLSD ) -> pdarray: """ Return a sorted copy of the array. Only sorts numeric arrays; for Strings, use argsort. Parameters ---------- pda : pdarray or Categorical The array to sort (int64 or float64) Returns ------- pdarray, int64 or float64 The sorted copy of pda Raises ------ TypeError Raised if the parameter is not a pdarray ValueError Raised if sort attempted on a pdarray with an unsupported dtype such as bool See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> sorted = ak.sort(a) >>> a array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if pda.size == 0: return zeros(0, dtype=int64) if pda.dtype not in numeric_dtypes: raise ValueError("ak.sort supports float64 or int64, not {}".format( pda.dtype)) repMsg = generic_msg(cmd="sort", args="{} {}".format(algorithm.name, pda.name)) return create_pdarray(cast(str, repMsg))
def coargsort(arrays): """ Return the permutation that sorts the rows (left-to-right), if the input arrays are treated as columns. Parameters ---------- arrays : iterable of pdarray The columns (int64 or float64) to sort by row Returns ------- pdarray, int64 The indices that permute the rows to sorted order See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Starts with the last array and moves forward. Examples -------- >>> a = ak.array([0, 1, 0, 1]) >>> b = ak.array([1, 1, 0, 0]) >>> perm = ak.coargsort([a, b]) >>> perm array([2, 0, 3, 1]) >>> a[perm] array([0, 0, 1, 1]) >>> b[perm] array([0, 1, 0, 1]) """ size = -1 for a in arrays: if not isinstance(a, pdarray): raise ValueError("Argument must be an iterable of pdarrays") if size == -1: size = a.size elif size != a.size: raise ValueError("All pdarrays must have same size") if size == 0: return zeros(0, dtype=int64) repMsg = generic_msg("coargsort {} {}".format( len(arrays), ' '.join([a.name for a in arrays]))) return create_pdarray(repMsg)
def argsort(pda: Union[pdarray, Strings, 'Categorical']) -> pdarray: """ Return the permutation that sorts the array. Parameters ---------- pda : pdarray or Strings or Categorical The array to sort (int64 or float64) Returns ------- pdarray, int64 The indices such that ``pda[indices]`` is sorted Raises ------ TypeError Raised if the parameter is other than a pdarray or Strings See Also -------- coargsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilinent to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> perm = ak.argsort(a) >>> a[perm] array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if hasattr(pda, "argsort"): return pda.argsort() if pda.size == 0: return zeros(0, dtype=int64) if isinstance(pda, Strings): name = '{}+{}'.format(pda.offsets.name, pda.bytes.name) else: name = pda.name repMsg = generic_msg("argsort {} {}".format(pda.objtype, name)) return create_pdarray(repMsg)
def sort(pda: pdarray) -> pdarray: """ Return a sorted copy of the array. Only sorts numeric arrays; for Strings, use argsort. Parameters ---------- pda : pdarray or Categorical The array to sort (int64 or float64) Returns ------- pdarray, int64 or float64 The sorted copy of pda Raises ------ TypeError Raised if the parameter is not a pdarray RuntimeError Raised if sort attempted on a pdarray with an unsupported dtype See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> sorted = ak.sort(a) >>> a array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if pda.size == 0: return zeros(0, dtype=int64) repMsg = generic_msg("sort {}".format(pda.name)) return create_pdarray(repMsg)
def argsort(pda): """ Return the permutation that sorts the array. Parameters ---------- pda : pdarray The array to sort (int64 or float64) Returns ------- pdarray, int64 The indices such that ``pda[indices]`` is sorted See Also -------- coargsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Examples -------- >>> a = ak.randint(0, 10, 10) >>> perm = ak.argsort(a) >>> a[perm] array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9]) """ if isinstance(pda, pdarray): if pda.size == 0: return zeros(0, dtype=int64) repMsg = generic_msg("argsort {}".format(pda.name)) return create_pdarray(repMsg) else: raise TypeError("must be pdarray {}".format(pda))
def coargsort(arrays: Sequence[Union[Strings, pdarray, 'Categorical']]) -> pdarray: # type: ignore """ Return the permutation that groups the rows (left-to-right), if the input arrays are treated as columns. The permutation sorts numeric columns, but not strings/Categoricals -- strings/Categoricals are grouped, but not ordered. Parameters ---------- arrays : Sequence[Union[Strings, pdarray, Categorical]] The columns (int64, float64, Strings, or Categorical) to sort by row Returns ------- pdarray, int64 The indices that permute the rows to grouped order Raises ------ ValueError Raised if the pdarrays are not of the same size or if the parameter is not an Iterable containing pdarrays, Strings, or Categoricals See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Starts with the last array and moves forward. This sort operates directly on numeric types, but for Strings, it operates on a hash. Thus, while grouping of equivalent strings is guaranteed, lexicographic ordering of the groups is not. For Categoricals, coargsort sorts based on Categorical.codes which guarantees grouping of equivalent categories but not lexicographic ordering of those groups. Examples -------- >>> a = ak.array([0, 1, 0, 1]) >>> b = ak.array([1, 1, 0, 0]) >>> perm = ak.coargsort([a, b]) >>> perm array([2, 0, 3, 1]) >>> a[perm] array([0, 0, 1, 1]) >>> b[perm] array([0, 1, 0, 1]) """ from arkouda.categorical import Categorical check_type(argname='coargsort', value=arrays, expected_type=Sequence[Union[pdarray, Strings, Categorical]]) size = -1 anames = [] atypes = [] for a in arrays: if isinstance(a, (pdarray, Strings)): anames.append('+'.join(a._list_component_names())) atypes.append(a.objtype) elif isinstance(a, Categorical): anames.append(a.codes.name) atypes.append(a.objtype) else: raise ValueError("Argument must be an iterable of pdarrays, Strings, or Categoricals") if size == -1: size = a.size elif size != a.size: raise ValueError("All pdarrays, Strings, or Categoricals must be of the same size") if size == 0: return zeros(0, dtype=int64) repMsg = generic_msg(cmd="coargsort", args="{:n} {} {}".format(len(arrays), ' '.join(anames), ' '.join(atypes))) return create_pdarray(cast(str, repMsg))
def coargsort(arrays: Iterable[Union[Strings, pdarray]]) -> pdarray: """ Return the permutation that groups the rows (left-to-right), if the input arrays are treated as columns. The permutation sorts numeric columns, but not strings -- strings are grouped, but not ordered. Parameters ---------- arrays : iterable of pdarray or Strings The columns (int64, float64, or Strings) to sort by row Returns ------- pdarray, int64 The indices that permute the rows to grouped order Raises ------ ValueError Raised if the pdarrays are not of the same size or if the parameter is not an Iterable containing pdarrays or Strings See Also -------- argsort Notes ----- Uses a least-significant-digit radix sort, which is stable and resilient to non-uniformity in data but communication intensive. Starts with the last array and moves forward. This sort operates directly on numeric types, but for Strings, it operates on a hash. Thus, while grouping of equivalent strings is guaranteed, lexicographic ordering of the groups is not. Examples -------- >>> a = ak.array([0, 1, 0, 1]) >>> b = ak.array([1, 1, 0, 0]) >>> perm = ak.coargsort([a, b]) >>> perm array([2, 0, 3, 1]) >>> a[perm] array([0, 0, 1, 1]) >>> b[perm] array([0, 1, 0, 1]) """ size = -1 anames = [] atypes = [] for a in arrays: if isinstance(a, Strings): anames.append('{}+{}'.format(a.offsets.name, a.bytes.name)) atypes.append(a.objtype) elif isinstance(a, pdarray): anames.append(a.name) atypes.append('pdarray') else: raise ValueError( "Argument must be an iterable of pdarrays or Strings") if size == -1: size = a.size elif size != a.size: raise ValueError( "All pdarrays or Strings must be of the same size") if size == 0: return zeros(0, dtype=int64) cmd = "coargsort" reqMsg = "{} {:n} {} {}".format(cmd, len(arrays), ' '.join(anames), ' '.join(atypes)) repMsg = generic_msg(reqMsg) return create_pdarray(repMsg)
def in1d( pda1: Union[pdarray, Strings, 'Categorical'], pda2: Union[pdarray, Strings, 'Categorical'], #type: ignore invert: bool = False) -> pdarray: #type: ignore """ Test whether each element of a 1-D array is also present in a second array. Returns a boolean array the same length as `pda1` that is True where an element of `pda1` is in `pda2` and False otherwise. Parameters ---------- pda1 : pdarray or Strings or Categorical Input array. pda2 : pdarray or Strings or Categorical The values against which to test each value of `pda1`. Must be the same type as `pda1`. invert : bool, optional If True, the values in the returned array are inverted (that is, False where an element of `pda1` is in `pda2` and True otherwise). Default is False. ``ak.in1d(a, b, invert=True)`` is equivalent to (but is faster than) ``~ak.in1d(a, b)``. Returns ------- pdarray, bool The values `pda1[in1d]` are in `pda2`. Raises ------ TypeError Raised if either pda1 or pda2 is not a pdarray, Strings, or Categorical object or if invert is not a bool RuntimeError Raised if the dtype of either array is not supported See Also -------- unique, intersect1d, union1d Notes ----- `in1d` can be considered as an element-wise function version of the python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is logically equivalent to ``ak.array([item in b for item in a])``, but is much faster and scales to arbitrarily large ``a``. ak.in1d is not supported for bool or float64 pdarrays Examples -------- >>> ak.in1d(ak.array([-1, 0, 1]), ak.array([-2, 0, 2])) array([False, True, False]) >>> ak.in1d(ak.array(['one','two']),ak.array(['two', 'three','four','five'])) array([False, True]) """ from arkouda.categorical import Categorical as Categorical_ if isinstance(pda1, pdarray) or isinstance(pda1, Strings) or isinstance( pda1, Categorical_): # While isinstance(thing, type) can be called on a tuple of types, this causes an issue with mypy for unknown reasons. if pda1.size == 0: return zeros(0, dtype=bool) if isinstance(pda2, pdarray) or isinstance(pda2, Strings) or isinstance( pda2, Categorical_): if pda2.size == 0: return zeros(pda1.size, dtype=bool) if hasattr(pda1, 'categories'): return cast(Categorical_, pda1).in1d(pda2) elif isinstance(pda1, pdarray) and isinstance(pda2, pdarray): repMsg = generic_msg(cmd="in1d", args="{} {} {}".\ format(pda1.name, pda2.name, invert)) return create_pdarray(repMsg) elif isinstance(pda1, Strings) and isinstance(pda2, Strings): repMsg = generic_msg(cmd="segmentedIn1d", args="{} {} {} {} {} {} {}".\ format(pda1.objtype, pda1.entry.name, "legacy_placeholder", pda2.objtype, pda2.entry.name, "legacy_placeholder", invert)) return create_pdarray(cast(str, repMsg)) else: raise TypeError( 'Both pda1 and pda2 must be pdarray, Strings, or Categorical')
def broadcast(self, values : pdarray) -> pdarray: """ Fill each group's segment with a constant value. Parameters ---------- values : pdarray The values to put in each group's segment Returns ------- pdarray The broadcast values Raises ------ TypeError Raised if value is not a pdarray object ValueError Raised if the values array does not have one value per segment Notes ----- This function is a sparse analog of ``np.broadcast``. If a GroupBy object represents a sparse matrix (tensor), then this function takes a (dense) column vector and replicates each value to the non-zero elements in the corresponding row. The returned array is in permuted (grouped) order. To get back to the order of the array on which GroupBy was called, the user must invert the permutation (see below). Examples -------- >>> a = ak.array([0, 1, 0, 1, 0]) >>> values = ak.array([3, 5]) >>> g = ak.GroupBy(a) # Result is in grouped order >>> g.broadcast(values) array([3, 3, 3, 5, 5] >>> b = ak.zeros_like(a) # Result is in original order >>> b[g.permutation] = g.broadcast(values) >>> b array([3, 5, 3, 5, 3]) >>> a = ak.randint(1,5,10) >>> a array([3, 1, 4, 4, 4, 1, 3, 3, 2, 2]) >>> g = ak.GroupBy(a) >>> keys,counts = g.count() >>> g.broadcast(counts > 2) array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> g.broadcast(counts == 3) array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) >>> g.broadcast(counts < 4) array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) """ '''if values a boolean array, convert to an int64 array, which is needed for now because Arkouda does not support broadcasting of boolean arrays''' if values.dtype == np.bool: values = 1*values if values.size != self.segments.size: raise ValueError("Must have one value per segment") temp = zeros(self.size, values.dtype) if values.size == 0: return temp diffs = concatenate((array([values[0]]), values[1:] - values[:-1])) temp[self.segments] = diffs return cumsum(temp)