def argsort(self): __doc__ = argsort.__doc__ idxperm = argsort(self.categories) inverse = zeros_like(idxperm) inverse[idxperm] = arange(idxperm.size) newvals = inverse[self.codes] return argsort(newvals)
def sort(self): __doc__ = sort.__doc__ idxperm = argsort(self.categories) inverse = zeros_like(idxperm) inverse[idxperm] = arange(idxperm.size) newvals = inverse[self.codes] return Categorical.from_codes(newvals, self.categories[idxperm])
def group(self): """ Return the permutation that groups the array, placing equivalent categories together. All instances of the same category are guaranteed to lie in one contiguous block of the permuted array, but the blocks are not necessarily ordered. Returns ------- pdarray The permutation that groups the array by value See Also -------- GroupBy, unique Notes ----- This method is faster than the corresponding Strings method. If the Categorical was created from a Strings object, then this function simply returns the cached permutation. Even if the Categorical was created using from_codes(), this function will be faster than Strings.group() because it sorts dense integer values, rather than 128-bit hash values. """ if self.permutation is None: return argsort(self.codes) else: return self.permutation
def intersect1d(pda1: pdarray, pda2: pdarray, assume_unique: bool = False) -> pdarray: """ Find the intersection of two arrays. Return the sorted, unique values that are in both of the input arrays. Parameters ---------- pda1 : pdarray Input array pda2 : pdarray Input array assume_unique : bool If True, the input arrays are both assumed to be unique, which can speed up the calculation. Default is False. Returns ------- pdarray Sorted 1D array of common and unique elements. Raises ------ TypeError Raised if either pda1 or pda2 is not a pdarray RuntimeError Raised if the dtype of either pdarray is not supported See Also -------- unique, union1d Notes ----- ak.intersect1d is not supported for bool or float64 pdarrays Examples -------- >>> ak.intersect1d([1, 3, 4, 3], [3, 1, 2, 1]) array([1, 3]) """ if pda1.size == 0: return pda1 # nothing in the intersection if pda2.size == 0: return pda2 # nothing in the intersection if pda1.dtype == int and pda2.dtype == int: repMsg = generic_msg(cmd="intersect1d", args="{} {} {}".\ format(pda1.name, pda2.name, assume_unique)) return create_pdarray(cast(str, repMsg)) if not assume_unique: pda1 = unique(pda1) pda2 = unique(pda2) aux = concatenate((pda1, pda2), ordered=False) aux_sort_indices = argsort(aux) aux = aux[aux_sort_indices] mask = aux[1:] == aux[:-1] int1d = aux[:-1][mask] return int1d
def __init__(self, keys): self.per_locale = False self.keys = keys if isinstance(keys, pdarray): self.nkeys = 1 self.size = keys.size if self.per_locale: self.permutation = local_argsort(keys) else: self.permutation = argsort(keys) elif isinstance(keys, Strings): self.nkeys = 1 self.size = keys.size if self.per_locale: raise ValueError("per-locale groupby not supported on strings") else: self.permutation = keys.group() else: self.nkeys = len(keys) self.size = keys[0].size for k in keys: if k.size != self.size: raise ValueError("Key arrays must all be same size") self.permutation = coargsort(keys) # self.permuted_keys = self.keys[self.permutation] self.find_segments()
def __init__(self, keys : List[Union[pdarray,np.int64,Strings]], assume_sorted : bool=False, hash_strings : bool=True) -> None: self.logger = getArkoudaLogger(name=self.__class__.__name__) self.assume_sorted = assume_sorted self.hash_strings = hash_strings self.keys = keys if isinstance(keys, pdarray): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) else: self.permutation = argsort(keys) # for Strings or Categorical elif hasattr(keys, "group"): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) else: self.permutation = keys.group() else: self.nkeys = len(keys) self.size = keys[0].size for k in keys: if k.size != self.size: raise ValueError("Key arrays must all be same size") if assume_sorted: self.permutation = arange(self.size) else: self.permutation = coargsort(keys) # self.permuted_keys = self.keys[self.permutation] self.find_segments()
def setxor1d(pda1: pdarray, pda2: pdarray, assume_unique: bool = False) -> pdarray: """ Find the set exclusive-or (symmetric difference) of two arrays. Return the sorted, unique values that are in only one (not both) of the input arrays. Parameters ---------- pda1 : pdarray Input array. pda2 : pdarray Input array. assume_unique : bool If True, the input arrays are both assumed to be unique, which can speed up the calculation. Default is False. Returns ------- pdarray Sorted 1D array of unique values that are in only one of the input arrays. Raises ------ TypeError Raised if either pda1 or pda2 is not a pdarray RuntimeError Raised if the dtype of either pdarray is not supported Notes ----- ak.setxor1d is not supported for bool or float64 pdarrays Examples -------- >>> a = ak.array([1, 2, 3, 2, 4]) >>> b = ak.array([2, 3, 5, 7, 5]) >>> ak.setxor1d(a,b) array([1, 4, 5, 7]) """ if pda1.size == 0: return pda2 # return other pdarray if pda1 is empty if pda2.size == 0: return pda1 # return other pdarray if pda2 is empty if pda1.dtype == int and pda2.dtype == int: repMsg = generic_msg(cmd="setxor1d", args="{} {} {}".\ format(pda1.name, pda2.name, assume_unique)) return create_pdarray(cast(str, repMsg)) if not assume_unique: pda1 = cast(pdarray, unique(pda1)) pda2 = cast(pdarray, unique(pda2)) aux = concatenate((pda1, pda2), ordered=False) aux_sort_indices = argsort(aux) aux = aux[aux_sort_indices] flag = concatenate((array([True]), aux[1:] != aux[:-1], array([True]))) return aux[flag[1:] & flag[:-1]]
def intersect1d(pda1, pda2, assume_unique=False): """ Find the intersection of two arrays. Return the sorted, unique values that are in both of the input arrays. Parameters ---------- pda1 : pdarray Input array pda2 : pdarray Input array assume_unique : bool If True, the input arrays are both assumed to be unique, which can speed up the calculation. Default is False. Returns ------- pdarray Sorted 1D array of common and unique elements. See Also -------- unique, union1d Examples -------- >>> ak.intersect1d([1, 3, 4, 3], [3, 1, 2, 1]) array([1, 3]) """ if isinstance(pda1, pdarray) and isinstance(pda2, pdarray): if pda1.size == 0: return pda1 # nothing in the intersection if pda2.size == 0: return pda2 # nothing in the intersection if pda1.dtype == int and pda2.dtype == int: repMsg = generic_msg("intersect1d {} {} {}".format(pda1.name, pda2.name, assume_unique)) return create_pdarray(repMsg) if not assume_unique: pda1 = unique(pda1) pda2 = unique(pda2) aux = concatenate((pda1, pda2)) aux_sort_indices = argsort(aux) aux = aux[aux_sort_indices] mask = aux[1:] == aux[:-1] int1d = aux[:-1][mask] return int1d else: raise TypeError("must be pdarray {} or {}".format(pda1,pda2))
def setxor1d(pda1, pda2, assume_unique=False): """ Find the set exclusive-or (symmetric difference) of two arrays. Return the sorted, unique values that are in only one (not both) of the input arrays. Parameters ---------- pda1 : pdarray Input array. pda2 : pdarray Input array. assume_unique : bool If True, the input arrays are both assumed to be unique, which can speed up the calculation. Default is False. Returns ------- pdarray Sorted 1D array of unique values that are in only one of the input arrays. Examples -------- >>> a = ak.array([1, 2, 3, 2, 4]) >>> b = ak.array([2, 3, 5, 7, 5]) >>> ak.setxor1d(a,b) array([1, 4, 5, 7]) """ if isinstance(pda1, pdarray) and isinstance(pda2, pdarray): if pda1.size == 0: return pda2 # return other pdarray if pda1 is empty if pda2.size == 0: return pda1 # return other pdarray if pda2 is empty if pda1.dtype == int and pda2.dtype == int: repMsg = generic_msg("setxor1d {} {} {}".format(pda1.name, pda2.name, assume_unique)) return create_pdarray(repMsg) if not assume_unique: pda1 = unique(pda1) pda2 = unique(pda2) aux = concatenate((pda1, pda2)) aux_sort_indices = argsort(aux) aux = aux[aux_sort_indices] flag = concatenate((array([True]), aux[1:] != aux[:-1], array([True]))) return aux[flag[1:] & flag[:-1]] else: raise TypeError("must be pdarray {} or {}".format(pda1,pda2))
def __init__(self, keys: Union[pdarray, Strings, 'Categorical', List[Union[pdarray, np.int64, Strings]]], assume_sorted: bool = False, hash_strings: bool = True) -> None: from arkouda.categorical import Categorical self.logger = getArkoudaLogger(name=self.__class__.__name__) self.assume_sorted = assume_sorted self.hash_strings = hash_strings self.keys: Union[pdarray, Strings, Categorical] if isinstance(keys, pdarray): if keys.dtype != int64: raise TypeError( 'GroupBy only supports pdarrays with a dtype int64') self.keys = cast(pdarray, keys) self.nkeys = 1 self.size = cast(int, keys.size) if assume_sorted: self.permutation = cast(pdarray, arange(self.size)) else: self.permutation = cast(pdarray, argsort(keys)) elif hasattr(keys, "group"): # for Strings or Categorical self.nkeys = 1 self.keys = cast(Union[Strings, Categorical], keys) self.size = cast(int, self.keys.size) # type: ignore if assume_sorted: self.permutation = cast(pdarray, arange(self.size)) else: self.permutation = cast(Union[Strings, Categorical], keys).group() else: self.keys = cast(Union[pdarray, Strings, Categorical], keys) self.nkeys = len(keys) self.size = cast(int, keys[0].size) # type: ignore for k in keys: if k.size != self.size: raise ValueError("Key arrays must all be same size") if assume_sorted: self.permutation = cast(pdarray, arange(self.size)) else: self.permutation = cast( pdarray, coargsort(cast(Sequence[pdarray], keys))) # self.permuted_keys = self.keys[self.permutation] self.find_segments()
def __init__(self, keys, assume_sorted=False, hash_strings=True): self.assume_sorted = assume_sorted self.hash_strings = hash_strings self.per_locale = False self.keys = keys if isinstance(keys, pdarray): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) elif self.per_locale: self.permutation = local_argsort(keys) else: self.permutation = argsort(keys) # for Strings or Categorical elif hasattr(keys, "group"): self.nkeys = 1 self.size = keys.size if assume_sorted: self.permutation = arange(self.size) elif self.per_locale: raise ValueError("per-locale groupby not supported on Strings or Categorical") else: self.permutation = keys.group() else: self.nkeys = len(keys) self.size = keys[0].size for k in keys: if k.size != self.size: raise ValueError("Key arrays must all be same size") if assume_sorted: self.permutation = arange(self.size) else: self.permutation = coargsort(keys) # self.permuted_keys = self.keys[self.permutation] self.find_segments()