예제 #1
0
 def argsort(self):
     __doc__ = argsort.__doc__
     idxperm = argsort(self.categories)
     inverse = zeros_like(idxperm)
     inverse[idxperm] = arange(idxperm.size)
     newvals = inverse[self.codes]
     return argsort(newvals)
예제 #2
0
 def sort(self):
     __doc__ = sort.__doc__
     idxperm = argsort(self.categories)
     inverse = zeros_like(idxperm)
     inverse[idxperm] = arange(idxperm.size)
     newvals = inverse[self.codes]
     return Categorical.from_codes(newvals, self.categories[idxperm])
예제 #3
0
    def group(self):
        """
        Return the permutation that groups the array, placing equivalent
        categories together. All instances of the same category are guaranteed to lie
        in one contiguous block of the permuted array, but the blocks are not
        necessarily ordered.

        Returns
        -------
        pdarray
            The permutation that groups the array by value

        See Also
        --------
        GroupBy, unique

        Notes
        -----
        This method is faster than the corresponding Strings method. If the Categorical
        was created from a Strings object, then this function simply returns the
        cached permutation. Even if the Categorical was created using from_codes(),
        this function will be faster than Strings.group() because it sorts dense
        integer values, rather than 128-bit hash values.
        """        
        if self.permutation is None:
            return argsort(self.codes)
        else:
            return self.permutation
예제 #4
0
def intersect1d(pda1: pdarray,
                pda2: pdarray,
                assume_unique: bool = False) -> pdarray:
    """
    Find the intersection of two arrays.

    Return the sorted, unique values that are in both of the input arrays.

    Parameters
    ----------
    pda1 : pdarray
        Input array
    pda2 : pdarray
        Input array
    assume_unique : bool
        If True, the input arrays are both assumed to be unique, which
        can speed up the calculation.  Default is False.

    Returns
    -------
    pdarray
        Sorted 1D array of common and unique elements.

    Raises
    ------
    TypeError
        Raised if either pda1 or pda2 is not a pdarray
    RuntimeError
        Raised if the dtype of either pdarray is not supported

    See Also
    --------
    unique, union1d

    Notes
    -----
    ak.intersect1d is not supported for bool or float64 pdarrays

    Examples
    --------
    >>> ak.intersect1d([1, 3, 4, 3], [3, 1, 2, 1])
    array([1, 3])
    """
    if pda1.size == 0:
        return pda1  # nothing in the intersection
    if pda2.size == 0:
        return pda2  # nothing in the intersection
    if pda1.dtype == int and pda2.dtype == int:
        repMsg = generic_msg(cmd="intersect1d", args="{} {} {}".\
                             format(pda1.name, pda2.name, assume_unique))
        return create_pdarray(cast(str, repMsg))
    if not assume_unique:
        pda1 = unique(pda1)
        pda2 = unique(pda2)
    aux = concatenate((pda1, pda2), ordered=False)
    aux_sort_indices = argsort(aux)
    aux = aux[aux_sort_indices]
    mask = aux[1:] == aux[:-1]
    int1d = aux[:-1][mask]
    return int1d
예제 #5
0
    def __init__(self, keys):
        self.per_locale = False
        self.keys = keys
        if isinstance(keys, pdarray):
            self.nkeys = 1
            self.size = keys.size
            if self.per_locale:
                self.permutation = local_argsort(keys)
            else:
                self.permutation = argsort(keys)
        elif isinstance(keys, Strings):
            self.nkeys = 1
            self.size = keys.size
            if self.per_locale:
                raise ValueError("per-locale groupby not supported on strings")
            else:
                self.permutation = keys.group()
        else:
            self.nkeys = len(keys)
            self.size = keys[0].size
            for k in keys:
                if k.size != self.size:
                    raise ValueError("Key arrays must all be same size")
            self.permutation = coargsort(keys)

        # self.permuted_keys = self.keys[self.permutation]
        self.find_segments()
예제 #6
0
 def __init__(self, keys : List[Union[pdarray,np.int64,Strings]], 
             assume_sorted : bool=False, hash_strings : bool=True) -> None:
     self.logger = getArkoudaLogger(name=self.__class__.__name__)
     self.assume_sorted = assume_sorted
     self.hash_strings = hash_strings
     self.keys = keys
     if isinstance(keys, pdarray):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = argsort(keys)
     # for Strings or Categorical
     elif hasattr(keys, "group"):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = keys.group()
     else:
         self.nkeys = len(keys)
         self.size = keys[0].size
         for k in keys:
             if k.size != self.size:
                 raise ValueError("Key arrays must all be same size")
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = coargsort(keys)
         
     # self.permuted_keys = self.keys[self.permutation]
     self.find_segments()       
예제 #7
0
def setxor1d(pda1: pdarray,
             pda2: pdarray,
             assume_unique: bool = False) -> pdarray:
    """
    Find the set exclusive-or (symmetric difference) of two arrays.

    Return the sorted, unique values that are in only one (not both) of the
    input arrays.

    Parameters
    ----------
    pda1 : pdarray
        Input array.
    pda2 : pdarray
        Input array.
    assume_unique : bool
        If True, the input arrays are both assumed to be unique, which
        can speed up the calculation.  Default is False.

    Returns
    -------
    pdarray
        Sorted 1D array of unique values that are in only one of the input
        arrays.

    Raises
    ------
    TypeError
        Raised if either pda1 or pda2 is not a pdarray
    RuntimeError
        Raised if the dtype of either pdarray is not supported

    Notes
    -----
    ak.setxor1d is not supported for bool or float64 pdarrays

    Examples
    --------
    >>> a = ak.array([1, 2, 3, 2, 4])
    >>> b = ak.array([2, 3, 5, 7, 5])
    >>> ak.setxor1d(a,b)
    array([1, 4, 5, 7])
    """
    if pda1.size == 0:
        return pda2  # return other pdarray if pda1 is empty
    if pda2.size == 0:
        return pda1  # return other pdarray if pda2 is empty
    if pda1.dtype == int and pda2.dtype == int:
        repMsg = generic_msg(cmd="setxor1d", args="{} {} {}".\
                             format(pda1.name, pda2.name, assume_unique))
        return create_pdarray(cast(str, repMsg))
    if not assume_unique:
        pda1 = cast(pdarray, unique(pda1))
        pda2 = cast(pdarray, unique(pda2))
    aux = concatenate((pda1, pda2), ordered=False)
    aux_sort_indices = argsort(aux)
    aux = aux[aux_sort_indices]
    flag = concatenate((array([True]), aux[1:] != aux[:-1], array([True])))
    return aux[flag[1:] & flag[:-1]]
예제 #8
0
def intersect1d(pda1, pda2, assume_unique=False):
    """
    Find the intersection of two arrays.

    Return the sorted, unique values that are in both of the input arrays.

    Parameters
    ----------
    pda1 : pdarray
        Input array
    pda2 : pdarray
        Input array
    assume_unique : bool
        If True, the input arrays are both assumed to be unique, which
        can speed up the calculation.  Default is False.

    Returns
    -------
    pdarray
        Sorted 1D array of common and unique elements.

    See Also
    --------
    unique, union1d

    Examples
    --------
    >>> ak.intersect1d([1, 3, 4, 3], [3, 1, 2, 1])
    array([1, 3])
    """
    if isinstance(pda1, pdarray) and isinstance(pda2, pdarray):
        if pda1.size == 0:
            return pda1 # nothing in the intersection
        if pda2.size == 0:
            return pda2 # nothing in the intersection
        if pda1.dtype == int and pda2.dtype == int:
            repMsg = generic_msg("intersect1d {} {} {}".format(pda1.name, pda2.name, assume_unique))
            return create_pdarray(repMsg)
        if not assume_unique:
            pda1 = unique(pda1)
            pda2 = unique(pda2)
        aux = concatenate((pda1, pda2))
        aux_sort_indices = argsort(aux)
        aux = aux[aux_sort_indices]
        mask = aux[1:] == aux[:-1]
        int1d = aux[:-1][mask]
        return int1d
    else:
        raise TypeError("must be pdarray {} or {}".format(pda1,pda2))
예제 #9
0
def setxor1d(pda1, pda2, assume_unique=False):
    """
    Find the set exclusive-or (symmetric difference) of two arrays.

    Return the sorted, unique values that are in only one (not both) of the
    input arrays.

    Parameters
    ----------
    pda1 : pdarray
        Input array.
    pda2 : pdarray
        Input array.
    assume_unique : bool
        If True, the input arrays are both assumed to be unique, which
        can speed up the calculation.  Default is False.

    Returns
    -------
    pdarray
        Sorted 1D array of unique values that are in only one of the input
        arrays.

    Examples
    --------
    >>> a = ak.array([1, 2, 3, 2, 4])
    >>> b = ak.array([2, 3, 5, 7, 5])
    >>> ak.setxor1d(a,b)
    array([1, 4, 5, 7])
    """
    if isinstance(pda1, pdarray) and isinstance(pda2, pdarray):
        if pda1.size == 0:
            return pda2 # return other pdarray if pda1 is empty
        if pda2.size == 0:
            return pda1 # return other pdarray if pda2 is empty
        if pda1.dtype == int and pda2.dtype == int:
            repMsg = generic_msg("setxor1d {} {} {}".format(pda1.name, pda2.name, assume_unique))
            return create_pdarray(repMsg)
        if not assume_unique:
            pda1 = unique(pda1)
            pda2 = unique(pda2)
        aux = concatenate((pda1, pda2))
        aux_sort_indices = argsort(aux)
        aux = aux[aux_sort_indices]
        flag = concatenate((array([True]), aux[1:] != aux[:-1], array([True])))
        return aux[flag[1:] & flag[:-1]]
    else:
        raise TypeError("must be pdarray {} or {}".format(pda1,pda2))
예제 #10
0
    def __init__(self,
                 keys: Union[pdarray, Strings, 'Categorical',
                             List[Union[pdarray, np.int64, Strings]]],
                 assume_sorted: bool = False,
                 hash_strings: bool = True) -> None:
        from arkouda.categorical import Categorical
        self.logger = getArkoudaLogger(name=self.__class__.__name__)
        self.assume_sorted = assume_sorted
        self.hash_strings = hash_strings
        self.keys: Union[pdarray, Strings, Categorical]

        if isinstance(keys, pdarray):
            if keys.dtype != int64:
                raise TypeError(
                    'GroupBy only supports pdarrays with a dtype int64')
            self.keys = cast(pdarray, keys)
            self.nkeys = 1
            self.size = cast(int, keys.size)
            if assume_sorted:
                self.permutation = cast(pdarray, arange(self.size))
            else:
                self.permutation = cast(pdarray, argsort(keys))
        elif hasattr(keys, "group"):  # for Strings or Categorical
            self.nkeys = 1
            self.keys = cast(Union[Strings, Categorical], keys)
            self.size = cast(int, self.keys.size)  # type: ignore
            if assume_sorted:
                self.permutation = cast(pdarray, arange(self.size))
            else:
                self.permutation = cast(Union[Strings, Categorical],
                                        keys).group()
        else:
            self.keys = cast(Union[pdarray, Strings, Categorical], keys)
            self.nkeys = len(keys)
            self.size = cast(int, keys[0].size)  # type: ignore
            for k in keys:
                if k.size != self.size:
                    raise ValueError("Key arrays must all be same size")
            if assume_sorted:
                self.permutation = cast(pdarray, arange(self.size))
            else:
                self.permutation = cast(
                    pdarray, coargsort(cast(Sequence[pdarray], keys)))

        # self.permuted_keys = self.keys[self.permutation]
        self.find_segments()
예제 #11
0
 def __init__(self, keys, assume_sorted=False, hash_strings=True):
     self.assume_sorted = assume_sorted
     self.hash_strings = hash_strings
     self.per_locale = False
     self.keys = keys
     if isinstance(keys, pdarray):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         elif self.per_locale:
             self.permutation = local_argsort(keys)
         else:
             self.permutation = argsort(keys)
     # for Strings or Categorical
     elif hasattr(keys, "group"):
         self.nkeys = 1
         self.size = keys.size
         if assume_sorted:
             self.permutation = arange(self.size)
         elif self.per_locale:
             raise ValueError("per-locale groupby not supported on Strings or Categorical")
         else:
             self.permutation = keys.group()
     else:
         self.nkeys = len(keys)
         self.size = keys[0].size
         for k in keys:
             if k.size != self.size:
                 raise ValueError("Key arrays must all be same size")
         if assume_sorted:
             self.permutation = arange(self.size)
         else:
             self.permutation = coargsort(keys)
         
     # self.permuted_keys = self.keys[self.permutation]
     self.find_segments()