Exemple #1
0
def zeros(size: int, dtype: type = np.float64) -> pdarray:
    """
    Create a pdarray filled with zeros.

    Parameters
    ----------
    size : int
        Size of the array (only rank-1 arrays supported)
    dtype : {float64, int64, bool}
        Type of resulting array, default float64

    Returns
    -------
    pdarray
        Zeros of the requested size and dtype
        
    Raises
    ------
    TypeError
        Raised if the supplied dtype is not supported or if the size
        parameter is neither an int nor a str that is parseable to an int.

    See Also
    --------
    ones, zeros_like

    Examples
    --------
    >>> ak.zeros(5, dtype=ak.int64)
    array([0, 0, 0, 0, 0])

    >>> ak.zeros(5, dtype=ak.float64)
    array([0, 0, 0, 0, 0])

    >>> ak.zeros(5, dtype=ak.bool)
    array([False, False, False, False, False])
    """
    if not np.isscalar(size):
        raise TypeError("size must be a scalar, not {}".\
                                     format(size.__class__.__name__))
    dtype = akdtype(dtype)  # normalize dtype
    # check dtype for error
    if cast(np.dtype, dtype).name not in numericDTypes:
        raise TypeError("unsupported dtype {}".format(dtype))
    repMsg = generic_msg("create {} {}".format(
        cast(np.dtype, dtype).name, size))
    return create_pdarray(repMsg)
Exemple #2
0
def ones(size: int, dtype: type = float64) -> pdarray:
    """
    Create a pdarray filled with ones.

    Parameters
    ----------
    size : int
        Size of the array (only rank-1 arrays supported)
    dtype : {float64, int64, bool}
        Resulting array type, default float64

    Returns
    -------
    pdarray
        Ones of the requested size and dtype
        
    Raises
    ------
    TypeError
        Raised if the supplied dtype is not supported or if the size
         parameter is neither an int nor a str that is parseable to an int.

    See Also
    --------
    zeros, ones_like

    Examples
    --------
    >>> ak.ones(5, dtype=ak.int64)
    array([1, 1, 1, 1, 1])
    >>> ak.ones(5, dtype=ak.float64)
    array([1, 1, 1, 1, 1])
    >>> ak.ones(5, dtype=ak.bool)
    array([True, True, True, True, True])
    """
    if not np.isscalar(size):
        raise TypeError("size must be a scalar, not {}".\
                                            format(size.__class__.__name__))
    dtype = akdtype(dtype)  # normalize dtype
    # check dtype for error
    if dtype.name not in numericDTypes:
        raise TypeError("unsupported dtype {}".format(dtype))
    kind, itemsize = translate_np_dtype(dtype)
    repMsg = generic_msg("create {} {}".format(dtype.name, size))
    a = create_pdarray(repMsg)
    a.fill(1)
    return a
Exemple #3
0
    def stick(self, other, delimiter="", toLeft=False):
        """
        Join the strings from another array onto one end of the strings 
        of this array, optionally inserting a delimiter.

        Parameters
        ----------
        other : Strings
            The strings to join onto self's strings
        delimiter : str
            String inserted between self and other
        toLeft : bool
            If true, join other strings to the left of self. By default,
            other is joined to the right of self.

        Returns
        -------
        Strings
            The array of joined strings

        See Also
        --------
        lstick, peel, rpeel

        Examples
        --------
        >>> s = ak.array(['a', 'c', 'e'])
        >>> t = ak.array(['b', 'd', 'f'])
        >>> s.stick(t, delimiter='.')
        array(['a.b', 'c.d', 'e.f'])
        """
        if not isinstance(other, Strings):
            raise TypeError(
                "stick: not supported between Strings and {}".format(
                    type(other)))
        if isinstance(delimiter, bytes):
            delimiter = delimiter.decode()
        if not isinstance(delimiter, str):
            raise TypeError("Delimiter must be a string, not {}".format(
                type(delimiter)))
        msg = "segmentedBinopvv {} {} {} {} {} {} {} {} {}".format(
            "stick", self.objtype, self.offsets.name, self.bytes.name,
            other.objtype, other.offsets.name, other.bytes.name,
            NUMBER_FORMAT_STRINGS['bool'].format(toLeft),
            json.dumps([delimiter]))
        repMsg = generic_msg(msg)
        return Strings(*repMsg.split('+'))
Exemple #4
0
def histogram(pda, bins=10):
    """
    Compute a histogram of evenly spaced bins over the range of an array.
    
    Parameters
    ----------
    pda : pdarray
        The values to histogram

    bins : int
        The number of equal-size bins to use (default: 10)

    Returns
    -------
    pdarray
        The number of values present in each bin

    See Also
    --------
    value_counts

    Notes
    -----
    The bins are evenly spaced in the interval [pda.min(), pda.max()]. Currently,
    the user must re-compute the bin edges, e.g. with np.linspace (see below) 
    in order to plot the histogram.

    Examples
    --------
    >>> A = ak.arange(0, 10, 1)
    >>> nbins = 3
    >>> h = ak.histogram(A, bins=nbins)
    >>> h
    array([3, 3, 4])
    # Recreate the bin edges in NumPy
    >>> binEdges = np.linspace(A.min(), A.max(), nbins+1)
    >>> binEdges
    array([0., 3., 6., 9.])
    # To plot, use only the left edges, and export the histogram to NumPy
    >>> plt.plot(binEdges[:-1], h.to_ndarray())
    """
    if isinstance(pda, pdarray) and isinstance(bins, int):
        repMsg = generic_msg("histogram {} {}".format(pda.name, bins))
        return create_pdarray(repMsg)
    else:
        raise TypeError("must be pdarray {} and bins must be an int {}".format(
            pda, bins))
Exemple #5
0
def suffix_array(strings: Strings) -> SArrays:
    """
        Return the suffix arrays of given strings. The size/shape of each suffix
	arrays is the same as the corresponding strings. 
	A simple example of suffix array is as follow. Given a string "banana$",
	all the suffixes are as follows. 
	s[0]="banana$"
	s[1]="anana$"
	s[2]="nana$"
	s[3]="ana$"
	s[4]="na$"
	s[5]="a$"
	s[6]="$"
	The suffix array of string "banana$"  is the array of indices of sorted suffixes.
	s[6]="$"
	s[5]="a$"
	s[3]="ana$"
	s[1]="anana$"
	s[0]="banana$"
	s[4]="na$"
	s[2]="nana$"
	so sa=[6,5,3,1,0,4,2]

        Returns
        -------
        pdarray
            The suffix arrays of the given strings

        See Also
        --------

        Notes
        -----
        
        Raises
        ------  
        RuntimeError
            Raised if there is a server-side error in executing group request or
            creating the pdarray encapsulating the return message
        """
    msg = "segmentedSuffixAry {} {} {}".format(strings.objtype,
                                               strings.offsets.name,
                                               strings.bytes.name)
    repMsg = generic_msg(msg)
    pdarrays = SArrays(*(repMsg.split('+')))
    return pdarrays
def random_strings_uniform(minlen: int,
                           maxlen: int,
                           size: int,
                           characters: str = 'uppercase',
                           seed: Union[None, int] = None) -> Strings:
    """
    Generate random strings with lengths uniformly distributed between 
    minlen and maxlen, and with characters drawn from a specified set.

    Parameters
    ----------
    minlen : int
        The minimum allowed length of string
    maxlen : int
        The maximum allowed length of string
    size : int
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from

    Returns
    -------
    Strings
        The array of random strings
        
    Raises
    ------
    ValueError
        Raised if minlen < 0, maxlen < minlen, or size < 0

    See Also
    --------
    random_strings_lognormal, randint
    """
    if minlen < 0 or maxlen < minlen or size < 0:
        raise ValueError(
            ("Incompatible arguments: minlen < 0, maxlen < minlen, " +
             "or size < 0"))
    msg = "randomStrings {} {} {} {} {} {}".\
          format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                 "uniform", characters,
                 NUMBER_FORMAT_STRINGS['int64'].format(minlen),
                 NUMBER_FORMAT_STRINGS['int64'].format(maxlen),
                 seed)
    repMsg = generic_msg(msg)
    return Strings(*(cast(str, repMsg).split('+')))
Exemple #7
0
def unique(pda, return_counts=False):
    """
    Find the unique elements of an array.

    Returns the sorted unique elements of an array. There is an optional
    output in addition to the unique elements: the number of times each 
    unique value comes up in the input array.

    Parameters
    ----------
    pda : pdarray
        Input array.
    return_counts : bool, optional
        If True, also return the number of times each unique item appears
        in `pda`.

    Returns
    -------
    unique : pdarray
        The sorted unique values.
    unique_counts : pdarray, optional
        The number of times each of the unique values comes up in the
        original array. Only provided if `return_counts` is True.

    Notes
    -----
    Internally, this function checks to see whether `pda` is sorted and, if so,
    whether it is already unique. This step can save considerable computation.
    Otherwise, this function will sort `pda`.

    Examples
    --------
    >>> A = ak.array([3, 2, 1, 1, 2, 3])
    >>> ak.unique(A)
    array([1, 2, 3])
    """
    if isinstance(pda, pdarray):
        repMsg = generic_msg("unique {} {}".format(pda.name, return_counts))
        if return_counts:
            vc = repMsg.split("+")
            if verbose: print(vc)
            return create_pdarray(vc[0]), create_pdarray(vc[1])
        else:
            return create_pdarray(repMsg)
    else:
        raise TypeError("must be pdarray {}".format(pda))
Exemple #8
0
def linspace(start: Union[float, int], stop: Union[float, int],
             length: int) -> pdarray:
    """
    Create a pdarray of linearly-spaced floats in a closed interval.

    Parameters
    ----------
    start : int
        Start of interval (inclusive)
    stop : int
        End of interval (inclusive)
    length : int
        Number of points

    Returns
    -------
    pdarray, float64
        Array of evenly spaced float values along the interval
        
    Raises
    ------
    TypeError
        Raised if start or stop is not a float or int or if length is not an int

    See Also
    --------
    arange
    
    Notes
    -----
    If that start is greater than stop, the pdarray values are generated
    in descending order.

    Examples
    --------
    >>> ak.linspace(0, 1, 5)
    array([0, 0.25, 0.5, 0.75, 1])

    >>> ak.linspace(start=1, stop=0, length=5)
    array([1, 0.75, 0.5, 0.25, 0])

    >>> ak.linspace(start=-5, stop=0, length=5)
    array([-5, -3.75, -2.5, -1.25, 0])
    """
    repMsg = generic_msg("linspace {} {} {}".format(start, stop, length))
    return create_pdarray(repMsg)
Exemple #9
0
def sort(pda: pdarray) -> pdarray:
    """
    Return a sorted copy of the array. Only sorts numeric arrays; 
    for Strings, use argsort.
    
    Parameters
    ----------
    pda : pdarray or Categorical
        The array to sort (int64 or float64)

    Returns
    -------
    pdarray, int64 or float64
        The sorted copy of pda

    Raises
    ------
    TypeError
        Raised if the parameter is not a pdarray
    ValueError
        Raised if sort attempted on a pdarray with an unsupported dtype
        such as bool

    See Also
    --------
    argsort

    Notes
    -----
    Uses a least-significant-digit radix sort, which is stable and resilient
    to non-uniformity in data but communication intensive.

    Examples
    --------
    >>> a = ak.randint(0, 10, 10)
    >>> sorted = ak.sort(a)
    >>> a
    array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9])
    """
    if pda.size == 0:
        return zeros(0, dtype=int64)
    if pda.dtype not in numeric_dtypes:
        raise ValueError("ak.sort supports float64 or int64, not {}".format(
            pda.dtype))
    repMsg = generic_msg(cmd="sort", args="{}".format(pda.name))
    return create_pdarray(cast(str, repMsg))
Exemple #10
0
    def flatten(self,
                delimiter: str,
                return_segments: bool = False) -> Union[Strings, Tuple]:
        """Unpack delimiter-joined substrings into a flat array.

        Parameters
        ----------
        delimeter : str
            Characters used to split strings into substrings
        return_segments : bool
            If True, also return mapping of original strings to first substring
            in return array.

        Returns
        -------
        Strings
            Flattened substrings with delimiters removed
        pdarray, int64 (optional)
            For each original string, the index of first corresponding substring
            in the return array

        See Also
        --------
        peel, rpeel

        Examples
        --------
        >>> orig = ak.array(['one|two', 'three|four|five', 'six'])
        >>> orig.flatten('|')
        array(['one', 'two', 'three', 'four', 'five', 'six'])
        >>> flat, map = orig.flatten('|', return_segments=True)
        >>> map
        array([0, 2, 5])
        """
        msg = "segmentedFlatten {}+{} {} {} {}".format(self.offsets.name,
                                                       self.bytes.name,
                                                       self.objtype,
                                                       return_segments,
                                                       json.dumps([delimiter]))
        repMsg = cast(str, generic_msg(msg))
        if return_segments:
            arrays = repMsg.split('+', maxsplit=2)
            return Strings(arrays[0], arrays[1]), create_pdarray(arrays[2])
        else:
            arrays = repMsg.split('+', maxsplit=1)
            return Strings(arrays[0], arrays[1])
Exemple #11
0
    def _binop(self, other: Union[SArrays, np.int_], op: str) -> pdarray:
        """
        Executes the requested binop on this SArrays instance and the
        parameter SArrays object and returns the results within
        a pdarray object.

        Parameters
        ----------
        other : SArrays
            the other object is a SArrays object
        op : str
            name of the binary operation to be performed 
      
        Returns
        -------
        pdarray
            encapsulating the results of the requested binop      

        Raises
    -   -----
        ValueError
            Raised if (1) the op is not in the self.BinOps set, or (2) if the
            sizes of this and the other instance don't match, or (3) the other
            object is not a SArrays object
        RuntimeError
            Raised if a server-side error is thrown while executing the
            binary operation
        """
        if op not in self.BinOps:
            raise ValueError("SArrays: unsupported operator: {}".format(op))
        if isinstance(other, Strings):
            if self.size != other.size:
                raise ValueError("SArrays: size mismatch {} {}".\
                                 format(self.size, other.size))
            msg = "segmentedBinopvvInt {} {} {} {} {} {} {}".format(
                op, self.objtype, self.offsets.name, self.bytes.name,
                other.objtype, other.offsets.name, other.bytes.name)
        elif resolve_scalar_dtype(other) == 'int':
            msg = "segmentedBinopvsInt {} {} {} {} {} {}".format(
                op, self.objtype, self.offsets.name, self.bytes.name,
                self.objtype, json.dumps([other]))
        else:
            raise ValueError("SArrays: {} not supported between SArrays and {}"\
                             .format(op, other.__class__.__name__))
        repMsg = generic_msg(msg)
        return create_pdarray(cast(str, repMsg))
Exemple #12
0
def argmaxk(pda, k):
    """
    Find the `k` maximum values of an array.

    Returns the largest `k` values of an array, sorted

    Parameters
    ----------
    pda : pdarray
        Input array.
    k : integer
        The desired count of maximum values to be returned by the output.

    Returns
    -------
    pdarray, int
        The indices of the maximum `k` values from pda

    Notes
    -----
    This call is equivalent in value to:
    
        ak.argsort(a)[k:]
    
    and generally outperforms this operation.

    This reduction will see a significant drop in performance as `k` grows
    beyond a certain value. This value is system dependent, but generally
    about a `k` of 5 million is where performance degredation has been observed.


    Examples
    --------
    >>> A = ak.array([10,5,1,3,7,2,9,0])
    >>> ak.argmaxk(A, 3)
    array([4, 6, 0])
    """
    if isinstance(pda, pdarray):
        if k == 0:
            return []
        if pda.size == 0:
            raise TypeError("must be a non-empty pdarray {} of type int or float".format(pda))
        repMsg = generic_msg("maxk {} {} {}".format(pda.name, k, True))
        return create_pdarray(repMsg)
    else:
        raise TypeError("must be pdarray {}".format(pda))
Exemple #13
0
    def get_lengths(self) -> pdarray:
        """
        Return the length of each string in the array.

        Returns
        -------
        pdarray, int
            The length of each string
            
        Raises
        ------
        RuntimeError
            Raised if there is a server-side error thrown
        """
        msg = "segmentLengths {} {} {}".\
                        format(self.objtype, self.offsets.name, self.bytes.name)
        return create_pdarray(generic_msg(msg))
Exemple #14
0
def setdiff1d(pda1, pda2, assume_unique=False):
    """
    Find the set difference of two arrays.

    Return the sorted, unique values in `pda1` that are not in `pda2`.

    Parameters
    ----------
    pda1 : pdarray
        Input array.
    pda2 : pdarray
        Input comparison array.
    assume_unique : bool
        If True, the input arrays are both assumed to be unique, which
        can speed up the calculation.  Default is False.

    Returns
    -------
    pdarray
        Sorted 1D array of values in `pda1` that are not in `pda2`.

    See Also
    --------
    unique, setxor1d

    Examples
    --------
    >>> a = ak.array([1, 2, 3, 2, 4, 1])
    >>> b = ak.array([3, 4, 5, 6])
    >>> ak.setdiff1d(a, b)
    array([1, 2])
    """
    if isinstance(pda1, pdarray) and isinstance(pda2, pdarray):
        if pda1.size == 0:
            return pda1 # return a zero length pdarray
        if pda2.size == 0:
            return pda1 # subtracting nothing return orig pdarray
        if pda1.dtype == int and pda2.dtype == int:
            repMsg = generic_msg("setdiff1d {} {} {}".format(pda1.name, pda2.name, assume_unique))
            return create_pdarray(repMsg)
        if not assume_unique:
            pda1 = unique(pda1)
            pda2 = unique(pda2)
        return pda1[in1d(pda1, pda2, invert=True)]
    else:
        raise TypeError("must be pdarray {} or {}".format(pda1,pda2))
Exemple #15
0
def argsort(pda: Union[pdarray, Strings, 'Categorical']) -> pdarray:
    """
    Return the permutation that sorts the array.
    
    Parameters
    ----------
    pda : pdarray or Strings or Categorical
        The array to sort (int64 or float64)

    Returns
    -------
    pdarray, int64
        The indices such that ``pda[indices]`` is sorted
        
    Raises
    ------
    TypeError
        Raised if the parameter is other than a pdarray or Strings

    See Also
    --------
    coargsort

    Notes
    -----
    Uses a least-significant-digit radix sort, which is stable and
    resilinent to non-uniformity in data but communication intensive.

    Examples
    --------
    >>> a = ak.randint(0, 10, 10)
    >>> perm = ak.argsort(a)
    >>> a[perm]
    array([0, 1, 1, 3, 4, 5, 7, 8, 8, 9])
    """
    if hasattr(pda, "argsort"):
        return pda.argsort()
    if pda.size == 0:
        return zeros(0, dtype=int64)
    if isinstance(pda, Strings):
        name = '{}+{}'.format(pda.offsets.name, pda.bytes.name)
    else:
        name = pda.name
    repMsg = generic_msg("argsort {} {}".format(pda.objtype, name))
    return create_pdarray(repMsg)
Exemple #16
0
 def split(self, maxsplit: int = 0, return_segments: bool = False):
     """
     Split string by the occurrences of pattern. If maxsplit is nonzero, at most maxsplit splits occur
     """
     from arkouda.strings import Strings
     cmd = "segmentedSplit"
     args = "{} {} {} {} {} {}".format(self.objtype,
                                       self.parent_entry_name,
                                       "legacy_placeholder",
                                       maxsplit,
                                       return_segments,
                                       json.dumps([self.pattern]))
     repMsg = cast(str, generic_msg(cmd=cmd, args=args))
     if return_segments:
         arrays = repMsg.split('+', maxsplit=2)
         return Strings.from_return_msg("+".join(arrays[0:2])), create_pdarray(arrays[2])
     else:
         return Strings.from_return_msg(repMsg)
Exemple #17
0
def standard_normal(size: Union[int, np.int64],
                    seed: Union[None, Union[int, np.int64]] = None) -> pdarray:
    """
    Draw real numbers from the standard normal distribution.

    Parameters
    ----------
    size : Union[int,np.int64]
        The number of samples to draw (size of the returned array)
    seed : Union[int,np.int64]
        Value used to initialize the random number generator
    
    Returns
    -------
    pdarray, float64
        The array of random numbers
        
    Raises
    ------
    TypeError
        Raised if size is not an int
    ValueError
        Raised if size < 0

    See Also
    --------
    randint

    Notes
    -----
    For random samples from :math:`N(\\mu, \\sigma^2)`, use:

    ``(sigma * standard_normal(size)) + mu``
    
    Examples
    --------
    >>> ak.standard_normal(3,1)
    array([-0.68586185091150265, 1.1723810583573375, 0.567584107142031])  
    """
    if size < 0:
        raise ValueError("The size parameter must be > 0")
    msg = "randomNormal {} {}".format(
        NUMBER_FORMAT_STRINGS['int64'].format(size), seed)
    return create_pdarray(generic_msg(msg))
Exemple #18
0
def suffix_array(filename: str) -> SArrays:
    """
        This function is major used for testing correctness and performance
        Return the suffix array of given file name's content as a string. 
	A simple example of suffix array is as follow. Given string "banana$",
	all the suffixes are as follows. 
	s[0]="banana$"
	s[1]="anana$"
	s[2]="nana$"
	s[3]="ana$"
	s[4]="na$"
	s[5]="a$"
	s[6]="$"
	The suffix array of string "banana$"  is the array of indices of sorted suffixes.
	s[6]="$"
	s[5]="a$"
	s[3]="ana$"
	s[1]="anana$"
	s[0]="banana$"
	s[4]="na$"
	s[2]="nana$"
	so sa=[6,5,3,1,0,4,2]

        Returns
        -------
        pdarray
            The suffix arrays of the given strings

        See Also
        --------

        Notes
        -----
        
        Raises
        ------  
        RuntimeError
            Raised if there is a server-side error in executing group request or
            creating the pdarray encapsulating the return message
        """
    msg = "segmentedSAFile {}".format(filename)
    repMsg = generic_msg(msg)
    pdarrays = SArrays(*(repMsg.split('+')))
    return pdarrays
Exemple #19
0
def attach_pda(user_defined_name: str) -> pdarray:
    """
    Return a pdarray attached to the a registered name in the arkouda 
    server which was registered using register_pda()
    
    Parameters
    ----------
    user_defined_name : str
        user defined name which array was registered under

    Returns
    -------
    pdarray
        pdarray which points to pdarray registered with user defined
        name in the arkouda server
        
    Raises
    ------
    TypeError
        Raised if user_defined_name is not a str

    See also
    --------
    register_pda, unregister_pda

    Notes
    -----
    Registered names/pdarrays in the server are immune to deletion 
    until they are unregistered.

    Examples
    --------
    >>> a = zeros(100)
    >>> r_pda = ak.register_pda(a, "my_zeros")
    >>> # potentially disconnect from server and reconnect to server
    >>> b = ak.attach_pda("my_zeros")
    >>> # ...other work...
    >>> ak.unregister_pda(b)
    """
    if not isinstance(user_defined_name, str):
        raise TypeError("user_defined_name must be a str")

    repMsg = generic_msg("attach {}".format(user_defined_name))
    return create_pdarray(repMsg)
Exemple #20
0
    def aggregate(self, values, operator):
        '''
        Using the permutation stored in the GroupBy instance, group another array 
        of values and apply a reduction to each group's values. 

        Parameters
        ----------
        values : pdarray
            The values to group and reduce
        operator: str
            The name of the reduction operator to use

        Returns
        -------
        unique_keys : (list of) pdarray or Strings
            The unique keys, in grouped order
        aggregates : pdarray
            One aggregate value per unique key in the GroupBy instance

        '''
        if not isinstance(values, pdarray):
            raise TypeError("<values> must be a pdarray")
        if values.size != self.size:
            raise ValueError("Attempt to group array using key array of different length")
        if operator not in self.Reductions:
            raise ValueError("Unsupported reduction: {}\nMust be one of {}".format(operator, self.Reductions))
        if self.assume_sorted:
            permuted_values = values
        else:
            permuted_values = values[self.permutation]
        if self.per_locale:
            cmd = "segmentedLocalRdx"
        else:
            cmd = "segmentedReduction"
        reqMsg = "{} {} {} {}".format(cmd,
                                         permuted_values.name,
                                         self.segments.name,
                                         operator)
        repMsg = generic_msg(reqMsg)
        if verbose: print(repMsg)
        if operator.startswith('arg'):
            return self.unique_keys, self.permutation[create_pdarray(repMsg)]
        else:
            return self.unique_keys, create_pdarray(repMsg)
Exemple #21
0
def union1d(pda1: pdarray, pda2: pdarray) -> pdarray:
    """
    Find the union of two arrays.

    Return the unique, sorted array of values that are in either 
    of the two input arrays.

    Parameters
    ----------
    pda1 : pdarray
        Input array
    pda2 : pdarray
        Input array

    Returns
    -------
    pdarray
        Unique, sorted union of the input arrays.
        
    Raises
    ------
    TypeError
        Raised if either pda1 or pda2 is not a pdarray

    See Also
    --------
    intersect1d, unique

    Examples
    --------
    >>> ak.union1d([-1, 0, 1], [-2, 0, 2])
    array([-2, -1,  0,  1,  2])
    """
    if pda1.size == 0:
        return pda2  # union is pda2
    if pda2.size == 0:
        return pda1  # union is pda1
    if pda1.dtype == int and pda2.dtype == int:
        repMsg = generic_msg("union1d {} {}".\
                             format(pda1.name, pda2.name))
        return create_pdarray(repMsg)
    return unique(concatenate((unique(pda1), unique(pda2))))
Exemple #22
0
def abs(pda: pdarray) -> pdarray:
    """
    Return the element-wise absolute value of the array.

    Parameters
    ----------
    pda : pdarray
    
    Returns
    -------
    pdarray
        A pdarray containing absolute values of the input array elements
   
    Raises
    ------
    TypeError
        Raised if the parameter is not a pdarray
    """
    repMsg = generic_msg("efunc {} {}".format("abs", pda.name))
    return create_pdarray(repMsg)
Exemple #23
0
 def find_segments(self):
     if self.per_locale:
         cmd = "findLocalSegments"
     else:
         cmd = "findSegments"
     if self.nkeys == 1:
         keynames = self.keys.name
     else:
         keynames = ' '.join([k.name for k in self.keys])
     reqMsg = "{} {} {:n} {:n} {}".format(cmd, self.permutation.name,
                                          self.nkeys, self.size, keynames)
     repMsg = generic_msg(reqMsg)
     segAttr, uniqAttr = repMsg.split("+")
     if verbose: print(segAttr, uniqAttr)
     self.segments = create_pdarray(segAttr)
     self.unique_key_indices = create_pdarray(uniqAttr)
     if self.nkeys == 1:
         self.unique_keys = self.keys[self.unique_key_indices]
     else:
         self.unique_keys = [k[self.unique_key_indices] for k in self.keys]
Exemple #24
0
    def group(self):
        """
        Return the permutation that groups the array, placing equivalent 
        strings together. This permutation does NOT sort the strings. All 
        instances of the same string are guaranteed to lie in one contiguous 
        block of the permuted array, but the blocks are not necessarily ordered.

        Returns
        -------
        pdarray
            The permutation that groups the array by value

        See Also
        --------
        GroupBy, unique
        """
        msg = "segmentedGroup {} {} {}".format(self.objtype, self.offsets.name,
                                               self.bytes.name)
        repMsg = generic_msg(msg)
        return create_pdarray(repMsg)
Exemple #25
0
 def binop(self, other, op):
     if op not in self.BinOps:
         raise ValueError("Strings: unsupported operator: {}".format(op))
     if isinstance(other, Strings):
         if self.size != other.size:
             raise ValueError("Strings: size mismatch {} {}".format(
                 self.size, other.size))
         msg = "segmentedBinopvv {} {} {} {} {} {} {}".format(
             op, self.objtype, self.offsets.name, self.bytes.name,
             other.objtype, other.offsets.name, other.bytes.name)
     elif resolve_scalar_dtype(other) == 'str':
         msg = "segmentedBinopvs {} {} {} {} {} {}".format(
             op, self.objtype, self.offsets.name, self.bytes.name,
             self.objtype, other)
     else:
         raise ValueError(
             "Strings: {} not supported between Strings and {}".format(
                 op, type(other)))
     repMsg = generic_msg(msg)
     return create_pdarray(repMsg)
Exemple #26
0
def in1d(pda1, pda2, invert=False):
    """
    Test whether each element of a 1-D array is also present in a second array.

    Returns a boolean array the same length as `pda1` that is True
    where an element of `pda1` is in `pda2` and False otherwise.

    Parameters
    ----------
    pda1 : pdarray
        Input array.
    pda2 : pdarray
        The values against which to test each value of `pda1`.
    invert : bool, optional
        If True, the values in the returned array are inverted (that is,
        False where an element of `pda1` is in `pda2` and True otherwise).
        Default is False. ``ak.in1d(a, b, invert=True)`` is equivalent
        to (but is faster than) ``~ak.in1d(a, b)``.

    Returns
    -------
    pdarray, bool
        The values `pda1[in1d]` are in `pda2`.

    See Also
    --------
    unique, intersect1d, union1d

    Notes
    -----
    `in1d` can be considered as an element-wise function version of the
    python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is logically
    equivalent to ``ak.array([item in b for item in a])``, but is much
    faster and scales to arbitrarily large ``a``.
    """
    if isinstance(pda1, pdarray) and isinstance(pda2, pdarray):
        repMsg = generic_msg("in1d {} {} {}".format(pda1.name, pda2.name,
                                                    invert))
        return create_pdarray(repMsg)
    else:
        raise TypeError("must be pdarray {} or {}".format(pda1, pda2))
Exemple #27
0
def random_strings_lognormal(logmean, logstd, size, characters='uppercase'):
    """
    Generate random strings with log-normally distributed lengths and 
    with characters drawn from a specified set.

    Parameters
    ----------
    logmean : float
        The log-mean of the length distribution
    logstd : float
        The log-standard-deviation of the length distribution
    size : int
        The number of strings to generate
    characters : (uppercase, lowercase, numeric, printable, binary)
        The set of characters to draw from

    Returns
    -------
    Strings
        The array of random strings

    See Also
    --------
    random_strings_lognormal, randint

    Notes
    -----
    The lengths of the generated strings are distributed $Lognormal(\mu, \sigma^2)$,
    with :math:`\mu = logmean` and :math:`\sigma = logstd`. Thus, the strings will have
    an average length of :math:`exp(\mu + 0.5*\sigma^2)`, a minimum length of zero, and
    a heavy tail towards longer strings.
    """
    if logstd <= 0 or size < 0:
        raise ValueError("Incompatible arguments")
    msg = "randomStrings {} {} {} {} {}".format(NUMBER_FORMAT_STRINGS['int64'].format(size),
                                                "lognormal",
                                                characters,
                                                NUMBER_FORMAT_STRINGS['float64'].format(logmean),
                                                NUMBER_FORMAT_STRINGS['float64'].format(logstd))
    repMsg = generic_msg(msg)
    return Strings(*(repMsg.split('+')))
Exemple #28
0
def sin(pda: pdarray) -> pdarray:
    """
    Return the element-wise sine of the array.

    Parameters
    ----------
    pda : pdarray
    
    Returns
    -------
    pdarray
        A pdarray containing sin for each element
        of the original pdarray
    
    Raises
    ------
    TypeError
        Raised if the parameter is not a pdarray
    """
    repMsg = generic_msg(cmd="efunc", args="{} {}".format("sin", pda.name))
    return create_pdarray(type_cast(str, repMsg))
Exemple #29
0
def cos(pda: pdarray) -> pdarray:
    """
    Return the element-wise cosine of the array.

    Parameters
    ----------
    pda : pdarray
    
    Returns
    -------
    pdarray
        A pdarray containing cosine for each element
        of the original pdarray
    
    Raises
    ------
    TypeError
        Raised if the parameter is not a pdarray
    """
    repMsg = generic_msg("efunc {} {}".format("cos", pda.name))
    return create_pdarray(repMsg)
Exemple #30
0
    def hash(self):
        """
        Compute a 128-bit hash of each string.

        Returns
        -------
        (pdarray, pdarray)
            A pair of int64 pdarrays. The ith hash value is the concatenation
            of the ith values from each array.

        Notes
        -----
        The implementation uses SipHash128, a fast and balanced hash function (used
        by Python for dictionaries and sets). For realistic numbers of strings (up
        to about 10**15), the probability of a collision between two 128-bit hash
        values is negligible.
        """
        msg = "segmentedHash {} {} {}".format(self.objtype, self.offsets.name, self.bytes.name)
        repMsg = generic_msg(msg)
        h1, h2 = repMsg.split('+')
        return create_pdarray(h1), create_pdarray(h2)