Esempio n. 1
0
    def argsort(self, key, ascending=True):
        """
        Return the permutation that sorts the dataframe by `key`.

        Parameters
        ----------
        key : str
            The key to sort on.

        Returns
        -------
        ak.pdarray
            The permutation array that sorts the data on `key`.
        """

        if self._empty:
            return ak.array([], dtype=ak.int64)
        if ascending:
            return ak.argsort(self[key])
        else:
            if isinstance(
                    self[key],
                    ak.pdarray) and self[key].dtype in (ak.int64, ak.float64):
                return ak.argsort(-self[key])
            else:
                return ak.argsort(self[key])[ak.arange(self.size - 1, -1, -1)]
Esempio n. 2
0
 def argsort(self, ascending=True):
     if not ascending:
         if isinstance(
                 self.index,
                 ak.pdarray) and self.index.dtype in (ak.int64, ak.float64):
             i = ak.argsort(-self.index)
         else:
             i = ak.argsort(self.index)[ak.arange(self.index.size - 1, -1,
                                                  -1)]
     else:
         i = ak.argsort(self.index)
     return i
Esempio n. 3
0
def time_ak_argsort(N_per_locale, trials, dtype, seed):
    print(">>> arkouda {} argsort".format(dtype))
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
        nbytes = a.size * a.itemsize
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
        nbytes = a.size * a.itemsize
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)
        nbytes = a.nbytes * a.entry.itemsize

    timings = []
    for i in range(trials):
        start = time.time()
        perm = ak.argsort(a)
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
    print("Average time = {:.4f} sec".format(tavg))
    bytes_per_sec = nbytes / tavg
    print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec / 2**30))
Esempio n. 4
0
def time_ak_argsort(N_per_locale, trials, dtype, scale_by_locales):
    print(">>> arkouda argsort")
    cfg = ak.get_config()
    if scale_by_locales:
        N = N_per_locale * cfg["numLocales"]
    else:
        N = N_per_locale
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64)
     
    timings = []
    for i in range(trials):
        start = time.time()
        perm = ak.argsort(a)
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    assert ak.is_sorted(a[perm])
    print("Average time = {:.4f} sec".format(tavg))
    bytes_per_sec = (a.size * a.itemsize) / tavg
    print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec/2**30))
Esempio n. 5
0
def check_correctness(dtype, seed):
    arrays, totalbytes = generate_arrays(1000, 2, dtype, seed)
    g = ak.GroupBy(arrays)
    perm = ak.argsort(ak.randint(0, 2**32, arrays[0].size))
    g2 = ak.GroupBy([a[perm] for a in arrays])
    assert all((uk == uk2).all() for uk, uk2 in zip(g.unique_keys, g2.unique_keys))
    assert (g.segments == g2.segments).all()
Esempio n. 6
0
def check_correctness(dtype):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64)

    perm = ak.argsort(a)
    assert ak.is_sorted(a[perm])
Esempio n. 7
0
    def sort_values(self,ascending=True):
        """ Sort the series numerically
             
        Returns
        -------
        A new Series sorted smallest to largest
        """

        if not ascending:
            if isinstance(self.values, ak.pdarray) and self.values.dtype in (ak.int64, ak.float64):
                # For numeric values, negation reverses sort order
                idx = ak.argsort(-self.values)
            else:
                # For non-numeric values, need the descending arange because reverse slicing not supported
                idx = ak.argsort(self.values)[ak.arange(self.values.size-1, -1, -1)]
        else:
            idx = ak.argsort(self.values)
        return Series((self.index[idx], self.values[idx]))
Esempio n. 8
0
def check_correctness(dtype, seed):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)

    perm = ak.argsort(a)
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
Esempio n. 9
0
def check_argsort(N):
    # create np version
    a = np.arange(N)
    a = a[::-1]
    iv = np.argsort(a)
    a = a[iv]
    # create ak version
    b = ak.arange(N)
    b = b[::-1]
    iv = ak.argsort(b)
    b = b[iv]
    # print(a,b)
    c = a == b.to_ndarray()
    # print(type(c),c)
    return pass_fail(c.all())
Esempio n. 10
0
def gen_rmat_edges(lgNv, Ne_per_v, p, perm=False):
    # number of vertices
    Nv = 2**lgNv
    # number of edges
    Ne = Ne_per_v * Nv
    # probabilities
    a = p
    b = (1.0 - a) / 3.0
    c = b
    d = b
    # init edge arrays
    ii = ak.ones(Ne, dtype=ak.int64)
    jj = ak.ones(Ne, dtype=ak.int64)
    # quantites to use in edge generation loop
    ab = a + b
    c_norm = c / (c + d)
    a_norm = a / (a + b)
    # generate edges
    for ib in range(1, lgNv):
        ii_bit = (ak.randint(0, 1, Ne, dtype=ak.float64) > ab)
        jj_bit = (ak.randint(0, 1, Ne, dtype=ak.float64) >
                  (c_norm * ii_bit + a_norm * (~ii_bit)))
        ii = ii + ((2**(ib - 1)) * ii_bit)
        jj = jj + ((2**(ib - 1)) * jj_bit)
    # sort all based on ii and jj using coargsort
    # all edges should be sorted based on both vertices of the edge
    iv = ak.coargsort((ii, jj))
    # permute into sorted order
    ii = ii[iv]  # permute first vertex into sorted order
    jj = jj[iv]  # permute second vertex into sorted order
    # to premute/rename vertices
    if perm:
        # generate permutation for new vertex numbers(names)
        ir = ak.argsort(ak.randint(0, 1, Nv, dtype=ak.float64))
        # renumber(rename) vertices
        ii = ir[ii]  # rename first vertex
        jj = ir[jj]  # rename second vertex
    #
    # maybe: remove edges which are self-loops???
    #
    # return pair of pdarrays
    return (ii, jj)
Esempio n. 11
0
def do_argsort(data, algo):
    if isinstance(data, (ak.pdarray, ak.Strings)):
        return ak.argsort(data, algo)
    else:
        return ak.coargsort(data, algo)
Esempio n. 12
0
    akwords = ak.array(more_words)
    matches = ak.in1d(strings, akwords)
    catmatches = ak.in1d(cat, akwords)
    assert ((matches == catmatches).all())
    # Every word in matches should be in the target set
    for word in strings[matches]:
        assert (word in more_words)
    # Exhaustively find all matches to make sure we didn't miss any
    inds = ak.zeros(strings.size, dtype=ak.bool)
    for word in more_words:
        inds |= (strings == word)
    assert ((inds == matches).all())
    print("in1d and iter passed")

    # argsort
    akperm = ak.argsort(strings)
    aksorted = strings[akperm].to_ndarray()
    npsorted = np.sort(test_strings)
    assert ((aksorted == npsorted).all())
    catperm = ak.argsort(cat)
    catsorted = cat[catperm].to_ndarray()
    assert ((catsorted == npsorted).all())
    print("argsort passed")

    # unique
    akuniq = ak.unique(strings)
    catuniq = ak.unique(cat)
    akset = set(akuniq.to_ndarray())
    catset = set(catuniq.to_ndarray())
    assert (akset == catset)
    # There should be no duplicates
Esempio n. 13
0
def search_intervals(vals, intervals, assume_unique=False):
    """
    Given an array of query vals and non-overlapping, half-open (pythonic) 
    intervals, return the index of the interval containing each query value, 
    or -1 if not present in any interval.

    Parameters
    ----------
    vals : pdarray(int, float)
        Values to search for in intervals
    intervals : 2-tuple of pdarrays
        Non-overlapping, half-open intervals, as a tuple of 
        (lower_bounds_inclusive, upper_bounds_exclusive)
    assume_unique : bool
        If True, assume query vals are unique. Default: False.
        
    Returns
    -------
    idx : pdarray(int64)
        Index of interval containing each query value, or -1 if not found 

    Notes
    -----
    The return idx satisfies the following condition:
        present = idx > -1
        ((intervals[0][idx[present]] <= vals[present]) & (intervals[1][idx[present]] > vals[present])).all()
    """
    if len(intervals) != 2:
        raise ValueError(
            "intervals must be 2-tuple of (lower_bound_inclusive, upper_bounds_exclusive)"
        )

    def check_numeric(x):
        if not (isinstance(x, ak.pdarray)
                and x.dtype in (ak.int64, ak.float64)):
            raise TypeError("arguments must be numeric arrays")

    check_numeric(vals)
    check_numeric(intervals[0])
    check_numeric(intervals[1])
    low = intervals[0]
    # Convert to closed (inclusive) intervals
    high = intervals[1] - 1
    if low.size != high.size:
        raise ValueError("Lower and upper bound arrays must be same size")
    if not (high >= low).all():
        raise ValueError("Upper bounds must be greater than lower bounds")
    if not ak.is_sorted(low):
        raise ValueError("Intervals must be sorted in ascending order")
    if not (low[1:] > high[:-1]).all():
        raise ValueError("Intervals must be non-overlapping")
    if assume_unique:
        uvals = vals
    else:
        g = ak.GroupBy(vals)
        uvals = g.unique_keys
    # Index of interval containing each unique value (initialized to -1: not found)
    containinginterval = -ak.ones(uvals.size, dtype=ak.int64)
    concat = ak.concatenate((low, uvals, high))
    perm = ak.argsort(concat)
    # iperm is the indices of the original values in the sorted array
    iperm = ak.argsort(perm)  # aku.invert_permutation(perm)
    boundary = uvals.size + low.size
    # indices of the lower bounds in the sorted array
    starts = iperm[:low.size]
    # indices of the upper bounds in the sorted array
    ends = iperm[boundary:]
    # which lower/upper bound pairs have any indices between them?
    valid = (ends > starts + 1)
    if valid.sum() > 0:
        # pranges is all the indices in sorted array that fall between a lower and an uppper bound
        segs, pranges = gen_ranges(starts[valid] + 1, ends[valid])
        # matches are the indices of those items in the original array
        matches = perm[pranges]
        # integer indices of each interval containing a hit
        hitidx = ak.arange(valid.size)[valid]
        # broadcast interval index out to matches
        matchintervalidx = ak.broadcast(segs, hitidx, matches.size)
        # make sure not to include any of the bounds themselves
        validmatch = (matches >= low.size) & (matches < boundary)
        # indices of unique values found (translated from concat keys)
        uvalidx = matches[validmatch] - low.size
        # set index of containing interval for uvals that were found
        containinginterval[uvalidx] = matchintervalidx[validmatch]
    if assume_unique:
        res = containinginterval
    else:
        res = g.broadcast(containinginterval, permute=True)
    return res