Esempio n. 1
0
def check_int_float(N):
    f = ak.randint(0, 2**63, N, dtype=ak.float64)
    i = ak.randint(0, 2**63, N, dtype=ak.int64)

    perm = ak.coargsort([f, i])
    assert ak.is_sorted(f[perm])

    perm = ak.coargsort([i, f])
    assert ak.is_sorted(i[perm])
Esempio n. 2
0
def check_correctness(dtype):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64)
        z = ak.zeros(N, dtype=dtype)

    perm = ak.coargsort([a, z])
    assert ak.is_sorted(a[perm])
    perm = ak.coargsort([z, a])
    assert ak.is_sorted(a[perm])
Esempio n. 3
0
def time_ak_argsort(N_per_locale, trials, dtype, seed):
    print(">>> arkouda {} argsort".format(dtype))
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
        nbytes = a.size * a.itemsize
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
        nbytes = a.size * a.itemsize
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)
        nbytes = a.nbytes * a.entry.itemsize

    timings = []
    for i in range(trials):
        start = time.time()
        perm = ak.argsort(a)
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
    print("Average time = {:.4f} sec".format(tavg))
    bytes_per_sec = nbytes / tavg
    print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec / 2**30))
Esempio n. 4
0
def time_ak_coargsort(N_per_locale, trials, dtype, seed):
    print(">>> arkouda {} coargsort".format(dtype))
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    for numArrays in (1, 2, 8, 16):
        if seed is None:
            seeds = [None for _ in range(numArrays)]
        else:
            seeds = [seed+i for i in range(numArrays)]
        if dtype == 'int64':
            arrs = [ak.randint(0, 2**32, N//numArrays, seed=s) for s in seeds]
            nbytes = sum(a.size * a.itemsize for a in arrs)
        elif dtype == 'float64':
            arrs = [ak.randint(0, 1, N//numArrays, dtype=ak.float64, seed=s) for s in seeds]
            nbytes = sum(a.size * a.itemsize for a in arrs)
        elif dtype == 'str':
            arrs = [ak.random_strings_uniform(1, 8, N//numArrays, seed=s) for s in seeds]
            nbytes = sum(a.bytes.size * a.bytes.itemsize for a in arrs)

        timings = []
        for i in range(trials):
            start = time.time()
            perm = ak.coargsort(arrs)
            end = time.time()
            timings.append(end - start)
        tavg = sum(timings) / trials

        a = arrs[0][perm]
        if dtype in ('int64', 'float64'):
            assert ak.is_sorted(a)
        print("{}-array Average time = {:.4f} sec".format(numArrays, tavg))
        bytes_per_sec = nbytes / tavg
        print("{}-array Average rate = {:.4f} GiB/sec".format(numArrays, bytes_per_sec/2**30))
Esempio n. 5
0
def time_ak_argsort(N_per_locale, trials, dtype, scale_by_locales):
    print(">>> arkouda argsort")
    cfg = ak.get_config()
    if scale_by_locales:
        N = N_per_locale * cfg["numLocales"]
    else:
        N = N_per_locale
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64)
     
    timings = []
    for i in range(trials):
        start = time.time()
        perm = ak.argsort(a)
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    assert ak.is_sorted(a[perm])
    print("Average time = {:.4f} sec".format(tavg))
    bytes_per_sec = (a.size * a.itemsize) / tavg
    print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec/2**30))
Esempio n. 6
0
def time_ak_coargsort(N_per_locale, trials, dtype):
    print(">>> arkouda coargsort")
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    for numArrays in (1, 2, 8, 16):
        if dtype == 'int64':
            arrs = [
                ak.randint(0, 2**32, N // numArrays) for _ in range(numArrays)
            ]
        elif dtype == 'float64':
            arrs = [
                ak.randint(0, 1, N // numArrays, dtype=ak.float64)
                for _ in range(numArrays)
            ]

        timings = []
        for i in range(trials):
            start = time.time()
            perm = ak.coargsort(arrs)
            end = time.time()
            timings.append(end - start)
        tavg = sum(timings) / trials

        a = arrs[0][perm]
        assert ak.is_sorted(a)
        print("{}-array Average time = {:.4f} sec".format(numArrays, tavg))
        bytes_per_sec = sum(a.size * a.itemsize for a in arrs) / tavg
        print("{}-array Average rate = {:.4f} GiB/sec".format(
            numArrays, bytes_per_sec / 2**30))
Esempio n. 7
0
def check_correctness(dtype, seed):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)
        z = ak.cast(ak.zeros(N), 'str')

    perm = ak.coargsort([a, z])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
    perm = ak.coargsort([z, a])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
Esempio n. 8
0
def check_correctness(dtype):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64)

    perm = ak.argsort(a)
    assert ak.is_sorted(a[perm])
Esempio n. 9
0
def check_float(N):
    a = ak.randint(0, 1, N, dtype=ak.float64)
    n = ak.randint(-1, 1, N, dtype=ak.float64)
    z = ak.zeros(N, dtype=ak.float64)

    perm = ak.coargsort([a])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([a, n])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([n, a])
    assert ak.is_sorted(n[perm])

    perm = ak.coargsort([z, a])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([z, n])
    assert ak.is_sorted(n[perm])
Esempio n. 10
0
def check_correctness(dtype, seed):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)

    perm = ak.argsort(a)
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
Esempio n. 11
0
def check_sorted(s):
    if isinstance(s, (ak.pdarray, ak.Strings)):
        return ak.is_sorted(s)
    else:
        return is_cosorted(s)
Esempio n. 12
0
def check_int(N):
    z = ak.zeros(N, dtype=ak.int64)

    a2 = ak.randint(0, 2**16, N)
    b2 = ak.randint(0, 2**16, N)
    c2 = ak.randint(0, 2**16, N)
    d2 = ak.randint(0, 2**16, N)
    n2 = ak.randint(-(2**15), 2**15, N)

    perm = ak.coargsort([a2])
    assert ak.is_sorted(a2[perm])

    perm = ak.coargsort([n2])
    assert ak.is_sorted(n2[perm])

    perm = ak.coargsort([a2, b2, c2, d2])
    assert ak.is_sorted(a2[perm])

    perm = ak.coargsort([z, b2, c2, d2])
    assert ak.is_sorted(b2[perm])

    perm = ak.coargsort([z, z, c2, d2])
    assert ak.is_sorted(c2[perm])

    perm = ak.coargsort([z, z, z, d2])
    assert ak.is_sorted(d2[perm])

    a4 = ak.randint(0, 2**32, N)
    b4 = ak.randint(0, 2**32, N)
    n4 = ak.randint(-(2**31), 2**31, N)

    perm = ak.coargsort([a4])
    assert ak.is_sorted(a4[perm])

    perm = ak.coargsort([n4])
    assert ak.is_sorted(n4[perm])

    perm = ak.coargsort([a4, b4])
    assert ak.is_sorted(a4[perm])

    perm = ak.coargsort([b4, a4])
    assert ak.is_sorted(b4[perm])

    a8 = ak.randint(0, 2**64, N)
    b8 = ak.randint(0, 2**64, N)
    n8 = ak.randint(-(2**63), 2**64, N)

    perm = ak.coargsort([a8])
    assert ak.is_sorted(a8[perm])

    perm = ak.coargsort([n8])
    assert ak.is_sorted(n8[perm])

    perm = ak.coargsort([b8, a8])
    assert ak.is_sorted(b8[perm])

    from itertools import permutations

    all_perm = permutations([a2, a4, a8])
    for p in all_perm:
        perm = ak.coargsort(p)
        assert ak.is_sorted(p[0][perm])
Esempio n. 13
0
def check_large(N):
    l = [ak.randint(0, 2**63, N) for _ in range(10)]
    perm = ak.coargsort(l)
    assert ak.is_sorted(l[0][perm])
Esempio n. 14
0
    def __init__(self,
                 segments,
                 values,
                 copy=False,
                 lengths=None,
                 grouping=None):
        """
        An array of variable-length arrays, also called a skyline array or ragged array.

        Parameters
        ----------
        segments : pdarray, int64
            Start index of each sub-array in the flattened values array
        values : pdarray
            The flattened values of all sub-arrays
        copy : bool
            If True, make a copy of the input arrays; otherwise, just store a reference.

        Returns
        -------
        SegArray
            Data structure representing an array whose elements are variable-length arrays.

        Notes
        -----
        Keyword args 'lengths' and 'grouping' are not user-facing. They are used by the
        attach method.
        """
        if not isinstance(segments, ak.pdarray) or segments.dtype != ak.int64:
            raise TypeError("Segments must be int64 pdarray")
        if not ak.is_sorted(segments) or (ak.unique(segments).size !=
                                          segments.size):
            raise ValueError("Segments must be unique and in sorted order")
        if segments.size > 0:
            if segments.min() != 0 or segments.max() >= values.size:
                raise ValueError(
                    "Segments must start at zero and be less than values.size")
        elif values.size > 0:
            raise ValueError(
                "Cannot have non-empty values with empty segments")
        if copy:
            self.segments = segments[:]
            self.values = values[:]
        else:
            self.segments = segments
            self.values = values
        self.size = segments.size
        self.valsize = values.size
        if lengths is None:
            self.lengths = self._get_lengths()
        else:
            self.lengths = lengths
        self.dtype = values.dtype
        if grouping is None:
            if self.size == 0:
                self.grouping = ak.GroupBy(ak.zeros(0, dtype=ak.int64))
            else:
                # Treat each sub-array as a group, for grouped aggregations
                self.grouping = ak.GroupBy(
                    ak.broadcast(self.segments, ak.arange(self.size),
                                 self.valsize))
        else:
            self.grouping = grouping
Esempio n. 15
0
def search_intervals(vals, intervals, assume_unique=False):
    """
    Given an array of query vals and non-overlapping, half-open (pythonic) 
    intervals, return the index of the interval containing each query value, 
    or -1 if not present in any interval.

    Parameters
    ----------
    vals : pdarray(int, float)
        Values to search for in intervals
    intervals : 2-tuple of pdarrays
        Non-overlapping, half-open intervals, as a tuple of 
        (lower_bounds_inclusive, upper_bounds_exclusive)
    assume_unique : bool
        If True, assume query vals are unique. Default: False.
        
    Returns
    -------
    idx : pdarray(int64)
        Index of interval containing each query value, or -1 if not found 

    Notes
    -----
    The return idx satisfies the following condition:
        present = idx > -1
        ((intervals[0][idx[present]] <= vals[present]) & (intervals[1][idx[present]] > vals[present])).all()
    """
    if len(intervals) != 2:
        raise ValueError(
            "intervals must be 2-tuple of (lower_bound_inclusive, upper_bounds_exclusive)"
        )

    def check_numeric(x):
        if not (isinstance(x, ak.pdarray)
                and x.dtype in (ak.int64, ak.float64)):
            raise TypeError("arguments must be numeric arrays")

    check_numeric(vals)
    check_numeric(intervals[0])
    check_numeric(intervals[1])
    low = intervals[0]
    # Convert to closed (inclusive) intervals
    high = intervals[1] - 1
    if low.size != high.size:
        raise ValueError("Lower and upper bound arrays must be same size")
    if not (high >= low).all():
        raise ValueError("Upper bounds must be greater than lower bounds")
    if not ak.is_sorted(low):
        raise ValueError("Intervals must be sorted in ascending order")
    if not (low[1:] > high[:-1]).all():
        raise ValueError("Intervals must be non-overlapping")
    if assume_unique:
        uvals = vals
    else:
        g = ak.GroupBy(vals)
        uvals = g.unique_keys
    # Index of interval containing each unique value (initialized to -1: not found)
    containinginterval = -ak.ones(uvals.size, dtype=ak.int64)
    concat = ak.concatenate((low, uvals, high))
    perm = ak.argsort(concat)
    # iperm is the indices of the original values in the sorted array
    iperm = ak.argsort(perm)  # aku.invert_permutation(perm)
    boundary = uvals.size + low.size
    # indices of the lower bounds in the sorted array
    starts = iperm[:low.size]
    # indices of the upper bounds in the sorted array
    ends = iperm[boundary:]
    # which lower/upper bound pairs have any indices between them?
    valid = (ends > starts + 1)
    if valid.sum() > 0:
        # pranges is all the indices in sorted array that fall between a lower and an uppper bound
        segs, pranges = gen_ranges(starts[valid] + 1, ends[valid])
        # matches are the indices of those items in the original array
        matches = perm[pranges]
        # integer indices of each interval containing a hit
        hitidx = ak.arange(valid.size)[valid]
        # broadcast interval index out to matches
        matchintervalidx = ak.broadcast(segs, hitidx, matches.size)
        # make sure not to include any of the bounds themselves
        validmatch = (matches >= low.size) & (matches < boundary)
        # indices of unique values found (translated from concat keys)
        uvalidx = matches[validmatch] - low.size
        # set index of containing interval for uvals that were found
        containinginterval[uvalidx] = matchintervalidx[validmatch]
    if assume_unique:
        res = containinginterval
    else:
        res = g.broadcast(containinginterval, permute=True)
    return res