def check_int_float(N): f = ak.randint(0, 2**63, N, dtype=ak.float64) i = ak.randint(0, 2**63, N, dtype=ak.int64) perm = ak.coargsort([f, i]) assert ak.is_sorted(f[perm]) perm = ak.coargsort([i, f]) assert ak.is_sorted(i[perm])
def check_correctness(dtype): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N) z = ak.zeros(N, dtype=dtype) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) z = ak.zeros(N, dtype=dtype) perm = ak.coargsort([a, z]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, a]) assert ak.is_sorted(a[perm])
def time_ak_argsort(N_per_locale, trials, dtype, seed): print(">>> arkouda {} argsort".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) nbytes = a.nbytes * a.entry.itemsize timings = [] for i in range(trials): start = time.time() perm = ak.argsort(a) end = time.time() timings.append(end - start) tavg = sum(timings) / trials if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = nbytes / tavg print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_coargsort(N_per_locale, trials, dtype, seed): print(">>> arkouda {} coargsort".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) for numArrays in (1, 2, 8, 16): if seed is None: seeds = [None for _ in range(numArrays)] else: seeds = [seed+i for i in range(numArrays)] if dtype == 'int64': arrs = [ak.randint(0, 2**32, N//numArrays, seed=s) for s in seeds] nbytes = sum(a.size * a.itemsize for a in arrs) elif dtype == 'float64': arrs = [ak.randint(0, 1, N//numArrays, dtype=ak.float64, seed=s) for s in seeds] nbytes = sum(a.size * a.itemsize for a in arrs) elif dtype == 'str': arrs = [ak.random_strings_uniform(1, 8, N//numArrays, seed=s) for s in seeds] nbytes = sum(a.bytes.size * a.bytes.itemsize for a in arrs) timings = [] for i in range(trials): start = time.time() perm = ak.coargsort(arrs) end = time.time() timings.append(end - start) tavg = sum(timings) / trials a = arrs[0][perm] if dtype in ('int64', 'float64'): assert ak.is_sorted(a) print("{}-array Average time = {:.4f} sec".format(numArrays, tavg)) bytes_per_sec = nbytes / tavg print("{}-array Average rate = {:.4f} GiB/sec".format(numArrays, bytes_per_sec/2**30))
def time_ak_argsort(N_per_locale, trials, dtype, scale_by_locales): print(">>> arkouda argsort") cfg = ak.get_config() if scale_by_locales: N = N_per_locale * cfg["numLocales"] else: N = N_per_locale print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) timings = [] for i in range(trials): start = time.time() perm = ak.argsort(a) end = time.time() timings.append(end - start) tavg = sum(timings) / trials assert ak.is_sorted(a[perm]) print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (a.size * a.itemsize) / tavg print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec/2**30))
def time_ak_coargsort(N_per_locale, trials, dtype): print(">>> arkouda coargsort") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) for numArrays in (1, 2, 8, 16): if dtype == 'int64': arrs = [ ak.randint(0, 2**32, N // numArrays) for _ in range(numArrays) ] elif dtype == 'float64': arrs = [ ak.randint(0, 1, N // numArrays, dtype=ak.float64) for _ in range(numArrays) ] timings = [] for i in range(trials): start = time.time() perm = ak.coargsort(arrs) end = time.time() timings.append(end - start) tavg = sum(timings) / trials a = arrs[0][perm] assert ak.is_sorted(a) print("{}-array Average time = {:.4f} sec".format(numArrays, tavg)) bytes_per_sec = sum(a.size * a.itemsize for a in arrs) / tavg print("{}-array Average rate = {:.4f} GiB/sec".format( numArrays, bytes_per_sec / 2**30))
def check_correctness(dtype, seed): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) z = ak.cast(ak.zeros(N), 'str') perm = ak.coargsort([a, z]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, a]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm])
def check_correctness(dtype): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) perm = ak.argsort(a) assert ak.is_sorted(a[perm])
def check_float(N): a = ak.randint(0, 1, N, dtype=ak.float64) n = ak.randint(-1, 1, N, dtype=ak.float64) z = ak.zeros(N, dtype=ak.float64) perm = ak.coargsort([a]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([a, n]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([n, a]) assert ak.is_sorted(n[perm]) perm = ak.coargsort([z, a]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, n]) assert ak.is_sorted(n[perm])
def check_correctness(dtype, seed): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) perm = ak.argsort(a) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm])
def check_sorted(s): if isinstance(s, (ak.pdarray, ak.Strings)): return ak.is_sorted(s) else: return is_cosorted(s)
def check_int(N): z = ak.zeros(N, dtype=ak.int64) a2 = ak.randint(0, 2**16, N) b2 = ak.randint(0, 2**16, N) c2 = ak.randint(0, 2**16, N) d2 = ak.randint(0, 2**16, N) n2 = ak.randint(-(2**15), 2**15, N) perm = ak.coargsort([a2]) assert ak.is_sorted(a2[perm]) perm = ak.coargsort([n2]) assert ak.is_sorted(n2[perm]) perm = ak.coargsort([a2, b2, c2, d2]) assert ak.is_sorted(a2[perm]) perm = ak.coargsort([z, b2, c2, d2]) assert ak.is_sorted(b2[perm]) perm = ak.coargsort([z, z, c2, d2]) assert ak.is_sorted(c2[perm]) perm = ak.coargsort([z, z, z, d2]) assert ak.is_sorted(d2[perm]) a4 = ak.randint(0, 2**32, N) b4 = ak.randint(0, 2**32, N) n4 = ak.randint(-(2**31), 2**31, N) perm = ak.coargsort([a4]) assert ak.is_sorted(a4[perm]) perm = ak.coargsort([n4]) assert ak.is_sorted(n4[perm]) perm = ak.coargsort([a4, b4]) assert ak.is_sorted(a4[perm]) perm = ak.coargsort([b4, a4]) assert ak.is_sorted(b4[perm]) a8 = ak.randint(0, 2**64, N) b8 = ak.randint(0, 2**64, N) n8 = ak.randint(-(2**63), 2**64, N) perm = ak.coargsort([a8]) assert ak.is_sorted(a8[perm]) perm = ak.coargsort([n8]) assert ak.is_sorted(n8[perm]) perm = ak.coargsort([b8, a8]) assert ak.is_sorted(b8[perm]) from itertools import permutations all_perm = permutations([a2, a4, a8]) for p in all_perm: perm = ak.coargsort(p) assert ak.is_sorted(p[0][perm])
def check_large(N): l = [ak.randint(0, 2**63, N) for _ in range(10)] perm = ak.coargsort(l) assert ak.is_sorted(l[0][perm])
def __init__(self, segments, values, copy=False, lengths=None, grouping=None): """ An array of variable-length arrays, also called a skyline array or ragged array. Parameters ---------- segments : pdarray, int64 Start index of each sub-array in the flattened values array values : pdarray The flattened values of all sub-arrays copy : bool If True, make a copy of the input arrays; otherwise, just store a reference. Returns ------- SegArray Data structure representing an array whose elements are variable-length arrays. Notes ----- Keyword args 'lengths' and 'grouping' are not user-facing. They are used by the attach method. """ if not isinstance(segments, ak.pdarray) or segments.dtype != ak.int64: raise TypeError("Segments must be int64 pdarray") if not ak.is_sorted(segments) or (ak.unique(segments).size != segments.size): raise ValueError("Segments must be unique and in sorted order") if segments.size > 0: if segments.min() != 0 or segments.max() >= values.size: raise ValueError( "Segments must start at zero and be less than values.size") elif values.size > 0: raise ValueError( "Cannot have non-empty values with empty segments") if copy: self.segments = segments[:] self.values = values[:] else: self.segments = segments self.values = values self.size = segments.size self.valsize = values.size if lengths is None: self.lengths = self._get_lengths() else: self.lengths = lengths self.dtype = values.dtype if grouping is None: if self.size == 0: self.grouping = ak.GroupBy(ak.zeros(0, dtype=ak.int64)) else: # Treat each sub-array as a group, for grouped aggregations self.grouping = ak.GroupBy( ak.broadcast(self.segments, ak.arange(self.size), self.valsize)) else: self.grouping = grouping
def search_intervals(vals, intervals, assume_unique=False): """ Given an array of query vals and non-overlapping, half-open (pythonic) intervals, return the index of the interval containing each query value, or -1 if not present in any interval. Parameters ---------- vals : pdarray(int, float) Values to search for in intervals intervals : 2-tuple of pdarrays Non-overlapping, half-open intervals, as a tuple of (lower_bounds_inclusive, upper_bounds_exclusive) assume_unique : bool If True, assume query vals are unique. Default: False. Returns ------- idx : pdarray(int64) Index of interval containing each query value, or -1 if not found Notes ----- The return idx satisfies the following condition: present = idx > -1 ((intervals[0][idx[present]] <= vals[present]) & (intervals[1][idx[present]] > vals[present])).all() """ if len(intervals) != 2: raise ValueError( "intervals must be 2-tuple of (lower_bound_inclusive, upper_bounds_exclusive)" ) def check_numeric(x): if not (isinstance(x, ak.pdarray) and x.dtype in (ak.int64, ak.float64)): raise TypeError("arguments must be numeric arrays") check_numeric(vals) check_numeric(intervals[0]) check_numeric(intervals[1]) low = intervals[0] # Convert to closed (inclusive) intervals high = intervals[1] - 1 if low.size != high.size: raise ValueError("Lower and upper bound arrays must be same size") if not (high >= low).all(): raise ValueError("Upper bounds must be greater than lower bounds") if not ak.is_sorted(low): raise ValueError("Intervals must be sorted in ascending order") if not (low[1:] > high[:-1]).all(): raise ValueError("Intervals must be non-overlapping") if assume_unique: uvals = vals else: g = ak.GroupBy(vals) uvals = g.unique_keys # Index of interval containing each unique value (initialized to -1: not found) containinginterval = -ak.ones(uvals.size, dtype=ak.int64) concat = ak.concatenate((low, uvals, high)) perm = ak.argsort(concat) # iperm is the indices of the original values in the sorted array iperm = ak.argsort(perm) # aku.invert_permutation(perm) boundary = uvals.size + low.size # indices of the lower bounds in the sorted array starts = iperm[:low.size] # indices of the upper bounds in the sorted array ends = iperm[boundary:] # which lower/upper bound pairs have any indices between them? valid = (ends > starts + 1) if valid.sum() > 0: # pranges is all the indices in sorted array that fall between a lower and an uppper bound segs, pranges = gen_ranges(starts[valid] + 1, ends[valid]) # matches are the indices of those items in the original array matches = perm[pranges] # integer indices of each interval containing a hit hitidx = ak.arange(valid.size)[valid] # broadcast interval index out to matches matchintervalidx = ak.broadcast(segs, hitidx, matches.size) # make sure not to include any of the bounds themselves validmatch = (matches >= low.size) & (matches < boundary) # indices of unique values found (translated from concat keys) uvalidx = matches[validmatch] - low.size # set index of containing interval for uvals that were found containinginterval[uvalidx] = matchintervalidx[validmatch] if assume_unique: res = containinginterval else: res = g.broadcast(containinginterval, permute=True) return res