def remove_repeats(self, return_multiplicity=False): """ Condense sequences of repeated values within a sub-array to a single value. Parameters ---------- return_multiplicity : bool If True, also return the number of times each value was repeated. Returns ------- norepeats : SegArray Sub-arrays with runs of repeated values replaced with single value multiplicity : SegArray If return_multiplicity=True, this array contains the number of times each value in the returned SegArray was repeated in the original SegArray. """ isrepeat = ak.zeros(self.values.size, dtype=ak.bool) isrepeat[1:] = self.values[:-1] == self.values[1:] isrepeat[self.segments] = False truepaths = self.values[~isrepeat] nhops = self.grouping.sum(~isrepeat)[1] # truehops = ak.cumsum(~isrepeat) # nhops = ak.concatenate((truehops[self.segments[1:]], ak.array([truehops.sum()+1]))) - truehops[self.segments] truesegs = ak.cumsum(nhops) - nhops norepeats = SegArray(truesegs, truepaths) if return_multiplicity: truehopinds = ak.arange(self.valsize)[~isrepeat] multiplicity = ak.zeros(truepaths.size, dtype=ak.int64) multiplicity[:-1] = truehopinds[1:] - truehopinds[:-1] multiplicity[-1] = self.valsize - truehopinds[-1] return norepeats, SegArray(truesegs, multiplicity) else: return norepeats
def test_plus_minus(self): # Datetime + Datetime not supported with self.assertRaises(TypeError) as cm: self.dtvec1 + self.dtvec2 # Datetime slice -> Datetime leading = self.dtvec1[1:] trailing = self.dtvec1[:-1] self.assertTrue(isinstance(leading, ak.Datetime) and isinstance(trailing, ak.Datetime)) # Datetime - Datetime -> Timedelta diff = leading - trailing self.assertTrue(isinstance(diff, ak.Timedelta)) self.assertTrue((diff == self.onesecond).all()) # Datetime - DatetimeScalar -> Timedelta diff = self.dtvec1 - self.dtscalar trange = ak.timedelta_range(start=0, periods=100, freq='s') self.assertTrue(isinstance(diff, ak.Timedelta)) self.assertTrue((diff == trange).all()) # DatetimeScalar - Datetime -> Timedelta diff = self.dtscalar - self.dtvec1 self.assertTrue(isinstance(diff, ak.Timedelta)) self.assertTrue((diff == (-trange)).all()) # Datetime + TimedeltaScalar -> Datetime t = (trailing + self.onesecond) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # TimedeltaScalar + Datetime -> Datetime t = (self.onesecond + trailing) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # Datetime - TimedeltaScalar -> Datetime t = leading - self.onesecond self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == trailing).all()) # Datetime + Timedelta -> Datetime t = (trailing + self.tdvec1[1:]) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # Timedelta + Datetime -> Datetime t = (self.tdvec1[1:] + trailing) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # Datetime - Timedelat -> Datetime t = (leading - self.tdvec1[1:]) self.assertTrue(isinstance(t, ak.Datetime)) # Timedelta + Timedelta -> Timedelta t = self.tdvec1 + self.tdvec1 self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all()) # Timedelta + TimedeltaScalar -> Timedelta t = self.tdvec1 + self.onesecond self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all()) # Timedelta - Timedelta -> Timedelta t = self.tdvec1 - self.tdvec1 self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all()) # Timedelta - TimedeltaScalar -> Timedelta t = self.tdvec1 - self.onesecond self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all())
def gen_ranges(starts, ends): """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points. Parameters ---------- starts : pdarray, int64 The start value of each range ends : pdarray, int64 The end value (exclusive) of each range Returns ------- segments : pdarray, int64 The starting index of each range in the resulting array ranges : pdarray, int64 The actual ranges, flattened into a single array """ if starts.size != ends.size: raise ValueError("starts and ends must be same size") if starts.size == 0: return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64) if not ((ends - starts) > 0).all(): raise ValueError("all ends must be greater than starts") lengths = ends - starts segs = ak.cumsum(lengths) - lengths totlen = lengths.sum() slices = ak.ones(totlen, dtype=ak.int64) diffs = ak.concatenate((ak.array([starts[0]]), starts[1:] - starts[:-1] - lengths[:-1] + 1)) slices[segs] = diffs return segs, ak.cumsum(slices)
def gen_ranges(starts, ends, stride=1): """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points. Parameters ---------- starts : pdarray, int64 The start value of each range ends : pdarray, int64 The end value (exclusive) of each range stride: int Difference between successive elements of each range Returns ------- segments : pdarray, int64 The starting index of each range in the resulting array ranges : pdarray, int64 The actual ranges, flattened into a single array """ if starts.size != ends.size: raise ValueError("starts and ends must be same length") if starts.size == 0: return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64) lengths = (ends - starts) // stride segs = ak.cumsum(lengths) - lengths totlen = lengths.sum() slices = ak.ones(totlen, dtype=ak.int64) diffs = ak.concatenate( (ak.array([starts[0]]), starts[1:] - starts[:-1] - (lengths[:-1] - 1) * stride)) slices[segs] = diffs return segs, ak.cumsum(slices)
def check_correctness(dtype): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N) z = ak.zeros(N, dtype=dtype) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) z = ak.zeros(N, dtype=dtype) perm = ak.coargsort([a, z]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, a]) assert ak.is_sorted(a[perm])
def check_correctness(dtype, random, seed): Ni = 10**4 Nv = 10**4 if seed is not None: np.random.seed(seed) # make indices unique # if indices are non-unique, results of unordered scatter are variable npi = np.arange(Ni) np.random.shuffle(npi) npc = np.zeros(Nv, dtype=dtype) aki = ak.array(npi) akc = ak.zeros(Nv, dtype=dtype) if random or seed is not None: if dtype == 'int64': npv = np.random.randint(0, 2**32, Ni) elif dtype == 'float64': npv = np.random.random(Ni) elif dtype == 'bool': npv = np.random.randint(0, 1, Ni, dtype=np.bool) else: npv = np.ones(Ni, dtype=dtype) akv = ak.array(npv) npc[npi] = npv akc[aki] = akv assert np.allclose(npc, akc.to_ndarray())
def time_ak_scatter(isize, vsize, trials, dtype, random): print(">>> arkouda scatter") cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format( cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni) c = ak.zeros(Nv, dtype=dtype) if random: if dtype == 'int64': v = ak.randint(0, 2**32, Ni) elif dtype == 'float64': v = ak.randint(0, 1, Ni, dtype=ak.float64) else: v = ak.ones(Ni, dtype=dtype) timings = [] for _ in range(trials): start = time.time() c[i] = v end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (i.size * i.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def append_single(self, x, prepend=False): ''' Append a single value to each sub-array. Parameters ---------- x : pdarray or scalar Single value to append to each sub-array Returns ------- SegArray Copy of original SegArray with values from x appended to each sub-array ''' if hasattr(x, 'size'): if x.size != self.size: raise ValueError( 'Argument must be scalar or same size as SegArray') if type(x) != type(self.values) or x.dtype != self.dtype: raise TypeError( 'Argument type must match value type of SegArray') newlens = self.lengths + 1 newsegs = ak.cumsum(newlens) - newlens newvals = ak.zeros(newlens.sum(), dtype=self.dtype) if prepend: lastscatter = newsegs else: lastscatter = newsegs + newlens - 1 newvals[lastscatter] = x origscatter = ak.arange(self.valsize) + self.grouping.broadcast( ak.arange(self.size), permute=True) if prepend: origscatter += 1 newvals[origscatter] = self.values return SegArray(newsegs, newvals)
def from_multi_array(cls, m): """ Construct a SegArray from a list of columns. This essentially transposes the input, resulting in an array of rows. Parameters ---------- m : list of pdarray List of columns, the rows of which will form the sub-arrays of the output Returns ------- SegArray Array of rows of input """ if isinstance(m, ak.pdarray): size = m.size n = 1 dtype = m.dtype else: s = set(mi.size for mi in m) if len(s) != 1: raise ValueError("All columns must have same length") size = s.pop() n = len(m) d = set(mi.dtype for mi in m) if len(d) != 1: raise ValueError("All columns must have same dtype") dtype = d.pop() newsegs = ak.arange(size) * n newvals = ak.zeros(size * n, dtype=dtype) for j in range(len(m)): newvals[j::len(m)] = m[j] return cls(newsegs, newvals)
def compare_strategies(length, ncat, op, dtype): keys = ak.randint(0, ncat, length) if dtype == 'int64': vals = ak.randint(0, length // ncat, length) elif dtype == 'bool': vals = ak.zeros(length, dtype='bool') for i in np.random.randint(0, length, ncat // 2): vals[i] = True else: vals = ak.linspace(-1, 1, length) print("Global groupby", end=' ') start = time() gg = ak.GroupBy(keys, False) ggtime = time() - start print(ggtime) print("Global reduce", end=' ') start = time() gk, gv = gg.aggregate(vals, op) grtime = time() - start print(grtime) print("Local groupby", end=' ') start = time() lg = ak.GroupBy(keys, True) lgtime = time() - start print(lgtime) print("Local reduce", end=' ') start = time() lk, lv = lg.aggregate(vals, op) lrtime = time() - start print(lrtime) print(f"Keys match? {(gk == lk).all()}") print(f"Absolute diff of vals = {ak.abs(gv - lv).sum()}") return ggtime, grtime, lgtime, lrtime
def expand(vals, segs, size): """ Broadcast per-segment values to a segmented array. Equivalent to ak.GroupBy.broadcast(vals) but accepts explicit segments and size arguments. Parameters ---------- vals : pdarray Values (one per segment) to broadcast over segments segs : pdarray Start indices of segments size : int Total size of result array Returns ------- pdarray Values broadcasted out to segments """ if vals.size != segs.size: raise ValueError("vals and segs must have same size") if vals.size == 0: return ak.array([]) if size < segs.size or size <= segs.max(): raise ValueError("Total size cannot be less than max segment") if segs[0] != 0 or not (segs[:-1] < segs[1:]).all(): raise ValueError( "segs must start at zero and be monotonically increasing") temp = ak.zeros(size, dtype=vals.dtype) diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1])) temp[segs] = diffs return ak.cumsum(temp)
def interval_lookup(keys, values, arguments, fillvalue=-1): ''' Apply a function defined over non-overlapping intervals to an array of arguments. Parameters ---------- keys : 2-tuple of pdarray Tuple of non-overlapping, half-open intervals expressed as (lower_bounds_inclusive, upper_bounds_exclusive) values : pdarray Function value to return for each entry in keys. arguments : pdarray Arguments to the function fillvalue : scalar Default value to return when argument is not in any interval. Returns ------- pdarray Value of function corresponding to the keys interval containing each argument, or fillvalue if argument not in any interval. ''' idx = search_intervals(arguments, keys, assume_unique=True) res = ak.zeros(arguments.size, dtype=values.dtype) if fillvalue is not None: res.fill(fillvalue) found = idx > -1 res[found] = values[idx[found]] return res
def _get_lengths(self): if self.size == 0: return ak.zeros(0, dtype=ak.int64) elif self.size == 1: return ak.array([self.valsize]) else: return ak.concatenate( (self.segments[1:], ak.array([self.valsize]))) - self.segments
def create_ak_array(N, op, dtype, seed): if op == 'zeros': a = ak.zeros(N, dtype=dtype) elif op == 'ones': a = ak.ones(N, dtype=dtype) elif op == 'randint': a = ak.randint(0, 2**32, N, dtype=dtype, seed=seed) return a
def check_zeros(N): # create np version a = ak.array(np.zeros(10)) # create ak version b = ak.zeros(10) # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def check_zeros(N): # create np version a = np.zeros(N) # create ak version b = ak.zeros(N) # print(a,b) c = a == b.to_ndarray() # print(type(c),c) return pass_fail(c.all())
def check_correctness(dtype, seed): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) z = ak.cast(ak.zeros(N), 'str') perm = ak.coargsort([a, z]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, a]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm])
def generate_arrays(length, nkeys, nvals, dtype='int64'): keys = ak.randint(0, nkeys, length) if dtype == 'int64': vals = ak.randint(0, nvals, length) elif dtype == 'bool': vals = ak.zeros(length, dtype='bool') for i in np.random.randint(0, length, nkeys // 2): vals[i] = True else: vals = ak.linspace(-1, 1, length) return keys, vals
def __eq__(self, other): if not isinstance(other, SegArray): return NotImplemented eq = ak.zeros(self.size, dtype=ak.bool) leneq = self.lengths == other.lengths if leneq.sum() > 0: selfcmp = self[leneq] othercmp = other[leneq] intersection = self.all(selfcmp.values == othercmp.values) eq[leneq] = intersection return eq
def IP_like(N): ''' Data like a 90/10 mix of IPv4 and IPv6 addresses ''' multiplicity = 10 nunique = N // (2 * multiplicity) # First generate unique addresses, then sample with replacement u1 = ak.zeros(nunique, dtype=ak.int64) u2 = ak.zeros(nunique, dtype=ak.int64) v4 = ak.uniform(nunique) < 0.9 n4 = v4.sum() v6 = ~v4 n6 = v4.size - n4 u1[v4] = ak.randint(0, 2**32, n4) u1[v6] = ak.randint(-2**63, 2**63, n6) u2[v6] = ak.randint(-2**63, 2**63, n6) sample = ak.randint(0, nunique, N // 2) IP1 = u1[sample] IP2 = u2[sample] yield 'IP-like 2*int64', (IP1, IP2)
def in1dmulti(a, b, assume_unique=False): """ The multi-level analog of ak.in1d -- test membership of rows of a in the set of rows of b. Parameters ---------- a : list of pdarrays Rows are elements for which to test membership in b b : list of pdarrays Rows are elements of the set in which to test membership assume_unique : bool If true, assume rows of a and b are each unique and sorted. By default, sort and unique them explicitly. Returns ------- pdarray, bool True for each row in a that is contained in b Notes: Only works for pdarrays of int64 dtype, Strings, or Categorical """ if not assume_unique: ag = ak.GroupBy(a) ua = ag.unique_keys bg = ak.GroupBy(b) ub = bg.unique_keys else: ua = a ub = b c = [ak.concatenate(x) for x in zip(ua, ub)] g = ak.GroupBy(c) k, ct = g.count() truth = ak.zeros(c[0].size, dtype=ak.bool) truth[g.permutation] = (g.broadcast(1 * (ct == 2)) == 1) if assume_unique: return truth[:a[0].size] else: truth2 = ak.zeros(a[0].size, dtype=ak.bool) truth2[ag.permutation] = (ag.broadcast(1 * truth[:ua[0].size]) == 1) return truth2
def check_float(N): a = ak.randint(0, 1, N, dtype=ak.float64) n = ak.randint(-1, 1, N, dtype=ak.float64) z = ak.zeros(N, dtype=ak.float64) perm = ak.coargsort([a]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([a, n]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([n, a]) assert ak.is_sorted(n[perm]) perm = ak.coargsort([z, a]) assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, n]) assert ak.is_sorted(n[perm])
def get_jth(self, j, return_origins=True, compressed=False, default=0): """ Select the j-th element of each sub-array, where possible. Parameters ---------- j : int The index of the value to get from each sub-array. If j is negative, it counts backwards from the end of each sub-array. return_origins : bool If True, return a logical index indicating where j is in bounds compressed : bool If False, return array is same size as self, with default value where j is out of bounds. If True, the return array only contains values where j is in bounds. default : scalar When compressed=False, the value to return when j is out of bounds for the sub-array Returns ------- val : pdarray compressed=False: The j-th value of each sub-array where j is in bounds and the default value where j is out of bounds. compressed=True: The j-th values of only the sub-arrays where j is in bounds origin_indices : pdarray, bool A Boolean array that is True where j is in bounds for the sub-array. """ longenough, newj = self._normalize_index(j) ind = (self.segments + newj)[longenough] if compressed: res = self.values[ind] else: res = ak.zeros(self.size, dtype=self.dtype) + default res[longenough] = self.values[ind] if return_origins: return res, longenough else: return res
def expand(size, segs, vals): """ Expand an array with values placed into the indicated segments. Parameters ---------- size : ak.pdarray The size of the array to be expanded segs : ak.pdarray The indices where the values should be placed vals : ak.pdarray The values to be placed in each segment Returns ------- pdarray The expanded array. """ temp = ak.zeros(size, vals.dtype) diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1])) temp[segs] = diffs return ak.cumsum(temp)
def _convert_strings(self, s): ''' Convert string field names to binary vectors. ''' # Initialize to zero values = ak.zeros(s.size, dtype=ak.int64) if self.separator == '': # When separator is empty, field names are guaranteed to be single characters for name, shift in zip(self.names, self.shifts): # Check if name exists in each string bit = s.contains(name) values = values | ak.where(bit, 1 << shift, 0) else: # When separator is non-empty, split on it sf, segs = s.flatten(self.separator, return_segments=True) # Create a grouping to map split fields back to originating string orig = ak.broadcast(segs, ak.arange(segs.size), sf.size) g = ak.GroupBy(orig) for name, shift in zip(self.names, self.shifts): # Check if name matches one of the split fields from originating string bit = g.any(sf == name)[1] values = values | ak.where(bit, 1 << shift, 0) return values
def check_int(N): z = ak.zeros(N, dtype=ak.int64) a2 = ak.randint(0, 2**16, N) b2 = ak.randint(0, 2**16, N) c2 = ak.randint(0, 2**16, N) d2 = ak.randint(0, 2**16, N) n2 = ak.randint(-(2**15), 2**15, N) perm = ak.coargsort([a2]) assert ak.is_sorted(a2[perm]) perm = ak.coargsort([n2]) assert ak.is_sorted(n2[perm]) perm = ak.coargsort([a2, b2, c2, d2]) assert ak.is_sorted(a2[perm]) perm = ak.coargsort([z, b2, c2, d2]) assert ak.is_sorted(b2[perm]) perm = ak.coargsort([z, z, c2, d2]) assert ak.is_sorted(c2[perm]) perm = ak.coargsort([z, z, z, d2]) assert ak.is_sorted(d2[perm]) a4 = ak.randint(0, 2**32, N) b4 = ak.randint(0, 2**32, N) n4 = ak.randint(-(2**31), 2**31, N) perm = ak.coargsort([a4]) assert ak.is_sorted(a4[perm]) perm = ak.coargsort([n4]) assert ak.is_sorted(n4[perm]) perm = ak.coargsort([a4, b4]) assert ak.is_sorted(a4[perm]) perm = ak.coargsort([b4, a4]) assert ak.is_sorted(b4[perm]) a8 = ak.randint(0, 2**64, N) b8 = ak.randint(0, 2**64, N) n8 = ak.randint(-(2**63), 2**64, N) perm = ak.coargsort([a8]) assert ak.is_sorted(a8[perm]) perm = ak.coargsort([n8]) assert ak.is_sorted(n8[perm]) perm = ak.coargsort([b8, a8]) assert ak.is_sorted(b8[perm]) from itertools import permutations all_perm = permutations([a2, a4, a8]) for p in all_perm: perm = ak.coargsort(p) assert ak.is_sorted(p[0][perm])
def inner_join(left, right, wherefunc=None, whereargs=None): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' from inspect import signature sample = min((left.size, right.size, 5)) if wherefunc is not None: if len(signature(wherefunc).parameters) != 2: raise ValueError("wherefunc must be a function that accepts exactly two arguments") if whereargs is None or len(whereargs) != 2: raise ValueError("whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError("Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError("Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Need dense 0-up right index, to filter out left not in right keep, (denseLeft, denseRight) = right_align(left, right) if keep.sum() == 0: # Intersection is empty return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64) keep = ak.arange(keep.size)[keep] # GroupBy right byRight = ak.GroupBy(denseRight) # Get segment boundaries (starts, ends) of right for each left item rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size]))) starts = rightSegs[denseLeft] ends = rightSegs[denseLeft+1] fullSize = (ends - starts).sum() # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ") # gen_ranges for gather of right items fullSegs, ranges = gen_ranges(starts, ends) # Evaluate where clause if wherefunc is None: filtRanges = ranges filtSegs = fullSegs keep12 = keep else: # Gather right whereargs rightWhere = whereargs[1][byRight.permutation][ranges] # Expand left whereargs leftWhere = ak.broadcast(fullSegs, whereargs[0][keep], ranges.size) # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) filtRanges = ranges[whereSatisfied] scan = ak.cumsum(whereSatisfied) - whereSatisfied filtSegsWithZeros = scan[fullSegs] filtSegSizes = ak.concatenate((filtSegsWithZeros[1:] - filtSegsWithZeros[:-1], ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]]))) keep2 = (filtSegSizes > 0) filtSegs = filtSegsWithZeros[keep2] keep12 = keep[keep2] # Gather right inds and expand left inds rightInds = byRight.permutation[filtRanges] leftInds = ak.broadcast(filtSegs, ak.arange(left.size)[keep12], filtRanges.size) return leftInds, rightInds
def __init__(self, segments, values, copy=False, lengths=None, grouping=None): """ An array of variable-length arrays, also called a skyline array or ragged array. Parameters ---------- segments : pdarray, int64 Start index of each sub-array in the flattened values array values : pdarray The flattened values of all sub-arrays copy : bool If True, make a copy of the input arrays; otherwise, just store a reference. Returns ------- SegArray Data structure representing an array whose elements are variable-length arrays. Notes ----- Keyword args 'lengths' and 'grouping' are not user-facing. They are used by the attach method. """ if not isinstance(segments, ak.pdarray) or segments.dtype != ak.int64: raise TypeError("Segments must be int64 pdarray") if not ak.is_sorted(segments) or (ak.unique(segments).size != segments.size): raise ValueError("Segments must be unique and in sorted order") if segments.size > 0: if segments.min() != 0 or segments.max() >= values.size: raise ValueError( "Segments must start at zero and be less than values.size") elif values.size > 0: raise ValueError( "Cannot have non-empty values with empty segments") if copy: self.segments = segments[:] self.values = values[:] else: self.segments = segments self.values = values self.size = segments.size self.valsize = values.size if lengths is None: self.lengths = self._get_lengths() else: self.lengths = lengths self.dtype = values.dtype if grouping is None: if self.size == 0: self.grouping = ak.GroupBy(ak.zeros(0, dtype=ak.int64)) else: # Treat each sub-array as a group, for grouped aggregations self.grouping = ak.GroupBy( ak.broadcast(self.segments, ak.arange(self.size), self.valsize)) else: self.grouping = grouping
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64: raise ValueError("left and right must be pdarray(int64)") if wherefunc is not None: from inspect import signature sample = min((left.size, right.size, 5)) if len(signature(wherefunc).parameters) != 2: raise ValueError("wherefunc must be a function that accepts exactly two arguments") if whereargs is None or len(whereargs) != 2: raise ValueError("whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError("Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError("Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Only join on intersection inter = ak.intersect1d(left, right) # Indices of left values present in intersection leftInds = ak.arange(left.size)[ak.in1d(left, inter)] # Left vals in intersection leftFilt = left[leftInds] # Indices of right vals present in inter rightInds = ak.arange(right.size)[ak.in1d(right, inter)] # Right vals in inter rightFilt = right[rightInds] byLeft = ak.GroupBy(leftFilt) byRight = ak.GroupBy(rightFilt) maxVal = inter.max() if forceDense or maxVal > 3*(left.size + right.size): # Remap intersection to dense, 0-up codes # Replace left values with dense codes uniqLeftVals = byLeft.unique_keys uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)] leftCodes = ak.zeros_like(leftFilt) - 1 leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False) # Replace right values with dense codes uniqRightVals = byRight.unique_keys uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)] rightCodes = ak.zeros_like(rightFilt) - 1 rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False) countSize = inter.size else: uniqLeftCodes = byLeft.unique_keys uniqRightCodes = byRight.unique_keys leftCodes = leftFilt rightCodes = rightFilt countSize = maxVal + 1 # Expand indices to product domain # First count occurrences of each code in left and right leftCounts = ak.zeros(countSize, dtype=ak.int64) leftCounts[uniqLeftCodes] = byLeft.count()[1] rightCounts = ak.zeros(countSize, dtype=ak.int64) rightCounts[uniqRightCodes] = byRight.count()[1] # Repeat each left index as many times as that code occurs in right prodLeft = rightCounts[leftCodes] leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum()) prodRight = leftCounts[rightCodes] rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum()) # Evaluate where clause if wherefunc is None: return leftFullInds, rightFullInds else: # Gather whereargs leftWhere = whereargs[0][leftFullInds] rightWhere = whereargs[1][rightFullInds] # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
def concat(cls, x, axis=0, ordered=True): """ Concatenate a sequence of SegArrays Parameters ---------- x : sequence of SegArray The SegArrays to concatenate axis : 0 or 1 Select vertical (0) or horizontal (1) concatenation. If axis=1, all SegArrays must have same size. ordered : bool Must be True. This option is present for compatibility only, because unordered concatenation is not yet supported. Returns ------- SegArray The input arrays joined into one SegArray """ if not ordered: raise ValueError( "Unordered concatenation not yet supported on SegArray; use ordered=True." ) if len(x) == 0: raise ValueError("Empty sequence passed to concat") for xi in x: if not isinstance(xi, cls): return NotImplemented if len(set(xi.dtype for xi in x)) != 1: raise ValueError( "SegArrays must all have same dtype to concatenate") if axis == 0: ctr = 0 segs = [] vals = [] for xi in x: # Segment offsets need to be raised by length of previous values segs.append(xi.segments + ctr) ctr += xi.valsize # Values can just be concatenated vals.append(xi.values) return cls(ak.concatenate(segs), ak.concatenate(vals)) elif axis == 1: sizes = set(xi.size for xi in x) if len(sizes) != 1: raise ValueError( "SegArrays must all have same size to concatenate with axis=1" ) if sizes.pop() == 0: return x[0] dt = list(x)[0].dtype newlens = sum(xi.lengths for xi in x) newsegs = ak.cumsum(newlens) - newlens # Ignore sub-arrays that are empty in all arrays nonzero = ak.concatenate( (newsegs[:-1] < newsegs[1:], ak.array([True]))) nzsegs = newsegs[nonzero] newvals = ak.zeros(newlens.sum(), dtype=dt) for xi in x: # Set up fromself for a scan, so that it steps up at the start of a segment # from the current array, and steps back down at the end fromself = ak.zeros(newvals.size + 1, dtype=ak.int64) fromself[nzsegs] += 1 nzlens = xi.lengths[nonzero] fromself[nzsegs + nzlens] -= 1 fromself = (ak.cumsum(fromself[:-1]) == 1) newvals[fromself] = xi.values nzsegs += nzlens return cls(newsegs, newvals, copy=False) else: raise ValueError( "Supported values for axis are 0 (vertical concat) or 1 (horizontal concat)" )