def gen_ranges(starts, ends, stride=1): """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points. Parameters ---------- starts : pdarray, int64 The start value of each range ends : pdarray, int64 The end value (exclusive) of each range stride: int Difference between successive elements of each range Returns ------- segments : pdarray, int64 The starting index of each range in the resulting array ranges : pdarray, int64 The actual ranges, flattened into a single array """ if starts.size != ends.size: raise ValueError("starts and ends must be same length") if starts.size == 0: return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64) lengths = (ends - starts) // stride segs = ak.cumsum(lengths) - lengths totlen = lengths.sum() slices = ak.ones(totlen, dtype=ak.int64) diffs = ak.concatenate( (ak.array([starts[0]]), starts[1:] - starts[:-1] - (lengths[:-1] - 1) * stride)) slices[segs] = diffs return segs, ak.cumsum(slices)
def gen_ranges(starts, ends): """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points. Parameters ---------- starts : pdarray, int64 The start value of each range ends : pdarray, int64 The end value (exclusive) of each range Returns ------- segments : pdarray, int64 The starting index of each range in the resulting array ranges : pdarray, int64 The actual ranges, flattened into a single array """ if starts.size != ends.size: raise ValueError("starts and ends must be same size") if not ((ends - starts) > 0).all(): raise ValueError("all ends must be greater than starts") lengths = ends - starts segs = ak.cumsum(lengths) - lengths totlen = lengths.sum() slices = ak.ones(totlen, dtype=ak.int64) diffs = ak.concatenate( (ak.array([starts[0]]), starts[1:] - starts[:-1] - lengths[:-1] + 1)) slices[segs] = diffs return segs, ak.cumsum(slices)
def expand(vals, segs, size): """ Broadcast per-segment values to a segmented array. Equivalent to ak.GroupBy.broadcast(vals) but accepts explicit segments and size arguments. Parameters ---------- vals : pdarray Values (one per segment) to broadcast over segments segs : pdarray Start indices of segments size : int Total size of result array Returns ------- pdarray Values broadcasted out to segments """ if vals.size != segs.size: raise ValueError("vals and segs must have same size") if vals.size == 0: return ak.array([]) if size < segs.size or size <= segs.max(): raise ValueError("Total size cannot be less than max segment") if segs[0] != 0 or not (segs[:-1] < segs[1:]).all(): raise ValueError( "segs must start at zero and be monotonically increasing") temp = ak.zeros(size, dtype=vals.dtype) diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1])) temp[segs] = diffs return ak.cumsum(temp)
def remove_repeats(self, return_multiplicity=False): """ Condense sequences of repeated values within a sub-array to a single value. Parameters ---------- return_multiplicity : bool If True, also return the number of times each value was repeated. Returns ------- norepeats : SegArray Sub-arrays with runs of repeated values replaced with single value multiplicity : SegArray If return_multiplicity=True, this array contains the number of times each value in the returned SegArray was repeated in the original SegArray. """ isrepeat = ak.zeros(self.values.size, dtype=ak.bool) isrepeat[1:] = self.values[:-1] == self.values[1:] isrepeat[self.segments] = False truepaths = self.values[~isrepeat] nhops = self.grouping.sum(~isrepeat)[1] # truehops = ak.cumsum(~isrepeat) # nhops = ak.concatenate((truehops[self.segments[1:]], ak.array([truehops.sum()+1]))) - truehops[self.segments] truesegs = ak.cumsum(nhops) - nhops norepeats = SegArray(truesegs, truepaths) if return_multiplicity: truehopinds = ak.arange(self.valsize)[~isrepeat] multiplicity = ak.zeros(truepaths.size, dtype=ak.int64) multiplicity[:-1] = truehopinds[1:] - truehopinds[:-1] multiplicity[-1] = self.valsize - truehopinds[-1] return norepeats, SegArray(truesegs, multiplicity) else: return norepeats
def append_single(self, x, prepend=False): ''' Append a single value to each sub-array. Parameters ---------- x : pdarray or scalar Single value to append to each sub-array Returns ------- SegArray Copy of original SegArray with values from x appended to each sub-array ''' if hasattr(x, 'size'): if x.size != self.size: raise ValueError( 'Argument must be scalar or same size as SegArray') if type(x) != type(self.values) or x.dtype != self.dtype: raise TypeError( 'Argument type must match value type of SegArray') newlens = self.lengths + 1 newsegs = ak.cumsum(newlens) - newlens newvals = ak.zeros(newlens.sum(), dtype=self.dtype) if prepend: lastscatter = newsegs else: lastscatter = newsegs + newlens - 1 newvals[lastscatter] = x origscatter = ak.arange(self.valsize) + self.grouping.broadcast( ak.arange(self.size), permute=True) if prepend: origscatter += 1 newvals[origscatter] = self.values return SegArray(newsegs, newvals)
def expand(size, segs, vals): """ Expand an array with values placed into the indicated segments. Parameters ---------- size : ak.pdarray The size of the array to be expanded segs : ak.pdarray The indices where the values should be placed vals : ak.pdarray The values to be placed in each segment Returns ------- pdarray The expanded array. """ temp = ak.zeros(size, vals.dtype) diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1])) temp[segs] = diffs return ak.cumsum(temp)
def inner_join(left, right, wherefunc=None, whereargs=None): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' from inspect import signature sample = min((left.size, right.size, 5)) if wherefunc is not None: if len(signature(wherefunc).parameters) != 2: raise ValueError( "wherefunc must be a function that accepts exactly two arguments" ) if whereargs is None or len(whereargs) != 2: raise ValueError( "whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError( "Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError( "Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Need dense 0-up right index, to filter out left not in right keep, (denseLeft, denseRight) = right_align(left, right) keep = ak.arange(keep.size)[keep] # GroupBy right byRight = ak.GroupBy(denseRight) # Get segment boundaries (starts, ends) of right for each left item rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size]))) starts = rightSegs[denseLeft] ends = rightSegs[denseLeft + 1] fullSize = (ends - starts).sum() # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ") # gen_ranges for gather of right items fullSegs, ranges = gen_ranges(starts, ends) # Evaluate where clause if wherefunc is None: filtRanges = ranges filtSegs = fullSegs keep12 = keep else: # Gather right whereargs rightWhere = whereargs[1][byRight.permutation][ranges] # Expand left whereargs leftWhere = expand(whereargs[0][keep], fullSegs, ranges.size) # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) filtRanges = ranges[whereSatisfied] scan = ak.cumsum(whereSatisfied) - whereSatisfied filtSegsWithZeros = scan[fullSegs] filtSegSizes = ak.concatenate( (filtSegsWithZeros[1:] - filtSegsWithZeros[:-1], ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]]))) keep2 = (filtSegSizes > 0) filtSegs = filtSegsWithZeros[keep2] keep12 = keep[keep2] # Gather right inds and expand left inds rightInds = byRight.permutation[filtRanges] leftInds = expand(ak.arange(left.size)[keep12], filtSegs, filtRanges.size) return leftInds, rightInds
def concat(cls, x, axis=0, ordered=True): """ Concatenate a sequence of SegArrays Parameters ---------- x : sequence of SegArray The SegArrays to concatenate axis : 0 or 1 Select vertical (0) or horizontal (1) concatenation. If axis=1, all SegArrays must have same size. ordered : bool Must be True. This option is present for compatibility only, because unordered concatenation is not yet supported. Returns ------- SegArray The input arrays joined into one SegArray """ if not ordered: raise ValueError( "Unordered concatenation not yet supported on SegArray; use ordered=True." ) if len(x) == 0: raise ValueError("Empty sequence passed to concat") for xi in x: if not isinstance(xi, cls): return NotImplemented if len(set(xi.dtype for xi in x)) != 1: raise ValueError( "SegArrays must all have same dtype to concatenate") if axis == 0: ctr = 0 segs = [] vals = [] for xi in x: # Segment offsets need to be raised by length of previous values segs.append(xi.segments + ctr) ctr += xi.valsize # Values can just be concatenated vals.append(xi.values) return cls(ak.concatenate(segs), ak.concatenate(vals)) elif axis == 1: sizes = set(xi.size for xi in x) if len(sizes) != 1: raise ValueError( "SegArrays must all have same size to concatenate with axis=1" ) if sizes.pop() == 0: return x[0] dt = list(x)[0].dtype newlens = sum(xi.lengths for xi in x) newsegs = ak.cumsum(newlens) - newlens # Ignore sub-arrays that are empty in all arrays nonzero = ak.concatenate( (newsegs[:-1] < newsegs[1:], ak.array([True]))) nzsegs = newsegs[nonzero] newvals = ak.zeros(newlens.sum(), dtype=dt) for xi in x: # Set up fromself for a scan, so that it steps up at the start of a segment # from the current array, and steps back down at the end fromself = ak.zeros(newvals.size + 1, dtype=ak.int64) fromself[nzsegs] += 1 nzlens = xi.lengths[nonzero] fromself[nzsegs + nzlens] -= 1 fromself = (ak.cumsum(fromself[:-1]) == 1) newvals[fromself] = xi.values nzsegs += nzlens return cls(newsegs, newvals, copy=False) else: raise ValueError( "Supported values for axis are 0 (vertical concat) or 1 (horizontal concat)" )
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64: raise ValueError("left and right must be pdarray(int64)") if wherefunc is not None: from inspect import signature sample = min((left.size, right.size, 5)) if len(signature(wherefunc).parameters) != 2: raise ValueError("wherefunc must be a function that accepts exactly two arguments") if whereargs is None or len(whereargs) != 2: raise ValueError("whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError("Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError("Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Only join on intersection inter = ak.intersect1d(left, right) # Indices of left values present in intersection leftInds = ak.arange(left.size)[ak.in1d(left, inter)] # Left vals in intersection leftFilt = left[leftInds] # Indices of right vals present in inter rightInds = ak.arange(right.size)[ak.in1d(right, inter)] # Right vals in inter rightFilt = right[rightInds] byLeft = ak.GroupBy(leftFilt) byRight = ak.GroupBy(rightFilt) maxVal = inter.max() if forceDense or maxVal > 3*(left.size + right.size): # Remap intersection to dense, 0-up codes # Replace left values with dense codes uniqLeftVals = byLeft.unique_keys uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)] leftCodes = ak.zeros_like(leftFilt) - 1 leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False) # Replace right values with dense codes uniqRightVals = byRight.unique_keys uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)] rightCodes = ak.zeros_like(rightFilt) - 1 rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False) countSize = inter.size else: uniqLeftCodes = byLeft.unique_keys uniqRightCodes = byRight.unique_keys leftCodes = leftFilt rightCodes = rightFilt countSize = maxVal + 1 # Expand indices to product domain # First count occurrences of each code in left and right leftCounts = ak.zeros(countSize, dtype=ak.int64) leftCounts[uniqLeftCodes] = byLeft.count()[1] rightCounts = ak.zeros(countSize, dtype=ak.int64) rightCounts[uniqRightCodes] = byRight.count()[1] # Repeat each left index as many times as that code occurs in right prodLeft = rightCounts[leftCodes] leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum()) prodRight = leftCounts[rightCodes] rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum()) # Evaluate where clause if wherefunc is None: return leftFullInds, rightFullInds else: # Gather whereargs leftWhere = whereargs[0][leftFullInds] rightWhere = whereargs[1][rightFullInds] # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
a = ak.linspace(1,10,10) b = np.linspace(1,10,10) print(ak.prod(a) == np.prod(b),ak.prod(a),np.prod(b),a.prod(),b.prod()) ak.v = False a = np.arange(0,20,1) b = a<10 print(b,np.sum(b),b.sum(),np.prod(b),b.prod(),np.cumsum(b),np.cumprod(b)) print() b = a<5 print(b,np.sum(b),b.sum(),np.prod(b),b.prod(),np.cumsum(b),np.cumprod(b)) print() a = ak.arange(0,20,1) b = a<10 print(b,ak.sum(b),b.sum(),ak.prod(b),b.prod(),ak.cumsum(b),ak.cumprod(b)) b = a<5 print(b,ak.sum(b),b.sum(),ak.prod(b),b.prod(),ak.cumsum(b),ak.cumprod(b)) ak.v = False a = ak.arange(0,10,1) iv = a[::-1] print(a,iv,a[iv]) ak.v = False a = ak.arange(0,10,1) iv = a[::-1] print(a,iv,a[iv]) ak.v = False a = ak.linspace(0,9,10)