Esempio n. 1
0
def gen_ranges(starts, ends, stride=1):
    """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points.

    Parameters
    ----------
    starts : pdarray, int64
        The start value of each range
    ends : pdarray, int64
        The end value (exclusive) of each range
    stride: int
        Difference between successive elements of each range

    Returns
    -------
    segments : pdarray, int64
        The starting index of each range in the resulting array
    ranges : pdarray, int64
        The actual ranges, flattened into a single array
    """
    if starts.size != ends.size:
        raise ValueError("starts and ends must be same length")
    if starts.size == 0:
        return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64)
    lengths = (ends - starts) // stride
    segs = ak.cumsum(lengths) - lengths
    totlen = lengths.sum()
    slices = ak.ones(totlen, dtype=ak.int64)
    diffs = ak.concatenate(
        (ak.array([starts[0]]),
         starts[1:] - starts[:-1] - (lengths[:-1] - 1) * stride))
    slices[segs] = diffs
    return segs, ak.cumsum(slices)
Esempio n. 2
0
def gen_ranges(starts, ends):
    """ Generate a segmented array of variable-length, contiguous 
    ranges between pairs of start- and end-points.

    Parameters
    ----------
    starts : pdarray, int64
        The start value of each range
    ends : pdarray, int64
        The end value (exclusive) of each range

    Returns
    -------
    segments : pdarray, int64
        The starting index of each range in the resulting array
    ranges : pdarray, int64
        The actual ranges, flattened into a single array
    """
    if starts.size != ends.size:
        raise ValueError("starts and ends must be same size")
    if not ((ends - starts) > 0).all():
        raise ValueError("all ends must be greater than starts")
    lengths = ends - starts
    segs = ak.cumsum(lengths) - lengths
    totlen = lengths.sum()
    slices = ak.ones(totlen, dtype=ak.int64)
    diffs = ak.concatenate(
        (ak.array([starts[0]]), starts[1:] - starts[:-1] - lengths[:-1] + 1))
    slices[segs] = diffs
    return segs, ak.cumsum(slices)
Esempio n. 3
0
def expand(vals, segs, size):
    """ Broadcast per-segment values to a segmented array. Equivalent 
    to ak.GroupBy.broadcast(vals) but accepts explicit segments and 
    size arguments.

    Parameters
    ----------
    vals : pdarray
        Values (one per segment) to broadcast over segments
    segs : pdarray
        Start indices of segments
    size : int
        Total size of result array

    Returns
    -------
    pdarray
        Values broadcasted out to segments
    """
    if vals.size != segs.size:
        raise ValueError("vals and segs must have same size")
    if vals.size == 0:
        return ak.array([])
    if size < segs.size or size <= segs.max():
        raise ValueError("Total size cannot be less than max segment")
    if segs[0] != 0 or not (segs[:-1] < segs[1:]).all():
        raise ValueError(
            "segs must start at zero and be monotonically increasing")
    temp = ak.zeros(size, dtype=vals.dtype)
    diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1]))
    temp[segs] = diffs
    return ak.cumsum(temp)
Esempio n. 4
0
    def remove_repeats(self, return_multiplicity=False):
        """
        Condense sequences of repeated values within a sub-array to a single value.

        Parameters
        ----------
        return_multiplicity : bool
            If True, also return the number of times each value was repeated.

        Returns
        -------
        norepeats : SegArray
            Sub-arrays with runs of repeated values replaced with single value
        multiplicity : SegArray
            If return_multiplicity=True, this array contains the number of times
            each value in the returned SegArray was repeated in the original SegArray.
        """
        isrepeat = ak.zeros(self.values.size, dtype=ak.bool)
        isrepeat[1:] = self.values[:-1] == self.values[1:]
        isrepeat[self.segments] = False
        truepaths = self.values[~isrepeat]
        nhops = self.grouping.sum(~isrepeat)[1]
        # truehops = ak.cumsum(~isrepeat)
        # nhops = ak.concatenate((truehops[self.segments[1:]], ak.array([truehops.sum()+1]))) - truehops[self.segments]
        truesegs = ak.cumsum(nhops) - nhops
        norepeats = SegArray(truesegs, truepaths)
        if return_multiplicity:
            truehopinds = ak.arange(self.valsize)[~isrepeat]
            multiplicity = ak.zeros(truepaths.size, dtype=ak.int64)
            multiplicity[:-1] = truehopinds[1:] - truehopinds[:-1]
            multiplicity[-1] = self.valsize - truehopinds[-1]
            return norepeats, SegArray(truesegs, multiplicity)
        else:
            return norepeats
Esempio n. 5
0
    def append_single(self, x, prepend=False):
        '''
        Append a single value to each sub-array.

        Parameters
        ----------
        x : pdarray or scalar
            Single value to append to each sub-array
        
        Returns
        -------
        SegArray
            Copy of original SegArray with values from x appended to each sub-array
        '''
        if hasattr(x, 'size'):
            if x.size != self.size:
                raise ValueError(
                    'Argument must be scalar or same size as SegArray')
            if type(x) != type(self.values) or x.dtype != self.dtype:
                raise TypeError(
                    'Argument type must match value type of SegArray')
        newlens = self.lengths + 1
        newsegs = ak.cumsum(newlens) - newlens
        newvals = ak.zeros(newlens.sum(), dtype=self.dtype)
        if prepend:
            lastscatter = newsegs
        else:
            lastscatter = newsegs + newlens - 1
        newvals[lastscatter] = x
        origscatter = ak.arange(self.valsize) + self.grouping.broadcast(
            ak.arange(self.size), permute=True)
        if prepend:
            origscatter += 1
        newvals[origscatter] = self.values
        return SegArray(newsegs, newvals)
Esempio n. 6
0
def expand(size, segs, vals):
    """ Expand an array with values placed into the indicated segments.

    Parameters
    ----------
    size : ak.pdarray
        The size of the array to be expanded
    segs : ak.pdarray
        The indices where the values should be placed
    vals : ak.pdarray
        The values to be placed in each segment

    Returns
    -------
    pdarray
        The expanded array.

    """
    temp = ak.zeros(size, vals.dtype)
    diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1]))
    temp[segs] = diffs
    return ak.cumsum(temp)
Esempio n. 7
0
def inner_join(left, right, wherefunc=None, whereargs=None):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    from inspect import signature
    sample = min((left.size, right.size, 5))
    if wherefunc is not None:
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError(
                "wherefunc must be a function that accepts exactly two arguments"
            )
        if whereargs is None or len(whereargs) != 2:
            raise ValueError(
                "whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError(
                "Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError(
                "Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e

    # Need dense 0-up right index, to filter out left not in right
    keep, (denseLeft, denseRight) = right_align(left, right)
    keep = ak.arange(keep.size)[keep]
    # GroupBy right
    byRight = ak.GroupBy(denseRight)
    # Get segment boundaries (starts, ends) of right for each left item
    rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size])))
    starts = rightSegs[denseLeft]
    ends = rightSegs[denseLeft + 1]
    fullSize = (ends - starts).sum()
    # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ")
    # gen_ranges for gather of right items
    fullSegs, ranges = gen_ranges(starts, ends)
    # Evaluate where clause
    if wherefunc is None:
        filtRanges = ranges
        filtSegs = fullSegs
        keep12 = keep
    else:
        # Gather right whereargs
        rightWhere = whereargs[1][byRight.permutation][ranges]
        # Expand left whereargs
        leftWhere = expand(whereargs[0][keep], fullSegs, ranges.size)
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        filtRanges = ranges[whereSatisfied]
        scan = ak.cumsum(whereSatisfied) - whereSatisfied
        filtSegsWithZeros = scan[fullSegs]
        filtSegSizes = ak.concatenate(
            (filtSegsWithZeros[1:] - filtSegsWithZeros[:-1],
             ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]])))
        keep2 = (filtSegSizes > 0)
        filtSegs = filtSegsWithZeros[keep2]
        keep12 = keep[keep2]
    # Gather right inds and expand left inds
    rightInds = byRight.permutation[filtRanges]
    leftInds = expand(ak.arange(left.size)[keep12], filtSegs, filtRanges.size)
    return leftInds, rightInds
Esempio n. 8
0
    def concat(cls, x, axis=0, ordered=True):
        """
        Concatenate a sequence of SegArrays

        Parameters
        ----------
        x : sequence of SegArray
            The SegArrays to concatenate
        axis : 0 or 1
            Select vertical (0) or horizontal (1) concatenation. If axis=1, all
            SegArrays must have same size.
        ordered : bool
            Must be True. This option is present for compatibility only, because unordered
            concatenation is not yet supported.

        Returns
        -------
        SegArray
            The input arrays joined into one SegArray
        """
        if not ordered:
            raise ValueError(
                "Unordered concatenation not yet supported on SegArray; use ordered=True."
            )
        if len(x) == 0:
            raise ValueError("Empty sequence passed to concat")
        for xi in x:
            if not isinstance(xi, cls):
                return NotImplemented
        if len(set(xi.dtype for xi in x)) != 1:
            raise ValueError(
                "SegArrays must all have same dtype to concatenate")
        if axis == 0:
            ctr = 0
            segs = []
            vals = []
            for xi in x:
                # Segment offsets need to be raised by length of previous values
                segs.append(xi.segments + ctr)
                ctr += xi.valsize
                # Values can just be concatenated
                vals.append(xi.values)
            return cls(ak.concatenate(segs), ak.concatenate(vals))
        elif axis == 1:
            sizes = set(xi.size for xi in x)
            if len(sizes) != 1:
                raise ValueError(
                    "SegArrays must all have same size to concatenate with axis=1"
                )
            if sizes.pop() == 0:
                return x[0]
            dt = list(x)[0].dtype
            newlens = sum(xi.lengths for xi in x)
            newsegs = ak.cumsum(newlens) - newlens
            # Ignore sub-arrays that are empty in all arrays
            nonzero = ak.concatenate(
                (newsegs[:-1] < newsegs[1:], ak.array([True])))
            nzsegs = newsegs[nonzero]
            newvals = ak.zeros(newlens.sum(), dtype=dt)
            for xi in x:
                # Set up fromself for a scan, so that it steps up at the start of a segment
                # from the current array, and steps back down at the end
                fromself = ak.zeros(newvals.size + 1, dtype=ak.int64)
                fromself[nzsegs] += 1
                nzlens = xi.lengths[nonzero]
                fromself[nzsegs + nzlens] -= 1
                fromself = (ak.cumsum(fromself[:-1]) == 1)
                newvals[fromself] = xi.values
                nzsegs += nzlens
            return cls(newsegs, newvals, copy=False)
        else:
            raise ValueError(
                "Supported values for axis are 0 (vertical concat) or 1 (horizontal concat)"
            )
Esempio n. 9
0
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64:
        raise ValueError("left and right must be pdarray(int64)")
    if wherefunc is not None:
        from inspect import signature
        sample = min((left.size, right.size, 5))
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError("Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError("Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e
    # Only join on intersection
    inter = ak.intersect1d(left, right)
    # Indices of left values present in intersection
    leftInds = ak.arange(left.size)[ak.in1d(left, inter)]
    # Left vals in intersection
    leftFilt = left[leftInds]
    # Indices of right vals present in inter
    rightInds = ak.arange(right.size)[ak.in1d(right, inter)]
    # Right vals in inter
    rightFilt = right[rightInds]
    byLeft = ak.GroupBy(leftFilt)
    byRight = ak.GroupBy(rightFilt)
    maxVal = inter.max()
    if forceDense or maxVal > 3*(left.size + right.size):
        # Remap intersection to dense, 0-up codes
        # Replace left values with dense codes
        uniqLeftVals = byLeft.unique_keys
        uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)]
        leftCodes = ak.zeros_like(leftFilt) - 1
        leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False)
        # Replace right values with dense codes
        uniqRightVals = byRight.unique_keys
        uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)]
        rightCodes = ak.zeros_like(rightFilt) - 1
        rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False)
        countSize = inter.size
    else:
        uniqLeftCodes = byLeft.unique_keys
        uniqRightCodes = byRight.unique_keys
        leftCodes = leftFilt
        rightCodes = rightFilt
        countSize = maxVal + 1
    # Expand indices to product domain
    # First count occurrences of each code in left and right
    leftCounts = ak.zeros(countSize, dtype=ak.int64)
    leftCounts[uniqLeftCodes] = byLeft.count()[1]
    rightCounts = ak.zeros(countSize, dtype=ak.int64)
    rightCounts[uniqRightCodes] = byRight.count()[1]
    # Repeat each left index as many times as that code occurs in right
    prodLeft = rightCounts[leftCodes]
    leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum())
    prodRight = leftCounts[rightCodes]
    rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum())
    # Evaluate where clause
    if wherefunc is None:
        return leftFullInds, rightFullInds
    else:
        # Gather whereargs
        leftWhere = whereargs[0][leftFullInds]
        rightWhere = whereargs[1][rightFullInds]
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
Esempio n. 10
0
a = ak.linspace(1,10,10)
b = np.linspace(1,10,10)
print(ak.prod(a) == np.prod(b),ak.prod(a),np.prod(b),a.prod(),b.prod())

ak.v = False

a = np.arange(0,20,1)
b = a<10
print(b,np.sum(b),b.sum(),np.prod(b),b.prod(),np.cumsum(b),np.cumprod(b))
print()
b = a<5
print(b,np.sum(b),b.sum(),np.prod(b),b.prod(),np.cumsum(b),np.cumprod(b))
print()
a = ak.arange(0,20,1)
b = a<10
print(b,ak.sum(b),b.sum(),ak.prod(b),b.prod(),ak.cumsum(b),ak.cumprod(b))
b = a<5
print(b,ak.sum(b),b.sum(),ak.prod(b),b.prod(),ak.cumsum(b),ak.cumprod(b))

ak.v = False
a = ak.arange(0,10,1)
iv = a[::-1]
print(a,iv,a[iv])

ak.v = False
a = ak.arange(0,10,1)
iv = a[::-1]
print(a,iv,a[iv])

ak.v = False
a = ak.linspace(0,9,10)