Esempio n. 1
    def remove_repeats(self, return_multiplicity=False):
        Condense sequences of repeated values within a sub-array to a single value.

        return_multiplicity : bool
            If True, also return the number of times each value was repeated.

        norepeats : SegArray
            Sub-arrays with runs of repeated values replaced with single value
        multiplicity : SegArray
            If return_multiplicity=True, this array contains the number of times
            each value in the returned SegArray was repeated in the original SegArray.
        isrepeat = ak.zeros(self.values.size, dtype=ak.bool)
        isrepeat[1:] = self.values[:-1] == self.values[1:]
        isrepeat[self.segments] = False
        truepaths = self.values[~isrepeat]
        nhops = self.grouping.sum(~isrepeat)[1]
        # truehops = ak.cumsum(~isrepeat)
        # nhops = ak.concatenate((truehops[self.segments[1:]], ak.array([truehops.sum()+1]))) - truehops[self.segments]
        truesegs = ak.cumsum(nhops) - nhops
        norepeats = SegArray(truesegs, truepaths)
        if return_multiplicity:
            truehopinds = ak.arange(self.valsize)[~isrepeat]
            multiplicity = ak.zeros(truepaths.size, dtype=ak.int64)
            multiplicity[:-1] = truehopinds[1:] - truehopinds[:-1]
            multiplicity[-1] = self.valsize - truehopinds[-1]
            return norepeats, SegArray(truesegs, multiplicity)
            return norepeats
Esempio n. 2
 def test_plus_minus(self):
     # Datetime + Datetime not supported
     with self.assertRaises(TypeError) as cm:
         self.dtvec1 + self.dtvec2
     # Datetime slice -> Datetime
     leading = self.dtvec1[1:]
     trailing = self.dtvec1[:-1]
     self.assertTrue(isinstance(leading, ak.Datetime) and isinstance(trailing, ak.Datetime))
     # Datetime - Datetime -> Timedelta
     diff = leading - trailing
     self.assertTrue(isinstance(diff, ak.Timedelta))
     self.assertTrue((diff == self.onesecond).all())
     # Datetime - DatetimeScalar -> Timedelta
     diff = self.dtvec1 - self.dtscalar
     trange = ak.timedelta_range(start=0, periods=100, freq='s')
     self.assertTrue(isinstance(diff, ak.Timedelta))
     self.assertTrue((diff == trange).all())
     # DatetimeScalar - Datetime -> Timedelta
     diff = self.dtscalar - self.dtvec1
     self.assertTrue(isinstance(diff, ak.Timedelta))
     self.assertTrue((diff == (-trange)).all())
     # Datetime + TimedeltaScalar -> Datetime
     t = (trailing + self.onesecond)
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # TimedeltaScalar + Datetime -> Datetime
     t = (self.onesecond + trailing)
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # Datetime - TimedeltaScalar -> Datetime
     t = leading - self.onesecond
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == trailing).all())
     # Datetime + Timedelta -> Datetime
     t = (trailing + self.tdvec1[1:])
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # Timedelta + Datetime -> Datetime
     t = (self.tdvec1[1:] + trailing)
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # Datetime - Timedelat -> Datetime
     t = (leading - self.tdvec1[1:])
     self.assertTrue(isinstance(t, ak.Datetime))
     # Timedelta + Timedelta -> Timedelta
     t = self.tdvec1 + self.tdvec1
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all())
     # Timedelta + TimedeltaScalar -> Timedelta
     t = self.tdvec1 + self.onesecond
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all())
     # Timedelta - Timedelta -> Timedelta
     t = self.tdvec1 - self.tdvec1
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all())
     # Timedelta - TimedeltaScalar -> Timedelta
     t = self.tdvec1 - self.onesecond
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all())
Esempio n. 3
def gen_ranges(starts, ends):
    """ Generate a segmented array of variable-length, contiguous 
    ranges between pairs of start- and end-points.

    starts : pdarray, int64
        The start value of each range
    ends : pdarray, int64
        The end value (exclusive) of each range

    segments : pdarray, int64
        The starting index of each range in the resulting array
    ranges : pdarray, int64
        The actual ranges, flattened into a single array
    if starts.size != ends.size:
        raise ValueError("starts and ends must be same size")
    if starts.size == 0:
        return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64)
    if not ((ends - starts) > 0).all():
        raise ValueError("all ends must be greater than starts")
    lengths = ends - starts
    segs = ak.cumsum(lengths) - lengths
    totlen = lengths.sum()
    slices = ak.ones(totlen, dtype=ak.int64)
    diffs = ak.concatenate((ak.array([starts[0]]), 
                            starts[1:] - starts[:-1] - lengths[:-1] + 1))
    slices[segs] = diffs
    return segs, ak.cumsum(slices)
Esempio n. 4
def gen_ranges(starts, ends, stride=1):
    """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points.

    starts : pdarray, int64
        The start value of each range
    ends : pdarray, int64
        The end value (exclusive) of each range
    stride: int
        Difference between successive elements of each range

    segments : pdarray, int64
        The starting index of each range in the resulting array
    ranges : pdarray, int64
        The actual ranges, flattened into a single array
    if starts.size != ends.size:
        raise ValueError("starts and ends must be same length")
    if starts.size == 0:
        return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64)
    lengths = (ends - starts) // stride
    segs = ak.cumsum(lengths) - lengths
    totlen = lengths.sum()
    slices = ak.ones(totlen, dtype=ak.int64)
    diffs = ak.concatenate(
         starts[1:] - starts[:-1] - (lengths[:-1] - 1) * stride))
    slices[segs] = diffs
    return segs, ak.cumsum(slices)
Esempio n. 5
def check_correctness(dtype):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64)
        z = ak.zeros(N, dtype=dtype)

    perm = ak.coargsort([a, z])
    assert ak.is_sorted(a[perm])
    perm = ak.coargsort([z, a])
    assert ak.is_sorted(a[perm])
Esempio n. 6
def check_correctness(dtype, random, seed):
    Ni = 10**4
    Nv = 10**4
    if seed is not None:
    # make indices unique
    # if indices are non-unique, results of unordered scatter are variable
    npi = np.arange(Ni)
    npc = np.zeros(Nv, dtype=dtype)
    aki = ak.array(npi)
    akc = ak.zeros(Nv, dtype=dtype)
    if random or seed is not None:
        if dtype == 'int64':
            npv = np.random.randint(0, 2**32, Ni)
        elif dtype == 'float64':
            npv = np.random.random(Ni)
        elif dtype == 'bool':
            npv = np.random.randint(0, 1, Ni, dtype=np.bool)
        npv = np.ones(Ni, dtype=dtype)
    akv = ak.array(npv)
    npc[npi] = npv
    akc[aki] = akv
    assert np.allclose(npc, akc.to_ndarray())
Esempio n. 7
def time_ak_scatter(isize, vsize, trials, dtype, random):
    print(">>> arkouda scatter")
    cfg = ak.get_config()
    Ni = isize * cfg["numLocales"]
    Nv = vsize * cfg["numLocales"]
    print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(
        cfg["numLocales"], Ni, Nv))
    # Index vector is always random
    i = ak.randint(0, Nv, Ni)
    c = ak.zeros(Nv, dtype=dtype)
    if random:
        if dtype == 'int64':
            v = ak.randint(0, 2**32, Ni)
        elif dtype == 'float64':
            v = ak.randint(0, 1, Ni, dtype=ak.float64)
        v = ak.ones(Ni, dtype=dtype)

    timings = []
    for _ in range(trials):
        start = time.time()
        c[i] = v
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    print("Average time = {:.4f} sec".format(tavg))
    bytes_per_sec = (i.size * i.itemsize * 3) / tavg
    print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
Esempio n. 8
    def append_single(self, x, prepend=False):
        Append a single value to each sub-array.

        x : pdarray or scalar
            Single value to append to each sub-array
            Copy of original SegArray with values from x appended to each sub-array
        if hasattr(x, 'size'):
            if x.size != self.size:
                raise ValueError(
                    'Argument must be scalar or same size as SegArray')
            if type(x) != type(self.values) or x.dtype != self.dtype:
                raise TypeError(
                    'Argument type must match value type of SegArray')
        newlens = self.lengths + 1
        newsegs = ak.cumsum(newlens) - newlens
        newvals = ak.zeros(newlens.sum(), dtype=self.dtype)
        if prepend:
            lastscatter = newsegs
            lastscatter = newsegs + newlens - 1
        newvals[lastscatter] = x
        origscatter = ak.arange(self.valsize) + self.grouping.broadcast(
            ak.arange(self.size), permute=True)
        if prepend:
            origscatter += 1
        newvals[origscatter] = self.values
        return SegArray(newsegs, newvals)
Esempio n. 9
 def from_multi_array(cls, m):
     Construct a SegArray from a list of columns. This essentially transposes the input,
     resulting in an array of rows.
     m : list of pdarray
         List of columns, the rows of which will form the sub-arrays of the output
         Array of rows of input
     if isinstance(m, ak.pdarray):
         size = m.size
         n = 1
         dtype = m.dtype
         s = set(mi.size for mi in m)
         if len(s) != 1:
             raise ValueError("All columns must have same length")
         size = s.pop()
         n = len(m)
         d = set(mi.dtype for mi in m)
         if len(d) != 1:
             raise ValueError("All columns must have same dtype")
         dtype = d.pop()
     newsegs = ak.arange(size) * n
     newvals = ak.zeros(size * n, dtype=dtype)
     for j in range(len(m)):
         newvals[j::len(m)] = m[j]
     return cls(newsegs, newvals)
def compare_strategies(length, ncat, op, dtype):
    keys = ak.randint(0, ncat, length)
    if dtype == 'int64':
        vals = ak.randint(0, length // ncat, length)
    elif dtype == 'bool':
        vals = ak.zeros(length, dtype='bool')
        for i in np.random.randint(0, length, ncat // 2):
            vals[i] = True
        vals = ak.linspace(-1, 1, length)
    print("Global groupby", end=' ')
    start = time()
    gg = ak.GroupBy(keys, False)
    ggtime = time() - start
    print("Global reduce", end=' ')
    start = time()
    gk, gv = gg.aggregate(vals, op)
    grtime = time() - start
    print("Local groupby", end=' ')
    start = time()
    lg = ak.GroupBy(keys, True)
    lgtime = time() - start
    print("Local reduce", end=' ')
    start = time()
    lk, lv = lg.aggregate(vals, op)
    lrtime = time() - start
    print(f"Keys match? {(gk == lk).all()}")
    print(f"Absolute diff of vals = {ak.abs(gv - lv).sum()}")
    return ggtime, grtime, lgtime, lrtime
Esempio n. 11
def expand(vals, segs, size):
    """ Broadcast per-segment values to a segmented array. Equivalent 
    to ak.GroupBy.broadcast(vals) but accepts explicit segments and 
    size arguments.

    vals : pdarray
        Values (one per segment) to broadcast over segments
    segs : pdarray
        Start indices of segments
    size : int
        Total size of result array

        Values broadcasted out to segments
    if vals.size != segs.size:
        raise ValueError("vals and segs must have same size")
    if vals.size == 0:
        return ak.array([])
    if size < segs.size or size <= segs.max():
        raise ValueError("Total size cannot be less than max segment")
    if segs[0] != 0 or not (segs[:-1] < segs[1:]).all():
        raise ValueError(
            "segs must start at zero and be monotonically increasing")
    temp = ak.zeros(size, dtype=vals.dtype)
    diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1]))
    temp[segs] = diffs
    return ak.cumsum(temp)
Esempio n. 12
def interval_lookup(keys, values, arguments, fillvalue=-1):
    Apply a function defined over non-overlapping intervals to
    an array of arguments.
    keys : 2-tuple of pdarray
        Tuple of non-overlapping, half-open intervals expressed
        as (lower_bounds_inclusive, upper_bounds_exclusive)
    values : pdarray
        Function value to return for each entry in keys.
    arguments : pdarray
        Arguments to the function
    fillvalue : scalar
        Default value to return when argument is not in any interval.

        Value of function corresponding to the keys interval
        containing each argument, or fillvalue if argument not
        in any interval.
    idx = search_intervals(arguments, keys, assume_unique=True)
    res = ak.zeros(arguments.size, dtype=values.dtype)
    if fillvalue is not None:
    found = idx > -1
    res[found] = values[idx[found]]
    return res
Esempio n. 13
 def _get_lengths(self):
     if self.size == 0:
         return ak.zeros(0, dtype=ak.int64)
     elif self.size == 1:
         return ak.array([self.valsize])
         return ak.concatenate(
             (self.segments[1:], ak.array([self.valsize]))) - self.segments
Esempio n. 14
def create_ak_array(N, op, dtype, seed):
    if op == 'zeros': 
        a = ak.zeros(N, dtype=dtype)
    elif op == 'ones':
        a = ak.ones(N, dtype=dtype)
    elif op == 'randint':
        a = ak.randint(0, 2**32, N, dtype=dtype, seed=seed)
    return a
Esempio n. 15
def check_zeros(N):
    # create np version
    a = ak.array(np.zeros(10))
    # create ak version
    b = ak.zeros(10)
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
Esempio n. 16
def check_zeros(N):
    # create np version
    a = np.zeros(N)
    # create ak version
    b = ak.zeros(N)
    # print(a,b)
    c = a == b.to_ndarray()
    # print(type(c),c)
    return pass_fail(c.all())
Esempio n. 17
def check_correctness(dtype, seed):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)
        z = ak.cast(ak.zeros(N), 'str')

    perm = ak.coargsort([a, z])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
    perm = ak.coargsort([z, a])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
Esempio n. 18
def generate_arrays(length, nkeys, nvals, dtype='int64'):
    keys = ak.randint(0, nkeys, length)
    if dtype == 'int64':
        vals = ak.randint(0, nvals, length)
    elif dtype == 'bool':
        vals = ak.zeros(length, dtype='bool')
        for i in np.random.randint(0, length, nkeys // 2):
            vals[i] = True
        vals = ak.linspace(-1, 1, length)
    return keys, vals
Esempio n. 19
 def __eq__(self, other):
     if not isinstance(other, SegArray):
         return NotImplemented
     eq = ak.zeros(self.size, dtype=ak.bool)
     leneq = self.lengths == other.lengths
     if leneq.sum() > 0:
         selfcmp = self[leneq]
         othercmp = other[leneq]
         intersection = self.all(selfcmp.values == othercmp.values)
         eq[leneq] = intersection
     return eq
Esempio n. 20
def IP_like(N):
    Data like a 90/10 mix of IPv4 and IPv6 addresses
    multiplicity = 10
    nunique = N // (2 * multiplicity)
    # First generate unique addresses, then sample with replacement
    u1 = ak.zeros(nunique, dtype=ak.int64)
    u2 = ak.zeros(nunique, dtype=ak.int64)
    v4 = ak.uniform(nunique) < 0.9
    n4 = v4.sum()
    v6 = ~v4
    n6 = v4.size - n4
    u1[v4] = ak.randint(0, 2**32, n4)
    u1[v6] = ak.randint(-2**63, 2**63, n6)
    u2[v6] = ak.randint(-2**63, 2**63, n6)
    sample = ak.randint(0, nunique, N // 2)
    IP1 = u1[sample]
    IP2 = u2[sample]
    yield 'IP-like 2*int64', (IP1, IP2)
Esempio n. 21
def in1dmulti(a, b, assume_unique=False):
    """ The multi-level analog of ak.in1d -- test membership of rows of a in the set of rows of b.

    a : list of pdarrays
        Rows are elements for which to test membership in b
    b : list of pdarrays
        Rows are elements of the set in which to test membership
    assume_unique : bool
        If true, assume rows of a and b are each unique and sorted. By default, sort and unique them explicitly.

    pdarray, bool
        True for each row in a that is contained in b

        Only works for pdarrays of int64 dtype, Strings, or Categorical
    if not assume_unique:
        ag = ak.GroupBy(a)
        ua = ag.unique_keys
        bg = ak.GroupBy(b)
        ub = bg.unique_keys
        ua = a
        ub = b
    c = [ak.concatenate(x) for x in zip(ua, ub)]
    g = ak.GroupBy(c)
    k, ct = g.count()
    truth = ak.zeros(c[0].size, dtype=ak.bool)
    truth[g.permutation] = (g.broadcast(1 * (ct == 2)) == 1)
    if assume_unique:
        return truth[:a[0].size]
        truth2 = ak.zeros(a[0].size, dtype=ak.bool)
        truth2[ag.permutation] = (ag.broadcast(1 * truth[:ua[0].size]) == 1)
        return truth2
Esempio n. 22
def check_float(N):
    a = ak.randint(0, 1, N, dtype=ak.float64)
    n = ak.randint(-1, 1, N, dtype=ak.float64)
    z = ak.zeros(N, dtype=ak.float64)

    perm = ak.coargsort([a])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([a, n])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([n, a])
    assert ak.is_sorted(n[perm])

    perm = ak.coargsort([z, a])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([z, n])
    assert ak.is_sorted(n[perm])
Esempio n. 23
    def get_jth(self, j, return_origins=True, compressed=False, default=0):
        Select the j-th element of each sub-array, where possible.

        j : int
            The index of the value to get from each sub-array. If j is negative, 
            it counts backwards from the end of each sub-array.
        return_origins : bool
            If True, return a logical index indicating where j is in bounds
        compressed : bool
            If False, return array is same size as self, with default value 
            where j is out of bounds. If True, the return array only contains
            values where j is in bounds.
        default : scalar
            When compressed=False, the value to return when j is out of bounds 
            for the sub-array

        val : pdarray
            compressed=False: The j-th value of each sub-array where j is in 
            bounds and the default value where j is out of bounds.
            compressed=True: The j-th values of only the sub-arrays where j is 
            in bounds
        origin_indices : pdarray, bool
            A Boolean array that is True where j is in bounds for the sub-array.
        longenough, newj = self._normalize_index(j)
        ind = (self.segments + newj)[longenough]
        if compressed:
            res = self.values[ind]
            res = ak.zeros(self.size, dtype=self.dtype) + default
            res[longenough] = self.values[ind]
        if return_origins:
            return res, longenough
            return res
Esempio n. 24
def expand(size, segs, vals):
    """ Expand an array with values placed into the indicated segments.

    size : ak.pdarray
        The size of the array to be expanded
    segs : ak.pdarray
        The indices where the values should be placed
    vals : ak.pdarray
        The values to be placed in each segment

        The expanded array.

    temp = ak.zeros(size, vals.dtype)
    diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1]))
    temp[segs] = diffs
    return ak.cumsum(temp)
Esempio n. 25
 def _convert_strings(self, s):
     Convert string field names to binary vectors.
     # Initialize to zero
     values = ak.zeros(s.size, dtype=ak.int64)
     if self.separator == '':
         # When separator is empty, field names are guaranteed to be single characters
         for name, shift in zip(self.names, self.shifts):
             # Check if name exists in each string
             bit = s.contains(name)
             values = values | ak.where(bit, 1 << shift, 0)
         # When separator is non-empty, split on it
         sf, segs = s.flatten(self.separator, return_segments=True)
         # Create a grouping to map split fields back to originating string
         orig = ak.broadcast(segs, ak.arange(segs.size), sf.size)
         g = ak.GroupBy(orig)
         for name, shift in zip(self.names, self.shifts):
             # Check if name matches one of the split fields from originating string
             bit = g.any(sf == name)[1]
             values = values | ak.where(bit, 1 << shift, 0)
     return values
Esempio n. 26
def check_int(N):
    z = ak.zeros(N, dtype=ak.int64)

    a2 = ak.randint(0, 2**16, N)
    b2 = ak.randint(0, 2**16, N)
    c2 = ak.randint(0, 2**16, N)
    d2 = ak.randint(0, 2**16, N)
    n2 = ak.randint(-(2**15), 2**15, N)

    perm = ak.coargsort([a2])
    assert ak.is_sorted(a2[perm])

    perm = ak.coargsort([n2])
    assert ak.is_sorted(n2[perm])

    perm = ak.coargsort([a2, b2, c2, d2])
    assert ak.is_sorted(a2[perm])

    perm = ak.coargsort([z, b2, c2, d2])
    assert ak.is_sorted(b2[perm])

    perm = ak.coargsort([z, z, c2, d2])
    assert ak.is_sorted(c2[perm])

    perm = ak.coargsort([z, z, z, d2])
    assert ak.is_sorted(d2[perm])

    a4 = ak.randint(0, 2**32, N)
    b4 = ak.randint(0, 2**32, N)
    n4 = ak.randint(-(2**31), 2**31, N)

    perm = ak.coargsort([a4])
    assert ak.is_sorted(a4[perm])

    perm = ak.coargsort([n4])
    assert ak.is_sorted(n4[perm])

    perm = ak.coargsort([a4, b4])
    assert ak.is_sorted(a4[perm])

    perm = ak.coargsort([b4, a4])
    assert ak.is_sorted(b4[perm])

    a8 = ak.randint(0, 2**64, N)
    b8 = ak.randint(0, 2**64, N)
    n8 = ak.randint(-(2**63), 2**64, N)

    perm = ak.coargsort([a8])
    assert ak.is_sorted(a8[perm])

    perm = ak.coargsort([n8])
    assert ak.is_sorted(n8[perm])

    perm = ak.coargsort([b8, a8])
    assert ak.is_sorted(b8[perm])

    from itertools import permutations

    all_perm = permutations([a2, a4, a8])
    for p in all_perm:
        perm = ak.coargsort(p)
        assert ak.is_sorted(p[0][perm])
Esempio n. 27
def inner_join(left, right, wherefunc=None, whereargs=None):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
    The return values satisfy the following assertions
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
    from inspect import signature
    sample = min((left.size, right.size, 5))
    if wherefunc is not None:
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError("Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError("Right whereargs must be same size as right join values")
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e

    # Need dense 0-up right index, to filter out left not in right
    keep, (denseLeft, denseRight) = right_align(left, right)
    if keep.sum() == 0:
        # Intersection is empty
        return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64)
    keep = ak.arange(keep.size)[keep]
    # GroupBy right
    byRight = ak.GroupBy(denseRight)
    # Get segment boundaries (starts, ends) of right for each left item
    rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size])))
    starts = rightSegs[denseLeft]
    ends = rightSegs[denseLeft+1]
    fullSize = (ends - starts).sum()
    # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ")
    # gen_ranges for gather of right items
    fullSegs, ranges = gen_ranges(starts, ends)
    # Evaluate where clause
    if wherefunc is None:
        filtRanges = ranges
        filtSegs = fullSegs
        keep12 = keep
        # Gather right whereargs
        rightWhere = whereargs[1][byRight.permutation][ranges]
        # Expand left whereargs
        leftWhere = ak.broadcast(fullSegs, whereargs[0][keep], ranges.size)
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        filtRanges = ranges[whereSatisfied]
        scan = ak.cumsum(whereSatisfied) - whereSatisfied
        filtSegsWithZeros = scan[fullSegs]    
        filtSegSizes = ak.concatenate((filtSegsWithZeros[1:] - filtSegsWithZeros[:-1], 
                                       ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]])))
        keep2 = (filtSegSizes > 0)
        filtSegs = filtSegsWithZeros[keep2]
        keep12 = keep[keep2]
    # Gather right inds and expand left inds
    rightInds = byRight.permutation[filtRanges]
    leftInds = ak.broadcast(filtSegs, ak.arange(left.size)[keep12], filtRanges.size)
    return leftInds, rightInds
Esempio n. 28
    def __init__(self,
        An array of variable-length arrays, also called a skyline array or ragged array.

        segments : pdarray, int64
            Start index of each sub-array in the flattened values array
        values : pdarray
            The flattened values of all sub-arrays
        copy : bool
            If True, make a copy of the input arrays; otherwise, just store a reference.

            Data structure representing an array whose elements are variable-length arrays.

        Keyword args 'lengths' and 'grouping' are not user-facing. They are used by the
        attach method.
        if not isinstance(segments, ak.pdarray) or segments.dtype != ak.int64:
            raise TypeError("Segments must be int64 pdarray")
        if not ak.is_sorted(segments) or (ak.unique(segments).size !=
            raise ValueError("Segments must be unique and in sorted order")
        if segments.size > 0:
            if segments.min() != 0 or segments.max() >= values.size:
                raise ValueError(
                    "Segments must start at zero and be less than values.size")
        elif values.size > 0:
            raise ValueError(
                "Cannot have non-empty values with empty segments")
        if copy:
            self.segments = segments[:]
            self.values = values[:]
            self.segments = segments
            self.values = values
        self.size = segments.size
        self.valsize = values.size
        if lengths is None:
            self.lengths = self._get_lengths()
            self.lengths = lengths
        self.dtype = values.dtype
        if grouping is None:
            if self.size == 0:
                self.grouping = ak.GroupBy(ak.zeros(0, dtype=ak.int64))
                # Treat each sub-array as a group, for grouped aggregations
                self.grouping = ak.GroupBy(
                    ak.broadcast(self.segments, ak.arange(self.size),
            self.grouping = grouping
Esempio n. 29
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
    The return values satisfy the following assertions
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
    if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64:
        raise ValueError("left and right must be pdarray(int64)")
    if wherefunc is not None:
        from inspect import signature
        sample = min((left.size, right.size, 5))
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError("Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError("Right whereargs must be same size as right join values")
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e
    # Only join on intersection
    inter = ak.intersect1d(left, right)
    # Indices of left values present in intersection
    leftInds = ak.arange(left.size)[ak.in1d(left, inter)]
    # Left vals in intersection
    leftFilt = left[leftInds]
    # Indices of right vals present in inter
    rightInds = ak.arange(right.size)[ak.in1d(right, inter)]
    # Right vals in inter
    rightFilt = right[rightInds]
    byLeft = ak.GroupBy(leftFilt)
    byRight = ak.GroupBy(rightFilt)
    maxVal = inter.max()
    if forceDense or maxVal > 3*(left.size + right.size):
        # Remap intersection to dense, 0-up codes
        # Replace left values with dense codes
        uniqLeftVals = byLeft.unique_keys
        uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)]
        leftCodes = ak.zeros_like(leftFilt) - 1
        leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False)
        # Replace right values with dense codes
        uniqRightVals = byRight.unique_keys
        uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)]
        rightCodes = ak.zeros_like(rightFilt) - 1
        rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False)
        countSize = inter.size
        uniqLeftCodes = byLeft.unique_keys
        uniqRightCodes = byRight.unique_keys
        leftCodes = leftFilt
        rightCodes = rightFilt
        countSize = maxVal + 1
    # Expand indices to product domain
    # First count occurrences of each code in left and right
    leftCounts = ak.zeros(countSize, dtype=ak.int64)
    leftCounts[uniqLeftCodes] = byLeft.count()[1]
    rightCounts = ak.zeros(countSize, dtype=ak.int64)
    rightCounts[uniqRightCodes] = byRight.count()[1]
    # Repeat each left index as many times as that code occurs in right
    prodLeft = rightCounts[leftCodes]
    leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum())
    prodRight = leftCounts[rightCodes]
    rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum())
    # Evaluate where clause
    if wherefunc is None:
        return leftFullInds, rightFullInds
        # Gather whereargs
        leftWhere = whereargs[0][leftFullInds]
        rightWhere = whereargs[1][rightFullInds]
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
Esempio n. 30
    def concat(cls, x, axis=0, ordered=True):
        Concatenate a sequence of SegArrays

        x : sequence of SegArray
            The SegArrays to concatenate
        axis : 0 or 1
            Select vertical (0) or horizontal (1) concatenation. If axis=1, all
            SegArrays must have same size.
        ordered : bool
            Must be True. This option is present for compatibility only, because unordered
            concatenation is not yet supported.

            The input arrays joined into one SegArray
        if not ordered:
            raise ValueError(
                "Unordered concatenation not yet supported on SegArray; use ordered=True."
        if len(x) == 0:
            raise ValueError("Empty sequence passed to concat")
        for xi in x:
            if not isinstance(xi, cls):
                return NotImplemented
        if len(set(xi.dtype for xi in x)) != 1:
            raise ValueError(
                "SegArrays must all have same dtype to concatenate")
        if axis == 0:
            ctr = 0
            segs = []
            vals = []
            for xi in x:
                # Segment offsets need to be raised by length of previous values
                segs.append(xi.segments + ctr)
                ctr += xi.valsize
                # Values can just be concatenated
            return cls(ak.concatenate(segs), ak.concatenate(vals))
        elif axis == 1:
            sizes = set(xi.size for xi in x)
            if len(sizes) != 1:
                raise ValueError(
                    "SegArrays must all have same size to concatenate with axis=1"
            if sizes.pop() == 0:
                return x[0]
            dt = list(x)[0].dtype
            newlens = sum(xi.lengths for xi in x)
            newsegs = ak.cumsum(newlens) - newlens
            # Ignore sub-arrays that are empty in all arrays
            nonzero = ak.concatenate(
                (newsegs[:-1] < newsegs[1:], ak.array([True])))
            nzsegs = newsegs[nonzero]
            newvals = ak.zeros(newlens.sum(), dtype=dt)
            for xi in x:
                # Set up fromself for a scan, so that it steps up at the start of a segment
                # from the current array, and steps back down at the end
                fromself = ak.zeros(newvals.size + 1, dtype=ak.int64)
                fromself[nzsegs] += 1
                nzlens = xi.lengths[nonzero]
                fromself[nzsegs + nzlens] -= 1
                fromself = (ak.cumsum(fromself[:-1]) == 1)
                newvals[fromself] = xi.values
                nzsegs += nzlens
            return cls(newsegs, newvals, copy=False)
            raise ValueError(
                "Supported values for axis are 0 (vertical concat) or 1 (horizontal concat)"