Ejemplo n.º 1
0
    def remove_repeats(self, return_multiplicity=False):
        """
        Condense sequences of repeated values within a sub-array to a single value.

        Parameters
        ----------
        return_multiplicity : bool
            If True, also return the number of times each value was repeated.

        Returns
        -------
        norepeats : SegArray
            Sub-arrays with runs of repeated values replaced with single value
        multiplicity : SegArray
            If return_multiplicity=True, this array contains the number of times
            each value in the returned SegArray was repeated in the original SegArray.
        """
        isrepeat = ak.zeros(self.values.size, dtype=ak.bool)
        isrepeat[1:] = self.values[:-1] == self.values[1:]
        isrepeat[self.segments] = False
        truepaths = self.values[~isrepeat]
        nhops = self.grouping.sum(~isrepeat)[1]
        # truehops = ak.cumsum(~isrepeat)
        # nhops = ak.concatenate((truehops[self.segments[1:]], ak.array([truehops.sum()+1]))) - truehops[self.segments]
        truesegs = ak.cumsum(nhops) - nhops
        norepeats = SegArray(truesegs, truepaths)
        if return_multiplicity:
            truehopinds = ak.arange(self.valsize)[~isrepeat]
            multiplicity = ak.zeros(truepaths.size, dtype=ak.int64)
            multiplicity[:-1] = truehopinds[1:] - truehopinds[:-1]
            multiplicity[-1] = self.valsize - truehopinds[-1]
            return norepeats, SegArray(truesegs, multiplicity)
        else:
            return norepeats
Ejemplo n.º 2
0
 def test_plus_minus(self):
     # Datetime + Datetime not supported
     with self.assertRaises(TypeError) as cm:
         self.dtvec1 + self.dtvec2
     # Datetime slice -> Datetime
     leading = self.dtvec1[1:]
     trailing = self.dtvec1[:-1]
     self.assertTrue(isinstance(leading, ak.Datetime) and isinstance(trailing, ak.Datetime))
     # Datetime - Datetime -> Timedelta
     diff = leading - trailing
     self.assertTrue(isinstance(diff, ak.Timedelta))
     self.assertTrue((diff == self.onesecond).all())
     # Datetime - DatetimeScalar -> Timedelta
     diff = self.dtvec1 - self.dtscalar
     trange = ak.timedelta_range(start=0, periods=100, freq='s')
     self.assertTrue(isinstance(diff, ak.Timedelta))
     self.assertTrue((diff == trange).all())
     # DatetimeScalar - Datetime -> Timedelta
     diff = self.dtscalar - self.dtvec1
     self.assertTrue(isinstance(diff, ak.Timedelta))
     self.assertTrue((diff == (-trange)).all())
     # Datetime + TimedeltaScalar -> Datetime
     t = (trailing + self.onesecond)
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # TimedeltaScalar + Datetime -> Datetime
     t = (self.onesecond + trailing)
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # Datetime - TimedeltaScalar -> Datetime
     t = leading - self.onesecond
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == trailing).all())
     # Datetime + Timedelta -> Datetime
     t = (trailing + self.tdvec1[1:])
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # Timedelta + Datetime -> Datetime
     t = (self.tdvec1[1:] + trailing)
     self.assertTrue(isinstance(t, ak.Datetime))
     self.assertTrue((t == leading).all())
     # Datetime - Timedelat -> Datetime
     t = (leading - self.tdvec1[1:])
     self.assertTrue(isinstance(t, ak.Datetime))
     # Timedelta + Timedelta -> Timedelta
     t = self.tdvec1 + self.tdvec1
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all())
     # Timedelta + TimedeltaScalar -> Timedelta
     t = self.tdvec1 + self.onesecond
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all())
     # Timedelta - Timedelta -> Timedelta
     t = self.tdvec1 - self.tdvec1
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all())
     # Timedelta - TimedeltaScalar -> Timedelta
     t = self.tdvec1 - self.onesecond
     self.assertTrue(isinstance(t, ak.Timedelta))
     self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all())
Ejemplo n.º 3
0
def gen_ranges(starts, ends):
    """ Generate a segmented array of variable-length, contiguous 
    ranges between pairs of start- and end-points.

    Parameters
    ----------
    starts : pdarray, int64
        The start value of each range
    ends : pdarray, int64
        The end value (exclusive) of each range

    Returns
    -------
    segments : pdarray, int64
        The starting index of each range in the resulting array
    ranges : pdarray, int64
        The actual ranges, flattened into a single array
    """
    if starts.size != ends.size:
        raise ValueError("starts and ends must be same size")
    if starts.size == 0:
        return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64)
    if not ((ends - starts) > 0).all():
        raise ValueError("all ends must be greater than starts")
    lengths = ends - starts
    segs = ak.cumsum(lengths) - lengths
    totlen = lengths.sum()
    slices = ak.ones(totlen, dtype=ak.int64)
    diffs = ak.concatenate((ak.array([starts[0]]), 
                            starts[1:] - starts[:-1] - lengths[:-1] + 1))
    slices[segs] = diffs
    return segs, ak.cumsum(slices)
Ejemplo n.º 4
0
def gen_ranges(starts, ends, stride=1):
    """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points.

    Parameters
    ----------
    starts : pdarray, int64
        The start value of each range
    ends : pdarray, int64
        The end value (exclusive) of each range
    stride: int
        Difference between successive elements of each range

    Returns
    -------
    segments : pdarray, int64
        The starting index of each range in the resulting array
    ranges : pdarray, int64
        The actual ranges, flattened into a single array
    """
    if starts.size != ends.size:
        raise ValueError("starts and ends must be same length")
    if starts.size == 0:
        return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64)
    lengths = (ends - starts) // stride
    segs = ak.cumsum(lengths) - lengths
    totlen = lengths.sum()
    slices = ak.ones(totlen, dtype=ak.int64)
    diffs = ak.concatenate(
        (ak.array([starts[0]]),
         starts[1:] - starts[:-1] - (lengths[:-1] - 1) * stride))
    slices[segs] = diffs
    return segs, ak.cumsum(slices)
Ejemplo n.º 5
0
def check_correctness(dtype):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64)
        z = ak.zeros(N, dtype=dtype)

    perm = ak.coargsort([a, z])
    assert ak.is_sorted(a[perm])
    perm = ak.coargsort([z, a])
    assert ak.is_sorted(a[perm])
Ejemplo n.º 6
0
def check_correctness(dtype, random, seed):
    Ni = 10**4
    Nv = 10**4
    if seed is not None:
        np.random.seed(seed)
    # make indices unique
    # if indices are non-unique, results of unordered scatter are variable
    npi = np.arange(Ni)
    np.random.shuffle(npi)
    npc = np.zeros(Nv, dtype=dtype)
    aki = ak.array(npi)
    akc = ak.zeros(Nv, dtype=dtype)
    if random or seed is not None:
        if dtype == 'int64':
            npv = np.random.randint(0, 2**32, Ni)
        elif dtype == 'float64':
            npv = np.random.random(Ni)
        elif dtype == 'bool':
            npv = np.random.randint(0, 1, Ni, dtype=np.bool)
    else:
        npv = np.ones(Ni, dtype=dtype)
    akv = ak.array(npv)
    npc[npi] = npv
    akc[aki] = akv
    assert np.allclose(npc, akc.to_ndarray())
Ejemplo n.º 7
0
def time_ak_scatter(isize, vsize, trials, dtype, random):
    print(">>> arkouda scatter")
    cfg = ak.get_config()
    Ni = isize * cfg["numLocales"]
    Nv = vsize * cfg["numLocales"]
    print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(
        cfg["numLocales"], Ni, Nv))
    # Index vector is always random
    i = ak.randint(0, Nv, Ni)
    c = ak.zeros(Nv, dtype=dtype)
    if random:
        if dtype == 'int64':
            v = ak.randint(0, 2**32, Ni)
        elif dtype == 'float64':
            v = ak.randint(0, 1, Ni, dtype=ak.float64)
    else:
        v = ak.ones(Ni, dtype=dtype)

    timings = []
    for _ in range(trials):
        start = time.time()
        c[i] = v
        end = time.time()
        timings.append(end - start)
    tavg = sum(timings) / trials

    print("Average time = {:.4f} sec".format(tavg))
    bytes_per_sec = (i.size * i.itemsize * 3) / tavg
    print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
Ejemplo n.º 8
0
    def append_single(self, x, prepend=False):
        '''
        Append a single value to each sub-array.

        Parameters
        ----------
        x : pdarray or scalar
            Single value to append to each sub-array
        
        Returns
        -------
        SegArray
            Copy of original SegArray with values from x appended to each sub-array
        '''
        if hasattr(x, 'size'):
            if x.size != self.size:
                raise ValueError(
                    'Argument must be scalar or same size as SegArray')
            if type(x) != type(self.values) or x.dtype != self.dtype:
                raise TypeError(
                    'Argument type must match value type of SegArray')
        newlens = self.lengths + 1
        newsegs = ak.cumsum(newlens) - newlens
        newvals = ak.zeros(newlens.sum(), dtype=self.dtype)
        if prepend:
            lastscatter = newsegs
        else:
            lastscatter = newsegs + newlens - 1
        newvals[lastscatter] = x
        origscatter = ak.arange(self.valsize) + self.grouping.broadcast(
            ak.arange(self.size), permute=True)
        if prepend:
            origscatter += 1
        newvals[origscatter] = self.values
        return SegArray(newsegs, newvals)
Ejemplo n.º 9
0
 def from_multi_array(cls, m):
     """
     Construct a SegArray from a list of columns. This essentially transposes the input,
     resulting in an array of rows.
     
     Parameters
     ----------
     m : list of pdarray
         List of columns, the rows of which will form the sub-arrays of the output
     
     Returns
     -------
     SegArray
         Array of rows of input
     """
     if isinstance(m, ak.pdarray):
         size = m.size
         n = 1
         dtype = m.dtype
     else:
         s = set(mi.size for mi in m)
         if len(s) != 1:
             raise ValueError("All columns must have same length")
         size = s.pop()
         n = len(m)
         d = set(mi.dtype for mi in m)
         if len(d) != 1:
             raise ValueError("All columns must have same dtype")
         dtype = d.pop()
     newsegs = ak.arange(size) * n
     newvals = ak.zeros(size * n, dtype=dtype)
     for j in range(len(m)):
         newvals[j::len(m)] = m[j]
     return cls(newsegs, newvals)
Ejemplo n.º 10
0
def compare_strategies(length, ncat, op, dtype):
    keys = ak.randint(0, ncat, length)
    if dtype == 'int64':
        vals = ak.randint(0, length // ncat, length)
    elif dtype == 'bool':
        vals = ak.zeros(length, dtype='bool')
        for i in np.random.randint(0, length, ncat // 2):
            vals[i] = True
    else:
        vals = ak.linspace(-1, 1, length)
    print("Global groupby", end=' ')
    start = time()
    gg = ak.GroupBy(keys, False)
    ggtime = time() - start
    print(ggtime)
    print("Global reduce", end=' ')
    start = time()
    gk, gv = gg.aggregate(vals, op)
    grtime = time() - start
    print(grtime)
    print("Local groupby", end=' ')
    start = time()
    lg = ak.GroupBy(keys, True)
    lgtime = time() - start
    print(lgtime)
    print("Local reduce", end=' ')
    start = time()
    lk, lv = lg.aggregate(vals, op)
    lrtime = time() - start
    print(lrtime)
    print(f"Keys match? {(gk == lk).all()}")
    print(f"Absolute diff of vals = {ak.abs(gv - lv).sum()}")
    return ggtime, grtime, lgtime, lrtime
Ejemplo n.º 11
0
def expand(vals, segs, size):
    """ Broadcast per-segment values to a segmented array. Equivalent 
    to ak.GroupBy.broadcast(vals) but accepts explicit segments and 
    size arguments.

    Parameters
    ----------
    vals : pdarray
        Values (one per segment) to broadcast over segments
    segs : pdarray
        Start indices of segments
    size : int
        Total size of result array

    Returns
    -------
    pdarray
        Values broadcasted out to segments
    """
    if vals.size != segs.size:
        raise ValueError("vals and segs must have same size")
    if vals.size == 0:
        return ak.array([])
    if size < segs.size or size <= segs.max():
        raise ValueError("Total size cannot be less than max segment")
    if segs[0] != 0 or not (segs[:-1] < segs[1:]).all():
        raise ValueError(
            "segs must start at zero and be monotonically increasing")
    temp = ak.zeros(size, dtype=vals.dtype)
    diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1]))
    temp[segs] = diffs
    return ak.cumsum(temp)
Ejemplo n.º 12
0
def interval_lookup(keys, values, arguments, fillvalue=-1):
    '''
    Apply a function defined over non-overlapping intervals to
    an array of arguments.
    
    Parameters
    ----------
    keys : 2-tuple of pdarray
        Tuple of non-overlapping, half-open intervals expressed
        as (lower_bounds_inclusive, upper_bounds_exclusive)
    values : pdarray
        Function value to return for each entry in keys.
    arguments : pdarray
        Arguments to the function
    fillvalue : scalar
        Default value to return when argument is not in any interval.

    Returns
    -------
    pdarray
        Value of function corresponding to the keys interval
        containing each argument, or fillvalue if argument not
        in any interval.
    '''
    idx = search_intervals(arguments, keys, assume_unique=True)
    res = ak.zeros(arguments.size, dtype=values.dtype)
    if fillvalue is not None:
        res.fill(fillvalue)
    found = idx > -1
    res[found] = values[idx[found]]
    return res
Ejemplo n.º 13
0
 def _get_lengths(self):
     if self.size == 0:
         return ak.zeros(0, dtype=ak.int64)
     elif self.size == 1:
         return ak.array([self.valsize])
     else:
         return ak.concatenate(
             (self.segments[1:], ak.array([self.valsize]))) - self.segments
Ejemplo n.º 14
0
def create_ak_array(N, op, dtype, seed):
    if op == 'zeros': 
        a = ak.zeros(N, dtype=dtype)
    elif op == 'ones':
        a = ak.ones(N, dtype=dtype)
    elif op == 'randint':
        a = ak.randint(0, 2**32, N, dtype=dtype, seed=seed)
    return a
Ejemplo n.º 15
0
def check_zeros(N):
    # create np version
    a = ak.array(np.zeros(10))
    # create ak version
    b = ak.zeros(10)
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
Ejemplo n.º 16
0
def check_zeros(N):
    # create np version
    a = np.zeros(N)
    # create ak version
    b = ak.zeros(N)
    # print(a,b)
    c = a == b.to_ndarray()
    # print(type(c),c)
    return pass_fail(c.all())
Ejemplo n.º 17
0
def check_correctness(dtype, seed):
    N = 10**4
    if dtype == 'int64':
        a = ak.randint(0, 2**32, N, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'float64':
        a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed)
        z = ak.zeros(N, dtype=dtype)
    elif dtype == 'str':
        a = ak.random_strings_uniform(1, 16, N, seed=seed)
        z = ak.cast(ak.zeros(N), 'str')

    perm = ak.coargsort([a, z])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
    perm = ak.coargsort([z, a])
    if dtype in ('int64', 'float64'):
        assert ak.is_sorted(a[perm])
Ejemplo n.º 18
0
def generate_arrays(length, nkeys, nvals, dtype='int64'):
    keys = ak.randint(0, nkeys, length)
    if dtype == 'int64':
        vals = ak.randint(0, nvals, length)
    elif dtype == 'bool':
        vals = ak.zeros(length, dtype='bool')
        for i in np.random.randint(0, length, nkeys // 2):
            vals[i] = True
    else:
        vals = ak.linspace(-1, 1, length)
    return keys, vals
Ejemplo n.º 19
0
 def __eq__(self, other):
     if not isinstance(other, SegArray):
         return NotImplemented
     eq = ak.zeros(self.size, dtype=ak.bool)
     leneq = self.lengths == other.lengths
     if leneq.sum() > 0:
         selfcmp = self[leneq]
         othercmp = other[leneq]
         intersection = self.all(selfcmp.values == othercmp.values)
         eq[leneq] = intersection
     return eq
Ejemplo n.º 20
0
def IP_like(N):
    '''
    Data like a 90/10 mix of IPv4 and IPv6 addresses
    '''
    multiplicity = 10
    nunique = N // (2 * multiplicity)
    # First generate unique addresses, then sample with replacement
    u1 = ak.zeros(nunique, dtype=ak.int64)
    u2 = ak.zeros(nunique, dtype=ak.int64)
    v4 = ak.uniform(nunique) < 0.9
    n4 = v4.sum()
    v6 = ~v4
    n6 = v4.size - n4
    u1[v4] = ak.randint(0, 2**32, n4)
    u1[v6] = ak.randint(-2**63, 2**63, n6)
    u2[v6] = ak.randint(-2**63, 2**63, n6)
    sample = ak.randint(0, nunique, N // 2)
    IP1 = u1[sample]
    IP2 = u2[sample]
    yield 'IP-like 2*int64', (IP1, IP2)
Ejemplo n.º 21
0
def in1dmulti(a, b, assume_unique=False):
    """ The multi-level analog of ak.in1d -- test membership of rows of a in the set of rows of b.

    Parameters
    ----------
    a : list of pdarrays
        Rows are elements for which to test membership in b
    b : list of pdarrays
        Rows are elements of the set in which to test membership
    assume_unique : bool
        If true, assume rows of a and b are each unique and sorted. By default, sort and unique them explicitly.

    Returns
    -------
    pdarray, bool
        True for each row in a that is contained in b

    Notes:
        Only works for pdarrays of int64 dtype, Strings, or Categorical
    """
    if not assume_unique:
        ag = ak.GroupBy(a)
        ua = ag.unique_keys
        bg = ak.GroupBy(b)
        ub = bg.unique_keys
    else:
        ua = a
        ub = b
    c = [ak.concatenate(x) for x in zip(ua, ub)]
    g = ak.GroupBy(c)
    k, ct = g.count()
    truth = ak.zeros(c[0].size, dtype=ak.bool)
    truth[g.permutation] = (g.broadcast(1 * (ct == 2)) == 1)
    if assume_unique:
        return truth[:a[0].size]
    else:
        truth2 = ak.zeros(a[0].size, dtype=ak.bool)
        truth2[ag.permutation] = (ag.broadcast(1 * truth[:ua[0].size]) == 1)
        return truth2
Ejemplo n.º 22
0
def check_float(N):
    a = ak.randint(0, 1, N, dtype=ak.float64)
    n = ak.randint(-1, 1, N, dtype=ak.float64)
    z = ak.zeros(N, dtype=ak.float64)

    perm = ak.coargsort([a])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([a, n])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([n, a])
    assert ak.is_sorted(n[perm])

    perm = ak.coargsort([z, a])
    assert ak.is_sorted(a[perm])

    perm = ak.coargsort([z, n])
    assert ak.is_sorted(n[perm])
Ejemplo n.º 23
0
    def get_jth(self, j, return_origins=True, compressed=False, default=0):
        """
        Select the j-th element of each sub-array, where possible.

        Parameters
        ----------
        j : int
            The index of the value to get from each sub-array. If j is negative, 
            it counts backwards from the end of each sub-array.
        return_origins : bool
            If True, return a logical index indicating where j is in bounds
        compressed : bool
            If False, return array is same size as self, with default value 
            where j is out of bounds. If True, the return array only contains
            values where j is in bounds.
        default : scalar
            When compressed=False, the value to return when j is out of bounds 
            for the sub-array

        Returns
        -------
        val : pdarray
            compressed=False: The j-th value of each sub-array where j is in 
            bounds and the default value where j is out of bounds.
            compressed=True: The j-th values of only the sub-arrays where j is 
            in bounds
        origin_indices : pdarray, bool
            A Boolean array that is True where j is in bounds for the sub-array.
        """
        longenough, newj = self._normalize_index(j)
        ind = (self.segments + newj)[longenough]
        if compressed:
            res = self.values[ind]
        else:
            res = ak.zeros(self.size, dtype=self.dtype) + default
            res[longenough] = self.values[ind]
        if return_origins:
            return res, longenough
        else:
            return res
Ejemplo n.º 24
0
def expand(size, segs, vals):
    """ Expand an array with values placed into the indicated segments.

    Parameters
    ----------
    size : ak.pdarray
        The size of the array to be expanded
    segs : ak.pdarray
        The indices where the values should be placed
    vals : ak.pdarray
        The values to be placed in each segment

    Returns
    -------
    pdarray
        The expanded array.

    """
    temp = ak.zeros(size, vals.dtype)
    diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1]))
    temp[segs] = diffs
    return ak.cumsum(temp)
Ejemplo n.º 25
0
 def _convert_strings(self, s):
     '''
     Convert string field names to binary vectors.
     '''
     # Initialize to zero
     values = ak.zeros(s.size, dtype=ak.int64)
     if self.separator == '':
         # When separator is empty, field names are guaranteed to be single characters
         for name, shift in zip(self.names, self.shifts):
             # Check if name exists in each string
             bit = s.contains(name)
             values = values | ak.where(bit, 1 << shift, 0)
     else:
         # When separator is non-empty, split on it
         sf, segs = s.flatten(self.separator, return_segments=True)
         # Create a grouping to map split fields back to originating string
         orig = ak.broadcast(segs, ak.arange(segs.size), sf.size)
         g = ak.GroupBy(orig)
         for name, shift in zip(self.names, self.shifts):
             # Check if name matches one of the split fields from originating string
             bit = g.any(sf == name)[1]
             values = values | ak.where(bit, 1 << shift, 0)
     return values
Ejemplo n.º 26
0
def check_int(N):
    z = ak.zeros(N, dtype=ak.int64)

    a2 = ak.randint(0, 2**16, N)
    b2 = ak.randint(0, 2**16, N)
    c2 = ak.randint(0, 2**16, N)
    d2 = ak.randint(0, 2**16, N)
    n2 = ak.randint(-(2**15), 2**15, N)

    perm = ak.coargsort([a2])
    assert ak.is_sorted(a2[perm])

    perm = ak.coargsort([n2])
    assert ak.is_sorted(n2[perm])

    perm = ak.coargsort([a2, b2, c2, d2])
    assert ak.is_sorted(a2[perm])

    perm = ak.coargsort([z, b2, c2, d2])
    assert ak.is_sorted(b2[perm])

    perm = ak.coargsort([z, z, c2, d2])
    assert ak.is_sorted(c2[perm])

    perm = ak.coargsort([z, z, z, d2])
    assert ak.is_sorted(d2[perm])

    a4 = ak.randint(0, 2**32, N)
    b4 = ak.randint(0, 2**32, N)
    n4 = ak.randint(-(2**31), 2**31, N)

    perm = ak.coargsort([a4])
    assert ak.is_sorted(a4[perm])

    perm = ak.coargsort([n4])
    assert ak.is_sorted(n4[perm])

    perm = ak.coargsort([a4, b4])
    assert ak.is_sorted(a4[perm])

    perm = ak.coargsort([b4, a4])
    assert ak.is_sorted(b4[perm])

    a8 = ak.randint(0, 2**64, N)
    b8 = ak.randint(0, 2**64, N)
    n8 = ak.randint(-(2**63), 2**64, N)

    perm = ak.coargsort([a8])
    assert ak.is_sorted(a8[perm])

    perm = ak.coargsort([n8])
    assert ak.is_sorted(n8[perm])

    perm = ak.coargsort([b8, a8])
    assert ak.is_sorted(b8[perm])

    from itertools import permutations

    all_perm = permutations([a2, a4, a8])
    for p in all_perm:
        perm = ak.coargsort(p)
        assert ak.is_sorted(p[0][perm])
Ejemplo n.º 27
0
def inner_join(left, right, wherefunc=None, whereargs=None):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    from inspect import signature
    sample = min((left.size, right.size, 5))
    if wherefunc is not None:
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError("Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError("Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e

    # Need dense 0-up right index, to filter out left not in right
    keep, (denseLeft, denseRight) = right_align(left, right)
    if keep.sum() == 0:
        # Intersection is empty
        return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64)
    keep = ak.arange(keep.size)[keep]
    # GroupBy right
    byRight = ak.GroupBy(denseRight)
    # Get segment boundaries (starts, ends) of right for each left item
    rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size])))
    starts = rightSegs[denseLeft]
    ends = rightSegs[denseLeft+1]
    fullSize = (ends - starts).sum()
    # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ")
    # gen_ranges for gather of right items
    fullSegs, ranges = gen_ranges(starts, ends)
    # Evaluate where clause
    if wherefunc is None:
        filtRanges = ranges
        filtSegs = fullSegs
        keep12 = keep
    else:
        # Gather right whereargs
        rightWhere = whereargs[1][byRight.permutation][ranges]
        # Expand left whereargs
        leftWhere = ak.broadcast(fullSegs, whereargs[0][keep], ranges.size)
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        filtRanges = ranges[whereSatisfied]
        scan = ak.cumsum(whereSatisfied) - whereSatisfied
        filtSegsWithZeros = scan[fullSegs]    
        filtSegSizes = ak.concatenate((filtSegsWithZeros[1:] - filtSegsWithZeros[:-1], 
                                       ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]])))
        keep2 = (filtSegSizes > 0)
        filtSegs = filtSegsWithZeros[keep2]
        keep12 = keep[keep2]
    # Gather right inds and expand left inds
    rightInds = byRight.permutation[filtRanges]
    leftInds = ak.broadcast(filtSegs, ak.arange(left.size)[keep12], filtRanges.size)
    return leftInds, rightInds
Ejemplo n.º 28
0
    def __init__(self,
                 segments,
                 values,
                 copy=False,
                 lengths=None,
                 grouping=None):
        """
        An array of variable-length arrays, also called a skyline array or ragged array.

        Parameters
        ----------
        segments : pdarray, int64
            Start index of each sub-array in the flattened values array
        values : pdarray
            The flattened values of all sub-arrays
        copy : bool
            If True, make a copy of the input arrays; otherwise, just store a reference.

        Returns
        -------
        SegArray
            Data structure representing an array whose elements are variable-length arrays.

        Notes
        -----
        Keyword args 'lengths' and 'grouping' are not user-facing. They are used by the
        attach method.
        """
        if not isinstance(segments, ak.pdarray) or segments.dtype != ak.int64:
            raise TypeError("Segments must be int64 pdarray")
        if not ak.is_sorted(segments) or (ak.unique(segments).size !=
                                          segments.size):
            raise ValueError("Segments must be unique and in sorted order")
        if segments.size > 0:
            if segments.min() != 0 or segments.max() >= values.size:
                raise ValueError(
                    "Segments must start at zero and be less than values.size")
        elif values.size > 0:
            raise ValueError(
                "Cannot have non-empty values with empty segments")
        if copy:
            self.segments = segments[:]
            self.values = values[:]
        else:
            self.segments = segments
            self.values = values
        self.size = segments.size
        self.valsize = values.size
        if lengths is None:
            self.lengths = self._get_lengths()
        else:
            self.lengths = lengths
        self.dtype = values.dtype
        if grouping is None:
            if self.size == 0:
                self.grouping = ak.GroupBy(ak.zeros(0, dtype=ak.int64))
            else:
                # Treat each sub-array as a group, for grouped aggregations
                self.grouping = ak.GroupBy(
                    ak.broadcast(self.segments, ak.arange(self.size),
                                 self.valsize))
        else:
            self.grouping = grouping
Ejemplo n.º 29
0
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64:
        raise ValueError("left and right must be pdarray(int64)")
    if wherefunc is not None:
        from inspect import signature
        sample = min((left.size, right.size, 5))
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError("Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError("Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e
    # Only join on intersection
    inter = ak.intersect1d(left, right)
    # Indices of left values present in intersection
    leftInds = ak.arange(left.size)[ak.in1d(left, inter)]
    # Left vals in intersection
    leftFilt = left[leftInds]
    # Indices of right vals present in inter
    rightInds = ak.arange(right.size)[ak.in1d(right, inter)]
    # Right vals in inter
    rightFilt = right[rightInds]
    byLeft = ak.GroupBy(leftFilt)
    byRight = ak.GroupBy(rightFilt)
    maxVal = inter.max()
    if forceDense or maxVal > 3*(left.size + right.size):
        # Remap intersection to dense, 0-up codes
        # Replace left values with dense codes
        uniqLeftVals = byLeft.unique_keys
        uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)]
        leftCodes = ak.zeros_like(leftFilt) - 1
        leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False)
        # Replace right values with dense codes
        uniqRightVals = byRight.unique_keys
        uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)]
        rightCodes = ak.zeros_like(rightFilt) - 1
        rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False)
        countSize = inter.size
    else:
        uniqLeftCodes = byLeft.unique_keys
        uniqRightCodes = byRight.unique_keys
        leftCodes = leftFilt
        rightCodes = rightFilt
        countSize = maxVal + 1
    # Expand indices to product domain
    # First count occurrences of each code in left and right
    leftCounts = ak.zeros(countSize, dtype=ak.int64)
    leftCounts[uniqLeftCodes] = byLeft.count()[1]
    rightCounts = ak.zeros(countSize, dtype=ak.int64)
    rightCounts[uniqRightCodes] = byRight.count()[1]
    # Repeat each left index as many times as that code occurs in right
    prodLeft = rightCounts[leftCodes]
    leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum())
    prodRight = leftCounts[rightCodes]
    rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum())
    # Evaluate where clause
    if wherefunc is None:
        return leftFullInds, rightFullInds
    else:
        # Gather whereargs
        leftWhere = whereargs[0][leftFullInds]
        rightWhere = whereargs[1][rightFullInds]
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
Ejemplo n.º 30
0
    def concat(cls, x, axis=0, ordered=True):
        """
        Concatenate a sequence of SegArrays

        Parameters
        ----------
        x : sequence of SegArray
            The SegArrays to concatenate
        axis : 0 or 1
            Select vertical (0) or horizontal (1) concatenation. If axis=1, all
            SegArrays must have same size.
        ordered : bool
            Must be True. This option is present for compatibility only, because unordered
            concatenation is not yet supported.

        Returns
        -------
        SegArray
            The input arrays joined into one SegArray
        """
        if not ordered:
            raise ValueError(
                "Unordered concatenation not yet supported on SegArray; use ordered=True."
            )
        if len(x) == 0:
            raise ValueError("Empty sequence passed to concat")
        for xi in x:
            if not isinstance(xi, cls):
                return NotImplemented
        if len(set(xi.dtype for xi in x)) != 1:
            raise ValueError(
                "SegArrays must all have same dtype to concatenate")
        if axis == 0:
            ctr = 0
            segs = []
            vals = []
            for xi in x:
                # Segment offsets need to be raised by length of previous values
                segs.append(xi.segments + ctr)
                ctr += xi.valsize
                # Values can just be concatenated
                vals.append(xi.values)
            return cls(ak.concatenate(segs), ak.concatenate(vals))
        elif axis == 1:
            sizes = set(xi.size for xi in x)
            if len(sizes) != 1:
                raise ValueError(
                    "SegArrays must all have same size to concatenate with axis=1"
                )
            if sizes.pop() == 0:
                return x[0]
            dt = list(x)[0].dtype
            newlens = sum(xi.lengths for xi in x)
            newsegs = ak.cumsum(newlens) - newlens
            # Ignore sub-arrays that are empty in all arrays
            nonzero = ak.concatenate(
                (newsegs[:-1] < newsegs[1:], ak.array([True])))
            nzsegs = newsegs[nonzero]
            newvals = ak.zeros(newlens.sum(), dtype=dt)
            for xi in x:
                # Set up fromself for a scan, so that it steps up at the start of a segment
                # from the current array, and steps back down at the end
                fromself = ak.zeros(newvals.size + 1, dtype=ak.int64)
                fromself[nzsegs] += 1
                nzlens = xi.lengths[nonzero]
                fromself[nzsegs + nzlens] -= 1
                fromself = (ak.cumsum(fromself[:-1]) == 1)
                newvals[fromself] = xi.values
                nzsegs += nzlens
            return cls(newsegs, newvals, copy=False)
        else:
            raise ValueError(
                "Supported values for axis are 0 (vertical concat) or 1 (horizontal concat)"
            )