예제 #1
0
def check_correctness(dtype, seed):
    arrays, totalbytes = generate_arrays(1000, 2, dtype, seed)
    g = ak.GroupBy(arrays)
    perm = ak.argsort(ak.randint(0, 2**32, arrays[0].size))
    g2 = ak.GroupBy([a[perm] for a in arrays])
    assert all((uk == uk2).all() for uk, uk2 in zip(g.unique_keys, g2.unique_keys))
    assert (g.segments == g2.segments).all()
예제 #2
0
def compare_strategies(length, ncat, op, dtype):
    keys = ak.randint(0, ncat, length)
    if dtype == 'int64':
        vals = ak.randint(0, length // ncat, length)
    elif dtype == 'bool':
        vals = ak.zeros(length, dtype='bool')
        for i in np.random.randint(0, length, ncat // 2):
            vals[i] = True
    else:
        vals = ak.linspace(-1, 1, length)
    print("Global groupby", end=' ')
    start = time()
    gg = ak.GroupBy(keys, False)
    ggtime = time() - start
    print(ggtime)
    print("Global reduce", end=' ')
    start = time()
    gk, gv = gg.aggregate(vals, op)
    grtime = time() - start
    print(grtime)
    print("Local groupby", end=' ')
    start = time()
    lg = ak.GroupBy(keys, True)
    lgtime = time() - start
    print(lgtime)
    print("Local reduce", end=' ')
    start = time()
    lk, lv = lg.aggregate(vals, op)
    lrtime = time() - start
    print(lrtime)
    print(f"Keys match? {(gk == lk).all()}")
    print(f"Absolute diff of vals = {ak.abs(gv - lv).sum()}")
    return ggtime, grtime, lgtime, lrtime
예제 #3
0
파일: util.py 프로젝트: Bears-R-Us/arkouda
def most_common(g, values):
    '''Find the most common value for each key in a GroupBy object.
    
    Parameters
    ----------
    g : ak.GroupBy
        Grouping of keys
    values : array-like
        Values in which to find most common

    Returns
    -------
    unique_keys : (list of) arrays
        Unique key of each group 
    most_common_values : array-like 
        The most common value for each key 
    '''
    # Give each key an integer index
    keyidx = g.broadcast(ak.arange(g.unique_keys[0].size), permute=True)
    # Annex values and group by (key, val)
    bykeyval = ak.GroupBy([keyidx, values])
    # Count number of records for each (key, val)
    (ki, uval), count = bykeyval.count()
    # Group out value
    bykey = ak.GroupBy(ki, assume_sorted=True)
    # Find the index of the most frequent value for each key
    _, topidx = bykey.argmax(count)
    # Gather the most frequent values
    return uval[topidx]
예제 #4
0
def run_test(levels, verbose=False):
    d = make_arrays()
    df = pd.DataFrame(d)
    akdf = {k: ak.array(v) for k, v in d.items()}
    if levels == 1:
        akg = ak.GroupBy(akdf['keys'])
        keyname = 'keys'
    elif levels == 2:
        akg = ak.GroupBy([akdf['keys'], akdf['keys2']])
        keyname = ['keys', 'keys2']
    tests = 0
    failures = 0
    not_impl = 0
    if verbose: print(f"Doing .count()")
    tests += 1
    pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels)
    akkeys, akvals = akg.count()
    akvals = akvals.to_ndarray()
    failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals)
    for vname in ('int64', 'float64', 'bool'):
        for op in ak.GroupBy.Reductions:
            if verbose: print(f"\nDoing aggregate({vname}, {op})")
            tests += 1
            do_check = True
            try:
                pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op,
                                                   levels)
            except Exception as E:
                if verbose: print("Pandas does not implement")
                do_check = False
            try:
                akkeys, akvals = akg.aggregate(akdf[vname], op)
                akvals = akvals.to_ndarray()
            except RuntimeError as E:
                if verbose: print("Arkouda error: ", E)
                not_impl += 1
                do_check = False
                continue
            if not do_check:
                continue
            if op.startswith('arg'):
                pdextrema = df[vname][pdvals]
                akextrema = akdf[vname][ak.array(akvals)].to_ndarray()
                if not np.allclose(pdextrema, akextrema):
                    print(
                        f"Different argmin/argmax: Arkouda failed to find an extremum"
                    )
                    print("pd: ", pdextrema)
                    print("ak: ", akextrema)
                    failures += 1
            else:
                failures += compare_keys(pdkeys, akkeys, levels, pdvals,
                                         akvals)
    print(
        f"{tests - failures - not_impl} / {tests - not_impl} passed, {failures} errors, {not_impl} not implemented"
    )
    return failures
예제 #5
0
파일: join.py 프로젝트: Bears-R-Us/arkouda
def compute_join_size(a, b):
    '''Compute the internal size of a hypothetical join between a and b. Returns
    both the number of elements and number of bytes required for the join.
    '''
    bya = ak.GroupBy(a)
    ua, asize = bya.count()
    byb = ak.GroupBy(b)
    ub, bsize = byb.count()
    afact = asize[ak.in1d(ua, ub)]
    bfact = bsize[ak.in1d(ub, ua)]
    nelem = (afact*bfact).sum()
    nbytes = 3*8*nelem
    return nelem, nbytes
예제 #6
0
    def GroupBy(self, keys, use_series=False):
        """
        Group the dataframe by a column or a list of columns.

        Parameters
        ----------
        keys : string or list
            An (ordered) list of column names or a single string to group by.
        use_series : If True, returns an akutil.GroupBy oject. Otherwise an arkouda GroupBy object

        Returns
        -------
        GroupBy
            Either an akutil GroupBy or an arkouda GroupBy object.

        See Also
        --------
        arkouda.GroupBy
        """

        self.update_size()
        if isinstance(keys, str):
            cols = self.data[keys]
        elif not isinstance(keys, list):
            raise TypeError(
                "keys must be a colum name or a list of column names")
        elif len(keys) == 1:
            cols = self.data[keys[0]]
        else:
            cols = [self.data[col] for col in keys]
        gb = ak.GroupBy(cols)
        if use_series:
            gb = GroupBy(gb, self)
        return gb
예제 #7
0
    def drop_duplicates(self, subset=None, keep='first'):
        """
        Drops duplcated rows and returns resulting DataFrame. 
        
        If a subset of the columns are provided then only one instance of each 
        duplicated row will be returned (keep determines which row).

        Parameters
        ----------
        subset : Iterable of column names to use to dedupe.
        keep : {'first', 'last'}, default 'first'
            Determines which duplicates (if any) to keep.

        Returns
        -------
        DataFrame
            DataFrame with duplicates removed.
        """
        if self._empty:
            return self

        if not subset:
            subset = self._columns[1:]

        if len(subset) == 1:
            if not subset[0] in self.data:
                raise KeyError("{} is not a column in the DataFrame.".format(
                    subset[0]))
            _ = ak.GroupBy(self.data[subset[0]])

        else:
            for col in subset:
                if not col in self.data:
                    raise KeyError(
                        "{} is not a column in the DataFrame.".format(
                            subset[0]))

            _ = ak.GroupBy([self.data[col] for col in subset])

        if keep == 'last':
            _segment_ends = ak.concatenate(
                [_.segments[1:] - 1,
                 ak.array([_.permutation.size - 1])])
            return self[_.permutation[_segment_ends]]
        else:
            return self[_.permutation[_.segments]]
예제 #8
0
파일: index.py 프로젝트: Bears-R-Us/arkouda
    def _merge(self, other):
        self._check_types(other)

        idx = [
            aku.concatenate([ix1, ix2], ordered=False)
            for ix1, ix2 in zip(self.index, other.index)
        ]

        return MultiIndex(ak.GroupBy(idx).unique_keys)
예제 #9
0
파일: index.py 프로젝트: Bears-R-Us/arkouda
    def _merge_all(self, array):

        idx = self.index

        for other in array:
            self._check_types(other)
            idx = [
                aku.concatenate([ix1, ix2], ordered=False)
                for ix1, ix2 in zip(idx, other.index)
            ]

        return MultiIndex(ak.GroupBy(idx).unique_keys)
예제 #10
0
def in1dmulti(a, b, assume_unique=False):
    """ The multi-level analog of ak.in1d -- test membership of rows of a in the set of rows of b.

    Parameters
    ----------
    a : list of pdarrays
        Rows are elements for which to test membership in b
    b : list of pdarrays
        Rows are elements of the set in which to test membership
    assume_unique : bool
        If true, assume rows of a and b are each unique and sorted. By default, sort and unique them explicitly.

    Returns
    -------
    pdarray, bool
        True for each row in a that is contained in b

    Notes:
        Only works for pdarrays of int64 dtype, Strings, or Categorical
    """
    if not assume_unique:
        ag = ak.GroupBy(a)
        ua = ag.unique_keys
        bg = ak.GroupBy(b)
        ub = bg.unique_keys
    else:
        ua = a
        ub = b
    c = [ak.concatenate(x) for x in zip(ua, ub)]
    g = ak.GroupBy(c)
    k, ct = g.count()
    truth = ak.zeros(c[0].size, dtype=ak.bool)
    truth[g.permutation] = (g.broadcast(1 * (ct == 2)) == 1)
    if assume_unique:
        return truth[:a[0].size]
    else:
        truth2 = ak.zeros(a[0].size, dtype=ak.bool)
        truth2[ag.permutation] = (ag.broadcast(1 * truth[:ua[0].size]) == 1)
        return truth2
예제 #11
0
    def unique(self, x=None):
        '''
        Return sub-arrays of unique values.
        
        Parameters
        ----------
        x : pdarray
            The values to unique, per group. By default, the values of this 
            SegArray's sub-arrays.

        Returns
        -------
        SegArray
            Same number of sub-arrays as original SegArray, but elements in sub-array 
            are unique and in sorted order.
        '''
        if x is None:
            x = self.values
        keyidx = self.grouping.broadcast(ak.arange(self.size), permute=True)
        ukey, uval = ak.GroupBy([keyidx, x]).unique_keys
        g = ak.GroupBy(ukey, assume_sorted=True)
        _, lengths = g.count()
        return SegArray(g.segments, uval, grouping=g, lengths=lengths)
예제 #12
0
def check_correctness():
    keys = ak.arange(1000) % 10
    ones = ak.ones_like(keys)
    g = ak.GroupBy(keys)
    # Make sure keys are correct
    assert (g.unique_keys == ak.arange(10)).all()
    # Check value of sums
    assert (g.sum(ones)[1] == 100).all()
    # For other ops, just run them and make sure they return the right size vector
    for op in ak.GroupBy.Reductions:
        if op in BOOLOPS:
            res = g.aggregate((ones == 1), op)[1]
        else:
            res = g.aggregate(ones, op)[1]
        assert (res.size == g.unique_keys.size)
예제 #13
0
def zero_up(vals):
    """ Map an array of sparse values to 0-up indices.
    Parameters
    ----------
    vals : pdarray
        Array to map to dense index

    Returns
    -------
    aligned : pdarray
        Array with values replaced by 0-up indices
    """
    g = ak.GroupBy(vals)
    uniqueInds = ak.arange(g.unique_keys.size)
    idinds = g.broadcast(uniqueInds, permute=True)
    return idinds
예제 #14
0
 def select_clusters(self):
     print("Computing Selection and Stability.")
     # Perhaps keep track of a "final clusters" array, that we update as we
     # work through this function.
     self.selection_data['selected'] = ak.ones(self.selection_data.size, dtype=ak.bool)
     byparent = ak.GroupBy(self.selection_data['parent'])
     uk = byparent.unique_keys
     for p in tqdm(uk[1:]):
         children = self.selection_data['index'][self.selection_data['parent'] == p]
         c_stab = (self.selection_data['stability'][children]).sum()
         p_stab = self.selection_data['stability'][p]
         if c_stab >= p_stab:
             self.selection_data['stability'][p] = c_stab
             self.selection_data['selected'][p] = False
         else:
             self.deselect_children(node=p)
     print("Selection and Stability computation is complete!")
예제 #15
0
파일: util.py 프로젝트: Bears-R-Us/arkouda
def enrich_inplace(data, keynames, aggregations, **kwargs):
    # TO DO: validate reductions and values
    try:
        keys = data[keynames]
    except (KeyError, TypeError):
        keys = [data[k] for k in keynames]
    g = ak.GroupBy(keys, **kwargs)
    for resname, (reduction, values) in aggregations.items():
        try:
            values = data[values]
        except (KeyError, TypeError):
            pass
        if reduction == 'count':
            pergroupval = g.count()[1]
        else:
            pergroupval = g.aggregate(values, reduction)[1]
        data[resname] = g.broadcast(pergroupval, permute=True)
예제 #16
0
def time_ak_groupby(N_per_locale, trials, dtype, seed):
    print(">>> arkouda groupby")
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    for numArrays in (1, 2, 8, 16):
        arrays, totalbytes = generate_arrays(N, numArrays, dtype, seed)
        timings = []
        for i in range(trials):
            start = time.time()
            g = ak.GroupBy(arrays)
            end = time.time()
            timings.append(end - start)
            tavg = sum(timings) / trials
            print("{}-array Average time = {:.4f} sec".format(numArrays, tavg))
            bytes_per_sec = totalbytes / tavg
            print("{}-array Average rate = {:.4f} GiB/sec".format(
                numArrays, bytes_per_sec / 2**30))
예제 #17
0
 def _convert_strings(self, s):
     '''
     Convert string field names to binary vectors.
     '''
     # Initialize to zero
     values = ak.zeros(s.size, dtype=ak.int64)
     if self.separator == '':
         # When separator is empty, field names are guaranteed to be single characters
         for name, shift in zip(self.names, self.shifts):
             # Check if name exists in each string
             bit = s.contains(name)
             values = values | ak.where(bit, 1 << shift, 0)
     else:
         # When separator is non-empty, split on it
         sf, segs = s.flatten(self.separator, return_segments=True)
         # Create a grouping to map split fields back to originating string
         orig = ak.broadcast(segs, ak.arange(segs.size), sf.size)
         g = ak.GroupBy(orig)
         for name, shift in zip(self.names, self.shifts):
             # Check if name matches one of the split fields from originating string
             bit = g.any(sf == name)[1]
             values = values | ak.where(bit, 1 << shift, 0)
     return values
예제 #18
0
def time_ak_aggregate(N_per_locale, trials, seed):
    print(">>> arkouda aggregate")
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    keys, intvals, boolvals = generate_arrays(N, seed)
    g = ak.GroupBy(keys, assume_sorted=True)
    for op in ak.GroupBy.Reductions:
        if op in BOOLOPS:
            v = boolvals
        else:
            v = intvals
        totalbytes = v.size * v.itemsize
        timings = []
        for i in range(trials):
            start = time.time()
            res = g.aggregate(v, op)[1]
            end = time.time()
            timings.append(end - start)
        tavg = sum(timings) / trials
        print("Aggregate {} Average time = {:.4f} sec".format(op, tavg))
        bytes_per_sec = totalbytes / tavg
        print("Aggregate {} Average rate = {:.4f} GiB/sec".format(
            op, bytes_per_sec / 2**30))
예제 #19
0
파일: join.py 프로젝트: zhihuidu/arkouda
def inner_join(left, right, wherefunc=None, whereargs=None):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    from inspect import signature
    sample = min((left.size, right.size, 5))
    if wherefunc is not None:
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError(
                "wherefunc must be a function that accepts exactly two arguments"
            )
        if whereargs is None or len(whereargs) != 2:
            raise ValueError(
                "whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError(
                "Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError(
                "Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e

    # Need dense 0-up right index, to filter out left not in right
    keep, (denseLeft, denseRight) = right_align(left, right)
    keep = ak.arange(keep.size)[keep]
    # GroupBy right
    byRight = ak.GroupBy(denseRight)
    # Get segment boundaries (starts, ends) of right for each left item
    rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size])))
    starts = rightSegs[denseLeft]
    ends = rightSegs[denseLeft + 1]
    fullSize = (ends - starts).sum()
    # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ")
    # gen_ranges for gather of right items
    fullSegs, ranges = gen_ranges(starts, ends)
    # Evaluate where clause
    if wherefunc is None:
        filtRanges = ranges
        filtSegs = fullSegs
        keep12 = keep
    else:
        # Gather right whereargs
        rightWhere = whereargs[1][byRight.permutation][ranges]
        # Expand left whereargs
        leftWhere = expand(whereargs[0][keep], fullSegs, ranges.size)
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        filtRanges = ranges[whereSatisfied]
        scan = ak.cumsum(whereSatisfied) - whereSatisfied
        filtSegsWithZeros = scan[fullSegs]
        filtSegSizes = ak.concatenate(
            (filtSegsWithZeros[1:] - filtSegsWithZeros[:-1],
             ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]])))
        keep2 = (filtSegSizes > 0)
        filtSegs = filtSegsWithZeros[keep2]
        keep12 = keep[keep2]
    # Gather right inds and expand left inds
    rightInds = byRight.permutation[filtRanges]
    leftInds = expand(ak.arange(left.size)[keep12], filtSegs, filtRanges.size)
    return leftInds, rightInds
예제 #20
0
 def test_groupby(self):
     g = ak.GroupBy([self.dtvec1, self.tdvec1])
     self.assertTrue(isinstance(g.unique_keys[0], ak.Datetime))
     self.assertTrue(isinstance(g.unique_keys[1], ak.Timedelta))
     self.assertTrue(g.unique_keys[0].is_sorted())
예제 #21
0
def run_test(levels):
    d = make_arrays()
    df = pd.DataFrame(d)
    akdf = {k:ak.array(v) for k, v in d.items()}
    if levels == 1:
        akg = ak.GroupBy(akdf['keys'])
        keyname = 'keys'
    elif levels == 2:
        akg = ak.GroupBy([akdf['keys'], akdf['keys2']])
        keyname = ['keys', 'keys2']
    tests = 0
    failures = 0
    not_impl = 0
    print(f"Doing .count()")
    tests += 1
    pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels)
    # print("Pandas:")
    # print(pdkeys)
    # print(pdvals)
    akkeys, akvals = akg.count()
    # akkeys = akkeys.to_ndarray()
    akvals = akvals.to_ndarray()
    # print("Arkouda:")
    # print(akkeys)
    # print(akvals)
    # if not np.allclose(pdkeys, akkeys):
    #     print(f"Different keys")
    #     failures += 1
    failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals)
    # elif not np.allclose(pdvals, akvals):
    #     print(f"Different values (abs diff = {np.abs(pdvals - akvals).sum()})")
    #     failures += 1
    for vname in ('int64', 'float64', 'bool'):
        for op in ak.GroupBy.Reductions:
            print(f"\nDoing aggregate({vname}, {op})")
            tests += 1
            do_check = True
            try:
                pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op, levels)
                # print("Pandas:")
                # print(pdkeys)
                # print(pdvals)
            except Exception as E:
                print("Pandas does not implement")
                do_check = False
            try:
                akkeys, akvals = akg.aggregate(akdf[vname], op)
                # akkeys = akkeys.to_ndarray()
                akvals = akvals.to_ndarray()
                # print("Arkouda:")
                # print(akkeys)
                # print(akvals)
            except RuntimeError as E:
                print("Arkouda error: ", E)
                not_impl += 1
                do_check = False
                continue
            if not do_check:
                continue
            if op.startswith('arg'):
                pdextrema = df[vname][pdvals]
                akextrema = akdf[vname][ak.array(akvals)].to_ndarray()
                if not np.allclose(pdextrema, akextrema):
                    print(f"Different argmin/argmax: Arkouda failed to find an extremum")
                    print("pd: ", pdextrema)
                    print("ak: ", akextrema)
                    failures += 1
            else:
                # if not np.allclose(pdkeys, akkeys):
                #     print(f"Different keys")
                #     failures += 1
                failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals)
                # elif not np.allclose(pdvals, akvals):
                #     print(f"Different values (abs diff = {np.where(np.isfinite(pdvals) & np.isfinite(akvals), np.abs(pdvals - akvals), 0).sum()})")
                #     failures += 1
    print(f"\n{failures} failures in {tests} tests ({not_impl} not implemented)")
예제 #22
0
                              args.prob,
                              perm=args.perm)

    print("ii = ", (ii.size, ii))
    print("ii(min,max) = ", (ii.min(), ii.max()))
    print("jj = ", (jj.size, jj))
    print("jj(min,max) = ", (jj.min(), jj.max()))

    nda_ii = ii.to_ndarray()  # convert to ndarray for plotting
    nda_jj = jj.to_ndarray()  # convert to ndarray for plotting
    plt.scatter(nda_ii, nda_jj)
    plt.show()

    df = {"ii": ii, "jj": jj}

    grps = ak.GroupBy(ii)
    ukeys, cts = grps.count()
    print("counts", (cts.min(), cts.max()))
    nBins = ak.max(cts)
    nda_cts = cts.to_ndarray()  # convert to ndarray for plotting
    plt.hist(nda_cts, bins=nBins)
    plt.yscale('log')
    plt.show()

    ukeys, nu = grps.nunique(jj)
    print("nunique", (nu.min(), nu.max()))
    nBins = nu.max()
    nda_nu = nu.to_ndarray()  # convert to ndarray for plotting
    plt.hist(nda_nu, bins=nBins)
    plt.yscale('log')
    plt.show()
예제 #23
0
def in1dmulti(a, b, assume_unique=False, symmetric=False):
    """ The multi-level analog of ak.in1d -- test membership of rows of a in the set of rows of b.

    Parameters
    ----------
    a : list of pdarrays
        Rows are elements for which to test membership in b
    b : list of pdarrays
        Rows are elements of the set in which to test membership
    assume_unique : bool
        If true, assume rows of a and b are each unique and sorted. By default, sort and unique them explicitly.

    Returns
    -------
    pdarray, bool
        True for each row in a that is contained in b

    Notes:
        Only works for pdarrays of int64 dtype, Strings, or Categorical
    """
    if isinstance(a, (ak.pdarray, ak.Strings, ak.Categorical)):
        if type(a) != type(b):
            raise TypeError("Arguments must have same type")
        if symmetric:
            return ak.in1d(a, b), ak.in1d(b, a)
        else:
            return ak.in1d(a, b)
    atypes = np.array([ai.dtype for ai in a])
    btypes = np.array([bi.dtype for bi in b])
    if not (atypes == btypes).all():
        raise TypeError("Array dtypes of arguments must match")
    if not assume_unique:
        ag = ak.GroupBy(a)
        ua = ag.unique_keys
        bg = ak.GroupBy(b)
        ub = bg.unique_keys
    else:
        ua = a
        ub = b
    # Key for deinterleaving result
    isa = ak.concatenate(
        (ak.ones(ua[0].size, dtype=ak.bool), ak.zeros(ub[0].size,
                                                      dtype=ak.bool)),
        ordered=False)
    c = [ak.concatenate(x, ordered=False) for x in zip(ua, ub)]
    g = ak.GroupBy(c)
    k, ct = g.count()
    if assume_unique:
        # need to verify uniqueness, otherwise answer will be wrong
        if (g.sum(isa)[1] > 1).any():
            raise NonUniqueError(
                "Called with assume_unique=True, but first argument is not unique"
            )
        if (g.sum(~isa)[1] > 1).any():
            raise NonUniqueError(
                "Called with assume_unique=True, but second argument is not unique"
            )
    # Where value appears twice, it is present in both a and b
    # truth = answer in c domain
    truth = g.broadcast(ct == 2, permute=True)
    if assume_unique:
        # Deinterleave truth into a and b domains
        if symmetric:
            return truth[isa], truth[~isa]
        else:
            return truth[isa]
    else:
        # If didn't start unique, first need to deinterleave into ua domain,
        # then broadcast to a domain
        atruth = ag.broadcast(truth[isa], permute=True)
        if symmetric:
            btruth = bg.broadcast(truth[~isa], permute=True)
            return atruth, btruth
        else:
            return atruth
예제 #24
0
    def add(self,b):

        index = self.index.concat(b.index).index
            
        values = ak.concatenate( [self.values, b.values],ordered=False)
        return Series(ak.GroupBy( index).sum(values))
예제 #25
0
    def __init__(self,
                 segments,
                 values,
                 copy=False,
                 lengths=None,
                 grouping=None):
        """
        An array of variable-length arrays, also called a skyline array or ragged array.

        Parameters
        ----------
        segments : pdarray, int64
            Start index of each sub-array in the flattened values array
        values : pdarray
            The flattened values of all sub-arrays
        copy : bool
            If True, make a copy of the input arrays; otherwise, just store a reference.

        Returns
        -------
        SegArray
            Data structure representing an array whose elements are variable-length arrays.

        Notes
        -----
        Keyword args 'lengths' and 'grouping' are not user-facing. They are used by the
        attach method.
        """
        if not isinstance(segments, ak.pdarray) or segments.dtype != ak.int64:
            raise TypeError("Segments must be int64 pdarray")
        if not ak.is_sorted(segments) or (ak.unique(segments).size !=
                                          segments.size):
            raise ValueError("Segments must be unique and in sorted order")
        if segments.size > 0:
            if segments.min() != 0 or segments.max() >= values.size:
                raise ValueError(
                    "Segments must start at zero and be less than values.size")
        elif values.size > 0:
            raise ValueError(
                "Cannot have non-empty values with empty segments")
        if copy:
            self.segments = segments[:]
            self.values = values[:]
        else:
            self.segments = segments
            self.values = values
        self.size = segments.size
        self.valsize = values.size
        if lengths is None:
            self.lengths = self._get_lengths()
        else:
            self.lengths = lengths
        self.dtype = values.dtype
        if grouping is None:
            if self.size == 0:
                self.grouping = ak.GroupBy(ak.zeros(0, dtype=ak.int64))
            else:
                # Treat each sub-array as a group, for grouped aggregations
                self.grouping = ak.GroupBy(
                    ak.broadcast(self.segments, ak.arange(self.size),
                                 self.valsize))
        else:
            self.grouping = grouping
예제 #26
0
def intersect(a, b, positions=True, unique=False):
    """
    Find the intersection of two arkouda arrays.

    This function can be especially useful when `positions=True` so
    that the caller gets the indices of values present in both arrays.

    Parameters
    ----------
    a : ak.Strings or ak.pdarray
        An array of strings

    b : ak.Strings or ak.pdarray
        An array of strings

    positions : bool (default=True)
        Return tuple of boolean pdarrays that indicate positions in a and b
        where the values are in the intersection.

    unique : bool (default=False)
        If the number of distinct values in `a` (and `b`) is equal to the size of
        `a` (and `b`), there is a more efficient method to compute the intersection.

    Returns
    -------
    (ak.pdarray, ak.pdarray)
        The indices of `a` and `b` where any element occurs at least once in both
        arrays.
    """

    # To ensure compatibility with all types of arrays:
    if (isinstance(a, ak.pdarray) and isinstance(b, ak.pdarray)):
        intx = ak.intersect1d(a, b)
        if not positions:
            return intx
        else:
            maska = ak.in1d(a, intx)
            maskb = ak.in1d(b, intx)
            return (maska, maskb)

    # It takes more effort to do this with ak.Strings arrays.
    elif (isinstance(a, ak.Strings) and isinstance(b, ak.Strings)):

        # Hash the two arrays first
        hash_a00, hash_a01 = a.hash()
        hash_b00, hash_b01 = b.hash()

        # a and b do not have duplicate entries, so the hashes are distinct
        if unique:
            hash0 = ak.concatenate([hash_a00, hash_b00])
            hash1 = ak.concatenate([hash_a01, hash_b01])

            # Group by the unique hashes
            gb = ak.GroupBy([hash0, hash1])
            val, cnt = gb.count()

            # Hash counts, in groupby order
            counts = gb.broadcast(cnt, permute=False)

            # Same, in original order
            tmp = counts[:]
            counts[gb.permutation] = tmp
            del tmp

            # Masks
            maska = (counts > 1)[:a.size]
            maskb = (counts > 1)[a.size:]

            # The intersection for each array of hash values
            if positions:
                return (maska, maskb)
            else:
                return a[maska]

        # a and b may have duplicate entries, so get the unique hash values
        else:
            gba = ak.GroupBy([hash_a00, hash_a01])
            gbb = ak.GroupBy([hash_b00, hash_b01])

            # Take the unique keys as the hash we'll work with
            a0, a1 = gba.unique_keys
            b0, b1 = gbb.unique_keys
            hash0 = ak.concatenate([a0, b0])
            hash1 = ak.concatenate([a1, b1])

            # Group by the unique hashes
            gb = ak.GroupBy([hash0, hash1])
            val, cnt = gb.count()

            # Hash counts, in groupby order
            counts = gb.broadcast(cnt, permute=False)

            # Restore the original order
            tmp = counts[:]
            counts[gb.permutation] = tmp
            del tmp

            # Broadcast back up one more level
            countsa = counts[:a0.size]
            countsb = counts[a0.size:]
            counts2a = gba.broadcast(countsa, permute=False)
            counts2b = gbb.broadcast(countsb, permute=False)

            # Restore the original orders
            tmp = counts2a[:]
            counts2a[gba.permutation] = tmp
            del tmp
            tmp = counts2b[:]
            counts2b[gbb.permutation] = tmp
            del tmp

            # Masks
            maska = (counts2a > 1)
            maskb = (counts2b > 1)

            # The intersection for each array of hash values
            if positions:
                return (maska, maskb)
            else:
                return a[maska]
예제 #27
0
파일: join.py 프로젝트: Bears-R-Us/arkouda
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False):
    '''Perform inner join on values in <left> and <right>, 
    using conditions defined by <wherefunc> evaluated on 
    <whereargs>, returning indices of left-right pairs. 

    Parameters
    ----------
    left : pdarray(int64)
        The left values to join
    right : pdarray(int64)
        The right values to join
    wherefunc : function, optional
        Function that takes two pdarray arguments and returns 
        a pdarray(bool) used to filter the join. Results for
        which wherefunc is False will be dropped.
    whereargs : 2-tuple of pdarray
        The two pdarray arguments to wherefunc
        
    Returns
    -------
    leftInds : pdarray(int64)
        The left indices of pairs that meet the join condition
    rightInds : pdarray(int64)
        The right indices of pairs that meet the join condition
        
    Notes
    -----
    The return values satisfy the following assertions
    
    `assert (left[leftInds] == right[rightInds]).all()`
    `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()`
        
    '''
    if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64:
        raise ValueError("left and right must be pdarray(int64)")
    if wherefunc is not None:
        from inspect import signature
        sample = min((left.size, right.size, 5))
        if len(signature(wherefunc).parameters) != 2:
            raise ValueError("wherefunc must be a function that accepts exactly two arguments")
        if whereargs is None or len(whereargs) != 2:
            raise ValueError("whereargs must be a 2-tuple with left and right arg arrays")
        if whereargs[0].size != left.size:
            raise ValueError("Left whereargs must be same size as left join values")
        if whereargs[1].size != right.size:
            raise ValueError("Right whereargs must be same size as right join values")
        try:
            _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample])
        except Exception as e:
            raise ValueError("Error evaluating wherefunc") from e
    # Only join on intersection
    inter = ak.intersect1d(left, right)
    # Indices of left values present in intersection
    leftInds = ak.arange(left.size)[ak.in1d(left, inter)]
    # Left vals in intersection
    leftFilt = left[leftInds]
    # Indices of right vals present in inter
    rightInds = ak.arange(right.size)[ak.in1d(right, inter)]
    # Right vals in inter
    rightFilt = right[rightInds]
    byLeft = ak.GroupBy(leftFilt)
    byRight = ak.GroupBy(rightFilt)
    maxVal = inter.max()
    if forceDense or maxVal > 3*(left.size + right.size):
        # Remap intersection to dense, 0-up codes
        # Replace left values with dense codes
        uniqLeftVals = byLeft.unique_keys
        uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)]
        leftCodes = ak.zeros_like(leftFilt) - 1
        leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False)
        # Replace right values with dense codes
        uniqRightVals = byRight.unique_keys
        uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)]
        rightCodes = ak.zeros_like(rightFilt) - 1
        rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False)
        countSize = inter.size
    else:
        uniqLeftCodes = byLeft.unique_keys
        uniqRightCodes = byRight.unique_keys
        leftCodes = leftFilt
        rightCodes = rightFilt
        countSize = maxVal + 1
    # Expand indices to product domain
    # First count occurrences of each code in left and right
    leftCounts = ak.zeros(countSize, dtype=ak.int64)
    leftCounts[uniqLeftCodes] = byLeft.count()[1]
    rightCounts = ak.zeros(countSize, dtype=ak.int64)
    rightCounts[uniqRightCodes] = byRight.count()[1]
    # Repeat each left index as many times as that code occurs in right
    prodLeft = rightCounts[leftCodes]
    leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum())
    prodRight = leftCounts[rightCodes]
    rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum())
    # Evaluate where clause
    if wherefunc is None:
        return leftFullInds, rightFullInds
    else:
        # Gather whereargs
        leftWhere = whereargs[0][leftFullInds]
        rightWhere = whereargs[1][rightFullInds]
        # Evaluate wherefunc and filter ranges, recompute segments
        whereSatisfied = wherefunc(leftWhere, rightWhere)
        return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
예제 #28
0
if __name__ == '__main__':
    import sys
    if len(sys.argv) != 7:
        print(
            f"Usage: {sys.argv[0]} <server> <port> <strategy (0=global, 1=perLocale)> <length> <num_keys> <num_vals>"
        )
        sys.exit()
    per_locale = (sys.argv[3] == '1')
    print("per_locale = ", per_locale)
    length = int(sys.argv[4])
    print("length     = ", length)
    nkeys = int(sys.argv[5])
    print("nkeys      = ", nkeys)
    nvals = int(sys.argv[6])
    print("nvals      = ", nvals)
    ak.connect(sys.argv[1], int(sys.argv[2]))
    print("Generating keys and vals...")
    start = time()
    keys, vals = generate_arrays(length, nkeys, nvals)
    print(f"{time() - start:.2f} seconds", end="\n\n")
    print("GroupBy...")
    start = time()
    g = ak.GroupBy(keys, per_locale)
    print(f"{time() - start:.2f} seconds", end="\n\n")
    for op in OPERATORS:
        print(f"Aggregate('{op}') ...")
        start = time()
        uk, rv = g.aggregate(vals, op)
        print(f"{time() - start:.2f} seconds", end="\n\n")
    sys.exit()
예제 #29
0
    def cluster(self, min_cluster_size=5):
        cluster_data = {}
        last_level_delta = self.level_data[0].delta

        # Initial setup; all levels are the same size
        num_nodes = self.level_data[0].size

        # This dataframe holds extraction data
        selection_data = aku.DataFrame({
                'stability': ak.zeros(1, dtype=ak.float64),
                'parent': ak.zeros(1, dtype=ak.int64),
            })

        # Create an initial cluster dataframe
        labels = ak.arange(num_nodes)
        sizes = ak.ones(num_nodes, dtype=ak.int64)
        stability = ak.zeros(num_nodes, dtype=ak.float64)
        selected = ak.zeros(num_nodes, dtype=ak.bool)

        df = aku.DataFrame({
            'cc':self.level_data[0].cc,
            'labels':labels,
            'sizes':sizes,
            'stability':stability,
        })
        # The result should have all the same keys as the deltas
        cluster_data[self.level_data[0].delta] = df

        # We don't start with the level 0, it gets passed through as is.
        for level in tqdm(self.level_data[1:]):
            bylevel = ak.GroupBy(level.cc)
            perm = bylevel.permutation
            # Save for later analysis
            old_labels = labels[:]
            # Count number of nodes in each group
            _,c = bylevel.count()
            # Find largest (negative) label value each group
            _, max_group_labels = bylevel.aggregate(labels, 'min')
            # Find maximum of existing cluster sizes from last iteration.
            _, max_group_size = bylevel.aggregate(sizes, 'max')
            # Find the maximum stability in each group
            _, max_group_stability = bylevel.aggregate(stability, 'max')
            # Find the number of sub-clusters in each group for purposes of creating new cluster labels
            clusters_and_zeros = ak.where(labels < 0, labels, 0)
            _, num_unique_labels = bylevel.aggregate(clusters_and_zeros, 'nunique')
            _, min_group_label = bylevel.aggregate(labels, 'max')
            num_sub_clusters = num_unique_labels - ak.where(min_group_label >= 0, 1, 0)

            # Update sizes
            count_bc = bylevel.broadcast(c, permute=False)
            sizes = ak.zeros(num_nodes, dtype=ak.int64)
            sizes[perm] = count_bc

            # Update labels to max (negative) in group
            labels_bc = bylevel.broadcast(max_group_labels, permute=False)
            labels = ak.zeros(num_nodes, dtype=ak.int64)
            labels[perm] = labels_bc

            # Update stability
            stability_bc = bylevel.broadcast(max_group_stability, permute=False)
            stability = ak.zeros(num_nodes, dtype=ak.float64)
            stability[perm] = stability_bc

            # Create and update labels as needed, baseline size is 1
            # Only need to test if there are at least two cluster labels in a group.
            new_clusters_join = (num_sub_clusters > 1)
            new_clusters_form = ((c >= min_cluster_size) & (max_group_labels >= 0))
            condition = (new_clusters_join | new_clusters_form)
            num_new_labels = int(condition.sum())

            new_labels_positioned = ak.zeros(c.size, dtype=np.int64)
            if num_new_labels > 0:
                # Set up selection_data 
                mn = abs(int(labels.min()))
                new_label_values = ak.arange(mn+1, mn+num_new_labels+1, 1) * (-1)
                new_labels_positioned = ak.zeros(c.size, dtype=np.int64)
                new_labels_positioned[condition] = new_label_values

                # Update selection_data
                update_df = aku.DataFrame({
                    'parent': ak.zeros(num_new_labels, dtype=ak.int64),
                    'stability': ak.zeros(num_new_labels, dtype=ak.float64),
                })
                selection_data.append(update_df)

                # Update the labels
                labels_bc = bylevel.broadcast(new_labels_positioned, permute=False)
                new_labels = ak.zeros(num_nodes, dtype=ak.int64)
                new_labels[perm] = labels_bc
                tmp = ak.where(new_labels < 0, new_labels, labels)
                labels = tmp

                # When clusters become absorbed into new clusters, add their parent labels and update stability
                mask = ((labels < 0) & (old_labels < 0) & (labels < old_labels))
                if mask.sum() > 0:
                    t1 = old_labels[mask]
                    t2 = labels[mask]
                    t3 = stability[mask]
                    bychangedlabels = ak.GroupBy([t1, t2])
                    [old,new] = bychangedlabels.unique_keys
                    # I don't remember the purpose of this line, but it's never used.
                    #stabby = t3[aku.invert_permutation(bychangedlabels.permutation)][bychangedlabels.segments]
                    selection_data['parent'][-1 * old] = -1 * new

            # Set new cluster stability to 0
            new_label_bc = bylevel.broadcast(new_labels_positioned, permute=False)
            tmp = ak.zeros(labels.size, dtype=np.int64)
            tmp[perm] = new_label_bc
            stability[tmp < 0] = 0

            # Update stability
            added_stability = sizes / (level.delta - last_level_delta)
            last_level_delta = level.delta
            tmp = ak.where(sizes >= min_cluster_size, stability + added_stability, stability)
            stability = tmp

            # Save this information after processing
            df = aku.DataFrame({
                'cc':level.cc,
                'labels':labels,
                'sizes':sizes,
                'stability':stability,
            })
            cluster_data[level.delta] = df

            # Update cluster selection information
            bylabel = ak.GroupBy(labels)
            keys = labels[bylabel.permutation][bylabel.segments]
            stab = stability[bylabel.permutation][bylabel.segments]
            indx = (keys[keys < 0])*(-1)
            vals = stab[keys < 0]
            selection_data['stability'][indx] = vals

        # Set up data for next steps
        self.cluster_data = cluster_data
        self.selection_data = selection_data

        # Select and extract
        self.select_clusters()
        self.extract_clusters()

        print("Clustering is complete!")

        return self.extracted_clusters
예제 #30
0
    # unique
    akuniq = ak.unique(strings)
    catuniq = ak.unique(cat)
    akset = set(akuniq.to_ndarray())
    catset = set(catuniq.to_ndarray())
    assert (akset == catset)
    # There should be no duplicates
    assert (akuniq.size == len(akset))
    npset = set(np.unique(test_strings))
    # When converted to a set, should agree with numpy
    assert (akset == npset)
    print("unique passed")

    # groupby
    g = ak.GroupBy(strings)
    gc = ak.GroupBy(cat)
    # Unique keys should be same result as ak.unique
    assert (akset == set(g.unique_keys.to_ndarray()))
    assert (akset == set(gc.unique_keys.to_ndarray()))
    assert ((gc.permutation == g.permutation).all())
    permStrings = strings[g.permutation]
    # Check each group individually
    lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size]))))
    for uk, s, l in zip(g.unique_keys, g.segments, lengths):
        # All values in group should equal key
        assert ((permStrings[s:s + l] == uk).all())
        # Key should not appear anywhere outside of group
        assert (not (permStrings[:s] == uk).any())
        assert (not (permStrings[s + l:] == uk).any())
    print("groupby passed")