def check_correctness(dtype, seed): arrays, totalbytes = generate_arrays(1000, 2, dtype, seed) g = ak.GroupBy(arrays) perm = ak.argsort(ak.randint(0, 2**32, arrays[0].size)) g2 = ak.GroupBy([a[perm] for a in arrays]) assert all((uk == uk2).all() for uk, uk2 in zip(g.unique_keys, g2.unique_keys)) assert (g.segments == g2.segments).all()
def compare_strategies(length, ncat, op, dtype): keys = ak.randint(0, ncat, length) if dtype == 'int64': vals = ak.randint(0, length // ncat, length) elif dtype == 'bool': vals = ak.zeros(length, dtype='bool') for i in np.random.randint(0, length, ncat // 2): vals[i] = True else: vals = ak.linspace(-1, 1, length) print("Global groupby", end=' ') start = time() gg = ak.GroupBy(keys, False) ggtime = time() - start print(ggtime) print("Global reduce", end=' ') start = time() gk, gv = gg.aggregate(vals, op) grtime = time() - start print(grtime) print("Local groupby", end=' ') start = time() lg = ak.GroupBy(keys, True) lgtime = time() - start print(lgtime) print("Local reduce", end=' ') start = time() lk, lv = lg.aggregate(vals, op) lrtime = time() - start print(lrtime) print(f"Keys match? {(gk == lk).all()}") print(f"Absolute diff of vals = {ak.abs(gv - lv).sum()}") return ggtime, grtime, lgtime, lrtime
def most_common(g, values): '''Find the most common value for each key in a GroupBy object. Parameters ---------- g : ak.GroupBy Grouping of keys values : array-like Values in which to find most common Returns ------- unique_keys : (list of) arrays Unique key of each group most_common_values : array-like The most common value for each key ''' # Give each key an integer index keyidx = g.broadcast(ak.arange(g.unique_keys[0].size), permute=True) # Annex values and group by (key, val) bykeyval = ak.GroupBy([keyidx, values]) # Count number of records for each (key, val) (ki, uval), count = bykeyval.count() # Group out value bykey = ak.GroupBy(ki, assume_sorted=True) # Find the index of the most frequent value for each key _, topidx = bykey.argmax(count) # Gather the most frequent values return uval[topidx]
def run_test(levels, verbose=False): d = make_arrays() df = pd.DataFrame(d) akdf = {k: ak.array(v) for k, v in d.items()} if levels == 1: akg = ak.GroupBy(akdf['keys']) keyname = 'keys' elif levels == 2: akg = ak.GroupBy([akdf['keys'], akdf['keys2']]) keyname = ['keys', 'keys2'] tests = 0 failures = 0 not_impl = 0 if verbose: print(f"Doing .count()") tests += 1 pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels) akkeys, akvals = akg.count() akvals = akvals.to_ndarray() failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) for vname in ('int64', 'float64', 'bool'): for op in ak.GroupBy.Reductions: if verbose: print(f"\nDoing aggregate({vname}, {op})") tests += 1 do_check = True try: pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op, levels) except Exception as E: if verbose: print("Pandas does not implement") do_check = False try: akkeys, akvals = akg.aggregate(akdf[vname], op) akvals = akvals.to_ndarray() except RuntimeError as E: if verbose: print("Arkouda error: ", E) not_impl += 1 do_check = False continue if not do_check: continue if op.startswith('arg'): pdextrema = df[vname][pdvals] akextrema = akdf[vname][ak.array(akvals)].to_ndarray() if not np.allclose(pdextrema, akextrema): print( f"Different argmin/argmax: Arkouda failed to find an extremum" ) print("pd: ", pdextrema) print("ak: ", akextrema) failures += 1 else: failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) print( f"{tests - failures - not_impl} / {tests - not_impl} passed, {failures} errors, {not_impl} not implemented" ) return failures
def compute_join_size(a, b): '''Compute the internal size of a hypothetical join between a and b. Returns both the number of elements and number of bytes required for the join. ''' bya = ak.GroupBy(a) ua, asize = bya.count() byb = ak.GroupBy(b) ub, bsize = byb.count() afact = asize[ak.in1d(ua, ub)] bfact = bsize[ak.in1d(ub, ua)] nelem = (afact*bfact).sum() nbytes = 3*8*nelem return nelem, nbytes
def GroupBy(self, keys, use_series=False): """ Group the dataframe by a column or a list of columns. Parameters ---------- keys : string or list An (ordered) list of column names or a single string to group by. use_series : If True, returns an akutil.GroupBy oject. Otherwise an arkouda GroupBy object Returns ------- GroupBy Either an akutil GroupBy or an arkouda GroupBy object. See Also -------- arkouda.GroupBy """ self.update_size() if isinstance(keys, str): cols = self.data[keys] elif not isinstance(keys, list): raise TypeError( "keys must be a colum name or a list of column names") elif len(keys) == 1: cols = self.data[keys[0]] else: cols = [self.data[col] for col in keys] gb = ak.GroupBy(cols) if use_series: gb = GroupBy(gb, self) return gb
def drop_duplicates(self, subset=None, keep='first'): """ Drops duplcated rows and returns resulting DataFrame. If a subset of the columns are provided then only one instance of each duplicated row will be returned (keep determines which row). Parameters ---------- subset : Iterable of column names to use to dedupe. keep : {'first', 'last'}, default 'first' Determines which duplicates (if any) to keep. Returns ------- DataFrame DataFrame with duplicates removed. """ if self._empty: return self if not subset: subset = self._columns[1:] if len(subset) == 1: if not subset[0] in self.data: raise KeyError("{} is not a column in the DataFrame.".format( subset[0])) _ = ak.GroupBy(self.data[subset[0]]) else: for col in subset: if not col in self.data: raise KeyError( "{} is not a column in the DataFrame.".format( subset[0])) _ = ak.GroupBy([self.data[col] for col in subset]) if keep == 'last': _segment_ends = ak.concatenate( [_.segments[1:] - 1, ak.array([_.permutation.size - 1])]) return self[_.permutation[_segment_ends]] else: return self[_.permutation[_.segments]]
def _merge(self, other): self._check_types(other) idx = [ aku.concatenate([ix1, ix2], ordered=False) for ix1, ix2 in zip(self.index, other.index) ] return MultiIndex(ak.GroupBy(idx).unique_keys)
def _merge_all(self, array): idx = self.index for other in array: self._check_types(other) idx = [ aku.concatenate([ix1, ix2], ordered=False) for ix1, ix2 in zip(idx, other.index) ] return MultiIndex(ak.GroupBy(idx).unique_keys)
def in1dmulti(a, b, assume_unique=False): """ The multi-level analog of ak.in1d -- test membership of rows of a in the set of rows of b. Parameters ---------- a : list of pdarrays Rows are elements for which to test membership in b b : list of pdarrays Rows are elements of the set in which to test membership assume_unique : bool If true, assume rows of a and b are each unique and sorted. By default, sort and unique them explicitly. Returns ------- pdarray, bool True for each row in a that is contained in b Notes: Only works for pdarrays of int64 dtype, Strings, or Categorical """ if not assume_unique: ag = ak.GroupBy(a) ua = ag.unique_keys bg = ak.GroupBy(b) ub = bg.unique_keys else: ua = a ub = b c = [ak.concatenate(x) for x in zip(ua, ub)] g = ak.GroupBy(c) k, ct = g.count() truth = ak.zeros(c[0].size, dtype=ak.bool) truth[g.permutation] = (g.broadcast(1 * (ct == 2)) == 1) if assume_unique: return truth[:a[0].size] else: truth2 = ak.zeros(a[0].size, dtype=ak.bool) truth2[ag.permutation] = (ag.broadcast(1 * truth[:ua[0].size]) == 1) return truth2
def unique(self, x=None): ''' Return sub-arrays of unique values. Parameters ---------- x : pdarray The values to unique, per group. By default, the values of this SegArray's sub-arrays. Returns ------- SegArray Same number of sub-arrays as original SegArray, but elements in sub-array are unique and in sorted order. ''' if x is None: x = self.values keyidx = self.grouping.broadcast(ak.arange(self.size), permute=True) ukey, uval = ak.GroupBy([keyidx, x]).unique_keys g = ak.GroupBy(ukey, assume_sorted=True) _, lengths = g.count() return SegArray(g.segments, uval, grouping=g, lengths=lengths)
def check_correctness(): keys = ak.arange(1000) % 10 ones = ak.ones_like(keys) g = ak.GroupBy(keys) # Make sure keys are correct assert (g.unique_keys == ak.arange(10)).all() # Check value of sums assert (g.sum(ones)[1] == 100).all() # For other ops, just run them and make sure they return the right size vector for op in ak.GroupBy.Reductions: if op in BOOLOPS: res = g.aggregate((ones == 1), op)[1] else: res = g.aggregate(ones, op)[1] assert (res.size == g.unique_keys.size)
def zero_up(vals): """ Map an array of sparse values to 0-up indices. Parameters ---------- vals : pdarray Array to map to dense index Returns ------- aligned : pdarray Array with values replaced by 0-up indices """ g = ak.GroupBy(vals) uniqueInds = ak.arange(g.unique_keys.size) idinds = g.broadcast(uniqueInds, permute=True) return idinds
def select_clusters(self): print("Computing Selection and Stability.") # Perhaps keep track of a "final clusters" array, that we update as we # work through this function. self.selection_data['selected'] = ak.ones(self.selection_data.size, dtype=ak.bool) byparent = ak.GroupBy(self.selection_data['parent']) uk = byparent.unique_keys for p in tqdm(uk[1:]): children = self.selection_data['index'][self.selection_data['parent'] == p] c_stab = (self.selection_data['stability'][children]).sum() p_stab = self.selection_data['stability'][p] if c_stab >= p_stab: self.selection_data['stability'][p] = c_stab self.selection_data['selected'][p] = False else: self.deselect_children(node=p) print("Selection and Stability computation is complete!")
def enrich_inplace(data, keynames, aggregations, **kwargs): # TO DO: validate reductions and values try: keys = data[keynames] except (KeyError, TypeError): keys = [data[k] for k in keynames] g = ak.GroupBy(keys, **kwargs) for resname, (reduction, values) in aggregations.items(): try: values = data[values] except (KeyError, TypeError): pass if reduction == 'count': pergroupval = g.count()[1] else: pergroupval = g.aggregate(values, reduction)[1] data[resname] = g.broadcast(pergroupval, permute=True)
def time_ak_groupby(N_per_locale, trials, dtype, seed): print(">>> arkouda groupby") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) for numArrays in (1, 2, 8, 16): arrays, totalbytes = generate_arrays(N, numArrays, dtype, seed) timings = [] for i in range(trials): start = time.time() g = ak.GroupBy(arrays) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("{}-array Average time = {:.4f} sec".format(numArrays, tavg)) bytes_per_sec = totalbytes / tavg print("{}-array Average rate = {:.4f} GiB/sec".format( numArrays, bytes_per_sec / 2**30))
def _convert_strings(self, s): ''' Convert string field names to binary vectors. ''' # Initialize to zero values = ak.zeros(s.size, dtype=ak.int64) if self.separator == '': # When separator is empty, field names are guaranteed to be single characters for name, shift in zip(self.names, self.shifts): # Check if name exists in each string bit = s.contains(name) values = values | ak.where(bit, 1 << shift, 0) else: # When separator is non-empty, split on it sf, segs = s.flatten(self.separator, return_segments=True) # Create a grouping to map split fields back to originating string orig = ak.broadcast(segs, ak.arange(segs.size), sf.size) g = ak.GroupBy(orig) for name, shift in zip(self.names, self.shifts): # Check if name matches one of the split fields from originating string bit = g.any(sf == name)[1] values = values | ak.where(bit, 1 << shift, 0) return values
def time_ak_aggregate(N_per_locale, trials, seed): print(">>> arkouda aggregate") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) keys, intvals, boolvals = generate_arrays(N, seed) g = ak.GroupBy(keys, assume_sorted=True) for op in ak.GroupBy.Reductions: if op in BOOLOPS: v = boolvals else: v = intvals totalbytes = v.size * v.itemsize timings = [] for i in range(trials): start = time.time() res = g.aggregate(v, op)[1] end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Aggregate {} Average time = {:.4f} sec".format(op, tavg)) bytes_per_sec = totalbytes / tavg print("Aggregate {} Average rate = {:.4f} GiB/sec".format( op, bytes_per_sec / 2**30))
def inner_join(left, right, wherefunc=None, whereargs=None): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' from inspect import signature sample = min((left.size, right.size, 5)) if wherefunc is not None: if len(signature(wherefunc).parameters) != 2: raise ValueError( "wherefunc must be a function that accepts exactly two arguments" ) if whereargs is None or len(whereargs) != 2: raise ValueError( "whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError( "Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError( "Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Need dense 0-up right index, to filter out left not in right keep, (denseLeft, denseRight) = right_align(left, right) keep = ak.arange(keep.size)[keep] # GroupBy right byRight = ak.GroupBy(denseRight) # Get segment boundaries (starts, ends) of right for each left item rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size]))) starts = rightSegs[denseLeft] ends = rightSegs[denseLeft + 1] fullSize = (ends - starts).sum() # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ") # gen_ranges for gather of right items fullSegs, ranges = gen_ranges(starts, ends) # Evaluate where clause if wherefunc is None: filtRanges = ranges filtSegs = fullSegs keep12 = keep else: # Gather right whereargs rightWhere = whereargs[1][byRight.permutation][ranges] # Expand left whereargs leftWhere = expand(whereargs[0][keep], fullSegs, ranges.size) # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) filtRanges = ranges[whereSatisfied] scan = ak.cumsum(whereSatisfied) - whereSatisfied filtSegsWithZeros = scan[fullSegs] filtSegSizes = ak.concatenate( (filtSegsWithZeros[1:] - filtSegsWithZeros[:-1], ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]]))) keep2 = (filtSegSizes > 0) filtSegs = filtSegsWithZeros[keep2] keep12 = keep[keep2] # Gather right inds and expand left inds rightInds = byRight.permutation[filtRanges] leftInds = expand(ak.arange(left.size)[keep12], filtSegs, filtRanges.size) return leftInds, rightInds
def test_groupby(self): g = ak.GroupBy([self.dtvec1, self.tdvec1]) self.assertTrue(isinstance(g.unique_keys[0], ak.Datetime)) self.assertTrue(isinstance(g.unique_keys[1], ak.Timedelta)) self.assertTrue(g.unique_keys[0].is_sorted())
def run_test(levels): d = make_arrays() df = pd.DataFrame(d) akdf = {k:ak.array(v) for k, v in d.items()} if levels == 1: akg = ak.GroupBy(akdf['keys']) keyname = 'keys' elif levels == 2: akg = ak.GroupBy([akdf['keys'], akdf['keys2']]) keyname = ['keys', 'keys2'] tests = 0 failures = 0 not_impl = 0 print(f"Doing .count()") tests += 1 pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels) # print("Pandas:") # print(pdkeys) # print(pdvals) akkeys, akvals = akg.count() # akkeys = akkeys.to_ndarray() akvals = akvals.to_ndarray() # print("Arkouda:") # print(akkeys) # print(akvals) # if not np.allclose(pdkeys, akkeys): # print(f"Different keys") # failures += 1 failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) # elif not np.allclose(pdvals, akvals): # print(f"Different values (abs diff = {np.abs(pdvals - akvals).sum()})") # failures += 1 for vname in ('int64', 'float64', 'bool'): for op in ak.GroupBy.Reductions: print(f"\nDoing aggregate({vname}, {op})") tests += 1 do_check = True try: pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op, levels) # print("Pandas:") # print(pdkeys) # print(pdvals) except Exception as E: print("Pandas does not implement") do_check = False try: akkeys, akvals = akg.aggregate(akdf[vname], op) # akkeys = akkeys.to_ndarray() akvals = akvals.to_ndarray() # print("Arkouda:") # print(akkeys) # print(akvals) except RuntimeError as E: print("Arkouda error: ", E) not_impl += 1 do_check = False continue if not do_check: continue if op.startswith('arg'): pdextrema = df[vname][pdvals] akextrema = akdf[vname][ak.array(akvals)].to_ndarray() if not np.allclose(pdextrema, akextrema): print(f"Different argmin/argmax: Arkouda failed to find an extremum") print("pd: ", pdextrema) print("ak: ", akextrema) failures += 1 else: # if not np.allclose(pdkeys, akkeys): # print(f"Different keys") # failures += 1 failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) # elif not np.allclose(pdvals, akvals): # print(f"Different values (abs diff = {np.where(np.isfinite(pdvals) & np.isfinite(akvals), np.abs(pdvals - akvals), 0).sum()})") # failures += 1 print(f"\n{failures} failures in {tests} tests ({not_impl} not implemented)")
args.prob, perm=args.perm) print("ii = ", (ii.size, ii)) print("ii(min,max) = ", (ii.min(), ii.max())) print("jj = ", (jj.size, jj)) print("jj(min,max) = ", (jj.min(), jj.max())) nda_ii = ii.to_ndarray() # convert to ndarray for plotting nda_jj = jj.to_ndarray() # convert to ndarray for plotting plt.scatter(nda_ii, nda_jj) plt.show() df = {"ii": ii, "jj": jj} grps = ak.GroupBy(ii) ukeys, cts = grps.count() print("counts", (cts.min(), cts.max())) nBins = ak.max(cts) nda_cts = cts.to_ndarray() # convert to ndarray for plotting plt.hist(nda_cts, bins=nBins) plt.yscale('log') plt.show() ukeys, nu = grps.nunique(jj) print("nunique", (nu.min(), nu.max())) nBins = nu.max() nda_nu = nu.to_ndarray() # convert to ndarray for plotting plt.hist(nda_nu, bins=nBins) plt.yscale('log') plt.show()
def in1dmulti(a, b, assume_unique=False, symmetric=False): """ The multi-level analog of ak.in1d -- test membership of rows of a in the set of rows of b. Parameters ---------- a : list of pdarrays Rows are elements for which to test membership in b b : list of pdarrays Rows are elements of the set in which to test membership assume_unique : bool If true, assume rows of a and b are each unique and sorted. By default, sort and unique them explicitly. Returns ------- pdarray, bool True for each row in a that is contained in b Notes: Only works for pdarrays of int64 dtype, Strings, or Categorical """ if isinstance(a, (ak.pdarray, ak.Strings, ak.Categorical)): if type(a) != type(b): raise TypeError("Arguments must have same type") if symmetric: return ak.in1d(a, b), ak.in1d(b, a) else: return ak.in1d(a, b) atypes = np.array([ai.dtype for ai in a]) btypes = np.array([bi.dtype for bi in b]) if not (atypes == btypes).all(): raise TypeError("Array dtypes of arguments must match") if not assume_unique: ag = ak.GroupBy(a) ua = ag.unique_keys bg = ak.GroupBy(b) ub = bg.unique_keys else: ua = a ub = b # Key for deinterleaving result isa = ak.concatenate( (ak.ones(ua[0].size, dtype=ak.bool), ak.zeros(ub[0].size, dtype=ak.bool)), ordered=False) c = [ak.concatenate(x, ordered=False) for x in zip(ua, ub)] g = ak.GroupBy(c) k, ct = g.count() if assume_unique: # need to verify uniqueness, otherwise answer will be wrong if (g.sum(isa)[1] > 1).any(): raise NonUniqueError( "Called with assume_unique=True, but first argument is not unique" ) if (g.sum(~isa)[1] > 1).any(): raise NonUniqueError( "Called with assume_unique=True, but second argument is not unique" ) # Where value appears twice, it is present in both a and b # truth = answer in c domain truth = g.broadcast(ct == 2, permute=True) if assume_unique: # Deinterleave truth into a and b domains if symmetric: return truth[isa], truth[~isa] else: return truth[isa] else: # If didn't start unique, first need to deinterleave into ua domain, # then broadcast to a domain atruth = ag.broadcast(truth[isa], permute=True) if symmetric: btruth = bg.broadcast(truth[~isa], permute=True) return atruth, btruth else: return atruth
def add(self,b): index = self.index.concat(b.index).index values = ak.concatenate( [self.values, b.values],ordered=False) return Series(ak.GroupBy( index).sum(values))
def __init__(self, segments, values, copy=False, lengths=None, grouping=None): """ An array of variable-length arrays, also called a skyline array or ragged array. Parameters ---------- segments : pdarray, int64 Start index of each sub-array in the flattened values array values : pdarray The flattened values of all sub-arrays copy : bool If True, make a copy of the input arrays; otherwise, just store a reference. Returns ------- SegArray Data structure representing an array whose elements are variable-length arrays. Notes ----- Keyword args 'lengths' and 'grouping' are not user-facing. They are used by the attach method. """ if not isinstance(segments, ak.pdarray) or segments.dtype != ak.int64: raise TypeError("Segments must be int64 pdarray") if not ak.is_sorted(segments) or (ak.unique(segments).size != segments.size): raise ValueError("Segments must be unique and in sorted order") if segments.size > 0: if segments.min() != 0 or segments.max() >= values.size: raise ValueError( "Segments must start at zero and be less than values.size") elif values.size > 0: raise ValueError( "Cannot have non-empty values with empty segments") if copy: self.segments = segments[:] self.values = values[:] else: self.segments = segments self.values = values self.size = segments.size self.valsize = values.size if lengths is None: self.lengths = self._get_lengths() else: self.lengths = lengths self.dtype = values.dtype if grouping is None: if self.size == 0: self.grouping = ak.GroupBy(ak.zeros(0, dtype=ak.int64)) else: # Treat each sub-array as a group, for grouped aggregations self.grouping = ak.GroupBy( ak.broadcast(self.segments, ak.arange(self.size), self.valsize)) else: self.grouping = grouping
def intersect(a, b, positions=True, unique=False): """ Find the intersection of two arkouda arrays. This function can be especially useful when `positions=True` so that the caller gets the indices of values present in both arrays. Parameters ---------- a : ak.Strings or ak.pdarray An array of strings b : ak.Strings or ak.pdarray An array of strings positions : bool (default=True) Return tuple of boolean pdarrays that indicate positions in a and b where the values are in the intersection. unique : bool (default=False) If the number of distinct values in `a` (and `b`) is equal to the size of `a` (and `b`), there is a more efficient method to compute the intersection. Returns ------- (ak.pdarray, ak.pdarray) The indices of `a` and `b` where any element occurs at least once in both arrays. """ # To ensure compatibility with all types of arrays: if (isinstance(a, ak.pdarray) and isinstance(b, ak.pdarray)): intx = ak.intersect1d(a, b) if not positions: return intx else: maska = ak.in1d(a, intx) maskb = ak.in1d(b, intx) return (maska, maskb) # It takes more effort to do this with ak.Strings arrays. elif (isinstance(a, ak.Strings) and isinstance(b, ak.Strings)): # Hash the two arrays first hash_a00, hash_a01 = a.hash() hash_b00, hash_b01 = b.hash() # a and b do not have duplicate entries, so the hashes are distinct if unique: hash0 = ak.concatenate([hash_a00, hash_b00]) hash1 = ak.concatenate([hash_a01, hash_b01]) # Group by the unique hashes gb = ak.GroupBy([hash0, hash1]) val, cnt = gb.count() # Hash counts, in groupby order counts = gb.broadcast(cnt, permute=False) # Same, in original order tmp = counts[:] counts[gb.permutation] = tmp del tmp # Masks maska = (counts > 1)[:a.size] maskb = (counts > 1)[a.size:] # The intersection for each array of hash values if positions: return (maska, maskb) else: return a[maska] # a and b may have duplicate entries, so get the unique hash values else: gba = ak.GroupBy([hash_a00, hash_a01]) gbb = ak.GroupBy([hash_b00, hash_b01]) # Take the unique keys as the hash we'll work with a0, a1 = gba.unique_keys b0, b1 = gbb.unique_keys hash0 = ak.concatenate([a0, b0]) hash1 = ak.concatenate([a1, b1]) # Group by the unique hashes gb = ak.GroupBy([hash0, hash1]) val, cnt = gb.count() # Hash counts, in groupby order counts = gb.broadcast(cnt, permute=False) # Restore the original order tmp = counts[:] counts[gb.permutation] = tmp del tmp # Broadcast back up one more level countsa = counts[:a0.size] countsb = counts[a0.size:] counts2a = gba.broadcast(countsa, permute=False) counts2b = gbb.broadcast(countsb, permute=False) # Restore the original orders tmp = counts2a[:] counts2a[gba.permutation] = tmp del tmp tmp = counts2b[:] counts2b[gbb.permutation] = tmp del tmp # Masks maska = (counts2a > 1) maskb = (counts2b > 1) # The intersection for each array of hash values if positions: return (maska, maskb) else: return a[maska]
def inner_join2(left, right, wherefunc=None, whereargs=None, forceDense=False): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' if not isinstance(left, ak.pdarray) or left.dtype != ak.int64 or not isinstance(right, ak.pdarray) or right.dtype != ak.int64: raise ValueError("left and right must be pdarray(int64)") if wherefunc is not None: from inspect import signature sample = min((left.size, right.size, 5)) if len(signature(wherefunc).parameters) != 2: raise ValueError("wherefunc must be a function that accepts exactly two arguments") if whereargs is None or len(whereargs) != 2: raise ValueError("whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError("Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError("Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Only join on intersection inter = ak.intersect1d(left, right) # Indices of left values present in intersection leftInds = ak.arange(left.size)[ak.in1d(left, inter)] # Left vals in intersection leftFilt = left[leftInds] # Indices of right vals present in inter rightInds = ak.arange(right.size)[ak.in1d(right, inter)] # Right vals in inter rightFilt = right[rightInds] byLeft = ak.GroupBy(leftFilt) byRight = ak.GroupBy(rightFilt) maxVal = inter.max() if forceDense or maxVal > 3*(left.size + right.size): # Remap intersection to dense, 0-up codes # Replace left values with dense codes uniqLeftVals = byLeft.unique_keys uniqLeftCodes = ak.arange(inter.size)[ak.in1d(inter, uniqLeftVals)] leftCodes = ak.zeros_like(leftFilt) - 1 leftCodes[byLeft.permutation] = byLeft.broadcast(uniqLeftCodes, permute=False) # Replace right values with dense codes uniqRightVals = byRight.unique_keys uniqRightCodes = ak.arange(inter.size)[ak.in1d(inter, uniqRightVals)] rightCodes = ak.zeros_like(rightFilt) - 1 rightCodes[byRight.permutation] = byRight.broadcast(uniqRightCodes, permute=False) countSize = inter.size else: uniqLeftCodes = byLeft.unique_keys uniqRightCodes = byRight.unique_keys leftCodes = leftFilt rightCodes = rightFilt countSize = maxVal + 1 # Expand indices to product domain # First count occurrences of each code in left and right leftCounts = ak.zeros(countSize, dtype=ak.int64) leftCounts[uniqLeftCodes] = byLeft.count()[1] rightCounts = ak.zeros(countSize, dtype=ak.int64) rightCounts[uniqRightCodes] = byRight.count()[1] # Repeat each left index as many times as that code occurs in right prodLeft = rightCounts[leftCodes] leftFullInds = ak.broadcast(ak.cumsum(prodLeft)-prodLeft, leftInds, prodLeft.sum()) prodRight = leftCounts[rightCodes] rightFullInds = ak.broadcast(ak.cumsum(prodRight)-prodRight, rightInds, prodRight.sum()) # Evaluate where clause if wherefunc is None: return leftFullInds, rightFullInds else: # Gather whereargs leftWhere = whereargs[0][leftFullInds] rightWhere = whereargs[1][rightFullInds] # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) return leftFullInds[whereSatisfied], rightFullInds[whereSatisfied]
if __name__ == '__main__': import sys if len(sys.argv) != 7: print( f"Usage: {sys.argv[0]} <server> <port> <strategy (0=global, 1=perLocale)> <length> <num_keys> <num_vals>" ) sys.exit() per_locale = (sys.argv[3] == '1') print("per_locale = ", per_locale) length = int(sys.argv[4]) print("length = ", length) nkeys = int(sys.argv[5]) print("nkeys = ", nkeys) nvals = int(sys.argv[6]) print("nvals = ", nvals) ak.connect(sys.argv[1], int(sys.argv[2])) print("Generating keys and vals...") start = time() keys, vals = generate_arrays(length, nkeys, nvals) print(f"{time() - start:.2f} seconds", end="\n\n") print("GroupBy...") start = time() g = ak.GroupBy(keys, per_locale) print(f"{time() - start:.2f} seconds", end="\n\n") for op in OPERATORS: print(f"Aggregate('{op}') ...") start = time() uk, rv = g.aggregate(vals, op) print(f"{time() - start:.2f} seconds", end="\n\n") sys.exit()
def cluster(self, min_cluster_size=5): cluster_data = {} last_level_delta = self.level_data[0].delta # Initial setup; all levels are the same size num_nodes = self.level_data[0].size # This dataframe holds extraction data selection_data = aku.DataFrame({ 'stability': ak.zeros(1, dtype=ak.float64), 'parent': ak.zeros(1, dtype=ak.int64), }) # Create an initial cluster dataframe labels = ak.arange(num_nodes) sizes = ak.ones(num_nodes, dtype=ak.int64) stability = ak.zeros(num_nodes, dtype=ak.float64) selected = ak.zeros(num_nodes, dtype=ak.bool) df = aku.DataFrame({ 'cc':self.level_data[0].cc, 'labels':labels, 'sizes':sizes, 'stability':stability, }) # The result should have all the same keys as the deltas cluster_data[self.level_data[0].delta] = df # We don't start with the level 0, it gets passed through as is. for level in tqdm(self.level_data[1:]): bylevel = ak.GroupBy(level.cc) perm = bylevel.permutation # Save for later analysis old_labels = labels[:] # Count number of nodes in each group _,c = bylevel.count() # Find largest (negative) label value each group _, max_group_labels = bylevel.aggregate(labels, 'min') # Find maximum of existing cluster sizes from last iteration. _, max_group_size = bylevel.aggregate(sizes, 'max') # Find the maximum stability in each group _, max_group_stability = bylevel.aggregate(stability, 'max') # Find the number of sub-clusters in each group for purposes of creating new cluster labels clusters_and_zeros = ak.where(labels < 0, labels, 0) _, num_unique_labels = bylevel.aggregate(clusters_and_zeros, 'nunique') _, min_group_label = bylevel.aggregate(labels, 'max') num_sub_clusters = num_unique_labels - ak.where(min_group_label >= 0, 1, 0) # Update sizes count_bc = bylevel.broadcast(c, permute=False) sizes = ak.zeros(num_nodes, dtype=ak.int64) sizes[perm] = count_bc # Update labels to max (negative) in group labels_bc = bylevel.broadcast(max_group_labels, permute=False) labels = ak.zeros(num_nodes, dtype=ak.int64) labels[perm] = labels_bc # Update stability stability_bc = bylevel.broadcast(max_group_stability, permute=False) stability = ak.zeros(num_nodes, dtype=ak.float64) stability[perm] = stability_bc # Create and update labels as needed, baseline size is 1 # Only need to test if there are at least two cluster labels in a group. new_clusters_join = (num_sub_clusters > 1) new_clusters_form = ((c >= min_cluster_size) & (max_group_labels >= 0)) condition = (new_clusters_join | new_clusters_form) num_new_labels = int(condition.sum()) new_labels_positioned = ak.zeros(c.size, dtype=np.int64) if num_new_labels > 0: # Set up selection_data mn = abs(int(labels.min())) new_label_values = ak.arange(mn+1, mn+num_new_labels+1, 1) * (-1) new_labels_positioned = ak.zeros(c.size, dtype=np.int64) new_labels_positioned[condition] = new_label_values # Update selection_data update_df = aku.DataFrame({ 'parent': ak.zeros(num_new_labels, dtype=ak.int64), 'stability': ak.zeros(num_new_labels, dtype=ak.float64), }) selection_data.append(update_df) # Update the labels labels_bc = bylevel.broadcast(new_labels_positioned, permute=False) new_labels = ak.zeros(num_nodes, dtype=ak.int64) new_labels[perm] = labels_bc tmp = ak.where(new_labels < 0, new_labels, labels) labels = tmp # When clusters become absorbed into new clusters, add their parent labels and update stability mask = ((labels < 0) & (old_labels < 0) & (labels < old_labels)) if mask.sum() > 0: t1 = old_labels[mask] t2 = labels[mask] t3 = stability[mask] bychangedlabels = ak.GroupBy([t1, t2]) [old,new] = bychangedlabels.unique_keys # I don't remember the purpose of this line, but it's never used. #stabby = t3[aku.invert_permutation(bychangedlabels.permutation)][bychangedlabels.segments] selection_data['parent'][-1 * old] = -1 * new # Set new cluster stability to 0 new_label_bc = bylevel.broadcast(new_labels_positioned, permute=False) tmp = ak.zeros(labels.size, dtype=np.int64) tmp[perm] = new_label_bc stability[tmp < 0] = 0 # Update stability added_stability = sizes / (level.delta - last_level_delta) last_level_delta = level.delta tmp = ak.where(sizes >= min_cluster_size, stability + added_stability, stability) stability = tmp # Save this information after processing df = aku.DataFrame({ 'cc':level.cc, 'labels':labels, 'sizes':sizes, 'stability':stability, }) cluster_data[level.delta] = df # Update cluster selection information bylabel = ak.GroupBy(labels) keys = labels[bylabel.permutation][bylabel.segments] stab = stability[bylabel.permutation][bylabel.segments] indx = (keys[keys < 0])*(-1) vals = stab[keys < 0] selection_data['stability'][indx] = vals # Set up data for next steps self.cluster_data = cluster_data self.selection_data = selection_data # Select and extract self.select_clusters() self.extract_clusters() print("Clustering is complete!") return self.extracted_clusters
# unique akuniq = ak.unique(strings) catuniq = ak.unique(cat) akset = set(akuniq.to_ndarray()) catset = set(catuniq.to_ndarray()) assert (akset == catset) # There should be no duplicates assert (akuniq.size == len(akset)) npset = set(np.unique(test_strings)) # When converted to a set, should agree with numpy assert (akset == npset) print("unique passed") # groupby g = ak.GroupBy(strings) gc = ak.GroupBy(cat) # Unique keys should be same result as ak.unique assert (akset == set(g.unique_keys.to_ndarray())) assert (akset == set(gc.unique_keys.to_ndarray())) assert ((gc.permutation == g.permutation).all()) permStrings = strings[g.permutation] # Check each group individually lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size])))) for uk, s, l in zip(g.unique_keys, g.segments, lengths): # All values in group should equal key assert ((permStrings[s:s + l] == uk).all()) # Key should not appear anywhere outside of group assert (not (permStrings[:s] == uk).any()) assert (not (permStrings[s + l:] == uk).any()) print("groupby passed")