def time_ak_in1d(size, trials): print(">>> arkouda int64 in1d") cfg = ak.get_config() N = size * cfg["numLocales"] a = ak.arange(N) % LARGE for regime, bsize in zip(('Medium', 'Large'), (MEDIUM, LARGE)): print( "{} regime: numLocales = {} a.size = {:,} b.size = {:,}".format( regime, cfg["numLocales"], N, bsize)) b = ak.arange(bsize) expected_misses = (LARGE - bsize) * (a.size // LARGE) + max( (0, (a.size % LARGE) - bsize)) timings = [] for _ in range(trials): start = time.time() c = ak.in1d(a, b) end = time.time() timings.append(end - start) assert (c.size - c.sum()) == expected_misses, "Incorrect result" tavg = sum(timings) / trials print("{} average time = {:.4f} sec".format(regime, tavg)) bytes_per_sec = (a.size * a.itemsize + b.size * b.itemsize) / tavg print("{} average rate = {:.2f} GiB/sec".format( regime, bytes_per_sec / 2**30))
def check_correctness(): asize = 10**4 bsize = 10**3 a = ak.arange(asize) b = ak.arange(bsize) c = ak.in1d(a, b) assert c.sum() == bsize, "Incorrect result"
def append_single(self, x, prepend=False): ''' Append a single value to each sub-array. Parameters ---------- x : pdarray or scalar Single value to append to each sub-array Returns ------- SegArray Copy of original SegArray with values from x appended to each sub-array ''' if hasattr(x, 'size'): if x.size != self.size: raise ValueError( 'Argument must be scalar or same size as SegArray') if type(x) != type(self.values) or x.dtype != self.dtype: raise TypeError( 'Argument type must match value type of SegArray') newlens = self.lengths + 1 newsegs = ak.cumsum(newlens) - newlens newvals = ak.zeros(newlens.sum(), dtype=self.dtype) if prepend: lastscatter = newsegs else: lastscatter = newsegs + newlens - 1 newvals[lastscatter] = x origscatter = ak.arange(self.valsize) + self.grouping.broadcast( ak.arange(self.size), permute=True) if prepend: origscatter += 1 newvals[origscatter] = self.values return SegArray(newsegs, newvals)
def check_correctness(): N = 10**4 thirds = [ak.cast(ak.arange(i, N*3, 3), 'str') for i in range(3)] thickrange = thirds[0].stick(thirds[1], delimiter='_').stick(thirds[2], delimiter='_') answer = ak.cast(ak.arange(N*3), 'str') assert (thickrange.flatten('_') == answer).all() assert (thickrange.flatten('_', regex=True) == answer).all() assert (thickrange.flatten('_+', regex=True) == answer).all()
def check_set_integer_iv(N): # create np version a = np.arange(N) iv = np.arange(N // 2) a[iv] = iv * 10 # create ak version b = ak.arange(N) iv = ak.arange(N // 2) b[iv] = iv * 10 # print(a,b) c = a == b.to_ndarray() # print(type(c),c) return pass_fail(c.all())
def check_set_integer_iv_value(N): # create np version a = np.arange(N) iv = np.arange(N // 2) a[iv] = -1 a = ak.array(a) # create ak version b = ak.arange(N) iv = ak.arange(N // 2) b[iv] = -1 # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def check_correctness(): keys = ak.arange(1000) % 10 ones = ak.ones_like(keys) g = ak.GroupBy(keys) # Make sure keys are correct assert (g.unique_keys == ak.arange(10)).all() # Check value of sums assert (g.sum(ones)[1] == 100).all() # For other ops, just run them and make sure they return the right size vector for op in ak.GroupBy.Reductions: if op in BOOLOPS: res = g.aggregate((ones == 1), op)[1] else: res = g.aggregate(ones, op)[1] assert (res.size == g.unique_keys.size)
def most_common(g, values): '''Find the most common value for each key in a GroupBy object. Parameters ---------- g : ak.GroupBy Grouping of keys values : array-like Values in which to find most common Returns ------- unique_keys : (list of) arrays Unique key of each group most_common_values : array-like The most common value for each key ''' # Give each key an integer index keyidx = g.broadcast(ak.arange(g.unique_keys[0].size), permute=True) # Annex values and group by (key, val) bykeyval = ak.GroupBy([keyidx, values]) # Count number of records for each (key, val) (ki, uval), count = bykeyval.count() # Group out value bykey = ak.GroupBy(ki, assume_sorted=True) # Find the index of the most frequent value for each key _, topidx = bykey.argmax(count) # Gather the most frequent values return uval[topidx]
def argsort(self, key, ascending=True): """ Return the permutation that sorts the dataframe by `key`. Parameters ---------- key : str The key to sort on. Returns ------- ak.pdarray The permutation array that sorts the data on `key`. """ if self._empty: return ak.array([], dtype=ak.int64) if ascending: return ak.argsort(self[key]) else: if isinstance( self[key], ak.pdarray) and self[key].dtype in (ak.int64, ak.float64): return ak.argsort(-self[key]) else: return ak.argsort(self[key])[ak.arange(self.size - 1, -1, -1)]
def time_ak_reduce(N_per_locale, trials, dtype, random): print(">>> arkouda reduce") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if random: if dtype == 'int64': a = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) else: a = ak.arange(0, N, 1) if dtype == 'float64': a = 1.0 * a timings = {op: [] for op in OPS} results = {} for i in range(trials): for op in timings.keys(): fxn = getattr(a, op) start = time.time() r = fxn() end = time.time() timings[op].append(end - start) results[op] = r tavg = {op: sum(t) / trials for op, t in timings.items()} for op, t in tavg.items(): print("{} = {}".format(op, results[op])) print(" Average time = {:.4f} sec".format(t)) bytes_per_sec = (a.size * a.itemsize) / t print(" Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def get_ngrams(self, n, return_origins=True): """ Return all n-grams from all sub-arrays. Parameters ---------- n : int Length of n-gram return_origins : bool If True, return an int64 array indicating which sub-array each returned n-gram came from. Returns ------- ngrams : list of pdarray An n-long list of pdarrays, essentially a table where each row is an n-gram. origin_indices : pdarray, int The index of the sub-array from which the corresponding n-gram originated """ ngrams = [] notsegstart = ak.ones(self.valsize, dtype=ak.bool) notsegstart[self.segments] = False valid = ak.ones(self.valsize - n + 1, dtype=ak.bool) for i in range(n): end = self.valsize - n + i + 1 ngrams.append(self.values[i:end]) if i > 0: valid &= notsegstart[i:end] ngrams = [char[valid] for char in ngrams] if return_origins: origin_indices = self.grouping.broadcast( ak.arange(self.size), permute=True)[:valid.size][valid] return ngrams, origin_indices else: return ngrams
def time_ak_scan(N_per_locale, trials, dtype, random, seed): print(">>> arkouda {} scan".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if random or args.seed is not None: if dtype == 'int64': a = ak.randint(1, N, N, seed=seed) elif dtype == 'float64': a = ak.uniform(N, seed=seed) + 0.5 else: a = ak.arange(1, N, 1) if dtype == 'float64': a = 1.0 * a timings = {op: [] for op in OPS} final_values = {} for i in range(trials): for op in timings.keys(): fxn = getattr(ak, op) start = time.time() r = fxn(a) end = time.time() timings[op].append(end - start) final_values[op] = r[r.size-1] tavg = {op: sum(t) / trials for op, t in timings.items()} for op, t in tavg.items(): print("{}, final value = {}".format(op, final_values[op])) print(" {} Average time = {:.4f} sec".format(op, t)) bytes_per_sec = (a.size * a.itemsize * 2) / t print(" {} Average rate = {:.2f} GiB/sec".format(op, bytes_per_sec/2**30))
def remove_repeats(self, return_multiplicity=False): """ Condense sequences of repeated values within a sub-array to a single value. Parameters ---------- return_multiplicity : bool If True, also return the number of times each value was repeated. Returns ------- norepeats : SegArray Sub-arrays with runs of repeated values replaced with single value multiplicity : SegArray If return_multiplicity=True, this array contains the number of times each value in the returned SegArray was repeated in the original SegArray. """ isrepeat = ak.zeros(self.values.size, dtype=ak.bool) isrepeat[1:] = self.values[:-1] == self.values[1:] isrepeat[self.segments] = False truepaths = self.values[~isrepeat] nhops = self.grouping.sum(~isrepeat)[1] # truehops = ak.cumsum(~isrepeat) # nhops = ak.concatenate((truehops[self.segments[1:]], ak.array([truehops.sum()+1]))) - truehops[self.segments] truesegs = ak.cumsum(nhops) - nhops norepeats = SegArray(truesegs, truepaths) if return_multiplicity: truehopinds = ak.arange(self.valsize)[~isrepeat] multiplicity = ak.zeros(truepaths.size, dtype=ak.int64) multiplicity[:-1] = truehopinds[1:] - truehopinds[:-1] multiplicity[-1] = self.valsize - truehopinds[-1] return norepeats, SegArray(truesegs, multiplicity) else: return norepeats
def from_multi_array(cls, m): """ Construct a SegArray from a list of columns. This essentially transposes the input, resulting in an array of rows. Parameters ---------- m : list of pdarray List of columns, the rows of which will form the sub-arrays of the output Returns ------- SegArray Array of rows of input """ if isinstance(m, ak.pdarray): size = m.size n = 1 dtype = m.dtype else: s = set(mi.size for mi in m) if len(s) != 1: raise ValueError("All columns must have same length") size = s.pop() n = len(m) d = set(mi.dtype for mi in m) if len(d) != 1: raise ValueError("All columns must have same dtype") dtype = d.pop() newsegs = ak.arange(size) * n newvals = ak.zeros(size * n, dtype=dtype) for j in range(len(m)): newvals[j::len(m)] = m[j] return cls(newsegs, newvals)
def check_get_integer_idx(N): # create np version a = np.arange(N) v1 = a[N // 2] # create ak version b = ak.arange(N) v2 = b[N // 2] return pass_fail(v1 == v2)
def check_arange(N): # create np version a = ak.array(np.arange(N)) # create ak version b = ak.arange(N) # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def time_flatten(N, trials): print(">>> arkouda flatten") cfg = ak.get_config() print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) thirds = [ak.cast(ak.arange(i, N*3, 3), 'str') for i in range(3)] thickrange = thirds[0].stick(thirds[1], delimiter='_').stick(thirds[2], delimiter='_') nbytes = thickrange.nbytes * thickrange.entry.itemsize non_regex_times = [] regex_literal_times = [] regex_pattern_times = [] for i in range(trials): start = time.time() non_regex = thickrange.flatten('_') end = time.time() non_regex_times.append(end - start) start = time.time() regex_literal = thickrange.flatten('_', regex=True) end = time.time() regex_literal_times.append(end - start) start = time.time() regex_pattern = thickrange.flatten('_+', regex=True) end = time.time() regex_pattern_times.append(end - start) avg_non_regex = sum(non_regex_times) / trials avg_regex_literal = sum(regex_literal_times) / trials avg_regex_pattern = sum(regex_pattern_times) / trials answer = ak.cast(ak.arange(N*3), 'str') assert (non_regex == answer).all() assert (regex_literal == answer).all() assert (regex_pattern == answer).all() print("non-regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_non_regex)) print("regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_regex_literal)) print("regex flatten with pattern delimiter Average time = {:.4f} sec".format(avg_regex_pattern)) print("non-regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_non_regex)) print("regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_literal)) print("regex flatten with pattern delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_pattern))
def refinement(N): ''' Coargsort of two arrays, where the first is already sorted but has many repeated values ''' groupsize = 100 a = ak.arange(N // 2) // groupsize factor = 2**32 // a.max() a *= factor b = ak.randint(0, 2**32, N // 2) yield 'refinement int64', (a, b)
def reset_index(self, size=False): """ Set the index to an integer range. Useful if this dataframe is the result of a slice operation from another dataframe, or if you have permuted the rows and no longer need to keep that ordering on the rows. Parameters ---------- size : int If size is passed, do not attempt to determine size based on existing column sizes. Assume caller handles consistency correctly. """ if not size: self.update_size() self.data['index'] = ak.arange(0, self._size) else: self.data['index'] = ak.arange(size)
def check_set_bool_iv(N): # create np version a = np.arange(N) a[a < N // 2] = a[:N // 2] * -1 # create ak version b = ak.arange(N) b[b < N // 2] = b[:N // 2] * -1 # print(a,b) c = a == b.to_ndarray() # print(type(c),c) return pass_fail(c.all())
def __setitem__(self, key, value): self.update_size() # If this is the first column added, we must create an index column. add_index = False if self._empty: add_index = True # Set a single row in the dataframe using a dict of values if type(key) == int: for k in self.data.keys(): if isinstance(self.data[k], ak.Strings): raise ValueError( "This DataFrame has a column of type ak.Strings;" " so this DataFrame is immutable. This feature could change" " if arkouda supports mutable Strings in the future.") if self._empty: raise ValueError( "Initial data must be dict of arkouda arrays.") elif not isinstance(value, (dict, UserDict)): raise ValueError("Expected dict or Row type.") elif key >= self._size: raise KeyError("The row index is out of range.") else: for k, v in value.items(): if k == 'index': continue self[k][key] = v # Set a single column in the dataframe using a an arkouda array elif type(key) == str: if not isinstance(value, self.COLUMN_CLASSES): raise ValueError( f"Column must be one of {self.COLUMN_CLASSES}.") elif self._size is not None and self._size != value.size: raise ValueError( "Expected size {} but received size {}.".format( self.size, value.size)) else: self._empty = False UserDict.__setitem__(self, key, value) # Update the index values if key not in self._columns: self._columns.append(key) # Do nothing and return if there's no valid data else: raise ValueError("No valid data received.") # Update the dataframe indices and metadata. if add_index: self.update_size() self.data['index'] = ak.arange(0, self._size, 1)
def check_set_bool_iv_value(N): # create np version a = np.arange(N) a[a < N // 2] = -1 a = ak.array(a) # create ak version b = ak.arange(N) b[b < N // 2] = -1 # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def check_get_bool_iv(N): # create np version a = np.arange(N) a = a[a < N // 2] a = ak.array(a) # create ak version b = ak.arange(N) b = b[b < N // 2] # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def check_set_integer_idx(N): # create np version a = np.arange(N) a[N // 2] = -1 a[-1] = -1 v1 = a[N // 2] # create ak version b = ak.arange(N) b[N // 2] = -1 b[-1] = -1 v2 = b[N // 2] return pass_fail(v1 == v2) and pass_fail(a[-1] == b[-1])
def check_bool(N): a = ak.arange(N) b = ak.ones(N) try: c = a and b except ValueError: correct = True except: correct = False d = ak.array([1]) correct = correct and (d and 5) return pass_fail(correct)
def argsort(self, ascending=True): if not ascending: if isinstance( self.index, ak.pdarray) and self.index.dtype in (ak.int64, ak.float64): i = ak.argsort(-self.index) else: i = ak.argsort(self.index)[ak.arange(self.index.size - 1, -1, -1)] else: i = ak.argsort(self.index) return i
def block_sorted(N): ''' The concatenation of two sorted arrays of unequal length The interleaving of two sorted arrays of unequal length Most often occurs in array setops, where two arrays are uniqued (via sorting), then concatenated and sorted ''' splitpoint = 0.4 Na = int(splitpoint * N) Nb = N - Na # Construct a and b such that: # 1) Values overlap # 2) a and b are sorted a = ak.arange(Na) b = ak.arange(Nb) c = ak.concatenate((a, b), ordered=True) yield 'block-sorted concat int64', c ci = ak.concatenate((a, b), ordered=False) yield 'block-sorted interleaved int64', ci
def check_sort(N): # create np version a = np.arange(N) a = a[::-1] a = np.sort(a) # create ak version b = ak.arange(N) b = b[::-1] b = ak.sort(b) # print(a,b) c = a == b.to_ndarray() # print(type(c),c) return pass_fail(c.all())
def check_coargsort(N): # create np version a = np.arange(N) a = a[::-1] iv = np.lexsort([a, a]) a = a[iv] # create ak version b = ak.arange(N) b = b[::-1] iv = ak.coargsort([b, b]) b = b[iv] # print(a,b) c = a == b.to_ndarray() # print(type(c),c) return pass_fail(c.all())
def in1d_intervals(vals, intervals, symmetric=False, assume_unique=False): """ Test each value for membership in *any* of a set of half-open (pythonic) intervals. Parameters ---------- vals : pdarray(int, float) Values to test for membership in intervals intervals : 2-tuple of pdarrays Non-overlapping, half-open intervals, as a tuple of (lower_bounds_inclusive, upper_bounds_exclusive) symmetric : bool If True, also return boolean pdarray indicating which intervals contained one or more query values. Returns ------- pdarray(bool) Array of same length as <vals>, True if corresponding value is included in any of the ranges defined by (low[i], high[i]) inclusive. pdarray(bool) (if symmetric=True) Array of same length as number of intervals, True if corresponding interval contains any of the values in <vals>. Notes ----- First return array is equivalent to the following: ((vals >= intervals[0][0]) & (vals < intervals[1][0])) | ((vals >= intervals[0][1]) & (vals < intervals[1][1])) | ... ((vals >= intervals[0][-1]) & (vals < intervals[1][-1])) But much faster when testing many ranges. Second (optional) return array is equivalent to: ((intervals[0] <= vals[0]) & (intervals[1] > vals[0])) | ((intervals[0] <= vals[1]) & (intervals[1] > vals[1])) | ... ((intervals[0] <= vals[-1]) & (intervals[1] > vals[-1])) But much faster when vals is non-trivial size. """ idx = search_intervals(vals, intervals, assume_unique=assume_unique) found = idx > -1 if symmetric: containresult = ak.in1d(ak.arange(intervals[0].size), idx) return found, containresult else: return found