def check_set_bool_iv(N): # create np version a = np.arange(N) a[a < N // 2] = a[:N // 2] * -1 a = ak.array(a) # create ak version b = ak.arange(N) b[b < N // 2] = b[:N // 2] * -1 # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def check_get_slice(N): # create np version a = np.ones(N) a = a[::2] a = ak.array(a) # create ak version b = ak.ones(N) b = b[::2] # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def check_correctness(dtype, random): Ni = 10**4 Nv = 10**4 # make indices unique # if indices are non-unique, results of unordered scatter are variable npi = np.arange(Ni) np.random.shuffle(npi) npc = np.zeros(Nv, dtype=dtype) aki = ak.array(npi) akc = ak.zeros(Nv, dtype=dtype) if random: if dtype == 'int64': npv = np.random.randint(0, 2**32, Ni) elif dtype == 'float64': npv = np.random.random(Ni) else: npv = np.ones(Ni, dtype=dtype) akv = ak.array(npv) npc[npi] = npv akc[aki] = akv assert np.allclose(npc, akc.to_ndarray())
def check_bool(N): a = ak.arange(N) b = ak.ones(N) try: c = a and b except ValueError: correct = True except: correct = False d = ak.array([1]) correct = correct and (d and 5) return pass_fail(correct)
def check_set_slice_value(N): # create np version a = np.ones(N) a[::2] = -1 a = ak.array(a) # create ak version b = ak.ones(N) b[::2] = -1 # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def locate(self,key): """Lookup values by index label The input can be a scalar, a list of scalers, or a list of lists (if the series has a MultiIndex). As a special case, if a Series is used as the key, the series labels are preserved with its values use as the key. Keys will be turned into arkouda arrays as needed. Returns ------- A Series containing the values corresponding to the key. """ t =type(key) if isinstance(key,Series): # special case, keep the index values of the Series, and lookup the values labels = key.index key = key.values v = aku.lookup(self.index.index,self.values,key) return Series( (labels, v)) elif isinstance(key,ak.pdarrayclass.pdarray): idx = self.index.lookup(key) elif t == list or t == tuple: key0 = key[0] if isinstance(key0,list) or isinstance(key0,tuple): # nested list. check if already arkouda arrays if not isinstance(key0[0], ak.pdarrayclass.pdarray): # convert list of lists to list of pdarrays key = [ ak.array(a) for a in np.array(key).T.copy() ] elif not isinstance(key0,ak.pdarrayclass.pdarray): # a list of scalers, convert into arkouda array key = ak.array(key) # else already list if arkouda array, use as is idx = self.index.lookup(key) else: # scalar value idx = self.index == key return Series( (self.index[idx], self.values[idx]) )
def check_correctness(dtype, random, seed): N = 10**4 if seed is not None: np.random.seed(seed) if dtype == 'int64': a = np.random.randint(1, N, N) elif dtype == 'float64': a = np.random.random(N) + 0.5 aka = ak.array(a) npa = aka.to_ndarray() assert np.allclose(a, npa)
def check_sort(N): # create np version a = np.arange(N) a = a[::-1] a = np.sort(a) a = ak.array(a) # create ak version b = ak.arange(N) b = b[::-1] b = ak.sort(b) # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def check_set_integer_iv(N): # create np version a = np.arange(N) iv = np.arange(N // 2) a[iv] = iv * 10 a = ak.array(a) # create ak version b = ak.arange(N) iv = ak.arange(N // 2) b[iv] = iv * 10 # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def conn_comp(src, dst, printCComp=False, printLayers=False): unvisited = ak.unique(src) if printCComp: print("unvisited size = ", unvisited.size, unvisited) components = [] while unvisited.size > 0: # use lowest numbered vertex as representative vertex rep_vertex = unvisited[0] # bfs from rep_vertex layers,visited = bfs(src,dst,ak.array([rep_vertex]),printLayers) # add verticies in component to list of components components.append(visited) # subtract out visited from unvisited vertices unvisited = ak.setdiff1d(unvisited,visited) if printCComp: print(" visited size = ", visited.size, visited) if printCComp: print("unvisited size = ", unvisited.size, unvisited) return components
def drop_duplicates(self, subset=None, keep='first'): """ Drops duplcated rows and returns resulting DataFrame. If a subset of the columns are provided then only one instance of each duplicated row will be returned (keep determines which row). Parameters ---------- subset : Iterable of column names to use to dedupe. keep : {'first', 'last'}, default 'first' Determines which duplicates (if any) to keep. Returns ------- DataFrame DataFrame with duplicates removed. """ if self._empty: return self if not subset: subset = self._columns[1:] if len(subset) == 1: if not subset[0] in self.data: raise KeyError("{} is not a column in the DataFrame.".format( subset[0])) _ = ak.GroupBy(self.data[subset[0]]) else: for col in subset: if not col in self.data: raise KeyError( "{} is not a column in the DataFrame.".format( subset[0])) _ = ak.GroupBy([self.data[col] for col in subset]) if keep == 'last': _segment_ends = ak.concatenate( [_.segments[1:] - 1, ak.array([_.permutation.size - 1])]) return self[_.permutation[_segment_ends]] else: return self[_.permutation[_.segments]]
def sample(self, n=5): """ Return a random sample of `n` rows. Parameters ---------- n : int (default=5) Number of rows to return. Returns ------- akutil.DataFrame The sampled `n` rows of the DataFrame. """ self.update_size() if self._size <= n: return self return self[ak.array(random.sample(range(self._size), n))]
def __init__(self, ar_tuple=None,data=None, index=None): if ar_tuple is not None: self.index = aku.Index.factory(ar_tuple[0]) self.values = ar_tuple[1] elif data is None: raise TypeError("ar_tuple and data cannot both be null") else: if not isinstance(data,ak.pdarrayclass.pdarray): data = ak.array(data) self.values= data if index is None: index = ak.arange(data.size) self.index = aku.Index.factory(index) if self.index.size != self.values.size: raise ValueError("Index and data must have same length") self.size = self.index.size
def check_correctness(dtype, random): N = 10**4 if random: if dtype == 'int64': a = np.random.randint(0, 2**32, N) elif dtype == 'float64': a = np.random.random(N) else: if dtype == 'int64': a = np.arange(0, N, 1, dtype=dtype) elif dtype == 'float64': a = np.arange(1, 1 + 1 / N, (1 / N) / N, dtype=dtype) for op in OPS: npa = a aka = ak.array(a) fxn = getattr(npa, op) npr = fxn() fxn = getattr(aka, op) akr = fxn() assert np.isclose(npr, akr)
def time_ak_read(N_per_locale, numfiles, trials, dtype, path, seed, parquet): print(">>> arkouda {} read".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}, filesPerLoc = {}".format( cfg["numLocales"], N, numfiles)) a = ak.array([]) readtimes = [] for i in range(trials): start = time.time() a = ak.read_all(path + '*') if not parquet else ak.read_parquet(path + '*') end = time.time() readtimes.append(end - start) avgread = sum(readtimes) / trials print("read Average time = {:.4f} sec".format(avgread)) nb = a.size * a.itemsize print("read Average rate = {:.2f} GiB/sec".format(nb / 2**30 / avgread))
def expand(size, segs, vals): """ Expand an array with values placed into the indicated segments. Parameters ---------- size : ak.pdarray The size of the array to be expanded segs : ak.pdarray The indices where the values should be placed vals : ak.pdarray The values to be placed in each segment Returns ------- pdarray The expanded array. """ temp = ak.zeros(size, vals.dtype) diffs = ak.concatenate((ak.array([vals[0]]), vals[1:] - vals[:-1])) temp[segs] = diffs return ak.cumsum(temp)
def coargsort(self, keys, ascending=True): """ Return the permutation that sorts the dataframe by `keys`. Parameters ---------- keys : list The keys to sort on. Returns ------- ak.pdarray The permutation array that sorts the data on `keys`. """ if self._empty: return ak.array([], dtype=ak.int64) arrays = [] for key in keys: arrays.append(self[key]) i = ak.coargsort(arrays) if not ascending: i = i[ak.arange(self.size - 1, -1, -1)] return i
def concat(cls, x, axis=0, ordered=True): """ Concatenate a sequence of SegArrays Parameters ---------- x : sequence of SegArray The SegArrays to concatenate axis : 0 or 1 Select vertical (0) or horizontal (1) concatenation. If axis=1, all SegArrays must have same size. ordered : bool Must be True. This option is present for compatibility only, because unordered concatenation is not yet supported. Returns ------- SegArray The input arrays joined into one SegArray """ if not ordered: raise ValueError( "Unordered concatenation not yet supported on SegArray; use ordered=True." ) if len(x) == 0: raise ValueError("Empty sequence passed to concat") for xi in x: if not isinstance(xi, cls): return NotImplemented if len(set(xi.dtype for xi in x)) != 1: raise ValueError( "SegArrays must all have same dtype to concatenate") if axis == 0: ctr = 0 segs = [] vals = [] for xi in x: # Segment offsets need to be raised by length of previous values segs.append(xi.segments + ctr) ctr += xi.valsize # Values can just be concatenated vals.append(xi.values) return cls(ak.concatenate(segs), ak.concatenate(vals)) elif axis == 1: sizes = set(xi.size for xi in x) if len(sizes) != 1: raise ValueError( "SegArrays must all have same size to concatenate with axis=1" ) if sizes.pop() == 0: return x[0] dt = list(x)[0].dtype newlens = sum(xi.lengths for xi in x) newsegs = ak.cumsum(newlens) - newlens # Ignore sub-arrays that are empty in all arrays nonzero = ak.concatenate( (newsegs[:-1] < newsegs[1:], ak.array([True]))) nzsegs = newsegs[nonzero] newvals = ak.zeros(newlens.sum(), dtype=dt) for xi in x: # Set up fromself for a scan, so that it steps up at the start of a segment # from the current array, and steps back down at the end fromself = ak.zeros(newvals.size + 1, dtype=ak.int64) fromself[nzsegs] += 1 nzlens = xi.lengths[nonzero] fromself[nzsegs + nzlens] -= 1 fromself = (ak.cumsum(fromself[:-1]) == 1) newvals[fromself] = xi.values nzsegs += nzlens return cls(newsegs, newvals, copy=False) else: raise ValueError( "Supported values for axis are 0 (vertical concat) or 1 (horizontal concat)" )
description="Example of cosine distance/similarity in arkouda") parser.add_argument('--server', default="localhost", help='server/Hostname of arkouda server') parser.add_argument('--port', type=int, default=5555, help='Port of arkouda server') args = parser.parse_args() ak.v = False ak.connect(server=args.server, port=args.port) u1 = [1, 0, 0] v1 = [0, 1, 0] d1 = ak_cos_dist(ak.array(u1), ak.array(v1)) print("d1 = ", d1) # d1 should be 1.0 assert (np.allclose(d1, distance.cosine(u1, v1))) u2 = [100, 0, 0] d2 = ak_cos_dist(ak.array(u2), ak.array(v1)) print("d2 = ", d2) # d2 should be 1.0 assert (np.allclose(d2, distance.cosine(u2, v1))) u3 = [1, 1, 0] d3 = ak_cos_dist(ak.array(u3), ak.array(v1)) print("d3 = ", d3) # d3 should be 0.29289321881345254 assert (np.allclose(d3, distance.cosine(u3, v1)))
import numpy as np import math import gc import sys import arkouda as ak ak.v = False if len(sys.argv) > 1: ak.connect(server=sys.argv[1], port=sys.argv[2]) else: ak.connect() a = ak.arange(0, 10, 1) b = np.linspace(10, 20, 10) c = ak.array(b) d = a + c e = d.to_ndarray() a = ak.ones(10) a[::2] = 0 print(a) a = ak.ones(10) b = ak.zeros(5) a[1::2] = b print(a) a = ak.zeros(10) # float64 b = ak.arange(0,10,1) # int64 a[:] = b # cast b to float64
return all(x == y for x, y in zip(a, b)) errors = False if __name__ == '__main__': if len(sys.argv) > 1: ak.connect(server=sys.argv[1], port=sys.argv[2]) else: ak.connect() with open(__file__, 'r') as f: base_words = np.array(f.read().split()) test_strings = np.random.choice(base_words, N, replace=True) strings = ak.array(test_strings) cat = ak.Categorical(strings) print("strings =", strings) print("categorical =", cat) # int index assert (strings[N // 3] == test_strings[N // 3]) assert (cat[N // 3] == test_strings[N // 3]) print("int index passed") # slice assert (compare_strings(strings[N // 4:N // 3].to_ndarray(), test_strings[N // 4:N // 3])) assert (compare_strings(cat[N // 4:N // 3].to_ndarray(), test_strings[N // 4:N // 3])) print("slice passed")
def run_test(levels): d = make_arrays() df = pd.DataFrame(d) akdf = {k:ak.array(v) for k, v in d.items()} if levels == 1: akg = ak.GroupBy(akdf['keys']) keyname = 'keys' elif levels == 2: akg = ak.GroupBy([akdf['keys'], akdf['keys2']]) keyname = ['keys', 'keys2'] tests = 0 failures = 0 not_impl = 0 print(f"Doing .count()") tests += 1 pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels) # print("Pandas:") # print(pdkeys) # print(pdvals) akkeys, akvals = akg.count() # akkeys = akkeys.to_ndarray() akvals = akvals.to_ndarray() # print("Arkouda:") # print(akkeys) # print(akvals) # if not np.allclose(pdkeys, akkeys): # print(f"Different keys") # failures += 1 failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) # elif not np.allclose(pdvals, akvals): # print(f"Different values (abs diff = {np.abs(pdvals - akvals).sum()})") # failures += 1 for vname in ('int64', 'float64', 'bool'): for op in ak.GroupBy.Reductions: print(f"\nDoing aggregate({vname}, {op})") tests += 1 do_check = True try: pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op, levels) # print("Pandas:") # print(pdkeys) # print(pdvals) except Exception as E: print("Pandas does not implement") do_check = False try: akkeys, akvals = akg.aggregate(akdf[vname], op) # akkeys = akkeys.to_ndarray() akvals = akvals.to_ndarray() # print("Arkouda:") # print(akkeys) # print(akvals) except RuntimeError as E: print("Arkouda error: ", E) not_impl += 1 do_check = False continue if not do_check: continue if op.startswith('arg'): pdextrema = df[vname][pdvals] akextrema = akdf[vname][ak.array(akvals)].to_ndarray() if not np.allclose(pdextrema, akextrema): print(f"Different argmin/argmax: Arkouda failed to find an extremum") print("pd: ", pdextrema) print("ak: ", akextrema) failures += 1 else: # if not np.allclose(pdkeys, akkeys): # print(f"Different keys") # failures += 1 failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals) # elif not np.allclose(pdvals, akvals): # print(f"Different values (abs diff = {np.where(np.isfinite(pdvals) & np.isfinite(akvals), np.abs(pdvals - akvals), 0).sum()})") # failures += 1 print(f"\n{failures} failures in {tests} tests ({not_impl} not implemented)")
def inner_join(left, right, wherefunc=None, whereargs=None): '''Perform inner join on values in <left> and <right>, using conditions defined by <wherefunc> evaluated on <whereargs>, returning indices of left-right pairs. Parameters ---------- left : pdarray(int64) The left values to join right : pdarray(int64) The right values to join wherefunc : function, optional Function that takes two pdarray arguments and returns a pdarray(bool) used to filter the join. Results for which wherefunc is False will be dropped. whereargs : 2-tuple of pdarray The two pdarray arguments to wherefunc Returns ------- leftInds : pdarray(int64) The left indices of pairs that meet the join condition rightInds : pdarray(int64) The right indices of pairs that meet the join condition Notes ----- The return values satisfy the following assertions `assert (left[leftInds] == right[rightInds]).all()` `assert wherefunc(whereargs[0][leftInds], whereargs[1][rightInds]).all()` ''' from inspect import signature sample = min((left.size, right.size, 5)) if wherefunc is not None: if len(signature(wherefunc).parameters) != 2: raise ValueError( "wherefunc must be a function that accepts exactly two arguments" ) if whereargs is None or len(whereargs) != 2: raise ValueError( "whereargs must be a 2-tuple with left and right arg arrays") if whereargs[0].size != left.size: raise ValueError( "Left whereargs must be same size as left join values") if whereargs[1].size != right.size: raise ValueError( "Right whereargs must be same size as right join values") try: _ = wherefunc(whereargs[0][:sample], whereargs[1][:sample]) except Exception as e: raise ValueError("Error evaluating wherefunc") from e # Need dense 0-up right index, to filter out left not in right keep, (denseLeft, denseRight) = right_align(left, right) keep = ak.arange(keep.size)[keep] # GroupBy right byRight = ak.GroupBy(denseRight) # Get segment boundaries (starts, ends) of right for each left item rightSegs = ak.concatenate((byRight.segments, ak.array([denseRight.size]))) starts = rightSegs[denseLeft] ends = rightSegs[denseLeft + 1] fullSize = (ends - starts).sum() # print(f"{left.size+right.size:,} input rows --> {fullSize:,} joins ({fullSize/(left.size+right.size):.1f} x) ") # gen_ranges for gather of right items fullSegs, ranges = gen_ranges(starts, ends) # Evaluate where clause if wherefunc is None: filtRanges = ranges filtSegs = fullSegs keep12 = keep else: # Gather right whereargs rightWhere = whereargs[1][byRight.permutation][ranges] # Expand left whereargs leftWhere = expand(whereargs[0][keep], fullSegs, ranges.size) # Evaluate wherefunc and filter ranges, recompute segments whereSatisfied = wherefunc(leftWhere, rightWhere) filtRanges = ranges[whereSatisfied] scan = ak.cumsum(whereSatisfied) - whereSatisfied filtSegsWithZeros = scan[fullSegs] filtSegSizes = ak.concatenate( (filtSegsWithZeros[1:] - filtSegsWithZeros[:-1], ak.array([whereSatisfied.sum() - filtSegsWithZeros[-1]]))) keep2 = (filtSegSizes > 0) filtSegs = filtSegsWithZeros[keep2] keep12 = keep[keep2] # Gather right inds and expand left inds rightInds = byRight.permutation[filtRanges] leftInds = expand(ak.arange(left.size)[keep12], filtSegs, filtRanges.size) return leftInds, rightInds
times=times, includeDelimiter=inc, keepPartial=part) triples = [s.rpartition(delim) for s in test_strings] for i in range(times - 1): triples = [rslide(t, delim) for t in triples] ltest, rtest = rmunge(triples, inc, part) assert ((ltest == ls.to_ndarray()).all() and (rtest == rs.to_ndarray()).all()) print("peel passed") # stick test_strings2 = np.random.choice(base_words, N, replace=True) strings2 = ak.array(test_strings2) stuck = strings.stick(strings2, delimiter=delim).to_ndarray() tstuck = np.array( [delim.join((a, b)) for a, b in zip(test_strings, test_strings2)]) assert ((stuck == tstuck).all()) assert ((strings + strings2) == strings.stick(strings2, delimiter="")).all() lstuck = strings.lstick(strings2, delimiter=delim).to_ndarray() tlstuck = np.array( [delim.join((b, a)) for a, b in zip(test_strings, test_strings2)]) assert ((lstuck == tlstuck).all()) assert ((strings2 + strings) == strings.lstick(strings2, delimiter="")).all() print("stick passed")