def test_plus_minus(self): # Datetime + Datetime not supported with self.assertRaises(TypeError) as cm: self.dtvec1 + self.dtvec2 # Datetime slice -> Datetime leading = self.dtvec1[1:] trailing = self.dtvec1[:-1] self.assertTrue(isinstance(leading, ak.Datetime) and isinstance(trailing, ak.Datetime)) # Datetime - Datetime -> Timedelta diff = leading - trailing self.assertTrue(isinstance(diff, ak.Timedelta)) self.assertTrue((diff == self.onesecond).all()) # Datetime - DatetimeScalar -> Timedelta diff = self.dtvec1 - self.dtscalar trange = ak.timedelta_range(start=0, periods=100, freq='s') self.assertTrue(isinstance(diff, ak.Timedelta)) self.assertTrue((diff == trange).all()) # DatetimeScalar - Datetime -> Timedelta diff = self.dtscalar - self.dtvec1 self.assertTrue(isinstance(diff, ak.Timedelta)) self.assertTrue((diff == (-trange)).all()) # Datetime + TimedeltaScalar -> Datetime t = (trailing + self.onesecond) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # TimedeltaScalar + Datetime -> Datetime t = (self.onesecond + trailing) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # Datetime - TimedeltaScalar -> Datetime t = leading - self.onesecond self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == trailing).all()) # Datetime + Timedelta -> Datetime t = (trailing + self.tdvec1[1:]) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # Timedelta + Datetime -> Datetime t = (self.tdvec1[1:] + trailing) self.assertTrue(isinstance(t, ak.Datetime)) self.assertTrue((t == leading).all()) # Datetime - Timedelat -> Datetime t = (leading - self.tdvec1[1:]) self.assertTrue(isinstance(t, ak.Datetime)) # Timedelta + Timedelta -> Timedelta t = self.tdvec1 + self.tdvec1 self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all()) # Timedelta + TimedeltaScalar -> Timedelta t = self.tdvec1 + self.onesecond self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(2*ak.ones(100, dtype=ak.int64), unit='s')).all()) # Timedelta - Timedelta -> Timedelta t = self.tdvec1 - self.tdvec1 self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all()) # Timedelta - TimedeltaScalar -> Timedelta t = self.tdvec1 - self.onesecond self.assertTrue(isinstance(t, ak.Timedelta)) self.assertTrue((t == ak.Timedelta(ak.zeros(100, dtype=ak.int64), unit='s')).all())
def rmat(size): ''' RMAT-generated edges (coargsort of two vertex arrays) ''' # N = number of edges = number of elements / 2 N = size // 2 avgdegree = 10 lgNv = int(np.log2(N / avgdegree)) # number of vertices Nv = 2**lgNv # probabilities a = 0.01 b = (1.0 - a) / 3.0 c = b d = b # quantites to use in edge generation loop ab = a + b c_norm = c / (c + d) a_norm = a / (a + b) # init edge arrays ii = ak.ones(N, dtype=ak.int64) jj = ak.ones(N, dtype=ak.int64) # generate edges for ib in range(1, lgNv): ii_bit = (ak.uniform(N) > ab) jj_bit = (ak.uniform(N) > (c_norm * ii_bit + a_norm * (~ii_bit))) ii = ii + ((2**(ib - 1)) * ii_bit) jj = jj + ((2**(ib - 1)) * jj_bit) yield 'RMAT int64', (ii, jj)
def get_ngrams(self, n, return_origins=True): """ Return all n-grams from all sub-arrays. Parameters ---------- n : int Length of n-gram return_origins : bool If True, return an int64 array indicating which sub-array each returned n-gram came from. Returns ------- ngrams : list of pdarray An n-long list of pdarrays, essentially a table where each row is an n-gram. origin_indices : pdarray, int The index of the sub-array from which the corresponding n-gram originated """ ngrams = [] notsegstart = ak.ones(self.valsize, dtype=ak.bool) notsegstart[self.segments] = False valid = ak.ones(self.valsize - n + 1, dtype=ak.bool) for i in range(n): end = self.valsize - n + i + 1 ngrams.append(self.values[i:end]) if i > 0: valid &= notsegstart[i:end] ngrams = [char[valid] for char in ngrams] if return_origins: origin_indices = self.grouping.broadcast( ak.arange(self.size), permute=True)[:valid.size][valid] return ngrams, origin_indices else: return ngrams
def time_ak_stream(N_per_locale, trials, alpha, dtype, random): print(">>> arkouda stream") cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if random: if dtype == 'int64': a = ak.randint(0, 2**32, N) b = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) b = ak.randint(0, 1, N, dtype=ak.float64) else: a = ak.ones(N, dtype=dtype) b = ak.ones(N, dtype=dtype) timings = [] for i in range(trials): start = time.time() c = a + b * alpha end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def check_equal(pdunit, akunit): pdval = pd.Timestamp(1, unit=pdunit) akval = ak.Datetime(ak.ones(10, dtype=ak.int64), unit=akunit)[0] try: self.assertEqual(pdval, akval) except AssertionError: logging.getLogger().error("pandas {} ({}) != arkouda {} ({})".format(pdunit, pdval, akunit, akval)) pdval = pd.Timedelta(1, unit=pdunit) akval = ak.Timedelta(ak.ones(10, dtype=ak.int64), unit=akunit)[0] try: self.assertEqual(pdval, akval) except AssertionError: logging.getLogger().error("pandas {} ({}) != arkouda {} ({})".format(pdunit, pdval, akunit, akval))
def time_ak_scatter(isize, vsize, trials, dtype, random): print(">>> arkouda scatter") cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format( cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni) c = ak.zeros(Nv, dtype=dtype) if random: if dtype == 'int64': v = ak.randint(0, 2**32, Ni) elif dtype == 'float64': v = ak.randint(0, 1, Ni, dtype=ak.float64) else: v = ak.ones(Ni, dtype=dtype) timings = [] for _ in range(trials): start = time.time() c[i] = v end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (i.size * i.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
def __eq__(self, v): if type(v) != list and type(v) != tuple: raise TypeError("Cannot compare MultiIndex to a scalar") retval = ak.ones(len(self), dtype=ak.bool) for a, b in zip(self.index, v): retval &= (a == b) return retval
def gen_ranges(starts, ends, stride=1): """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points. Parameters ---------- starts : pdarray, int64 The start value of each range ends : pdarray, int64 The end value (exclusive) of each range stride: int Difference between successive elements of each range Returns ------- segments : pdarray, int64 The starting index of each range in the resulting array ranges : pdarray, int64 The actual ranges, flattened into a single array """ if starts.size != ends.size: raise ValueError("starts and ends must be same length") if starts.size == 0: return ak.zeros(0, dtype=ak.int64), ak.zeros(0, dtype=ak.int64) lengths = (ends - starts) // stride segs = ak.cumsum(lengths) - lengths totlen = lengths.sum() slices = ak.ones(totlen, dtype=ak.int64) diffs = ak.concatenate( (ak.array([starts[0]]), starts[1:] - starts[:-1] - (lengths[:-1] - 1) * stride)) slices[segs] = diffs return segs, ak.cumsum(slices)
def gen_ranges(starts, ends): """ Generate a segmented array of variable-length, contiguous ranges between pairs of start- and end-points. Parameters ---------- starts : pdarray, int64 The start value of each range ends : pdarray, int64 The end value (exclusive) of each range Returns ------- segments : pdarray, int64 The starting index of each range in the resulting array ranges : pdarray, int64 The actual ranges, flattened into a single array """ if starts.size != ends.size: raise ValueError("starts and ends must be same size") if not ((ends - starts) > 0).all(): raise ValueError("all ends must be greater than starts") lengths = ends - starts segs = ak.cumsum(lengths) - lengths totlen = lengths.sum() slices = ak.ones(totlen, dtype=ak.int64) diffs = ak.concatenate( (ak.array([starts[0]]), starts[1:] - starts[:-1] - lengths[:-1] + 1)) slices[segs] = diffs return segs, ak.cumsum(slices)
def setUp(self): ArkoudaTest.setUp(self) self.dtvec1 = ak.date_range(start='2021-01-01 12:00:00', periods=100, freq='s') self.dtvec2 = ak.Datetime(pd.date_range('2021-01-01 12:00:00', periods=100, freq='s')) self.dtscalar = pd.Timestamp('2021-01-01 12:00:00') self.tdvec1 = ak.timedelta_range(start='1 second', end='1 second', periods=100) self.tdvec2 = ak.Timedelta(ak.ones(100, dtype=ak.int64), unit='s') self.onesecond = pd.Timedelta(1, unit='s')
def create_ak_array(N, op, dtype, seed): if op == 'zeros': a = ak.zeros(N, dtype=dtype) elif op == 'ones': a = ak.ones(N, dtype=dtype) elif op == 'randint': a = ak.randint(0, 2**32, N, dtype=dtype, seed=seed) return a
def is_cosorted(data): # (b[0] > a[0]) | ((b[0] == a[0]) & recurse(a[1], b[1])) def helper(x, right): return (x[1:] > x[:-1]) | ((x[1:] == x[:-1]) & right) right = ak.ones(data[0].size - 1, dtype=ak.bool) for x in reversed(data): right = helper(x, right) return right.all()
def testPdArraySubtractNumpyInt(): aArray = ak.ones(100) addArray = aArray - np.int64(2) assert isinstance(addArray, ak.pdarray) assert np.float64(-1) == addArray[0] addArray = np.int64(2) - aArray assert isinstance(addArray, ak.pdarray) assert np.float64(1) == addArray[0]
def check_ones(N): # create np version a = ak.array(np.ones(N)) # create ak version b = ak.ones(N) # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def testPdArraySubtractInt(): aArray = ak.ones(100) addArray = aArray - 2 assert isinstance(addArray, ak.pdarray) assert np.float64(-1) == addArray[0] addArray = 2 - aArray assert isinstance(addArray, ak.pdarray) assert np.float64(1) == addArray[0]
def testPdArrayMultInt(): aArray = ak.ones(100) mArray = aArray*5 assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0] mArray = 5*aArray assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0]
def testPdArrayMultNumpyInt(): aArray = ak.ones(100) mArray = aArray*np.int64(5) assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0] mArray = np.int64(5)*aArray assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0]
def testPdArrayDivideInt(): aArray = ak.ones(100) mArray = aArray*15/3 assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0] mArray = 15*aArray/3 assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0]
def testPdArrayDivideNumpyInt(): aArray = ak.ones(100) mArray = aArray*np.int64(15)/3 assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0] mArray = np.int64(15)*aArray/3 assert isinstance(mArray, ak.pdarray) assert np.float64(5) == mArray[0]
def check_set_slice(N): # create np version a = np.ones(N) a[::2] = a[::2] * -1 # create ak version b = ak.ones(N) b[::2] = b[::2] * -1 # print(a,b) c = a == b.to_ndarray() return pass_fail(c.all())
def testPdArrayAddInt(): aArray = ak.ones(100) addArray = aArray + 1 assert isinstance(addArray, ak.pdarray) assert np.float64(2) == addArray[0] addArray = 1 + aArray assert isinstance(addArray, ak.pdarray) assert np.float64(2) == addArray[0]
def testPdArrayAddNumpyInt(): aArray = ak.ones(100) addArray = aArray + np.int64(1) assert isinstance(addArray, ak.pdarray) assert np.float64(2) == addArray[0] addArray = np.int64(1) + aArray assert isinstance(addArray, ak.pdarray) assert np.float64(2) == addArray[0]
def gen_rmat_edges(lgNv, Ne_per_v, p, perm=False): # number of vertices Nv = 2**lgNv # number of edges Ne = Ne_per_v * Nv # probabilities a = p b = (1.0 - a) / 3.0 c = b d = b # init edge arrays ii = ak.ones(Ne, dtype=ak.int64) jj = ak.ones(Ne, dtype=ak.int64) # quantites to use in edge generation loop ab = a + b c_norm = c / (c + d) a_norm = a / (a + b) # generate edges for ib in range(1, lgNv): ii_bit = (ak.randint(0, 1, Ne, dtype=ak.float64) > ab) jj_bit = (ak.randint(0, 1, Ne, dtype=ak.float64) > (c_norm * ii_bit + a_norm * (~ii_bit))) ii = ii + ((2**(ib - 1)) * ii_bit) jj = jj + ((2**(ib - 1)) * jj_bit) # sort all based on ii and jj using coargsort # all edges should be sorted based on both vertices of the edge iv = ak.coargsort((ii, jj)) # permute into sorted order ii = ii[iv] # permute first vertex into sorted order jj = jj[iv] # permute second vertex into sorted order # to premute/rename vertices if perm: # generate permutation for new vertex numbers(names) ir = ak.argsort(ak.randint(0, 1, Nv, dtype=ak.float64)) # renumber(rename) vertices ii = ir[ii] # rename first vertex jj = ir[jj] # rename second vertex # # maybe: remove edges which are self-loops??? # # return pair of pdarrays return (ii, jj)
def check_bool(N): a = ak.arange(N) b = ak.ones(N) try: c = a and b except ValueError: correct = True except: correct = False d = ak.array([1]) correct = correct and (d and 5) return pass_fail(correct)
def check_set_slice(N): # create np version a = np.ones(N) a[::2] = a[::2] * -1 a = ak.array(a) # create ak version b = ak.ones(N) b[::2] = b[::2] * -1 # print(a,b) c = a == b # print(type(c),c) return pass_fail(c.all())
def time_ak_gather(isize, vsize, trials, dtype, random): print(">>> arkouda gather") cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni) if random: if dtype == 'int64': v = ak.randint(0, 2**32, Nv) elif dtype == 'float64': v = ak.randint(0, 1, Nv, dtype=ak.float64) elif dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv) else: if dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv) else: v = ak.ones(Nv, dtype=dtype) print("v={}".format(v)) print("v.offsets={}".format(v.offsets)) print("v.nbytes={}".format(v.nbytes)) print("v[1]={}".format(v[1])) print("In Gather size={}".format(v.size)) print("In Gather nbytes={}".format(v.nbytes)) print("In Gather ndim={}".format(v.ndim)) print("In Gather shape={}".format(v.shape)) print("In Gather offsets name ={}".format(v.offsets.name)) print("In Gather offsets size={}".format(v.offsets.size)) print("In Gather bytes name ={}".format(v.bytes.name)) print("In Gather bytes size={}".format(v.bytes.size)) timings = [] for _ in range(trials): print("In Gather loop i={}".format(i)) print("In Gather v[i]={}".format(v[i])) start = time.time() c = v[i] end = time.time() print("In Gather loop c={}".format(c)) timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
def select_clusters(self): print("Computing Selection and Stability.") # Perhaps keep track of a "final clusters" array, that we update as we # work through this function. self.selection_data['selected'] = ak.ones(self.selection_data.size, dtype=ak.bool) byparent = ak.GroupBy(self.selection_data['parent']) uk = byparent.unique_keys for p in tqdm(uk[1:]): children = self.selection_data['index'][self.selection_data['parent'] == p] c_stab = (self.selection_data['stability'][children]).sum() p_stab = self.selection_data['stability'][p] if c_stab >= p_stab: self.selection_data['stability'][p] = c_stab self.selection_data['selected'][p] = False else: self.deselect_children(node=p) print("Selection and Stability computation is complete!")
def time_ak_gather(isize, vsize, trials, dtype, random, seed): print(">>> arkouda {} gather".format(dtype)) cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni, seed=seed) if seed is not None: seed += 1 if random or seed is not None: if dtype == 'int64': v = ak.randint(0, 2**32, Nv, seed=seed) elif dtype == 'float64': v = ak.randint(0, 1, Nv, dtype=ak.float64, seed=seed) elif dtype == 'bool': v = ak.randint(0, 1, Nv, dtype=ak.bool, seed=seed) elif dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv, seed=seed) else: if dtype == 'str': v = ak.cast(ak.arange(Nv), 'str') else: v = ak.ones(Nv, dtype=dtype) timings = [] for _ in range(trials): start = time.time() c = v[i] end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
#!/usr/bin/env python3 import importlib import numpy as np import math import gc import sys import arkouda as ak ak.verbose = False if len(sys.argv) > 1: ak.connect(server=sys.argv[1], port=sys.argv[2]) else: ak.connect() N = 10**9 a = ak.ones(N, dtype='int64') b = ak.ones(N, dtype='int64') print(a, b) c = a + b d = a - b print(c) print(d) ak.shutdown()
import gc import sys import arkouda as ak print(">>> Sanity checks on the arkouda_server") ak.verbose = False if len(sys.argv) > 1: ak.connect(server=sys.argv[1], port=sys.argv[2]) else: ak.connect() N = 1000 a1 = ak.ones(N,dtype=np.int64) a2 = ak.arange(0,N,1) t1 = a1 t2 = a1 * 10 dt = 10 # should get N*N answers I,J = ak.join_on_eq_with_dt(a1,a1,a1,a1,dt,"true_dt",result_limit=N*N) print(I,J) if (I.size == N*N) and (J.size == N*N): print("passed!") else: print("failed!") # should get N answers I,J = ak.join_on_eq_with_dt(a2,a1,t1,t2,dt,"true_dt")