def check_correctness(): N = 10**4 thirds = [ak.cast(ak.arange(i, N*3, 3), 'str') for i in range(3)] thickrange = thirds[0].stick(thirds[1], delimiter='_').stick(thirds[2], delimiter='_') answer = ak.cast(ak.arange(N*3), 'str') assert (thickrange.flatten('_') == answer).all() assert (thickrange.flatten('_', regex=True) == answer).all() assert (thickrange.flatten('_+', regex=True) == answer).all()
def time_flatten(N, trials): print(">>> arkouda flatten") cfg = ak.get_config() print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) thirds = [ak.cast(ak.arange(i, N*3, 3), 'str') for i in range(3)] thickrange = thirds[0].stick(thirds[1], delimiter='_').stick(thirds[2], delimiter='_') nbytes = thickrange.nbytes * thickrange.entry.itemsize non_regex_times = [] regex_literal_times = [] regex_pattern_times = [] for i in range(trials): start = time.time() non_regex = thickrange.flatten('_') end = time.time() non_regex_times.append(end - start) start = time.time() regex_literal = thickrange.flatten('_', regex=True) end = time.time() regex_literal_times.append(end - start) start = time.time() regex_pattern = thickrange.flatten('_+', regex=True) end = time.time() regex_pattern_times.append(end - start) avg_non_regex = sum(non_regex_times) / trials avg_regex_literal = sum(regex_literal_times) / trials avg_regex_pattern = sum(regex_pattern_times) / trials answer = ak.cast(ak.arange(N*3), 'str') assert (non_regex == answer).all() assert (regex_literal == answer).all() assert (regex_pattern == answer).all() print("non-regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_non_regex)) print("regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_regex_literal)) print("regex flatten with pattern delimiter Average time = {:.4f} sec".format(avg_regex_pattern)) print("non-regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_non_regex)) print("regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_literal)) print("regex flatten with pattern delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_pattern))
def power_law(N): ''' Power law distributed (alpha = 2.5) reals and integers in (1, 2**32) ''' y = ak.uniform(N) a = -2.5 # power law exponent, between -2 and -3 ub = 2**32 # upper bound data = ((ub**(a + 1) - 1) * y + 1)**(1 / (a + 1)) yield 'power-law float64', data datai = ak.cast(data, ak.int64) yield 'power-law int64', datai
def check_correctness(dtype, seed): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) z = ak.cast(ak.zeros(N), 'str') perm = ak.coargsort([a, z]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, a]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm])
def time_ak_gather(isize, vsize, trials, dtype, random, seed): print(">>> arkouda {} gather".format(dtype)) cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni, seed=seed) if seed is not None: seed += 1 if random or seed is not None: if dtype == 'int64': v = ak.randint(0, 2**32, Nv, seed=seed) elif dtype == 'float64': v = ak.randint(0, 1, Nv, dtype=ak.float64, seed=seed) elif dtype == 'bool': v = ak.randint(0, 1, Nv, dtype=ak.bool, seed=seed) elif dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv, seed=seed) else: if dtype == 'str': v = ak.cast(ak.arange(Nv), 'str') else: v = ak.ones(Nv, dtype=dtype) timings = [] for _ in range(trials): start = time.time() c = v[i] end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))