def check_correctness(seed): N = 10**4 start = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed) end = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed) # each string in test_substring contains '1 string 1' with random strings before and after test_substring = start.stick(end, delimiter='1 string 1') assert test_substring.contains('1 string 1').all() assert test_substring.contains('1 string 1', regex=True).all() assert test_substring.contains('\\d string \\d', regex=True).all()
def time_substring_search(N, trials, seed): print(">>> arkouda substring search") cfg = ak.get_config() print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) start = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed) end = ak.random_strings_uniform(minlen=1, maxlen=8, size=N, seed=seed) # each string in test_substring contains '1 string 1' with random strings before and after test_substring = start.stick(end, delimiter='1 string 1') nbytes = test_substring.nbytes * test_substring.entry.itemsize non_regex_times = [] regex_literal_times = [] regex_pattern_times = [] for i in range(trials): start = time.time() non_regex = test_substring.contains('1 string 1') end = time.time() non_regex_times.append(end - start) start = time.time() regex_literal = test_substring.contains('1 string 1', regex=True) end = time.time() regex_literal_times.append(end - start) start = time.time() regex_pattern = test_substring.contains('\\d string \\d', regex=True) end = time.time() regex_pattern_times.append(end - start) avg_non_regex = sum(non_regex_times) / trials avg_regex_literal = sum(regex_literal_times) / trials avg_regex_pattern = sum(regex_pattern_times) / trials assert non_regex.all() assert regex_literal.all() assert regex_pattern.all() print("non-regex with literal substring Average time = {:.4f} sec".format( avg_non_regex)) print("regex with literal substring Average time = {:.4f} sec".format( avg_regex_literal)) print("regex with pattern Average time = {:.4f} sec".format( avg_regex_pattern)) print("non-regex with literal substring Average rate = {:.4f} GiB/sec". format(nbytes / 2**30 / avg_non_regex)) print("regex with literal substring Average rate = {:.4f} GiB/sec".format( nbytes / 2**30 / avg_regex_literal)) print("regex with pattern Average rate = {:.4f} GiB/sec".format( nbytes / 2**30 / avg_regex_pattern))
def time_ak_gather(isize, vsize, trials, dtype, random): print(">>> arkouda gather") cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni) if random: if dtype == 'int64': v = ak.randint(0, 2**32, Nv) elif dtype == 'float64': v = ak.randint(0, 1, Nv, dtype=ak.float64) elif dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv) else: if dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv) else: v = ak.ones(Nv, dtype=dtype) print("v={}".format(v)) print("v.offsets={}".format(v.offsets)) print("v.nbytes={}".format(v.nbytes)) print("v[1]={}".format(v[1])) print("In Gather size={}".format(v.size)) print("In Gather nbytes={}".format(v.nbytes)) print("In Gather ndim={}".format(v.ndim)) print("In Gather shape={}".format(v.shape)) print("In Gather offsets name ={}".format(v.offsets.name)) print("In Gather offsets size={}".format(v.offsets.size)) print("In Gather bytes name ={}".format(v.bytes.name)) print("In Gather bytes size={}".format(v.bytes.size)) timings = [] for _ in range(trials): print("In Gather loop i={}".format(i)) print("In Gather v[i]={}".format(v[i])) start = time.time() c = v[i] end = time.time() print("In Gather loop c={}".format(c)) timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
def time_ak_sa(vsize, trials, dtype): print(">>> arkouda suffix array") cfg = ak.get_config() Nv = vsize * cfg["numLocales"] print("numLocales = {}, num of strings = {:,}".format( cfg["numLocales"], Nv)) # v = ak.random_strings_uniform(90000000, 100000000, Nv) v = ak.random_strings_uniform(1, 16, Nv) c = ak.suffix_array(v) print("size of suffix array={}".format(c.bytes.size)) # print("All the random strings are as follows") for k in range(vsize): print("the {} th random tring ={}".format(k, v[k])) print("the {} th suffix array ={}".format(k, c[k])) print("") # print(v) timings = [] for _ in range(trials): start = time.time() ak.suffix_array(v) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg
def time_ak_argsort(N_per_locale, trials, dtype, seed): print(">>> arkouda {} argsort".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) nbytes = a.nbytes * a.entry.itemsize timings = [] for i in range(trials): start = time.time() perm = ak.argsort(a) end = time.time() timings.append(end - start) tavg = sum(timings) / trials if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = nbytes / tavg print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_coargsort(N_per_locale, trials, dtype, seed): print(">>> arkouda {} coargsort".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) for numArrays in (1, 2, 8, 16): if seed is None: seeds = [None for _ in range(numArrays)] else: seeds = [seed+i for i in range(numArrays)] if dtype == 'int64': arrs = [ak.randint(0, 2**32, N//numArrays, seed=s) for s in seeds] nbytes = sum(a.size * a.itemsize for a in arrs) elif dtype == 'float64': arrs = [ak.randint(0, 1, N//numArrays, dtype=ak.float64, seed=s) for s in seeds] nbytes = sum(a.size * a.itemsize for a in arrs) elif dtype == 'str': arrs = [ak.random_strings_uniform(1, 8, N//numArrays, seed=s) for s in seeds] nbytes = sum(a.bytes.size * a.bytes.itemsize for a in arrs) timings = [] for i in range(trials): start = time.time() perm = ak.coargsort(arrs) end = time.time() timings.append(end - start) tavg = sum(timings) / trials a = arrs[0][perm] if dtype in ('int64', 'float64'): assert ak.is_sorted(a) print("{}-array Average time = {:.4f} sec".format(numArrays, tavg)) bytes_per_sec = nbytes / tavg print("{}-array Average rate = {:.4f} GiB/sec".format(numArrays, bytes_per_sec/2**30))
def check_correctness(dtype, seed): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) perm = ak.argsort(a) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm])
def time_ak_in1d(size, trials): print(">>> arkouda string in1d") cfg = ak.get_config() N = size * cfg["numLocales"] a = ak.random_strings_uniform(1, MAXSTRLEN, N) for regime, bsize in zip(('Medium', 'Large'), (MEDIUM, LARGE)): print( "{} regime: numLocales = {} a.size = {:,} b.size = {:,}".format( regime, cfg["numLocales"], N, bsize)) b = ak.random_strings_uniform(1, MAXSTRLEN, bsize) timings = [] for _ in range(trials): start = time.time() c = ak.in1d(a, b) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("{} average time = {:.4f} sec".format(regime, tavg)) bytes_per_sec = (a.size * 8 + a.nbytes + b.size * 8 + b.nbytes) / tavg print("{} average rate = {:.2f} GiB/sec".format( regime, bytes_per_sec / 2**30))
def check_correctness(vsize, strlen, trials, dtype): Ni = strlen Nv = vsize v = ak.random_strings_uniform(1, Ni, Nv) c = ak.suffix_array(v) for k in range(Nv): s = v[k] sa = suffixArray(s) aksa = c[k] # _,tmp=c[k].split(maxsplit=1) # aksa=tmp.split() # intaksa = [int(numeric_string) for numeric_string in aksa] # intaksa = aksa[1:-1] # print(sa) # print(intaksa) assert (sa == aksa)
def generate_arrays(N, numArrays, dtype, seed): totalbytes = 0 arrays = [] for i in range(numArrays): if dtype == 'int64' or (i % 2 == 0 and dtype == 'mixed'): a = ak.randint(0, 2**32, N//numArrays, seed=seed) arrays.append(a) totalbytes += a.size * a.itemsize else: a = ak.random_strings_uniform(1, 16, N//numArrays, seed=seed) arrays.append(a) totalbytes += (a.nbytes * a.entry.itemsize) if seed is not None: seed += 1 if numArrays == 1: arrays = arrays[0] return arrays, totalbytes
def check_correctness(dtype, seed): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) z = ak.zeros(N, dtype=dtype) elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) z = ak.cast(ak.zeros(N), 'str') perm = ak.coargsort([a, z]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) perm = ak.coargsort([z, a]) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm])
def time_ak_gather(isize, vsize, trials, dtype, random, seed): print(">>> arkouda {} gather".format(dtype)) cfg = ak.get_config() Ni = isize * cfg["numLocales"] Nv = vsize * cfg["numLocales"] print("numLocales = {}, num_indices = {:,} ; num_values = {:,}".format(cfg["numLocales"], Ni, Nv)) # Index vector is always random i = ak.randint(0, Nv, Ni, seed=seed) if seed is not None: seed += 1 if random or seed is not None: if dtype == 'int64': v = ak.randint(0, 2**32, Nv, seed=seed) elif dtype == 'float64': v = ak.randint(0, 1, Nv, dtype=ak.float64, seed=seed) elif dtype == 'bool': v = ak.randint(0, 1, Nv, dtype=ak.bool, seed=seed) elif dtype == 'str': v = ak.random_strings_uniform(1, 16, Nv, seed=seed) else: if dtype == 'str': v = ak.cast(ak.arange(Nv), 'str') else: v = ak.ones(Nv, dtype=dtype) timings = [] for _ in range(trials): start = time.time() c = v[i] end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: bytes_per_sec = (c.size * c.itemsize * 3) / tavg print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30))
def time_ak_sa(vsize, strlen, trials, dtype): print(">>> arkouda suffix array") cfg = ak.get_config() Nv = vsize * cfg["numLocales"] print("numLocales = {}, num of strings = {:,}".format( cfg["numLocales"], Nv)) if dtype == 'str': v = ak.random_strings_uniform(1, strlen, Nv) else: print("Wrong data type") c = ak.suffix_array(v) # print("size of suffix array={}".format(c.bytes.size)) # print("offset/number of suffix array={}".format(c.offsets.size)) # print("itemsize of suffix array={}".format(c.offsets.itemsize)) print("All the random strings are as follows") for k in range(vsize): print("the {} th random tring ={}".format(k, v[k])) print("the {} th suffix array ={}".format(k, c[k])) print("") timings = [] for _ in range(trials): start = time.time() c = ak.suffix_array(v) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: print("Wrong data type") print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
errors = False if __name__ == '__main__': if len(sys.argv) > 1: ak.connect(server=sys.argv[1], port=sys.argv[2]) else: ak.connect() # with open(__file__, 'r') as f: # base_words = np.array(f.read().split()) # test_strings = np.random.choice(base_words, N, replace=True) # strings = ak.array(test_strings) base_words1 = ak.random_strings_uniform(0, 10, UNIQUE, characters='printable') base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') base_words = ak.concatenate((base_words1, base_words2)) np_base_words = np.hstack( (base_words1.to_ndarray(), base_words2.to_ndarray())) assert (compare_strings(base_words.to_ndarray(), np_base_words)) choices = ak.randint(0, base_words.size, N) strings = base_words[choices] test_strings = strings.to_ndarray() cat = ak.Categorical(strings) print("strings =", strings) print("categorical =", cat)