def argsort(self, key, ascending=True): """ Return the permutation that sorts the dataframe by `key`. Parameters ---------- key : str The key to sort on. Returns ------- ak.pdarray The permutation array that sorts the data on `key`. """ if self._empty: return ak.array([], dtype=ak.int64) if ascending: return ak.argsort(self[key]) else: if isinstance( self[key], ak.pdarray) and self[key].dtype in (ak.int64, ak.float64): return ak.argsort(-self[key]) else: return ak.argsort(self[key])[ak.arange(self.size - 1, -1, -1)]
def argsort(self, ascending=True): if not ascending: if isinstance( self.index, ak.pdarray) and self.index.dtype in (ak.int64, ak.float64): i = ak.argsort(-self.index) else: i = ak.argsort(self.index)[ak.arange(self.index.size - 1, -1, -1)] else: i = ak.argsort(self.index) return i
def time_ak_argsort(N_per_locale, trials, dtype, seed): print(">>> arkouda {} argsort".format(dtype)) cfg = ak.get_config() N = N_per_locale * cfg["numLocales"] print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) nbytes = a.size * a.itemsize elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) nbytes = a.nbytes * a.entry.itemsize timings = [] for i in range(trials): start = time.time() perm = ak.argsort(a) end = time.time() timings.append(end - start) tavg = sum(timings) / trials if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm]) print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = nbytes / tavg print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec / 2**30))
def time_ak_argsort(N_per_locale, trials, dtype, scale_by_locales): print(">>> arkouda argsort") cfg = ak.get_config() if scale_by_locales: N = N_per_locale * cfg["numLocales"] else: N = N_per_locale print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N)) if dtype == 'int64': a = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) timings = [] for i in range(trials): start = time.time() perm = ak.argsort(a) end = time.time() timings.append(end - start) tavg = sum(timings) / trials assert ak.is_sorted(a[perm]) print("Average time = {:.4f} sec".format(tavg)) bytes_per_sec = (a.size * a.itemsize) / tavg print("Average rate = {:.4f} GiB/sec".format(bytes_per_sec/2**30))
def check_correctness(dtype, seed): arrays, totalbytes = generate_arrays(1000, 2, dtype, seed) g = ak.GroupBy(arrays) perm = ak.argsort(ak.randint(0, 2**32, arrays[0].size)) g2 = ak.GroupBy([a[perm] for a in arrays]) assert all((uk == uk2).all() for uk, uk2 in zip(g.unique_keys, g2.unique_keys)) assert (g.segments == g2.segments).all()
def check_correctness(dtype): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64) perm = ak.argsort(a) assert ak.is_sorted(a[perm])
def sort_values(self,ascending=True): """ Sort the series numerically Returns ------- A new Series sorted smallest to largest """ if not ascending: if isinstance(self.values, ak.pdarray) and self.values.dtype in (ak.int64, ak.float64): # For numeric values, negation reverses sort order idx = ak.argsort(-self.values) else: # For non-numeric values, need the descending arange because reverse slicing not supported idx = ak.argsort(self.values)[ak.arange(self.values.size-1, -1, -1)] else: idx = ak.argsort(self.values) return Series((self.index[idx], self.values[idx]))
def check_correctness(dtype, seed): N = 10**4 if dtype == 'int64': a = ak.randint(0, 2**32, N, seed=seed) elif dtype == 'float64': a = ak.randint(0, 1, N, dtype=ak.float64, seed=seed) elif dtype == 'str': a = ak.random_strings_uniform(1, 16, N, seed=seed) perm = ak.argsort(a) if dtype in ('int64', 'float64'): assert ak.is_sorted(a[perm])
def check_argsort(N): # create np version a = np.arange(N) a = a[::-1] iv = np.argsort(a) a = a[iv] # create ak version b = ak.arange(N) b = b[::-1] iv = ak.argsort(b) b = b[iv] # print(a,b) c = a == b.to_ndarray() # print(type(c),c) return pass_fail(c.all())
def gen_rmat_edges(lgNv, Ne_per_v, p, perm=False): # number of vertices Nv = 2**lgNv # number of edges Ne = Ne_per_v * Nv # probabilities a = p b = (1.0 - a) / 3.0 c = b d = b # init edge arrays ii = ak.ones(Ne, dtype=ak.int64) jj = ak.ones(Ne, dtype=ak.int64) # quantites to use in edge generation loop ab = a + b c_norm = c / (c + d) a_norm = a / (a + b) # generate edges for ib in range(1, lgNv): ii_bit = (ak.randint(0, 1, Ne, dtype=ak.float64) > ab) jj_bit = (ak.randint(0, 1, Ne, dtype=ak.float64) > (c_norm * ii_bit + a_norm * (~ii_bit))) ii = ii + ((2**(ib - 1)) * ii_bit) jj = jj + ((2**(ib - 1)) * jj_bit) # sort all based on ii and jj using coargsort # all edges should be sorted based on both vertices of the edge iv = ak.coargsort((ii, jj)) # permute into sorted order ii = ii[iv] # permute first vertex into sorted order jj = jj[iv] # permute second vertex into sorted order # to premute/rename vertices if perm: # generate permutation for new vertex numbers(names) ir = ak.argsort(ak.randint(0, 1, Nv, dtype=ak.float64)) # renumber(rename) vertices ii = ir[ii] # rename first vertex jj = ir[jj] # rename second vertex # # maybe: remove edges which are self-loops??? # # return pair of pdarrays return (ii, jj)
def do_argsort(data, algo): if isinstance(data, (ak.pdarray, ak.Strings)): return ak.argsort(data, algo) else: return ak.coargsort(data, algo)
akwords = ak.array(more_words) matches = ak.in1d(strings, akwords) catmatches = ak.in1d(cat, akwords) assert ((matches == catmatches).all()) # Every word in matches should be in the target set for word in strings[matches]: assert (word in more_words) # Exhaustively find all matches to make sure we didn't miss any inds = ak.zeros(strings.size, dtype=ak.bool) for word in more_words: inds |= (strings == word) assert ((inds == matches).all()) print("in1d and iter passed") # argsort akperm = ak.argsort(strings) aksorted = strings[akperm].to_ndarray() npsorted = np.sort(test_strings) assert ((aksorted == npsorted).all()) catperm = ak.argsort(cat) catsorted = cat[catperm].to_ndarray() assert ((catsorted == npsorted).all()) print("argsort passed") # unique akuniq = ak.unique(strings) catuniq = ak.unique(cat) akset = set(akuniq.to_ndarray()) catset = set(catuniq.to_ndarray()) assert (akset == catset) # There should be no duplicates
def search_intervals(vals, intervals, assume_unique=False): """ Given an array of query vals and non-overlapping, half-open (pythonic) intervals, return the index of the interval containing each query value, or -1 if not present in any interval. Parameters ---------- vals : pdarray(int, float) Values to search for in intervals intervals : 2-tuple of pdarrays Non-overlapping, half-open intervals, as a tuple of (lower_bounds_inclusive, upper_bounds_exclusive) assume_unique : bool If True, assume query vals are unique. Default: False. Returns ------- idx : pdarray(int64) Index of interval containing each query value, or -1 if not found Notes ----- The return idx satisfies the following condition: present = idx > -1 ((intervals[0][idx[present]] <= vals[present]) & (intervals[1][idx[present]] > vals[present])).all() """ if len(intervals) != 2: raise ValueError( "intervals must be 2-tuple of (lower_bound_inclusive, upper_bounds_exclusive)" ) def check_numeric(x): if not (isinstance(x, ak.pdarray) and x.dtype in (ak.int64, ak.float64)): raise TypeError("arguments must be numeric arrays") check_numeric(vals) check_numeric(intervals[0]) check_numeric(intervals[1]) low = intervals[0] # Convert to closed (inclusive) intervals high = intervals[1] - 1 if low.size != high.size: raise ValueError("Lower and upper bound arrays must be same size") if not (high >= low).all(): raise ValueError("Upper bounds must be greater than lower bounds") if not ak.is_sorted(low): raise ValueError("Intervals must be sorted in ascending order") if not (low[1:] > high[:-1]).all(): raise ValueError("Intervals must be non-overlapping") if assume_unique: uvals = vals else: g = ak.GroupBy(vals) uvals = g.unique_keys # Index of interval containing each unique value (initialized to -1: not found) containinginterval = -ak.ones(uvals.size, dtype=ak.int64) concat = ak.concatenate((low, uvals, high)) perm = ak.argsort(concat) # iperm is the indices of the original values in the sorted array iperm = ak.argsort(perm) # aku.invert_permutation(perm) boundary = uvals.size + low.size # indices of the lower bounds in the sorted array starts = iperm[:low.size] # indices of the upper bounds in the sorted array ends = iperm[boundary:] # which lower/upper bound pairs have any indices between them? valid = (ends > starts + 1) if valid.sum() > 0: # pranges is all the indices in sorted array that fall between a lower and an uppper bound segs, pranges = gen_ranges(starts[valid] + 1, ends[valid]) # matches are the indices of those items in the original array matches = perm[pranges] # integer indices of each interval containing a hit hitidx = ak.arange(valid.size)[valid] # broadcast interval index out to matches matchintervalidx = ak.broadcast(segs, hitidx, matches.size) # make sure not to include any of the bounds themselves validmatch = (matches >= low.size) & (matches < boundary) # indices of unique values found (translated from concat keys) uvalidx = matches[validmatch] - low.size # set index of containing interval for uvals that were found containinginterval[uvalidx] = matchintervalidx[validmatch] if assume_unique: res = containinginterval else: res = g.broadcast(containinginterval, permute=True) return res