コード例 #1
0
def time_ak_in1d(size, trials):
    print(">>> arkouda int64 in1d")
    cfg = ak.get_config()
    N = size * cfg["numLocales"]
    a = ak.arange(N) % LARGE

    for regime, bsize in zip(('Medium', 'Large'), (MEDIUM, LARGE)):
        print(
            "{} regime: numLocales = {}  a.size = {:,}  b.size = {:,}".format(
                regime, cfg["numLocales"], N, bsize))
        b = ak.arange(bsize)
        expected_misses = (LARGE - bsize) * (a.size // LARGE) + max(
            (0, (a.size % LARGE) - bsize))
        timings = []
        for _ in range(trials):
            start = time.time()
            c = ak.in1d(a, b)
            end = time.time()
            timings.append(end - start)
            assert (c.size - c.sum()) == expected_misses, "Incorrect result"
        tavg = sum(timings) / trials
        print("{} average time = {:.4f} sec".format(regime, tavg))
        bytes_per_sec = (a.size * a.itemsize + b.size * b.itemsize) / tavg
        print("{} average rate = {:.2f} GiB/sec".format(
            regime, bytes_per_sec / 2**30))
コード例 #2
0
def check_correctness():
    asize = 10**4
    bsize = 10**3
    a = ak.arange(asize)
    b = ak.arange(bsize)
    c = ak.in1d(a, b)
    assert c.sum() == bsize, "Incorrect result"
コード例 #3
0
ファイル: segarray.py プロジェクト: Bears-R-Us/arkouda
    def append_single(self, x, prepend=False):
        '''
        Append a single value to each sub-array.

        Parameters
        ----------
        x : pdarray or scalar
            Single value to append to each sub-array
        
        Returns
        -------
        SegArray
            Copy of original SegArray with values from x appended to each sub-array
        '''
        if hasattr(x, 'size'):
            if x.size != self.size:
                raise ValueError(
                    'Argument must be scalar or same size as SegArray')
            if type(x) != type(self.values) or x.dtype != self.dtype:
                raise TypeError(
                    'Argument type must match value type of SegArray')
        newlens = self.lengths + 1
        newsegs = ak.cumsum(newlens) - newlens
        newvals = ak.zeros(newlens.sum(), dtype=self.dtype)
        if prepend:
            lastscatter = newsegs
        else:
            lastscatter = newsegs + newlens - 1
        newvals[lastscatter] = x
        origscatter = ak.arange(self.valsize) + self.grouping.broadcast(
            ak.arange(self.size), permute=True)
        if prepend:
            origscatter += 1
        newvals[origscatter] = self.values
        return SegArray(newsegs, newvals)
コード例 #4
0
def check_correctness():
    N = 10**4

    thirds = [ak.cast(ak.arange(i, N*3, 3), 'str') for i in range(3)]
    thickrange = thirds[0].stick(thirds[1], delimiter='_').stick(thirds[2], delimiter='_')

    answer = ak.cast(ak.arange(N*3), 'str')
    assert (thickrange.flatten('_') == answer).all()
    assert (thickrange.flatten('_', regex=True) == answer).all()
    assert (thickrange.flatten('_+', regex=True) == answer).all()
コード例 #5
0
ファイル: check.py プロジェクト: ZhuangZzzi/arkouda
def check_set_integer_iv(N):
    # create np version
    a = np.arange(N)
    iv = np.arange(N // 2)
    a[iv] = iv * 10
    # create ak version
    b = ak.arange(N)
    iv = ak.arange(N // 2)
    b[iv] = iv * 10
    # print(a,b)
    c = a == b.to_ndarray()
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #6
0
def check_set_integer_iv_value(N):
    # create np version
    a = np.arange(N)
    iv = np.arange(N // 2)
    a[iv] = -1
    a = ak.array(a)
    # create ak version
    b = ak.arange(N)
    iv = ak.arange(N // 2)
    b[iv] = -1
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #7
0
def check_correctness():
    keys = ak.arange(1000) % 10
    ones = ak.ones_like(keys)
    g = ak.GroupBy(keys)
    # Make sure keys are correct
    assert (g.unique_keys == ak.arange(10)).all()
    # Check value of sums
    assert (g.sum(ones)[1] == 100).all()
    # For other ops, just run them and make sure they return the right size vector
    for op in ak.GroupBy.Reductions:
        if op in BOOLOPS:
            res = g.aggregate((ones == 1), op)[1]
        else:
            res = g.aggregate(ones, op)[1]
        assert (res.size == g.unique_keys.size)
コード例 #8
0
ファイル: util.py プロジェクト: Bears-R-Us/arkouda
def most_common(g, values):
    '''Find the most common value for each key in a GroupBy object.
    
    Parameters
    ----------
    g : ak.GroupBy
        Grouping of keys
    values : array-like
        Values in which to find most common

    Returns
    -------
    unique_keys : (list of) arrays
        Unique key of each group 
    most_common_values : array-like 
        The most common value for each key 
    '''
    # Give each key an integer index
    keyidx = g.broadcast(ak.arange(g.unique_keys[0].size), permute=True)
    # Annex values and group by (key, val)
    bykeyval = ak.GroupBy([keyidx, values])
    # Count number of records for each (key, val)
    (ki, uval), count = bykeyval.count()
    # Group out value
    bykey = ak.GroupBy(ki, assume_sorted=True)
    # Find the index of the most frequent value for each key
    _, topidx = bykey.argmax(count)
    # Gather the most frequent values
    return uval[topidx]
コード例 #9
0
    def argsort(self, key, ascending=True):
        """
        Return the permutation that sorts the dataframe by `key`.

        Parameters
        ----------
        key : str
            The key to sort on.

        Returns
        -------
        ak.pdarray
            The permutation array that sorts the data on `key`.
        """

        if self._empty:
            return ak.array([], dtype=ak.int64)
        if ascending:
            return ak.argsort(self[key])
        else:
            if isinstance(
                    self[key],
                    ak.pdarray) and self[key].dtype in (ak.int64, ak.float64):
                return ak.argsort(-self[key])
            else:
                return ak.argsort(self[key])[ak.arange(self.size - 1, -1, -1)]
コード例 #10
0
ファイル: reduce.py プロジェクト: cdrickett/arkouda
def time_ak_reduce(N_per_locale, trials, dtype, random):
    print(">>> arkouda reduce")
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    if random:
        if dtype == 'int64':
            a = ak.randint(0, 2**32, N)
        elif dtype == 'float64':
            a = ak.randint(0, 1, N, dtype=ak.float64)
    else:
        a = ak.arange(0, N, 1)
        if dtype == 'float64':
            a = 1.0 * a

    timings = {op: [] for op in OPS}
    results = {}
    for i in range(trials):
        for op in timings.keys():
            fxn = getattr(a, op)
            start = time.time()
            r = fxn()
            end = time.time()
            timings[op].append(end - start)
            results[op] = r
    tavg = {op: sum(t) / trials for op, t in timings.items()}

    for op, t in tavg.items():
        print("{} = {}".format(op, results[op]))
        print("  Average time = {:.4f} sec".format(t))
        bytes_per_sec = (a.size * a.itemsize) / t
        print("  Average rate = {:.2f} GiB/sec".format(bytes_per_sec / 2**30))
コード例 #11
0
ファイル: segarray.py プロジェクト: Bears-R-Us/arkouda
    def get_ngrams(self, n, return_origins=True):
        """
        Return all n-grams from all sub-arrays.

        Parameters
        ----------
        n : int
            Length of n-gram
        return_origins : bool
            If True, return an int64 array indicating which sub-array 
            each returned n-gram came from.
        
        Returns
        -------
        ngrams : list of pdarray
            An n-long list of pdarrays, essentially a table where each row is an n-gram.
        origin_indices : pdarray, int
            The index of the sub-array from which the corresponding n-gram originated
        """
        ngrams = []
        notsegstart = ak.ones(self.valsize, dtype=ak.bool)
        notsegstart[self.segments] = False
        valid = ak.ones(self.valsize - n + 1, dtype=ak.bool)
        for i in range(n):
            end = self.valsize - n + i + 1
            ngrams.append(self.values[i:end])
            if i > 0:
                valid &= notsegstart[i:end]
        ngrams = [char[valid] for char in ngrams]
        if return_origins:
            origin_indices = self.grouping.broadcast(
                ak.arange(self.size), permute=True)[:valid.size][valid]
            return ngrams, origin_indices
        else:
            return ngrams
コード例 #12
0
def time_ak_scan(N_per_locale, trials, dtype, random, seed):
    print(">>> arkouda {} scan".format(dtype))
    cfg = ak.get_config()
    N = N_per_locale * cfg["numLocales"]
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
    if random or args.seed is not None:
        if dtype == 'int64':
            a = ak.randint(1, N, N, seed=seed)
        elif dtype == 'float64':
            a = ak.uniform(N, seed=seed) + 0.5
    else:
        a = ak.arange(1, N, 1)
        if dtype == 'float64':
            a = 1.0 * a
     
    timings = {op: [] for op in OPS}
    final_values = {}
    for i in range(trials):
        for op in timings.keys():
            fxn = getattr(ak, op)
            start = time.time()
            r = fxn(a)
            end = time.time()
            timings[op].append(end - start)
            final_values[op] = r[r.size-1]
    tavg = {op: sum(t) / trials for op, t in timings.items()}

    for op, t in tavg.items():
        print("{}, final value = {}".format(op, final_values[op]))
        print("  {} Average time = {:.4f} sec".format(op, t))
        bytes_per_sec = (a.size * a.itemsize * 2) / t
        print("  {} Average rate = {:.2f} GiB/sec".format(op, bytes_per_sec/2**30))
コード例 #13
0
ファイル: segarray.py プロジェクト: Bears-R-Us/arkouda
    def remove_repeats(self, return_multiplicity=False):
        """
        Condense sequences of repeated values within a sub-array to a single value.

        Parameters
        ----------
        return_multiplicity : bool
            If True, also return the number of times each value was repeated.

        Returns
        -------
        norepeats : SegArray
            Sub-arrays with runs of repeated values replaced with single value
        multiplicity : SegArray
            If return_multiplicity=True, this array contains the number of times
            each value in the returned SegArray was repeated in the original SegArray.
        """
        isrepeat = ak.zeros(self.values.size, dtype=ak.bool)
        isrepeat[1:] = self.values[:-1] == self.values[1:]
        isrepeat[self.segments] = False
        truepaths = self.values[~isrepeat]
        nhops = self.grouping.sum(~isrepeat)[1]
        # truehops = ak.cumsum(~isrepeat)
        # nhops = ak.concatenate((truehops[self.segments[1:]], ak.array([truehops.sum()+1]))) - truehops[self.segments]
        truesegs = ak.cumsum(nhops) - nhops
        norepeats = SegArray(truesegs, truepaths)
        if return_multiplicity:
            truehopinds = ak.arange(self.valsize)[~isrepeat]
            multiplicity = ak.zeros(truepaths.size, dtype=ak.int64)
            multiplicity[:-1] = truehopinds[1:] - truehopinds[:-1]
            multiplicity[-1] = self.valsize - truehopinds[-1]
            return norepeats, SegArray(truesegs, multiplicity)
        else:
            return norepeats
コード例 #14
0
ファイル: segarray.py プロジェクト: Bears-R-Us/arkouda
 def from_multi_array(cls, m):
     """
     Construct a SegArray from a list of columns. This essentially transposes the input,
     resulting in an array of rows.
     
     Parameters
     ----------
     m : list of pdarray
         List of columns, the rows of which will form the sub-arrays of the output
     
     Returns
     -------
     SegArray
         Array of rows of input
     """
     if isinstance(m, ak.pdarray):
         size = m.size
         n = 1
         dtype = m.dtype
     else:
         s = set(mi.size for mi in m)
         if len(s) != 1:
             raise ValueError("All columns must have same length")
         size = s.pop()
         n = len(m)
         d = set(mi.dtype for mi in m)
         if len(d) != 1:
             raise ValueError("All columns must have same dtype")
         dtype = d.pop()
     newsegs = ak.arange(size) * n
     newvals = ak.zeros(size * n, dtype=dtype)
     for j in range(len(m)):
         newvals[j::len(m)] = m[j]
     return cls(newsegs, newvals)
コード例 #15
0
def check_get_integer_idx(N):
    # create np version
    a = np.arange(N)
    v1 = a[N // 2]
    # create ak version
    b = ak.arange(N)
    v2 = b[N // 2]
    return pass_fail(v1 == v2)
コード例 #16
0
def check_arange(N):
    # create np version
    a = ak.array(np.arange(N))
    # create ak version
    b = ak.arange(N)
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #17
0
def time_flatten(N, trials):
    print(">>> arkouda flatten")
    cfg = ak.get_config()
    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))

    thirds = [ak.cast(ak.arange(i, N*3, 3), 'str') for i in range(3)]
    thickrange = thirds[0].stick(thirds[1], delimiter='_').stick(thirds[2], delimiter='_')
    nbytes = thickrange.nbytes * thickrange.entry.itemsize

    non_regex_times = []
    regex_literal_times = []
    regex_pattern_times = []
    for i in range(trials):
        start = time.time()
        non_regex = thickrange.flatten('_')
        end = time.time()
        non_regex_times.append(end - start)

        start = time.time()
        regex_literal = thickrange.flatten('_', regex=True)
        end = time.time()
        regex_literal_times.append(end - start)

        start = time.time()
        regex_pattern = thickrange.flatten('_+', regex=True)
        end = time.time()
        regex_pattern_times.append(end - start)

    avg_non_regex = sum(non_regex_times) / trials
    avg_regex_literal = sum(regex_literal_times) / trials
    avg_regex_pattern = sum(regex_pattern_times) / trials

    answer = ak.cast(ak.arange(N*3), 'str')
    assert (non_regex == answer).all()
    assert (regex_literal == answer).all()
    assert (regex_pattern == answer).all()

    print("non-regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_non_regex))
    print("regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_regex_literal))
    print("regex flatten with pattern delimiter Average time = {:.4f} sec".format(avg_regex_pattern))

    print("non-regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_non_regex))
    print("regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_literal))
    print("regex flatten with pattern delimiter Average rate = {:.4f} GiB/sec".format(nbytes/2**30/avg_regex_pattern))
コード例 #18
0
ファイル: sort-cases.py プロジェクト: Bears-R-Us/arkouda
def refinement(N):
    '''
    Coargsort of two arrays, where the first is already sorted
    but has many repeated values
    '''
    groupsize = 100
    a = ak.arange(N // 2) // groupsize
    factor = 2**32 // a.max()
    a *= factor
    b = ak.randint(0, 2**32, N // 2)
    yield 'refinement int64', (a, b)
コード例 #19
0
    def reset_index(self, size=False):
        """
        Set the index to an integer range.

        Useful if this dataframe is the result of a slice operation from
        another dataframe, or if you have permuted the rows and no longer need
        to keep that ordering on the rows.

        Parameters
        ----------
        size : int
            If size is passed, do not attempt to determine size based on
            existing column sizes. Assume caller handles consistency correctly.
        """

        if not size:
            self.update_size()
            self.data['index'] = ak.arange(0, self._size)
        else:
            self.data['index'] = ak.arange(size)
コード例 #20
0
ファイル: check.py プロジェクト: ZhuangZzzi/arkouda
def check_set_bool_iv(N):
    # create np version
    a = np.arange(N)
    a[a < N // 2] = a[:N // 2] * -1
    # create ak version
    b = ak.arange(N)
    b[b < N // 2] = b[:N // 2] * -1
    # print(a,b)
    c = a == b.to_ndarray()
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #21
0
    def __setitem__(self, key, value):
        self.update_size()

        # If this is the first column added, we must create an index column.
        add_index = False
        if self._empty:
            add_index = True

        # Set a single row in the dataframe using a dict of values
        if type(key) == int:
            for k in self.data.keys():
                if isinstance(self.data[k], ak.Strings):
                    raise ValueError(
                        "This DataFrame has a column of type ak.Strings;"
                        " so this DataFrame is immutable. This feature could change"
                        " if arkouda supports mutable Strings in the future.")
            if self._empty:
                raise ValueError(
                    "Initial data must be dict of arkouda arrays.")
            elif not isinstance(value, (dict, UserDict)):
                raise ValueError("Expected dict or Row type.")
            elif key >= self._size:
                raise KeyError("The row index is out of range.")
            else:
                for k, v in value.items():
                    if k == 'index':
                        continue
                    self[k][key] = v

        # Set a single column in the dataframe using a an arkouda array
        elif type(key) == str:
            if not isinstance(value, self.COLUMN_CLASSES):
                raise ValueError(
                    f"Column must be one of {self.COLUMN_CLASSES}.")
            elif self._size is not None and self._size != value.size:
                raise ValueError(
                    "Expected size {} but received size {}.".format(
                        self.size, value.size))
            else:
                self._empty = False
                UserDict.__setitem__(self, key, value)
                # Update the index values
                if key not in self._columns:
                    self._columns.append(key)

        # Do nothing and return if there's no valid data
        else:
            raise ValueError("No valid data received.")

        # Update the dataframe indices and metadata.
        if add_index:
            self.update_size()
            self.data['index'] = ak.arange(0, self._size, 1)
コード例 #22
0
def check_set_bool_iv_value(N):
    # create np version
    a = np.arange(N)
    a[a < N // 2] = -1
    a = ak.array(a)
    # create ak version
    b = ak.arange(N)
    b[b < N // 2] = -1
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #23
0
def check_get_bool_iv(N):
    # create np version
    a = np.arange(N)
    a = a[a < N // 2]
    a = ak.array(a)
    # create ak version
    b = ak.arange(N)
    b = b[b < N // 2]
    # print(a,b)
    c = a == b
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #24
0
ファイル: check.py プロジェクト: ZhuangZzzi/arkouda
def check_set_integer_idx(N):
    # create np version
    a = np.arange(N)
    a[N // 2] = -1
    a[-1] = -1
    v1 = a[N // 2]
    # create ak version
    b = ak.arange(N)
    b[N // 2] = -1
    b[-1] = -1
    v2 = b[N // 2]
    return pass_fail(v1 == v2) and pass_fail(a[-1] == b[-1])
コード例 #25
0
ファイル: check.py プロジェクト: ZhuangZzzi/arkouda
def check_bool(N):
    a = ak.arange(N)
    b = ak.ones(N)
    try:
        c = a and b
    except ValueError:
        correct = True
    except:
        correct = False
    d = ak.array([1])
    correct = correct and (d and 5)
    return pass_fail(correct)
コード例 #26
0
ファイル: index.py プロジェクト: Bears-R-Us/arkouda
 def argsort(self, ascending=True):
     if not ascending:
         if isinstance(
                 self.index,
                 ak.pdarray) and self.index.dtype in (ak.int64, ak.float64):
             i = ak.argsort(-self.index)
         else:
             i = ak.argsort(self.index)[ak.arange(self.index.size - 1, -1,
                                                  -1)]
     else:
         i = ak.argsort(self.index)
     return i
コード例 #27
0
ファイル: sort-cases.py プロジェクト: Bears-R-Us/arkouda
def block_sorted(N):
    '''
    The concatenation of two sorted arrays of unequal length
    The interleaving of two sorted arrays of unequal length

    Most often occurs in array setops, where two arrays are
    uniqued (via sorting), then concatenated and sorted
    '''
    splitpoint = 0.4
    Na = int(splitpoint * N)
    Nb = N - Na
    # Construct a and b such that:
    #   1) Values overlap
    #   2) a and b are sorted
    a = ak.arange(Na)
    b = ak.arange(Nb)
    c = ak.concatenate((a, b), ordered=True)
    yield 'block-sorted concat int64', c

    ci = ak.concatenate((a, b), ordered=False)
    yield 'block-sorted interleaved int64', ci
コード例 #28
0
ファイル: check.py プロジェクト: ZhuangZzzi/arkouda
def check_sort(N):
    # create np version
    a = np.arange(N)
    a = a[::-1]
    a = np.sort(a)
    # create ak version
    b = ak.arange(N)
    b = b[::-1]
    b = ak.sort(b)
    # print(a,b)
    c = a == b.to_ndarray()
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #29
0
ファイル: check.py プロジェクト: ZhuangZzzi/arkouda
def check_coargsort(N):
    # create np version
    a = np.arange(N)
    a = a[::-1]
    iv = np.lexsort([a, a])
    a = a[iv]
    # create ak version
    b = ak.arange(N)
    b = b[::-1]
    iv = ak.coargsort([b, b])
    b = b[iv]
    # print(a,b)
    c = a == b.to_ndarray()
    # print(type(c),c)
    return pass_fail(c.all())
コード例 #30
0
ファイル: alignment.py プロジェクト: Bears-R-Us/arkouda
def in1d_intervals(vals, intervals, symmetric=False, assume_unique=False):
    """
    Test each value for membership in *any* of a set of half-open (pythonic) 
    intervals.

    Parameters
    ----------
    vals : pdarray(int, float)
        Values to test for membership in intervals
    intervals : 2-tuple of pdarrays
        Non-overlapping, half-open intervals, as a tuple of 
        (lower_bounds_inclusive, upper_bounds_exclusive)
    symmetric : bool
        If True, also return boolean pdarray indicating which intervals
        contained one or more query values.
        
    Returns
    -------
    pdarray(bool)
        Array of same length as <vals>, True if corresponding value is
        included in any of the ranges defined by (low[i], high[i]) inclusive.
    pdarray(bool) (if symmetric=True)    
        Array of same length as number of intervals, True if corresponding
        interval contains any of the values in <vals>.

    Notes
    -----
    First return array is equivalent to the following:
        ((vals >= intervals[0][0]) & (vals < intervals[1][0])) | 
        ((vals >= intervals[0][1]) & (vals < intervals[1][1])) | 
        ...
        ((vals >= intervals[0][-1]) & (vals < intervals[1][-1]))
    But much faster when testing many ranges.
    
    Second (optional) return array is equivalent to:
        ((intervals[0] <= vals[0]) & (intervals[1] > vals[0])) |
        ((intervals[0] <= vals[1]) & (intervals[1] > vals[1])) |
        ...
        ((intervals[0] <= vals[-1]) & (intervals[1] > vals[-1]))
    But much faster when vals is non-trivial size.
    """
    idx = search_intervals(vals, intervals, assume_unique=assume_unique)
    found = idx > -1
    if symmetric:
        containresult = ak.in1d(ak.arange(intervals[0].size), idx)
        return found, containresult
    else:
        return found