Exemple #1
0
def compare_strategies(length, ncat, op, dtype):
    keys = ak.randint(0, ncat, length)
    if dtype == 'int64':
        vals = ak.randint(0, length//ncat, length)
    elif dtype == 'bool':
        vals = ak.zeros(length, dtype='bool')
        for i in np.random.randint(0, length, ncat//2):
            vals[i] = True
    else:
        vals = ak.linspace(-1, 1, length)        
    print("Global groupby", end=' ')                                        
    start = time()                                                
    gg = ak.GroupBy(keys, False)
    ggtime = time() - start
    print(ggtime)
    print("Global reduce", end=' ')
    start = time()
    gk, gv = gg.aggregate(vals, op)
    grtime = time() - start
    print(grtime)
    print("Local groupby", end=' ')
    start = time()
    lg = ak.GroupBy(keys, True)
    lgtime = time() - start
    print(lgtime)
    print("Local reduce", end=' ')
    start = time()
    lk, lv = lg.aggregate(vals, op)
    lrtime = time() - start
    print(lrtime)
    print(f"Keys match? {(gk == lk).all()}")
    print(f"Absolute diff of vals = {ak.abs(gv - lv).sum()}")
    return ggtime, grtime, lgtime, lrtime
Exemple #2
0
    def test_multi_level_categorical(self):
        string = ak.array(['a', 'b', 'a', 'b', 'c'])
        cat = ak.Categorical(string)
        cat_from_codes = ak.Categorical.from_codes(
            codes=ak.array([0, 1, 0, 1, 2]),
            categories=ak.array(['a', 'b', 'c']))
        i = ak.arange(string.size)
        expected = {('a', 'a'): 2, ('b', 'b'): 2, ('c', 'c'): 1}

        # list of 2 strings
        str_grouping = ak.GroupBy([string, string])
        str_labels, str_values = str_grouping.nunique(i)
        str_dict = to_tuple_dict(str_labels, str_values)
        self.assertDictEqual(expected, str_dict)

        # list of 2 cats (one from_codes)
        cat_grouping = ak.GroupBy([cat, cat_from_codes])
        cat_labels, cat_values = cat_grouping.nunique(i)
        cat_dict = to_tuple_dict(cat_labels, cat_values)
        self.assertDictEqual(expected, cat_dict)

        # One cat (from_codes) and one string
        mixed_grouping = ak.GroupBy([cat_from_codes, string])
        mixed_labels, mixed_values = mixed_grouping.nunique(i)
        mixed_dict = to_tuple_dict(mixed_labels, mixed_values)
        self.assertDictEqual(expected, mixed_dict)
Exemple #3
0
def run_test(levels, verbose=False):
    '''
    The run_test method enables execution of ak.GroupBy and ak.GroupBy.Reductions
    on a randomized set of arrays on the specified number of levels. 
    
    Note: the current set of valid levels is {1,2}
    :return: 
    '''
    d = make_arrays()
    df = pd.DataFrame(d)
    akdf = {k:ak.array(v) for k, v in d.items()}

    if levels == 1:
        akg = ak.GroupBy(akdf['keys'])
        keyname = 'keys'
    elif levels == 2:
        akg = ak.GroupBy([akdf['keys'], akdf['keys2']])
        keyname = ['keys', 'keys2']
    tests = 0
    failures = 0
    not_impl = 0
    if verbose: print(f"Doing .count()")
    tests += 1
    pdkeys, pdvals = groupby_to_arrays(df, keyname, 'int64', 'count', levels)
    akkeys, akvals = akg.count()
    akvals = akvals.to_ndarray()
    failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals)
    for vname in ('int64', 'float64', 'bool'):
        for op in ak.GroupBy.Reductions:
            if verbose: print(f"\nDoing aggregate({vname}, {op})")
            tests += 1
            do_check = True
            try:
                pdkeys, pdvals = groupby_to_arrays(df, keyname, vname, op, levels)
            except Exception as E:
                if verbose: print("Pandas does not implement")
                do_check = False
            try:
                akkeys, akvals = akg.aggregate(akdf[vname], op)
                akvals = akvals.to_ndarray()
            except RuntimeError as E:
                if verbose: print("Arkouda error: ", E)
                not_impl += 1
                do_check = False
                continue
            if not do_check:
                continue
            if op.startswith('arg'):
                pdextrema = df[vname][pdvals]
                akextrema = akdf[vname][ak.array(akvals)].to_ndarray()
                if not np.allclose(pdextrema, akextrema):
                    print(f"Different argmin/argmax: Arkouda failed to find an extremum")
                    print("pd: ", pdextrema)
                    print("ak: ", akextrema)
                    failures += 1
            else:
                failures += compare_keys(pdkeys, akkeys, levels, pdvals, akvals)
    print(f"{tests - failures - not_impl} / {tests - not_impl} passed, {failures} errors, {not_impl} not implemented")
    return failures
Exemple #4
0
    def setUp(self):
        ArkoudaTest.setUp(self)

        self.bvalues = ak.randint(0, 1, 10, dtype=bool)
        self.fvalues = ak.randint(0, 1, 10, dtype=float)
        self.ivalues = ak.array([4, 1, 3, 2, 2, 2, 5, 5, 2, 3])
        self.igb = ak.GroupBy(self.ivalues)
Exemple #5
0
 def test_zero_length_groupby(self):
     """
     This tests groupby boundary condition on a zero length pdarray, see Issue #900 for details
     """
     g = ak.GroupBy(ak.zeros(0, dtype=ak.int64))
     str(
         g.segments
     )  # passing condition, if this was deleted it will cause the test to fail
Exemple #6
0
def run_test_groupby(strings, cat, akset):
    g = ak.GroupBy(strings)
    gc = ak.GroupBy(cat)
    # Unique keys should be same result as ak.unique
    assert (akset == set(g.unique_keys.to_ndarray()))
    assert (akset == set(gc.unique_keys.to_ndarray()))
    assert ((gc.permutation == g.permutation).all())
    permStrings = strings[g.permutation].to_ndarray()
    # Check each group individually
    lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size]))))
    for uk, s, l in zip(g.unique_keys.to_ndarray(), g.segments.to_ndarray(),
                        lengths):
        # All values in group should equal key
        assert ((permStrings[s:s + l] == uk).all())
        # Key should not appear anywhere outside of group
        assert (not (permStrings[:s] == uk).any())
        assert (not (permStrings[s + l:] == uk).any())
Exemple #7
0
    def test_count(self):
        values = ak.array([4, 1, 3, 2, 2, 2, 5, 5, 2, 3])
        gb = ak.GroupBy(values)
        keys, counts = gb.count()

        self.assertTrue((np.array([1, 2, 3, 4, 5]) == keys.to_ndarray()).all())
        self.assertTrue((np.array([1, 4, 2, 1,
                                   2]) == counts.to_ndarray()).all())
Exemple #8
0
    def test_error_handling(self):
        d = make_arrays()
        akdf = {k: ak.array(v) for k, v in d.items()}
        gb = ak.GroupBy([akdf['keys'], akdf['keys2']])

        with self.assertRaises(TypeError) as cm:
            gb.broadcast([])
        self.assertEqual(
            'type of argument "values" must be arkouda.pdarrayclass.pdarray; got list instead',
            cm.exception.args[0])
Exemple #9
0
 def test_nunique_types(self):
     string = ak.array(['a', 'b', 'a', 'b', 'c'])
     cat = ak.Categorical(string)
     i = ak.array([5, 3, 5, 3, 1])
     expected = ak.array([1, 1, 1])
     # Try GroupBy.nunique with every combination of types, including mixed
     keys = (string, cat, i, (string, cat, i))
     for key in keys:
         g = ak.GroupBy(key)
         for val in keys:
             k, n = g.nunique(val)
             self.assertTrue((n == expected).all())
Exemple #10
0
    def test_aggregate_strings(self):
        s = ak.array(['a', 'b', 'a', 'b', 'c'])
        i = ak.arange(s.size)
        grouping = ak.GroupBy(s)
        labels, values = grouping.nunique(i)

        expected = {'a': 2, 'b': 2, 'c': 1}
        actual = {
            label: value
            for (label, value) in zip(labels.to_ndarray(), values.to_ndarray())
        }

        self.assertDictEqual(expected, actual)
Exemple #11
0
def run_test(verbose=True):
    '''
    The run_test method enables execution of ak.GroupBy and ak.GroupBy.Reductions 
    for mean, min, max, and sum
    on a randomized set of arrays including nan values. 

    :return: 
    '''

    d = make_arrays()
    df = pd.DataFrame(d)
    akdf = {k:ak.array(v) for k, v in d.items()}

    akg = ak.GroupBy(akdf['keys'])
    keyname = 'keys'

    tests = 0
    failures = 0
    not_impl = 0

    tests += 1
    pdkeys, pdvals = groupby_to_arrays(df, keyname, 'float64', 'count')
    akkeys, akvals = akg.count()
    akvals = akvals.to_ndarray()

    for op in OPS:
        tests += 1

        do_check = True
        try:
            pdkeys, pdvals = groupby_to_arrays(df, keyname, 'float64', op)
        except Exception as E:
            if verbose: print("Pandas does not implement")
            do_check = False
        try:
            akkeys, akvals = akg.aggregate(akdf['float64'], op, True)
            akvals = akvals.to_ndarray()
        except RuntimeError as E:
            if verbose: print("Arkouda error: ", E)
            not_impl += 1
            do_check = False
            continue
        if not do_check:
            continue

        for i in range(pdvals.size):
            if np.isnan(pdvals[i]):
                pdvals[i] = 0.0 # clear out any nans to match ak implementation
        failures += compare_keys(pdkeys, akkeys, pdvals, akvals)

    return failures
Exemple #12
0
 def testPrecision(self):
     # See https://github.com/Bears-R-Us/arkouda/issues/964
     # Grouped sum was exacerbating floating point errors
     # This test verifies the fix
     N = 10**6
     G = N // 10
     ub = 2**63 // N
     groupnum = ak.randint(0, G, N, seed=1)
     intval = ak.randint(0, ub, N, seed=2)
     floatval = ak.cast(intval, ak.float64)
     g = ak.GroupBy(groupnum)
     _, intmean = g.mean(intval)
     _, floatmean = g.mean(floatval)
     ak_mse = ak.mean((intmean - floatmean)**2)
     self.assertTrue(np.isclose(ak_mse, 0.0))
Exemple #13
0
    def test_broadcast_booleans(self):

        values = ak.array([4, 1, 3, 2, 2, 2, 5, 5, 2, 3])
        gb = ak.GroupBy(values)
        keys, counts = gb.count()

        self.assertTrue((np.array([1, 4, 2, 1,
                                   2]) == counts.to_ndarray()).all())
        self.assertTrue((np.array([1, 2, 3, 4, 5]) == keys.to_ndarray()).all())

        results = gb.broadcast(counts > 2)
        self.assertTrue((np.array([0, 1, 1, 1, 1, 0, 0, 0, 0,
                                   0]), results.to_ndarray()))

        results = gb.broadcast(counts == 2)
        self.assertTrue((np.array([0, 0, 0, 0, 0, 1, 1, 0, 1,
                                   1]), results.to_ndarray()))

        results = gb.broadcast(counts < 4)
        self.assertTrue((np.array([1, 0, 0, 0, 0, 1, 1, 1, 1,
                                   1]), results.to_ndarray()))
Exemple #14
0
    def test_error_handling(self):
        d = make_arrays()
        akdf = {k: ak.array(v) for k, v in d.items()}
        gb = ak.GroupBy([akdf['keys'], akdf['keys2']])

        with self.assertRaises(TypeError) as cm:
            ak.GroupBy(self.bvalues)
        self.assertEqual('GroupBy only supports pdarrays with a dtype int64',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            ak.GroupBy(self.fvalues)
        self.assertEqual('GroupBy only supports pdarrays with a dtype int64',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            gb.broadcast([])
        self.assertEqual(
            'type of argument "values" must be arkouda.pdarrayclass.pdarray; got list instead',
            cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.nunique(ak.randint(0, 1, 10, dtype=bool))
        self.assertEqual('the pdarray dtype must be int64',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.nunique(ak.randint(0, 1, 10, dtype=float64))
        self.assertEqual('the pdarray dtype must be int64',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.any(ak.randint(0, 1, 10, dtype=float64))
        self.assertEqual('any is only supported for pdarrays of dtype bool',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.any(ak.randint(0, 1, 10, dtype=int64))
        self.assertEqual('any is only supported for pdarrays of dtype bool',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.all(ak.randint(0, 1, 10, dtype=float64))
        self.assertEqual('all is only supported for pdarrays of dtype bool',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.all(ak.randint(0, 1, 10, dtype=int64))
        self.assertEqual('all is only supported for pdarrays of dtype bool',
                         cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.min(ak.randint(0, 1, 10, dtype=bool))
        self.assertEqual(
            'min is only supported for pdarrays of dtype float64 and int64',
            cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.max(ak.randint(0, 1, 10, dtype=bool))
        self.assertEqual(
            'max is only supported for pdarrays of dtype float64 and int64',
            cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.argmin(ak.randint(0, 1, 10, dtype=bool))
        self.assertEqual(
            'argmin is only supported for pdarrays of dtype float64 and int64',
            cm.exception.args[0])

        with self.assertRaises(TypeError) as cm:
            self.igb.argmax(ak.randint(0, 1, 10, dtype=bool))
        self.assertEqual(
            'argmax is only supported for pdarrays of dtype float64 and int64',
            cm.exception.args[0])
Exemple #15
0
        assert (word in more_words)
    # Exhaustively find all matches to make sure we didn't miss any
    inds = ak.zeros(strings.size, dtype=ak.bool)
    for word in more_words:
        inds |= (strings == word)
    assert ((inds == matches).all())
    print("in1d and iter passed")

    # argsort
    test_argsort(strings, test_strings, cat)

    # unique
    test_unique(strings, test_strings, cat)

    # groupby
    g = ak.GroupBy(strings)
    gc = ak.GroupBy(cat)
    # Unique keys should be same result as ak.unique
    assert (akset == set(g.unique_keys.to_ndarray()))
    assert (akset == set(gc.unique_keys.to_ndarray()))
    assert ((gc.permutation == g.permutation).all())
    permStrings = strings[g.permutation]
    # Check each group individually
    lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size]))))
    for uk, s, l in zip(g.unique_keys, g.segments, lengths):
        # All values in group should equal key
        assert ((permStrings[s:s + l] == uk).all())
        # Key should not appear anywhere outside of group
        assert (not (permStrings[:s] == uk).any())
        assert (not (permStrings[s + l:] == uk).any())
    print("groupby passed")
Exemple #16
0
 def test_type_failure_multilevel_groupby_aggregate(self):
     # just checking no error occurs with hotfix for Issue 858
     keys = [ak.randint(0, 10, 100), ak.randint(0, 10, 100)]
     g = ak.GroupBy(keys)
     g.min(ak.randint(0, 10, 100))
Exemple #17
0
if __name__ == '__main__':
    import sys
    if len(sys.argv) != 7:
        print(
            f"Usage: {sys.argv[0]} <server> <port> <strategy (0=global, 1=perLocale)> <length> <num_keys> <num_vals>"
        )
        sys.exit()
    per_locale = (sys.argv[3] == '1')
    print("per_locale = ", per_locale)
    length = int(sys.argv[4])
    print("length     = ", length)
    nkeys = int(sys.argv[5])
    print("nkeys      = ", nkeys)
    nvals = int(sys.argv[6])
    print("nvals      = ", nvals)
    ak.connect(sys.argv[1], int(sys.argv[2]))
    print("Generating keys and vals...")
    start = time()
    keys, vals = generate_arrays(length, nkeys, nvals)
    print(f"{time() - start:.2f} seconds", end="\n\n")
    print("GroupBy...")
    start = time()
    g = ak.GroupBy(keys, per_locale)
    print(f"{time() - start:.2f} seconds", end="\n\n")
    for op in OPERATORS:
        print(f"Aggregate('{op}') ...")
        start = time()
        uk, rv = g.aggregate(vals, op)
        print(f"{time() - start:.2f} seconds", end="\n\n")
    sys.exit()