def test_qcut(self): c = qcut(arange(10), 3) self.assertTrue(sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4])) == 0) c = qcut(arange(11), 3) self.assertTrue( sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4])) == 0) c = qcut(range(5), 3, labels=["good", "medium", "bad"]) self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 4])) == 0) c = qcut(arange(100.0), [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95]) self.assertTrue(c._np[0] == 1) self.assertTrue(c._np[5] == 2) self.assertTrue(c._np[94] == 7) self.assertTrue(c._np[95] == 1) c = qcut(arange(100.0), [0.00, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95]) self.assertTrue(c._np[0] == 2) self.assertTrue(c._np[5] == 2) self.assertTrue(c._np[94] == 7) self.assertTrue(c._np[95] == 1) c = qcut(range(5), 4) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0) self.assertTrue((c.category_array == [ b'Clipped', b'0.0->1.0', b'1.0->2.0', b'2.0->3.0', b'3.0->4.0' ]).all()) c = qcut(range(5), 4, labels=True) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0) self.assertTrue((c.category_array == [ b'Clipped', b'0.0->1.0', b'1.0->2.0', b'2.0->3.0', b'3.0->4.0' ]).all()) c = qcut(range(5), 4, labels=None) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0) self.assertTrue((c.category_array == [ b'Clipped', b'0.0->1.0', b'1.0->2.0', b'2.0->3.0', b'3.0->4.0' ]).all()) c = qcut(range(5), 3, labels=["good", "medium", "bad"]) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 4])) == 0) self.assertTrue( np.array([(h == t) for (h, t) in zip( c.expand_array.astype('U'), ['good', 'good', 'medium', 'bad', 'bad'], )]).all()) self.assertTrue( (c.category_array == [[b'Clipped', b'good', b'medium', b'bad']]).all()) c = qcut(range(5), 4, labels=False) self.assertIsInstance(c, FastArray) self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0)
def test_ops(self): ds = Dataset({ 'test': arange(300000) % 3, 'test2': arange(300000.0), 'test2i': arange(300000), 'test3': arange(300000) % 3, }) gb = ds.groupby('test') result = gb.mean() self.assertTrue(result.test2[0] == result.test2i[0]) self.assertTrue(result.test2[1] == result.test2i[1]) self.assertTrue(result.test3[1] == 1.0) result = gb.median() result = gb.trimbr() result = gb.nanmedian()
def test_cumcount_vs_gb(self): arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50) ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)}) gb_result = ds.gb('keycol').cumcount() c = Categorical(ds.keycol) c_result = c.cumcount() rdiff = gb_result - c_result assert sum(rdiff) == 0 f = logical(arange(50) % 2) c_result = c.cumcount(filter=f) assert bool(np.all(isnotnan(c_result[f]))) assert bool(np.all(isnan(c_result[~f])))
def test_extract_groups(self): ## Case 1: Basic operation. # Create a grouping from some data. key_data1 = rt.FA([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6]) g1 = rt.Grouping(key_data1) # Create another data array the same length as the key for the Grouping. data1 = rt.arange(len(key_data1)) # Create a mask which selects the even-numbered groups. group_mask1 = rt.arange(len(g1.ncountgroup)) % 2 == 0 # Extract elements from the data array where they correspond to even-numbered groups. result1 = Grouping.extract_groups(group_mask1, data1, g1.ncountgroup, g1.ifirstgroup) assert_array_equal(rt.FA([1, 2, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20]), result1)
def test_searchsorted(self): a = rt.arange(10.0) b = rt.arange(20.0) / 2 b[3] = -np.inf b[7] = np.inf b[5] = np.nan b[2] = 100 b[1] = -100 x1 = np.searchsorted(a, b, side='left') x2 = rt.searchsorted(a, b, side='left') assert sum(x1 - x2) == 0 x1 = rt.searchsorted(a, b, side='right') x2 = rt.searchsorted(a, b, side='right') assert sum(x1 - x2) == 0 b = b.astype(np.int32)
def test_pad(self): arrsize = 100 numrows = 20 ds = Dataset({'time': arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = ['ZYGO', 'YHOO', 'FB', 'GOOG', 'IBM'] ds.symbol2 = Cat(1 + ds.data, list('ABCDEFGHIJKLMNOPQRST')) ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols) ds.time[[3, 4, 7]] = nan newds = ds.gb('symbol').pad() self.assertTrue(newds.time[7] == 2.00) self.assertTrue(newds.time[3] != newds.time[3]) newds = ds.gb('symbol').backfill() self.assertTrue(newds.time[7] == 12.00) # see if we can pull a group newds = ds.gb('symbol').get_group('YHOO') self.assertTrue(np.all(newds.symbol == 'YHOO'))
def test_iter(self): correct_keys = FastArray(['e', 'd', 'b', 'c', 'a']) correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]] str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd']) gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)}) gb = gb.gb('keycol') for i, tup in enumerate(gb): self.assertEqual(tup[0], correct_keys[i]) self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
def test_shift(self): """ Test that Categorical.shift shifts the values in an array or Dataset *per group*. """ result = rt.Cat([1, 1, 1, 2]).shift(arange(4), window=1)[0] assert result[1] == 0 assert result[2] == 1 result = rt.Cat([1, 1, 1, 2]).shift([5, 6, 7, 8], window=1)[0] assert result[1] == 5 assert result[2] == 6
def test_apply_nonreduce(self): arrsize = 200 numrows = 7 ds = rt.Dataset({'time': rt.arange(arrsize * 1.0)}) ds.data = arange(arrsize) % numrows ds.data2 = (arange(arrsize) + 3) % numrows symbols = [ 'AAPL', 'AMZN', 'FB', 'GOOG', 'IBM', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', ] ds.symbol = rt.Cat(1 + rt.arange(arrsize) % len(symbols), symbols) result = ds.symbol.apply_reduce(lambda x, y: np.sum(np.minimum(x, y)), (ds.data, ds.data)) ac = ds.accum2('symbol', 'data') newds = ac.apply_nonreduce(np.cumsum) ds2 = ac.apply_reduce(lambda x, y: np.sum(np.maximum(x, y)), (newds.data, newds.data2)) x = np.maximum(newds.data, newds.data2) y = ac.apply_nonreduce(lambda x, y: np.maximum(x, y), (newds.data, newds.data2))[0] self.assertTrue(np.all(x == y))
def test_transform(self): arrsize = 200 numrows = 7 ds = Dataset({'time': arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM'] ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols) newds = ds.gb('symbol')['data'].sum(transform=True) # removed from test since gbkeys not returned in transform # self.assertTrue(np.all(newds.symbol == ds.symbol)) catds = ds.symbol.sum(ds.data, transform=True) self.assertTrue(np.all(newds[0] == catds[0])) # test showfilter catds = ds.symbol.sum(ds.data, showfilter=True, transform=True) self.assertTrue(np.all(newds[0] == catds[0])) # test diff result1 = ds.gb('symbol').apply_nonreduce(TypeRegister.FastArray.diff) result2 = ds.gb('symbol').diff() self.assertTrue(result1.equals(result2))
def test_gb_labels_enum(self): # make sure enum groupby keys are displayed as string, not integer code c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], { 'a': 30, 'b': 20, 'c': 10 }) c_result = c.count() c_labels = c_result[c_result.label_get_names()][0] ds = Dataset({'catcol': c, 'data': arange(9)}) ds_result = ds.gbu('catcol').count() ds_labels = ds_result[ds_result.label_get_names()][0] assert c_labels.dtype.char == ds_labels.dtype.char assert bool(np.all(c_labels == ds_labels))
def test_extract_groups_all_groups_off(self): """Test for Grouping.extract_groups() when given a condition mask with all values set to False.""" # Create a grouping from some data. key_data1 = rt.FA([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6]) g1 = rt.Grouping(key_data1) # Create another data array the same length as the key for the Grouping. data1 = rt.arange(len(key_data1)) # Create a condition mask with all values set to False. group_mask1 = rt.zeros(len(g1.ncountgroup), dtype=np.bool) # Extract elements from the data array with all groups masked out -- i.e. we're trying # to select data from none of the groups. result1 = Grouping.extract_groups(group_mask1, data1, g1.ncountgroup, g1.ifirstgroup) assert_array_equal(rt.FA([]), result1)
'min', 'max', 'var', 'std', 'nansum', 'nanmean', 'nanmin', 'nanmax', 'nanvar', 'nanstd', ] gb_funcs_L2 = ['first', 'last', 'median', 'mode', 'nanmedian'] gb_funcs_L3 = ['cumsum', 'cumprod'] all_gb_ops = gb_funcs_L1 + gb_funcs_L2 + gb_funcs_L3 even_filter = logical(arange(30) % 2) d_filter = str_fa != b'd' d_filter_results = [ 0, np.nan, np.nan, np.nan, np.nan, np.nan, 0, np.nan, np.nan, np.nan, np.nan, np.nan,
def test_cut(self): c = cut(arange(10), 3) self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0) c = cut(arange(10.0), 3) self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0) c = cut(arange(11), 3) self.assertTrue( sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3])) == 0) c = cut(FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10])) self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0) c = cut( FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10]), labels=['a', 'b', 'c', 'd', 'e'], ) self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0) a = np.array([1, 7, 5, 4, 6, 3]) l = FA([b'1.0->3.0', b'3.0->5.0', b'5.0->7.0']) c = cut(a, 3) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) c = cut(a, 3, labels=True) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) c = cut(a, 3, labels=None) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) c = cut(a, 3, labels=False) self.assertIsInstance(c, FastArray) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) c, b = cut(a, 3, retbins=True) self.assertIsInstance(c, Categorical) self.assertIsInstance(b, np.ndarray) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) self.assertTrue(sum(b - FA([1.0, 3.0, 5.0, 7.0])) == 0) l = ["bad", "medium", "good"] c = cut(a, 3, labels=l) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) # contiguous test x = arange(4).reshape(2, 2) knots = [-0.5, 0.5, 1.5, 2.5, 3.5] c = cut(x[:, 1], knots) l = FastArray([b'-0.5->0.5', b'0.5->1.5', b'1.5->2.5', b'2.5->3.5']) self.assertTrue((c.category_array == l).all()) # inf upcast test x = np.array([0, 1, 10, 100, 5]) knots = [-np.inf, 2, 11, 50, np.inf] c = cut(x, knots) self.assertTrue((c._fa == FA([1, 1, 2, 4, 2])).all())
["alpha", "beta", "gamma", "delta", "epsilon", "zeta"]) }), "ds_beta": Dataset({ k: list(range(i * 10, (i + 1) * 10)) for i, k in enumerate( ["eta", "theta", "iota", "kappa", "lambada", "mu"]) }), }), Struct({ "alpha": 1, "beta": [2, 3], "gamma": ['2', '3'], "delta": arange(10), "epsilon": Struct({ "theta": Struct({ "kappa": 3, "zeta": 4, }), "iota": 2, }), }), ] @contextmanager def greedy_completion(): ip = get_ipython()