def test_apply(self): arrsize = 200 numrows = 7 ds = Dataset({'time': arange(arrsize * 1.0)}) ds.data = np.random.randint(numrows, size=arrsize) ds.data2 = np.random.randint(numrows, size=arrsize) symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM'] ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols) ds.accum2('symbol', 'data').sum(ds.data2) ds.accum2('symbol', 'data').sum(ds.data2, showfilter=True) ds.accum2('symbol', 'data').median(ds.data2, showfilter=True) ds.accum2('symbol', 'data').median(ds.data2, showfilter=False) ds.accum2('symbol', 'data').apply_reduce(np.median, ds.data2, showfilter=True) ds.accum2('symbol', 'data').apply_reduce(np.median, ds.data2, showfilter=False) f = logical(arange(200) % 2) ds.accum2('symbol', 'data').apply_reduce(np.median, ds.data2, filter=f) ds.accum2('symbol', 'data').apply_reduce(np.median, ds.data2, filter=f, showfilter=True) ds.accum2('symbol', 'data').median(ds.data2, filter=f, showfilter=True)
def test_simple_cats(self): data = arange(1, 6) * 10 colnames = FastArray(['a', 'b', 'c', 'd', 'e']) c1 = Categorical(colnames) c2 = Categorical(arange(5)) # no filter ac = Accum2(c2, c1) result = ac.sum(data) self.assertEqual(result._ncols, 7) for i, colname in enumerate(colnames): arr = result[colname] self.assertEqual(arr[i], data[i])
def test_simple_cats_filter_accum(self): data = arange(1, 6) * 10 colnames = FastArray(['a', 'b', 'c', 'd', 'e']) c1 = Categorical(colnames) c2 = Categorical(arange(5)) # filtered accum object ac = Accum2(c2, c1, showfilter=True) result = ac.sum(data) self.assertEqual(result._ncols, 8) for i, colname in enumerate(colnames): arr = result[colname] self.assertEqual(arr[i + 1], data[i])
def test_accum2_median(self): ds = Dataset({'time': arange(200.0)}) ds.data = np.random.randint(7, size=200) ds.data2 = np.random.randint(7, size=200) symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM'] ds.symbol = Cat(1 + arange(200) % 5, symbols) ac = Accum2(ds.data, ds.symbol).median(ds.time) totalcol = ac[ac.summary_get_names()[0]] footer = ac.footer_get_values()['Median'] for i in range(len(symbols)): s_median = ds[ds.symbol == symbols[i], :].time.median() self.assertEqual(footer[i + 1], s_median) for i in range(7): s_median = ds[ds.data == i, :].time.median() self.assertEqual(totalcol[i], s_median)
def test_col_moves(self): st = Struct( { _k: list(range(_i * 10, (_i + 1) * 10)) for _i, _k in enumerate('abcdefghijklmnop') } ) st.col_move_to_front(1) self.assertEqual(list(st), list('bacdefghijklmnop')) st.col_move_to_front(1) st.col_move_to_back(14) self.assertEqual(list(st), list('abcdefghijklmnpo')) st.col_move_to_back(14) with self.assertRaises(ValueError): st.col_move_to_front(arange(20)) st.col_move_to_back(list('dgh')) self.assertEqual(list(st), list('abcefijklmnopdgh')) st.col_move_to_front(list('gpha')) self.assertEqual(list(st), list('gphabcefijklmnod')) st.col_move(list('cim'), list('hfo')) self.assertEqual(list(st), list('cimgpabejklndhfo')) st.col_move_to_front({'g': 1}) st.col_move_to_front('h') with self.assertWarns(UserWarning): st.col_move_to_front('q') self.assertEqual(list(st), list('hgcimpabejklndfo')) st.col_move_to_back({'g': 1}) st.col_move_to_back('h') with self.assertWarns(UserWarning): st.col_move_to_back('q') self.assertEqual(list(st), list('cimpabejklndfogh'))
def test_qcut(self): c = qcut(arange(10), 3) self.assertTrue(sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4])) == 0) c = qcut(arange(11), 3) self.assertTrue( sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4])) == 0) c = qcut(range(5), 3, labels=["good", "medium", "bad"]) self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 4])) == 0) c = cut( FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10]), labels=['a', 'b', 'c', 'd', 'e'], )
def test_multikey_cats_filter_accum_ordered(self): unsorted_str = FastArray(['c', 'e', 'b', 'd', 'a']) ints = arange(1, 6) * 10 data = np.random.rand(5) * 10 # unsorted filter accum object c1 = Categorical([unsorted_str, ints]) c2 = Categorical([unsorted_str, ints]) ac = Accum2(c2, c1) result = ac.sum(data, showfilter=True) self.assertEqual(result._ncols, 9) for i, key1 in enumerate(unsorted_str): k1 = bytes.decode(key1) k2 = ints[i] full_colname = "('" + k1 + "', " + str(k2) + ")" arr = result[full_colname] self.assertEqual(arr[i + 1], data[i]) # sorted filter accum object sortidx = np.argsort(unsorted_str) sorted_str = unsorted_str[sortidx] sorted_ints = ints[sortidx] sorted_data = data[sortidx] c1 = Categorical([unsorted_str, ints], ordered=True) c2 = Categorical([unsorted_str, ints], ordered=True) ac = Accum2(c2, c1) result = ac.sum(data, showfilter=True) self.assertEqual(result._ncols, 9) for i, key1 in enumerate(sorted_str): k1 = bytes.decode(key1) k2 = sorted_ints[i] full_colname = "('" + k1 + "', " + str(k2) + ")" arr = result[full_colname] self.assertEqual(arr[i + 1], sorted_data[i])
def test_ismember_categorical_numeric(self): c = Categorical([1, 2, 3, 1, 2, 3, 1, 2, 4]) f = FastArray([1, 2, 3], dtype=np.int64) b, idx = ismember(c, f) self.assertTrue(bool(np.all(b[:-1]))) self.assertFalse(b[-1], False) self.assertTrue(bool(np.all(idx[:-1] == tile(FA([0, 1, 2]), 3)[:-1]))) self.assertTrue(idx.isnan()[-1]) f = FastArray(['a', 'b', 'c']) with pytest.raises(TypeError): b, idx = ismember(c, f) c = Categorical([1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 4.0]) f = FastArray([1, 2, 3], dtype=np.float64) b, idx = ismember(c, f) self.assertTrue(bool(np.all(b[:-1]))) self.assertFalse(b[-1], False) self.assertTrue(bool(np.all(idx[:-1] == tile(FA([0, 1, 2]), 3)[:-1]))) self.assertTrue(idx.isnan()[-1]) f = FastArray(['a', 'b', 'c']) with pytest.raises(TypeError): b, idx = ismember(c, f) c = Categorical([np.random.choice(['a', 'b', 'c'], 10), arange(10)]) with pytest.raises(TypeError): b, idx = ismember(c, f)
def test_col_ctor_02(self): inv_keys = ['True', 'False', 'None'] arr = arange(5) inv_dict = {k: arr for k in inv_keys} with self.assertWarns(UserWarning): st = Struct(inv_dict) self.assertTrue(bool(np.all(inv_keys == list(st)))) for k in inv_keys: self.assertTrue(bool(np.all(st[k] == arr)))
def test_accum2(self): c = cut(arange(10), 3) self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0) c = cut(arange(10.0), 3) self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0) c = cut(arange(11), 3) self.assertTrue( sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3])) == 0) c = cut(FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10])) self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0) c = cut( FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10]), labels=['a', 'b', 'c', 'd', 'e'], ) self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0)
def test_concat(self): st1 = Struct({ 'ds': TypeRegister.Dataset( {'col_' + str(i): np.random.rand(5) for i in range(5)}), 'arr': arange(5), 'cat': TypeRegister.Categorical(['a', 'a', 'b', 'c', 'a']), }) st2 = Struct({ 'ds': TypeRegister.Dataset( {'col_' + str(i): np.random.rand(5) for i in range(5)}), 'arr': arange(5), 'cat': TypeRegister.Categorical(['a', 'a', 'b', 'c', 'a']), }) result = Struct.concat_structs([st1, st2]) self.assertTrue(isinstance(result.ds, TypeRegister.Dataset)) self.assertTrue(isinstance(result.arr, TypeRegister.FastArray)) self.assertTrue(isinstance(result.cat, TypeRegister.Categorical)) correct_arr = np.hstack([st1.arr, st2.arr]) self.assertTrue(bool(np.all(correct_arr == result.arr))) correct_cat = np.array( ['a', 'a', 'b', 'c', 'a', 'a', 'a', 'b', 'c', 'a']) self.assertTrue(bool(np.all(correct_cat == result.cat))) for c in st1.ds: correct = np.hstack([st1.ds[c], st2.ds[c]]) self.assertTrue(bool(np.all(correct == result.ds[c]))) order = list(st1.keys()) result = list(result.keys()) self.assertTrue(bool(np.all(order == result)))
def test_accum2_nanmedian_with_filter(self): ds = Dataset({'time': arange(200.0)}) ds.data = np.random.randint(7, size=200) ds.data2 = np.random.randint(7, size=200) symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM'] # N.B. make a copy here for testing symbol_categorical = Cat(1 + arange(200) % 5, symbols) # N.B. Categorical.copy and Categorical constructor doesn't do deep copy?! ds.symbol = Cat(1 + arange(200) % 5, symbols) chosen_symbols = ['AMZN', 'AAPL'] filt = symbol_categorical.isin(chosen_symbols) ac = Accum2(ds.data, ds.symbol) stat1 = ac.nanmedian(ds.time, filter=filt) totalcol = stat1[stat1.summary_get_names()[0]] footer = stat1.footer_get_values()['Median'] # Make sure we don't change the input data self.assertTrue(not rt.any(ds.symbol._fa == 0)) for sym in chosen_symbols: s_median = rt.nanmedian(ds[symbol_categorical == sym, :].time) i = rt.where(symbol_categorical.category_array == sym)[0].item() self.assertEqual(footer[i + 1], s_median) for i in range(7): s_median = rt.nanmedian(ds[(ds.data == i) & filt, :].time) self.assertEqual(totalcol[i], s_median) chosen_symbols = ['IBM', 'FB'] filt = symbol_categorical.isin(chosen_symbols) stat2 = ac.nanmedian(ds.time, filter=filt) totalcol = stat2[stat2.summary_get_names()[0]] footer = stat2.footer_get_values()['Median'] # Make sure we don't change the input data self.assertTrue(not rt.any(ds.symbol._fa == 0)) for sym in chosen_symbols: s_median = rt.nanmedian(ds[symbol_categorical == sym, :].time) i = rt.where(symbol_categorical.category_array == sym)[0].item() self.assertEqual(footer[i + 1], s_median) for i in range(7): s_median = rt.nanmedian(ds[(ds.data == i) & filt, :].time) self.assertEqual(totalcol[i], s_median)
def test_apply_nonreduce(self): arrsize = 200 numrows = 7 ds = rt.Dataset({'time': rt.arange(arrsize * 1.0)}) ds.data = arange(arrsize) % numrows ds.data2 = (arange(arrsize) + 3) % numrows symbols = [ 'AAPL', 'AMZN', 'FB', 'GOOG', 'IBM', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', ] ds.symbol = rt.Cat(1 + rt.arange(arrsize) % len(symbols), symbols) result = ds.symbol.apply_reduce(lambda x, y: np.sum(np.minimum(x, y)), (ds.data, ds.data)) ac = ds.accum2('symbol', 'data') newds = ac.apply_nonreduce(np.cumsum) ds2 = ac.apply_reduce(lambda x, y: np.sum(np.maximum(x, y)), (newds.data, newds.data2)) x = np.maximum(newds.data, newds.data2) y = ac.apply_nonreduce(lambda x, y: np.maximum(x, y), (newds.data, newds.data2))[0] self.assertTrue(np.all(x == y))
def test_showfilter_label_subclass(self): d = Date.range('20190201', '20190210') c = Categorical(d) c2 = Categorical(arange(10)) ac = Accum2(c, c2) result = ac.count(showfilter=True) self.assertTrue(isinstance(result.YLabel, Date)) self.assertTrue(result.YLabel.isnan()[0]) d = DateTimeNano.random(10) c = Categorical(d) c2 = Categorical(arange(10)) ac = Accum2(c, c2) result = ac.count(showfilter=True) self.assertTrue(isinstance(result.YLabel, DateTimeNano)) self.assertTrue(result.YLabel.isnan()[0]) d = DateSpan(arange(10, 20)) c = Categorical(d) c2 = Categorical(arange(10)) ac = Accum2(c, c2) result = ac.count(showfilter=True) self.assertTrue(isinstance(result.YLabel, DateSpan)) self.assertTrue(result.YLabel.isnan()[0]) d = TimeSpan(np.random.rand(10) * 10_000_000_000) c = Categorical(d) c2 = Categorical(arange(10)) ac = Accum2(c, c2) result = ac.count(showfilter=True) self.assertTrue(isinstance(result.YLabel, TimeSpan)) self.assertTrue(result.YLabel.isnan()[0])
def test_ismember_categorical(): for b_index_c in [0, 1]: for b_index_d in [0, 1]: # string values, both base indices c = TypeRegister.Categorical( np.random.choice(['a', 'b', 'c', 'd', 'e', 'f'], 15), base_index=b_index_c, ) d = TypeRegister.Categorical(np.random.choice(['a', 'b', 'c'], 10), base_index=b_index_d) cs, ds = c.as_string_array, d.as_string_array b, f = ismember(c, d) bs, fs = ismember(cs, ds) assert_array_equal(b, bs) assert_array_equal(int8(f), fs) b, f = ismember(d, c) bs, fs = ismember(ds, cs) assert_array_equal(b, bs) assert_array_equal(int8(f), fs) # codes, string values, both base indices c = TypeRegister.Categorical(np.random.choice( ['a', 'b', 'c', 'd', 'e', 'f'], 15), base_index=b_index_c) d = TypeRegister.Categorical(np.random.choice(['a', 'b', 'c'], 10), ['a', 'b', 'c'], base_index=1) cs, ds = c.as_string_array, d.as_string_array b, f = ismember(c, d) bs, fs = ismember(cs, ds) assert_array_equal(b, bs) assert_array_equal(int8(f), fs) b, f = ismember(d, c) bs, fs = ismember(ds, cs) assert_array_equal(b, bs) assert_array_equal(int8(f), fs) c = Categorical(np.random.choice(['a', 'b', 'c'], 15)) with pytest.raises(TypeError): b, idx = ismember(c, arange(3))
def test_ismember_align_multikey(): correct_bool = FastArray([True, True, True, False, False]) correct_idx = FastArray([0, 1, 2, int8.inv, int8.inv], dtype=np.int8) # bytes / unicode both upcast a_keys = [arange(5), FastArray([b'a', b'b', b'c', b'd', b'e'], dtype='S5')] b_keys = [arange(3), FastArray(['a', 'b', 'c'], dtype='U4', unicode=True)] b, idx = ismember(a_keys, b_keys) assert_array_equal(b, correct_bool) # NOTE: flip to numpy because FastArray is sentinel-aware assert_array_equal(idx._np, correct_idx._np) assert a_keys[1].dtype.char == 'S' # bytes / Categorical unicode a_keys = [arange(5), FastArray(['a', 'b', 'c', 'd', 'e'], dtype='S5')] b_keys = [ arange(3), Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True), unicode=True), ] b, idx = ismember(a_keys, b_keys) assert_array_equal(b, correct_bool) # NOTE: flip to numpy because FastArray is sentinel-aware assert_array_equal(idx._np, correct_idx._np) # unicode / Categorical a_keys = [ arange(5), FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True) ] b_keys = [ arange(3), Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True), unicode=True), ] b, idx = ismember(a_keys, b_keys) assert_array_equal(b, correct_bool) # NOTE: flip to numpy because FastArray is sentinel-aware assert_array_equal(idx._np, correct_idx._np) # different numeric types a_keys = [ arange(5, dtype=np.float64), FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True), ] b_keys = [ arange(3), Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True), unicode=True), ] b, idx = ismember(a_keys, b_keys) assert_array_equal(b, correct_bool) # NOTE: flip to numpy because FastArray is sentinel-aware assert_array_equal(idx._np, correct_idx._np) # string / non-string a_keys = [ arange(5).astype('S'), FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True), ] b_keys = [ arange(3), Categorical(FastArray(['a', 'b', 'c'], dtype='U4', unicode=True), unicode=True), ] with pytest.raises(TypeError): b, idx = ismember(a_keys, b_keys) # multikey categorical, no expand array a_keys = [ arange(5).astype('S'), FastArray(['a', 'b', 'c', 'd', 'e'], dtype='U5', unicode=True), ] b_keys = [ arange(3), Categorical( [FastArray(['a', 'b', 'c'], dtype='U4', unicode=True), arange(3)], unicode=True, ), ] with pytest.raises(TypeError): b, idx = ismember(a_keys, b_keys) with pytest.raises(TypeError): b, idx = ismember(b_keys, a_keys) # unsupported object array a_keys = [ arange(5).astype('O'), FastArray(['a', 'b', 'c', 'd', 'e'], dtype='S5') ] b_keys = [arange(3), FastArray(['a', 'b', 'c'], dtype='U4', unicode=True)] with pytest.raises(TypeError): b, idx = ismember(a_keys, b_keys)
def test_ismember_int_edges(): # hit thresholds for a previous bug for a_size in [127, 129, 254, 256]: a = arange(a_size) for b_size in range(129): _, _ = ismember(a, arange(b_size))
def test_tree(self): '''sanity check that .tree() at least returns something, even for empty Struct''' s = Struct() self.assertIsInstance(s.tree(), DisplayString) s['foo'] = Dataset({'bar': arange(5)}) self.assertIsInstance(s.tree(), DisplayString)
def test_dataset_accum2(self): # test from accum2 off dataset and with a filter ds = Dataset({'test': arange(10), 'data': arange(10) // 2}) x = ds.accum2('data', 'test').sum(ds.test, filter=ds.data == 3) totalcol = x.summary_get_names()[0] self.assertEqual(x[totalcol][3], 13)