def test_sample(self): # Test Dataset.sample ds = rt.Dataset({'num': [1, 2, 3, 4, 5], 'str': ['ab', 'bc', 'cd', 'de', 'ef']}) np.random.seed(1) ds_sample = ds.sample(3, rt.FA([True, True, True, False, True])) ds_sample_expected = rt.Dataset({'num': [1, 3, 5], 'str': ['ab', 'cd', 'ef']}) assert (ds_sample_expected == ds_sample).all(axis=None) # Test FastArray.sample fa = rt.FA([1, 2, 3, 4, 5]) np.random.seed(1) fa_sample = fa.sample(2, rt.FA([False, True, True, False, True])) fa_sample_expected = rt.FA([2, 5]) assert (fa_sample_expected == fa_sample).all(axis=None) # Test overflow fa_sample = fa.sample(10, rt.FA([False, True, False, False, True])) fa_sample_expected = rt.FA([2, 5]) assert (fa_sample_expected == fa_sample).all(axis=None) # Test no filter np.random.seed(1) fa_sample = fa.sample(2) fa_sample_expected = rt.FA([2, 3]) assert (fa_sample_expected == fa_sample).all(axis=None) # Test fancy index np.random.seed(1) fa_sample = fa.sample(2, rt.FA([1, 3, 4])) fa_sample_expected = rt.FA([2, 5]) assert (fa_sample_expected == fa_sample).all(axis=None)
def test_sample(self): # Test Dataset.sample ds = rt.Dataset({'num': [1, 2, 3, 4, 5], 'str': ['ab', 'bc', 'cd', 'de', 'ef']}) ds_sample = ds.sample(3, rt.FA([True, True, True, False, True]), seed=1) ds_sample_expected = rt.Dataset({'num': [1, 2, 5], 'str': ['ab', 'bc', 'ef']}) assert ds_sample.keys() == ds_sample_expected.keys() for col_name in ds_sample_expected.keys(): assert_array_equal(ds_sample_expected[col_name], ds_sample[col_name], err_msg=f"Column '{col_name}' differs.") # Test FastArray.sample fa = rt.FA([1, 2, 3, 4, 5]) fa_sample = fa.sample(2, rt.FA([False, True, True, False, True]), seed=1) fa_sample_expected = rt.FA([2, 3]) assert_array_equal(fa_sample_expected, fa_sample) # Test overflow fa_sample = fa.sample(10, rt.FA([False, True, False, False, True]), seed=1) fa_sample_expected = rt.FA([2, 5]) assert_array_equal(fa_sample_expected, fa_sample) # Test no filter fa_sample = fa.sample(2, seed=1) fa_sample_expected = rt.FA([2, 3]) assert_array_equal(fa_sample_expected, fa_sample) # Test fancy index fa_sample = fa.sample(2, rt.FA([1, 3, 4]), seed=1) fa_sample_expected = rt.FA([2, 4]) assert_array_equal(fa_sample_expected, fa_sample)
def get_doctest_dataset_data(): return { 'ds_simple_1': rt.Dataset({ 'A': [0, 1, 6, 7], 'B': [1.2, 3.1, 9.6, 21] }), 'ds_simple_2': rt.Dataset({ 'X': [0, 1, 6, 9], 'C': [2.4, 6.2, 19.2, 53] }), 'ds_complex_1': rt.Dataset({ 'A': [0, 6, 9, 11], 'B': ['Q', 'R', 'S', 'T'], 'C': [2.4, 6.2, 19.2, 25.9] }), 'ds_complex_2': rt.Dataset({ 'A': [0, 1, 6, 10], 'B': ['Q', 'R', 'R', 'T'], 'E': [1.5, 3.75, 11.2, 13.1], }), }
class TestPyarrowConvertDataset: @pytest.mark.parametrize(('rt_dset',), [ pytest.param(rt.Dataset({}), id='empty'), pytest.param(rt.Dataset({ 'ink_capacity': rt.FA([15, 10, 15, 25, 10, 15, 25, 15]), 'purchase_date': rt.Date(['2019-06-19', '2019-06-19', '2020-01-15', '2020-05-22', '2020-02-10', '2020-02-10', '2020-03-17', '2020-03-17']), 'country_code': rt.Categorical( # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats. [36, 36, 344, 840, 840, 124, 36, 484], { 'IRL': 372, 'USA': 840, 'AUS': 36, 'HKG': 344, 'JPN': 392, 'MEX': 484, 'KHM': 116, 'THA': 764, 'JAM': 388, 'ARM': 51 }, ordered=True) }) ) ]) def test_roundtrip_rt_pa_rt(self, rt_dset: rt.Dataset) -> None: """Test round-tripping from rt.Dataset to pyarrow.Table and back.""" result_pa_tbl = rt_dset.to_arrow() result_rt_dset = rt.Dataset.from_arrow(result_pa_tbl, zero_copy_only=False) assert rt_dset.keys() == result_rt_dset.keys() for col_name in rt_dset.keys(): # relaxed_cat_check=True, because we're not trying to test specific details of Categorical conversion # here, we're more interested in the dataset-level stuff. assert_array_or_cat_equal(rt_dset[col_name], result_rt_dset[col_name], relaxed_cat_check=True)
def test_accum_cols_multikey(self): num_rows = 12 data = rt.Dataset({ 'Symb': rt.Cat(['A', 'B'] * int(num_rows / 2)), 'Exch': rt.Cat(['X', 'Y', 'Y', 'X'] * int(num_rows / 4)), 'Count': rt.full(num_rows, 1.0), 'PlusMinus': [1.0, -1.0] * int(num_rows / 2), }) data.MultiKeyCat = rt.Cat([data.Symb, data.Exch]) accum = rt.accum_cols(data.MultiKeyCat, [data.Count, data.PlusMinus], ['Count', 'PlusMinus']) accum_expected = rt.Dataset({ 'Symb': ['A', 'B', 'A', 'B'], 'Exch': ['X', 'Y', 'Y', 'X'], 'Count': [3.0, 3.0, 3.0, 3.0], 'PlusMinus': [3.0, -3.0, 3.0, -3.0], }) accum_expected.footer_set_values('Total', { 'Exch': 'Total', 'Count': 12.0, 'PlusMinus': 0.0 }) self.assertTrue((accum == accum_expected).all(axis=None))
def test_aggs_var_symb_0_25_ncols_5(self): test_class = categorical_base(5, 0.25, "var") cat = rt.Categorical( values=test_class.bin_ids, categories=test_class.keys, base_index=default_base_index, ) cat = cat.var(rt.Dataset(test_class.data)) gb = pd.DataFrame(test_class.data) gb = gb.groupby(test_class.bin_ids).var() for k, v in test_class.data.items(): safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
def test_aggs_mean_symb_0_40_ncols_6(self): test_class = categorical_base(6, 0.40, "mean") cat = rt.Categorical( values=test_class.bin_ids, categories=test_class.keys, base_index=default_base_index, ) cat = cat.mean(rt.Dataset(test_class.data)) gb = pd.DataFrame(test_class.data) gb = gb.groupby(test_class.bin_ids).mean() for k, v in test_class.data.items(): safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
def test_aggs_sum_symb_0_10_ncols_7(self): test_class = categorical_base(7, 0.10, "sum") cat = rt.Categorical( values=test_class.bin_ids, categories=test_class.keys, base_index=default_base_index, ) cat = cat.sum(rt.Dataset(test_class.data)) gb = pd.DataFrame(test_class.data) gb = gb.groupby(test_class.bin_ids).sum() for k, v in test_class.data.items(): safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
class TestHStackAny: """Tests for the rt.hstack_any (a.k.a. rt.stack_rows) function.""" _fa1 = rt.FastArray([100, 200]) _fa2 = rt.FastArray([111, 222]) _dtn1 = rt.DateTimeNano('2021-10-12 01:02:03', from_tz='UTC') _dtn2 = rt.DateTimeNano('1980-03-04 13:14:15', from_tz='UTC') _ts1 = _dtn1 - _dtn2 _ts2 = _dtn2 - _dtn1 _ds1 = rt.Dataset({'a': 11}) _ds2 = rt.Dataset({'b': 22}) _pds1 = rt.PDataset(_ds1) _pds2 = rt.PDataset(_ds2) @pytest.mark.parametrize( "inputs,expected", [ pytest.param([_fa1, _fa2], rt.FastArray, id='FastArray,FastArray'), pytest.param([_dtn1, _dtn2], rt.DateTimeNano, id='DateTimeNano,DateTimeNano'), pytest.param([_dtn1, _dtn2], rt.DateTimeNano, id='DateTimeNano,DateTimeNano'), pytest.param([_ts1, _ts2], rt.TimeSpan, id='TimeSpan,TimeSpan'), pytest.param([_ds1, _ds2], rt.Dataset, id='Dataset,Dataset'), pytest.param([_pds1, _pds2], None, id='PDataset,PDataset'), # notyet pytest.param([_dtn1, _ts2], None, id='DateTimeNano,TimeSpan'), # neither is base pytest.param([_fa1, _dtn2], rt.FastArray, id='FastArray,DateTimeNano'), pytest.param([_ts1, _fa2], rt.FastArray, id='TimeSpan,FastArray'), pytest.param([_ds1, _pds2], rt.Dataset, id='Dataset,PDataset'), pytest.param([_pds1, _ds2], rt.Dataset, id='PDataset,Dataset'), pytest.param([_fa1, _ds2], None, id='FastArray,Dataset'), ], ) def test_hstack_any(self, inputs, expected): if expected is None: with pytest.raises(Exception): rt.hstack_any(inputs) else: result = rt.hstack_any(inputs) assert type(result) == expected
def test_accum_cols_noncat(self): num_rows = 10 pointer = rt.FA([0, 1] * int(num_rows / 2)) count = rt.full(num_rows, 1.0) accum = rt.accum_cols(pointer, count) accum_expected = rt.Dataset({'YLabel': [0, 1], 'col0': [5.0, 5.0]}) accum_expected.footer_set_values('Total', { 'YLabel': 'Total', 'col0': 10.0 }) self.assertTrue((accum == accum_expected).all(axis=None))
def test_alignmk(self): ds1 = rt.Dataset() ds1['Time'] = [0, 1, 4, 6, 8, 9, 11, 16, 19, 30] ds1['Px'] = [10, 12, 15, 11, 10, 9, 13, 7, 9, 10] ds2 = rt.Dataset() ds2['Time'] = [0, 0, 5, 7, 8, 10, 12, 15, 17, 20] ds2['Vols'] = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29] # Categorical keys ds1['Ticker'] = rt.Categorical(['Test'] * 10) ds2['Ticker'] = rt.Categorical(['Test', 'Blah'] * 5) res = alignmk(ds1.Ticker, ds2.Ticker, ds1.Time, ds2.Time) target = rt.FastArray([0, 0, 0, 2, 4, 4, 4, 6, 8, 8]) assert_array_equal(res, target) # char array keys ds1['Ticker'] = rt.FastArray(['Test'] * 10) ds2['Ticker'] = rt.FastArray(['Test', 'Blah'] * 5) res = alignmk(ds1.Ticker, ds2.Ticker, ds1.Time, ds2.Time) target = rt.FastArray([0, 0, 0, 2, 4, 4, 4, 6, 8, 8]) assert_array_equal(res, target)
def test_save_load_dataset_array(self, arr, tmpdir): # Test #1: save and load of ndarray within Dataset fn = str(tmpdir.join(name(arr))) ds = rt.Dataset({name(arr): arr}) ds.save(fn) ds2 = rt.Dataset.load(fn) assert_save_load(ds2, ds) assert_array_equal_(ds2[name(arr)], ds[name(arr)]) # Test #2: save and load of FastArray derived from ndarray within Dataset f_arr = rt.FA(arr) fn = str(tmpdir.join(name(f_arr))) ds = rt.Dataset({name(f_arr): f_arr}) ds.save(fn) ds2 = rt.Dataset.load(fn) assert_save_load(ds2, ds) assert_array_equal_(ds[name(f_arr)], ds2[name(f_arr)])
def test_accum_cols(self): num_rows = 10 data = rt.Dataset({ 'Symb': rt.Cat(['A', 'B'] * int(num_rows / 2)), 'Count': rt.full(num_rows, 1.0), 'PlusMinus': [1.0, -1.0] * int(num_rows / 2), # Added to handle edge case of zero footer }) accum = rt.accum_cols(data.Symb, [data.Count, data.PlusMinus], ['Count', 'PlusMinus']) accum_expected = rt.Dataset({ 'Symb': ['A', 'B'], 'Count': [5.0, 5.0], 'PlusMinus': [5.0, -5.0] }) accum_expected.footer_set_values('Total', { 'Symb': 'Total', 'Count': 10.0, 'PlusMinus': 0.0 }) self.assertTrue((accum == accum_expected).all(axis=None))
def test_advanced_multikey(self): ##data generation code alpha = 'Q W E R T Y U I O P A S D F G H J K L Z X C V B N M'.split(' ') digits = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] sz = 200 numb_kvs = 5 # can't be more than 26 as we are usign the contents of alpha for the column name # 2d array of keys/values vals = [[0] * sz] * numb_kvs keys = [[''] * sz] * numb_kvs # random initialization for them for n in range(0, numb_kvs): for i in range(0, sz): vals[n][i] = digits[rand.randint(0, 1000) % len(digits)] keys[n][i] = alpha[rand.randint(0, 1000) % len(alpha)] # create the data map # multi key hash for numbkeys 1:numb_kvs while numb_kvs > 0: data = {} for n in range(0, numb_kvs): data[alpha[n]] = keys[n] data[alpha[n + numb_kvs]] = vals[n] key_cols = alpha[0:numb_kvs] val_cols = alpha[numb_kvs : numb_kvs * 2] # print('SFW--------------------------------------------------------------') mset = rt.Dataset(data) # t = time.time() s_group = rt.GroupBy(mset, keys=key_cols).sum() # print(time.time() - t, 'SFW GROUP BY ') # print('PANDAS--------------------------------------------------------------') df2 = pd.DataFrame(data) # t = time.time() p_group = df2.groupby(key_cols).sum() # print(time.time() - t, 'PANDAS GROUP BY ') # print('compare out--------------------------------------------------------------') pandas_ = list(p_group[val_cols]) sfw_ = list(s_group[val_cols]) assert pandas_ == sfw_ numb_kvs = numb_kvs - 1
def inner(cat): # Test #1: save and load Categorical fn = str(tmpdir.join(name(cat))) save_sds(fn, cat) cat2 = load_sds(fn) assert_save_load(cat2, cat) assert cat == cat2 # Test #2: save and load Categorical from within Dataset ds = rt.Dataset({name(cat): cat}) ds.save(fn) ds2 = rt.Dataset.load(fn) assert_save_load(ds2, ds) assert ds[name(cat)] == ds2[name(cat)]
def test_single_col_groupby_tests(self): Values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] Keys = ['a', 'b', 'c', 'a', 'b', 'c', 'd', 'e', 'f'] for type_ in type_list: data = {'Vs': rt.FastArray(Values, dtype=type_), 'Ks': Keys} pd_data = pd.DataFrame(data) sfw_data = rt.Dataset(data) key = 'Ks' val = 'Vs' pd_gb = pd_data.groupby(key) sfw_gb = sfw_data.groupby(key) for name in functions_str: pd_func = getattr(pd_gb, name) sfw_func = getattr(sfw_gb, name) pd_out = pd_func() sfw_out = sfw_func() pd_col = pd_out[val]._values if name == 'count': sfw_col = sfw_out['Count'] else: sfw_col = sfw_out[val] is_integer_subttype = np.issubdtype(type_, np.integer) is_median = name != 'median' if not safe_equal(pd_col, sfw_col) and (not is_integer_subttype and not is_median): print('data_type_t = ', type_) print('function =', name) print('pandas output =', pd_col) print('sfw output =', sfw_col) # TODO move as error message following assert self.assertTrue(False)
def test_apply_nonreduce(self): arrsize = 200 numrows = 7 ds = rt.Dataset({'time': rt.arange(arrsize * 1.0)}) ds.data = arange(arrsize) % numrows ds.data2 = (arange(arrsize) + 3) % numrows symbols = [ 'AAPL', 'AMZN', 'FB', 'GOOG', 'IBM', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', ] ds.symbol = rt.Cat(1 + rt.arange(arrsize) % len(symbols), symbols) result = ds.symbol.apply_reduce(lambda x, y: np.sum(np.minimum(x, y)), (ds.data, ds.data)) ac = ds.accum2('symbol', 'data') newds = ac.apply_nonreduce(np.cumsum) ds2 = ac.apply_reduce(lambda x, y: np.sum(np.maximum(x, y)), (newds.data, newds.data2)) x = np.maximum(newds.data, newds.data2) y = ac.apply_nonreduce(lambda x, y: np.maximum(x, y), (newds.data, newds.data2))[0] self.assertTrue(np.all(x == y))
def test_groupby_categorical_sort(self): """ Test that groupby on a categorical sorts the dataset correctly """ ds = rt.Dataset() cats = ['z', 'y', 'x', 'w', 'a', 'b', 'c', 'd'] vals = [0, 1, 2, 3, 4, 5, 6, 7] expected = dict(zip(cats, vals)) ds["Cat"] = rt.Categorical([cats[xx % len(cats)] for xx in range(100)]) # two identical columns ds["Value1"] = [vals[xx % len(cats)] for xx in range(100)] ds["Value2"] = [vals[xx % len(cats)] for xx in range(100)] grp = ds.groupby("Cat").mean() grp["Expected"] = [expected[xx] for xx in grp.Cat.astype('U')] diff = rt.sum(rt.abs(grp.Expected - grp.Value1)) diff += rt.sum(rt.abs(grp.Expected - grp.Value2)) assert diff <= 1e-9
def test_stack_save_load(self, dataframe, stack_count, tmpdir, stack): def assert_stack_equal(pds, ds, num_stack=1): assert id(pds) != id( ds ), f"Identity of saved {name(ds)} should be different from the loaded {name(ds)}." assert isinstance(pds, rt.PDataset), f"got type {type(pds)}" assert pds.shape == ( num_stack * ds.shape[0], ds.shape[1], ), f"Shapes should be the same.\n{name(ds)}\n{repr(ds)}\n{name(pds)}\n{pds}" # TODO consider stacking # for f_arr1, f_arr2 in zip(pds.values(), ds.values()): # assert_array_equal_(f_arr2._np, f_arr1._np) fn = str(tmpdir.join(name(dataframe))) ds = rt.Dataset(dataframe) save_sds(fn, ds) for i in range(stack_count): # expectations for empty input if i == 0: if stack: with pytest.raises(ValueError): _ = load_sds([fn] * i, stack=stack) else: pds = load_sds([fn] * i, stack=stack) assert isinstance(pds, type(None)), f"got type {type(pds)}" continue # expectations for n+1 input where n is a positive nonzero integer pds = load_sds([fn] * i, stack=stack) if stack: assert_stack_equal(pds, ds, num_stack=i) else: # handle expectations for non-stacked load assert isinstance(pds, list), f"got type {type(pds)}"
def test_multkey(self): alpha = 'Q W E R T Y U I O P A S D F G H J K L Z X C V B N M'.split(' ') digits = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] sz = 4000 numbers = [0] * sz keys1 = [''] * sz keys2 = [''] * sz for i in range(0, sz): numbers[i] = digits[rand.randint(0, 1000) % len(digits)] keys1[i] = alpha[rand.randint(0, 1000) % len(alpha)] keys2[i] = alpha[rand.randint(0, 1000) % len(alpha)] ary = rt.FastArray(numbers) data = {'k1': keys1, 'k2': keys2, 'beta': numbers} # print('SFW--------------------------------------------------------------') mset = rt.Dataset(data) # t = time.time() s_group = rt.GroupBy(mset, keys=['k1', 'k2']).sum() # print(time.time() - t, 'SFW GROUP BY ') # print('PANDAS--------------------------------------------------------------') df2 = pd.DataFrame(data) # t = time.time() p_group = df2.groupby(['k1', 'k2']).sum() # print(time.time() - t, 'PANDAS GROUP BY ') # print('compare out--------------------------------------------------------------') pandas = list(p_group['beta']) sfw = list(s_group['beta']) assert pandas == sfw
def numpy_array_to_dataset(inarray: numpy.ndarray, columns=None): out = rt.Dataset(numpy_array_to_dict(inarray, columns=columns)) return out
def test_accum_ratiop(self): num_rows = 12 data = rt.Dataset({ 'Symb': rt.Cat(['A', 'A', 'A', 'B'] * int(num_rows / 4)), 'Exch': rt.Cat(['Z', 'Z', 'X', 'X'] * int(num_rows / 4)), 'Count': rt.full(num_rows, 1.0), }) # Invalid input with self.assertRaises( ValueError, msg=f'Failed to raise an error when passing invalid norm_by arg' ): rt.accum_ratiop(data.Symb, data.Exch, data.Count, norm_by='z') # Ratio within total accum = rt.accum_ratiop(data.Symb, data.Exch, data.Count, norm_by='T') accum_expected = rt.Dataset({ 'Symb': ['A', 'B'], 'X': [25.0, 25.0], 'Z': [50.0, 0.0], 'TotalRatio': [75.0, 25.0], 'Total': [9.0, 3.0], }) accum_expected.footer_set_values( 'TotalRatio', { 'Symb': 'TotalRatio', 'X': 50.0, 'Z': 50.0, 'TotalRatio': 100.0 }, ) accum_expected.footer_set_values('Total', { 'Symb': 'Total', 'X': 6.0, 'Z': 6.0, 'Total': 12.0 }) self.assertTrue((accum == accum_expected).all(axis=None)) # Ratio within columns accum = rt.accum_ratiop(data.Symb, data.Exch, data.Count, norm_by='c') accum_expected = rt.Dataset({ 'Symb': ['A', 'B'], 'X': [50.0, 50.0], 'Z': [100.0, 0.0], 'TotalRatio': [75.0, 25.0], 'Total': [9.0, 3.0], }) accum_expected.footer_set_values( 'TotalRatio', { 'Symb': 'TotalRatio', 'X': 100.0, 'Z': 100.0, 'TotalRatio': 100.0 }, ) accum_expected.footer_set_values('Total', { 'Symb': 'Total', 'X': 6.0, 'Z': 6.0, 'Total': 12.0 }) self.assertTrue((accum == accum_expected).all(axis=None))
def test_multi_col_groupby_tests(self, numb_keys_and_values=5, numb_rows=20): col_val_names = ['alpha', 'beta', 'gamma', 'sigma', 'zeta'] col_key_names = ['lions', 'tigers', 'bears', 'oh', 'my'] MAX_LENGTH = min(len(col_val_names), len(col_key_names)) assert numb_keys_and_values <= MAX_LENGTH for type_ in type_list: vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] keys = 'a b c d e f g'.split(' ') vs = [] ks = [] for i in range(0, numb_keys_and_values): vs.append([ vals[rand.randint(0, len(vals) - 1)] for i in range(0, numb_rows) ]) ks.append([ keys[rand.randint(0, len(keys) - 1)] for i in range(0, numb_rows) ]) data = {} for i in range(0, numb_keys_and_values): data[col_val_names[i]] = rt.FastArray(vs[i], dtype=type_) data[col_key_names[i]] = rt.FastArray(vs[i], dtype=type_) pd_data = pd.DataFrame(data) sfw_data = rt.Dataset(data) key = col_key_names[0:numb_keys_and_values] val = col_val_names[0:numb_keys_and_values] pd_gb = pd_data.groupby(key) sfw_gb = sfw_data.groupby(key) for name in functions_str: pd_out = getattr(pd_gb, name)() sfw_out = getattr(sfw_gb, name)() if name == 'count': # only compare one column for count pd_col = pd_out['alpha'] sfw_col = sfw_out.Count if not safe_equal(pd_col, sfw_col): print('function =', name) print('pandas output =', pd_col) print('sfw output =', sfw_col) self.assertTrue(False) else: for val in col_val_names: # extract array from pandas series pd_col = pd_out[val]._values sfw_col = sfw_out[val] is_integer_subttype = np.issubdtype(type_, np.integer) is_median = name != 'median' if not safe_equal(pd_col, sfw_col) and ( not is_integer_subttype and not is_median): print('function =', name) print('pandas output =', pd_col) assert False
def test_save_load_datasets(self, dataframe, tmpdir): # generate a dataframe of all the dtypes # all array types # copy itself and create nested datasets and sibling datasets # Test #1: save and load of DataFrame fn = str(tmpdir.join(name(dataframe))) # save_sds(fn, dataframe) # dataframe2 = load_sds(fn) # assert dataframe2 == dataframe # E TypeError: save_sds() can only save Structs, Datasets, or single arrays. Got <class 'pandas.core.frame.DataFrame'> # ..\rt_sds.py:470: TypeError # Test #2: save and load of Dataset created from DataFrame dataset = rt.Dataset(dataframe) save_sds(fn, dataset) dataset2 = load_sds(fn) assert_save_load(dataset2, dataset) for f_arr1, f_arr2 in zip(dataset.values(), dataset2.values()): assert_array_equal_(f_arr2._np, f_arr1._np) # Test #3: save and load nested Dataset within a Multiset # This also tests that shallow and deep copies that are saved and loaded from SDS # are both unique objects with the same size footprint. multiset = rt.Multiset() shallow_copy_name, deep_copy_name = "dataset_shallow_copy", "dataset_deep_copy" dataset_shallow_copy, dataset_deep_copy = ( dataset.copy(deep=False), dataset.copy(deep=True), ) multiset[shallow_copy_name], multiset[deep_copy_name] = ( dataset_shallow_copy, dataset_deep_copy, ) fn = str(tmpdir.join(name(multiset))) save_sds(fn, multiset) multiset2 = load_sds(fn) assert_save_load(multiset2, multiset) # Shallow copy assertions assert id(multiset[shallow_copy_name]) != id( multiset2[shallow_copy_name] ), f"Identity of saved object should be different from the loaded object." for f_arr1, f_arr2 in zip(multiset[shallow_copy_name].values(), multiset2[shallow_copy_name].values()): # Convert these to ndarrays so we don't need to consider Riptable invalid checks. # This test is concerned with ensuring the same data is loaded as saved. assert_save_load(f_arr2, f_arr1) assert_array_equal_(f_arr2._np, f_arr2._np) # Deep copy assertions assert id(multiset[deep_copy_name]) != id( multiset2[deep_copy_name] ), f"Identity of saved object should be different from the loaded object." for f_arr1, f_arr2 in zip(multiset[deep_copy_name].values(), multiset2[deep_copy_name].values()): assert_save_load(f_arr2, f_arr1) assert_array_equal_(f_arr2._np, f_arr2._np)
def test_meta(self): st = rt.Struct({ 'a': rt.Dataset({ 'col1': rt.FastArray([1, 2]).astype(np.int32), 'col2': rt.FastArray([3, 4]).astype(np.int32), 'col4': rt.FastArray([5, 6]).astype(np.int32), }), 'b': rt.FastArray([3, 4]).astype(np.int32), }) out = StringIO() orig_stdout = sys.stdout sys.stdout = out print(st.info()) output = out.getvalue() target_output = '''\x1b[1;36mDescription: \x1b[00m<no description> \x1b[1;36mSteward: \x1b[00m<no steward> \x1b[1;36mType: \x1b[00mStruct \x1b[1;36mContents:\x1b[00m \x1b[1;36mType Name Description Steward \x1b[00m \x1b[1;36m------- ---- -------------------------------------------------- ------------\x1b[00m Dataset \x1b[1;32ma \x1b[00m <no description> <no steward> int32 \x1b[1;32mb \x1b[00m <no description> <no steward> ''' self.assertEqual(output, target_output) schema = {'Description': 'This is a structure', 'Steward': 'Nick'} st.apply_schema(schema) st2 = rt.Struct({ 'This': st, 'That': np.array([1, 2]).astype(np.int32) }) out = StringIO() sys.stdout = out print(st2.info()) output = out.getvalue() target_output = '''\x1b[1;36mDescription: \x1b[00m<no description> \x1b[1;36mSteward: \x1b[00m<no steward> \x1b[1;36mType: \x1b[00mStruct \x1b[1;36mContents:\x1b[00m \x1b[1;36mType Name Description Steward \x1b[00m \x1b[1;36m------ ---- -------------------------------------------------- ------------\x1b[00m Struct \x1b[1;32mThis\x1b[00m This is a structure Nick int32 \x1b[1;32mThat\x1b[00m <no description> <no steward> ''' self.assertEqual(output, target_output) schema = { 'Description': 'This is a structure', 'Steward': 'Nick', 'Type': 'Struct', 'Contents': { 'This': { 'Description': 'This is a nested structure', 'Steward': 'Bob', 'Type': 'AttackHelicoptor', 'Contents': { 'a': { 'Description': 'A description for a', 'Steward': 'Fred', 'Contents': { 'col1': { 'Description': 'This describes column 1', 'Steward': 'Jay', 'Type': 'int32', }, 'col2': { 'Description': 'This describes column 2', 'Steward': 'Alex', 'Type': 'float32', }, 'col3': { 'Description': 'This column is not there', 'Steward': 'Ben', }, }, }, 'b': { 'Description': 'A descriptiion for b', 'Steward': 'George', }, }, }, 'That': { 'Description': 'This is an array', 'Steward': 'Willy' }, }, } res = st2.apply_schema(schema) res_c = { 'This': { 'Type Mismatch': 'Type Struct does not match schema type AttackHelicoptor', 'a': { 'col2': { 'Type Mismatch': 'Type int32 does not match schema type float32' }, 'Extra Column': 'col4', 'Missing Column': 'col3', }, } } self.assertEqual(res, res_c) out = StringIO() sys.stdout = out print(st2.info()) output = out.getvalue() target_output = '''\x1b[1;36mDescription: \x1b[00mThis is a structure \x1b[1;36mSteward: \x1b[00mNick \x1b[1;36mType: \x1b[00mStruct \x1b[1;36mContents:\x1b[00m \x1b[1;36mType Name Description Steward \x1b[00m \x1b[1;36m------ ---- -------------------------------------------------- ------------\x1b[00m Struct \x1b[1;32mThis\x1b[00m This is a nested structure Bob int32 \x1b[1;32mThat\x1b[00m This is an array Willy ''' self.assertEqual(output, target_output) out = StringIO() sys.stdout = out print(st2.This.info()) output = out.getvalue() target_output = '''\x1b[1;36mDescription: \x1b[00mThis is a nested structure \x1b[1;36mSteward: \x1b[00mBob \x1b[1;36mType: \x1b[00mStruct \x1b[1;36mContents:\x1b[00m \x1b[1;36mType Name Description Steward \x1b[00m \x1b[1;36m------- ---- -------------------------------------------------- ------------\x1b[00m Dataset \x1b[1;32ma \x1b[00m A description for a Fred int32 \x1b[1;32mb \x1b[00m A descriptiion for b George ''' self.assertEqual(output, target_output) out = StringIO() sys.stdout = out print(st2.This.a.info()) output = out.getvalue() target_output = '''\x1b[1;36mDescription: \x1b[00mA description for a \x1b[1;36mSteward: \x1b[00mFred \x1b[1;36mType: \x1b[00mDataset \x1b[1;36mContents:\x1b[00m \x1b[1;36mType Name Description Steward \x1b[00m \x1b[1;36m----- ---- -------------------------------------------------- ------------\x1b[00m int32 \x1b[1;32mcol1\x1b[00m This describes column 1 Jay int32 \x1b[1;32mcol2\x1b[00m This describes column 2 Alex int32 \x1b[1;32mcol4\x1b[00m <no description> <no steward> ''' self.assertEqual(output, target_output) out = StringIO() sys.stdout = out print(st2.This.a.col1.info()) output = out.getvalue() target_output = '''\x1b[1;36mDescription: \x1b[00mThis describes column 1 \x1b[1;36mSteward: \x1b[00mJay \x1b[1;36mType: \x1b[00mint32 ''' self.assertEqual(output, target_output) sys.stdout = orig_stdout