def test_ddof_default(self): ''' FastArray differs from numpy when calculating var, nanvar, std, and nanstd. We set ddof's default to 1 instead of 0 ''' arr = FA([1, 2]) func_names = ['std', 'var', 'nanstd', 'nanvar'] for name in func_names: # make sure different than numpy func = arr.__getattribute__(name) fa_default = func() np_default = func(ddof=0) self.assertNotAlmostEqual( fa_default, np_default, msg=f"Failed to set ddof to 1 for reduce function {name}", ) # make sure ddof is 1 when sent to numpy reduce sent_to_numpy = func(keepdims=0) self.assertAlmostEqual( fa_default, sent_to_numpy, msg= f"Failed to set ddof to 1 before passing to numpy for function {name}", )
def test_rank(self): a = FA([nan, 2, 2, 3]) b = a.nanrankdata() assert_array_equal(b, [nan, 1.5, 1.5, 3.0]) a = FA([0, 2, 2, 3]) b = a.rankdata() assert_array_equal(b, [1, 2.5, 2.5, 4])
def test_np_vs_member(self): import builtins ''' Check to make sure the result is the same no matter how the ufunc is accessed. ''' func_names = ['min', 'max', 'sum'] arr = FA([1, 2, 3, 4, 5]) num_types = int_types[1:] + float_types for dt in num_types: arr = arr.astype(dt) # print(dt) for name in func_names: member_func = None np_func = None builtin_func = None results = [] member_func = arr.__getattribute__(name) results.append(member_func()) if hasattr(np, name): np_func = np.__getattribute__(name) results.append(np_func(arr)) if hasattr(builtins, name): builtin_func = builtins.__getattribute__(name) results.append(builtin_func(arr)) self.assertEqual( len(set(results)), 1, msg= f"Results did not match for datatype {dt} and function {name}. Fastarray: {member_func} Numpy: {np_func} Builtin: {builtin_func}", )
def test_shift(self): ''' Check to make sure FastArray's shift mimics pandas shift - not numpy roll. ''' arr0 = FA([1, 2, 3, 4, 5]) all_ints = int_types # [1:] # bool is not included shift_dict = { tuple(float_types): np.nan, tuple([np.str_]): '', tuple([np.bytes_]): b'', } for tp in all_ints: if tp is bool: tp = np.bool_ shift_dict[(tp, )] = INVALID_DICT[tp(1).dtype.num] for dtypes, invalid in shift_dict.items(): for dt in dtypes: arr = arr0.astype(dt) pos_shift = arr.shift(1) neg_shift = arr.shift(-1) if invalid != invalid: self.assertNotEqual( pos_shift[0], pos_shift[0], msg= f"Positive shift on datatype {dt} did not fill with {invalid}.", ) self.assertNotEqual( neg_shift[-1], neg_shift[-1], msg= f"Negative shift on datatype {dt} did not fill with {invalid}.", ) else: self.assertEqual( pos_shift[0], invalid, msg= f"Positive shift on datatype {dt} did not fill with {invalid}.", ) self.assertEqual( neg_shift[-1], invalid, msg= f"Negative shift on datatype {dt} did not fill with {invalid}.", ) self.assertEqual( pos_shift[1], arr[0], msg= f"Positive shift on datatype {dt} did not shift existing values to the correct place.", ) self.assertEqual( neg_shift[0], arr[1], msg= f"Negative shift on datatype {dt} did not shift existing values to the correct place.", )
def test_partition(self): a = FA([1, 0, 3, 4, 2]) b = a.partition2(kth=2) assert_array_equal(b, [1, 0, 2, 4, 3]) a = FA([10, 0, 30, 40, 20]) b = a.argpartition2(kth=2) assert_array_equal(b, [0, 1, 4, 3, 2])
def test_iter(self): correct_keys = FastArray(['e', 'd', 'b', 'c', 'a']) correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]] str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd']) gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)}) gb = gb.gb('keycol') for i, tup in enumerate(gb): self.assertEqual(tup[0], correct_keys[i]) self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
def test_gb_categoricals(self): codes = [1, 44, 44, 133, 75, 75, 75, 1] stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g'] c1 = Categorical(codes, LikertDecision, sort_gb=True) c2 = Categorical(stringlist) d = {'nums': np.arange(8)} # from enum only d_enum = d.copy() d_enum['cat_from_enum'] = c1 ds_enum = Dataset(d_enum) enum_result = ds_enum.gb('cat_from_enum').sum() correct = FastArray([3, 15, 3, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, enum_result.nums), msg= f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}", ) # from list only d_list = d.copy() d_list['cat_from_list'] = c2 ds_list = Dataset(d_list) list_result = ds_list.gb('cat_from_list').sum() correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, list_result.nums), msg=f"Incorrect sum when grouping by list categorical.", ) d_both = d_enum.copy() d_both['cat_from_list'] = c2 ds_both = Dataset(d_both) # by enum, list result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum() num_result = result.nums correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by enum, list categoricals.", ) # by list, enum result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum() num_result = result.nums correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64) self.assertTrue( self.array_equal(correct, num_result), msg=f"Incorrect sum when grouping by list, enum categoricals.", )
def test_dtype_int_reduce(self): ''' If the result of a reduce function is an integer, it will be cast to int64. If the input is uint64, the output will also be uint64 ''' func_names = ['nansum'] unsigned_arr = FA(num_list, dtype=np.int32) signed_arr = FA(num_list, dtype=np.uint64) for name in func_names: us_func = unsigned_arr.__getattribute__(name) s_func = signed_arr.__getattribute__(name) us_dt = us_func().dtype s_dt = s_func().dtype self.assertEqual(us_dt, np.int64) self.assertEqual(s_dt, np.uint64)
def test_dtype_reduce(self): ''' If a dtype is passed to a reduce function, make sure the result is the correct dtype ''' dt = np.int8 arr = FA([1, 2]) func_names = ['std', 'var', 'nanstd', 'nanvar'] for name in func_names: func = arr.__getattribute__(name) result = func(dtype=dt) self.assertEqual( dt, result.dtype, msg= f"Dtypes did not match for func {name}. {dt} was the keyword, the output was {result.dtype}", )
def test_single_key_string_count(self): correct_counts = FastArray([4, 5, 9, 6, 6]) # for sorting/count bug fix 8/21/2018 c_make_unique = Categorical(str_fa) result_counts = c_make_unique.count().Count match = bool(np.all(result_counts == correct_counts)) assert match c_from_codes = Categorical(sorted_codes, complete_unique_cats, base_index=0) result_counts = c_from_codes.count().Count match = bool(np.all(result_counts == correct_counts)) assert match c_from_codes_unsorted = Categorical(sorted_codes, unsorted_unique_cats, base_index=0) result_counts = c_from_codes_unsorted.count().Count match = bool(np.all(result_counts == correct_counts)) assert match # 8/24/2018 SJK - default name for groupby key columns might change, so selected this by index # also, in most cases (save intenum/dict) categorical groupby no longer returns a categorical result_keys = c_from_codes_unsorted.count()[1] match = bool(np.all(result_keys == unsorted_unique_cats)) assert match, f"Result: {result_keys} Expected: {unsorted_unique_cats}"
def test_display_properties(self): ''' FastArrays of default types have default item formatting for display (see Utils.rt_display_properties) This checks to see that the correct item format is being returned from a FastArray ''' f = FA(num_list, dtype=np.int32) item_format, convert_func = f.display_query_properties() self.assertEqual( item_format.length, DisplayLength.Short, msg=f"Incorrect length for item format.", ) self.assertEqual(item_format.justification, DisplayJustification.Right) # self.assertEqual(item_format.invalid, None) self.assertEqual(item_format.can_have_spaces, False) self.assertEqual(item_format.color, DisplayColumnColors.Default) self.assertEqual(convert_func.__name__, 'convertInt')
def test_string_compare(self): ''' FastArray currently does not support bytestring array comparison with ufuncs - numpy also prints notimplemented However operators <=, <, ==, !=, >, >= will return the correct result (boolean array) ''' f_arr = FA(['a', 'b', 'c']) invalid_funcs = [ np.less_equal, np.less, np.equal, np.not_equal, np.greater, np.greater_equal, ] valid_func_names = [ '__ne__', '__eq__', '__ge__', '__gt__', '__le__', '__lt__' ] correct_results = [ [False, False, False], [True, True, True], [True, True, True], [False, False, False], [True, True, True], [False, False, False], ] correct_dict = dict(zip(valid_func_names, correct_results)) # ufunc comparisons will not work for strings (should we implement this on our own?) for func in invalid_funcs: with self.assertRaises( TypeError, msg=f"String comparison did not raise TypeError for {func}" ): result = func(f_arr, f_arr) # strings need to be compared this way for f_name in valid_func_names: func = f_arr.__getattribute__(f_name) result = func(f_arr) correct = correct_dict[f_name] for i in range(len(result)): self.assertEqual( result[i], correct[i], msg=f"String comparison failed for function {f_name}", )
def test_multikey_count(self): mk_list = [str_fa.copy(), int_fa.copy(), str_fa.copy(), int_fa.copy()] c_multi = Categorical(mk_list) result_counts = c_multi.count().Count correct_counts = FastArray([6, 5, 1, 2, 3, 2, 2, 4, 2, 2, 1]) all_correct = bool(np.all(result_counts == correct_counts)) assert all_correct,\ f"Incorrect result for multikey count for 4 keys. {result_counts} vs. {correct_counts}"
def test_total_sizes(self): st = Struct({ 'a': Dataset({ # 10x int32 => 40B 'A': range(10), # 10x int32 => 40B 'B': range(10, 20), }), 'b': Struct({ # 1x int32 => 4B 'C': 0, # 1x int32 => 4B 'D': 1, # 1x int32 => 4B 'E': 2, }), # 5x int32 => 20B 'c': FastArray(np.arange(5)), # 5x int32 => 20B 'd': np.arange(5, 10), # ??? 'e': ['abc', 'def', 'ghi'], 'f': { # 1x int32 => 4B 'q': 1, # 1x int32 => 4B 'r': 2, }, # 1x float64 => 8B 'g': 3.14, # 1x int32 => 4B 'h': 84, # ??? 'i': None, # ??? 'j': slice(None), }) # Create some duplicated/aliased data within the struct. st.z = st.c # Calculate the sizes of the Struct's data in bytes. (physical, logical) = st.total_sizes # For now, we only check that the logical size is larger than the physical size # (due to the presence of aliased array(s) somewhere within the Struct). # TODO: Strengthen this test by checking the actual computed sizes to make sure they're correct. self.assertLess( physical, logical, "The physical size is not less than the logical size.")
def test_rt_pa_str(self, rt_farr: rt.FastArray) -> None: """Test round-tripping from rt.FastArray to pyarrow.Array and back.""" result_pa_arr = rt_farr.to_arrow() result_list = result_pa_arr.to_pylist() assert len(rt_farr) == len(result_list) for i in range(len(rt_farr)): str_farr = rt_farr[i] str_result = result_list[i] assert len(str_farr) == len(str_result)
def test_categorical_numeric_array_key_completion(self): ip = get_ipython() complete = ip.Completer.complete lst = [1, 44, 44, 133, 75] # type: List[int] ip.user_ns["cat"] = Categorical(FastArray(lst)) _, matches = complete(line_buffer="cat['") expected = [str(i) for i in lst] for c in expected: self.assertIn(c, matches)
def test_categorical_string_array_key_completion(self): ip = get_ipython() complete = ip.Completer.complete lst = ['a', 'b', 'c', 'c', 'd', 'a', 'b'] # type: List[str] ip.user_ns["cat"] = Categorical(FastArray(lst), ordered=True, base_index=1, filter=None) _, matches = complete(line_buffer="cat['") for s in lst: self.assertIn(s, matches)
def test_roundtrip_rt_pa_rt(self, rt_farr: rt.FastArray) -> None: """Test round-tripping from rt.FastArray to pyarrow.Array and back.""" result_pa_arr = rt_farr.to_arrow() result_farr = rt.FastArray.from_arrow(result_pa_arr, zero_copy_only=False) assert_array_equal(rt_farr, result_farr)
def test_reductions(self): message_types = [ 'CREATE', 'RUN', 'CREATE', 'RUN', 'RUN', 'RUN', 'RUN', 'CANCEL', 'RUN', 'RUN', 'RUN', 'CANCEL', ] order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1] seconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120] shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0] d2 = dict( message_type=message_types, order_id=order_ids, second=seconds, shares=shares, ) dat = Dataset(d2) dat = dat[['order_id', 'message_type', 'second', 'shares']] # Numeric reduction dsr = dat.groupby('order_id').sum() self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.order_id, [1, 2]) self.assertArrayEqual(dsr.second, [410, 676]) self.assertArrayEqual(dsr.shares, [800, 1100]) # Numeric reduction with all columns returned dsr = dat.groupby('order_id', return_all=True).sum() self.assertEqual(dsr.shape, (2, 4)) self.assertEqual(dsr.keys()[1], 'message_type') # Order-based reduction dsr = dat.groupby('order_id').first() self.assertEqual(dsr.shape, (2, 4)) self.assertArrayEqual(dsr.order_id, [1, 2]) self.assertArrayEqual(dsr.message_type, ['CREATE', 'CREATE']) self.assertArrayEqual(dsr.second, [50, 72]) self.assertArrayEqual(dsr.shares, [0, 0]) # Order-based reduction, which returns all columns regardless dsr = dat.groupby('order_id', return_all=True).first() self.assertEqual(dsr.shape, (2, 4)) # Order-based reduction with multiple keys dsr = dat.groupby(['order_id', 'message_type']).first() self.assertEqual(dsr.shape, (6, 4)) self.assertArrayEqual(dsr.order_id, [1, 1, 1, 2, 2, 2]) self.assertArrayEqual( dsr.message_type, ['CANCEL', 'CREATE', 'RUN', 'CANCEL', 'CREATE', 'RUN']) self.assertArrayEqual(dsr.second, [120, 50, 70, 97, 72, 90]) self.assertArrayEqual(dsr.shares, [0, 0, 200, 0, 0, 100]) # On a subset of columns gb = dat.groupby('order_id') dsr = gb['shares'].sum() self.assertEqual(dsr.shape, (2, 2)) self.assertArrayEqual(dsr.shares, [800, 1100]) # Accumulating function dsr = dat.groupby('order_id').cumsum() self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800]) # return_all has no effect with accumulating functions # 8/23/2018 SJK - changed behavior so return all shows the keys dsr = dat.groupby('order_id', return_all=True).cumsum() self.assertEqual(dsr.shape, (12, 3)) # Add cum_shares back to a dataset dat['cum_shares'] = dat.groupby('order_id').shares.cumsum().shares self.assertEqual(dat.shape, (12, 5)) self.assertArrayEqual(dat.cum_shares, gb.shares.cumsum().shares) # On a subset of columns dsr = dat.groupby('order_id')[['shares', 'second']].cumsum() self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800]) self.assertArrayEqual( dsr.second, [50, 120, 72, 195, 162, 250, 290, 347, 445, 560, 676, 410]) # On a subset of columns with a filter f = FastArray([ True, False, True, False, True, False, True, False, True, False, True, False, ]) dsr = dat.groupby('order_id')[['shares', 'second']].cumsum(filter=f) self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 0, 0, 0, 100, 100, 100, 100, 400, 400, 550, 100]) self.assertArrayEqual( dsr.second, [50, 50, 72, 50, 162, 162, 145, 162, 260, 260, 376, 145]) # On shares and second with filter at groupby construction dsr = dat.groupby('order_id', filter=f)[['shares', 'second']].cumsum() inv = INVALID_DICT[dsr.shares[0].dtype.num] self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, inv, 0, inv, 100, inv, 100, inv, 400, inv, 550, inv]) self.assertArrayEqual( dsr.second, [50, inv, 72, inv, 162, inv, 145, inv, 260, inv, 376, inv]) # Using agg function dsr = gb[['second', 'shares']].agg(['sum', 'mean']) self.assertEqual(dsr.shape, (2, 2)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Sum.shares, [800, 1100]) self.assertArrayAlmostEqual(dsr.Mean.second, [82.00, 96.57], places=2) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Check for issue when bracket indexing on groupby f = open(os.devnull, 'w') print(gb, file=f) f.close() dsr = gb[['second', 'shares']].agg(['sum', 'mean']) # Using different functions on different columns dsr = gb.agg({'second': 'sum', 'shares': ['max', 'mean']}) self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Max.shares, [500, 400]) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Using numpy functions dsr = gb.agg({'second': np.sum, 'shares': [np.max, np.mean]}) self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Max.shares, [500, 400]) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Alternate way to add to multiset gb = dat.groupby('order_id') ms = gb[['shares']].agg(['max', 'mean']) ms.Sum = gb[['second']].sum() self.assertEqual(ms.shape, (2, 3)) self.assertArrayEqual(ms.Sum.second, [410, 676]) self.assertArrayEqual(ms.Max.shares, [500, 400]) self.assertArrayAlmostEqual(ms.Mean.shares, [160.00, 157.14], places=2)
_GREEDY_JEDI_CONFIG_TABLE = [ GreedyJediConfigType.NEITHER, GreedyJediConfigType.GREEDY, GreedyJediConfigType.JEDI, GreedyJediConfigType.GREEDY_JEDI, ] CODES = [1, 44, 44, 133, 75] # Todo - put this in a function generator that can get data based on data type # Add data entries to _RT_DATA_TABLE in an append only way since tests depend on ordering. # Tests should not depend on ordering of _RT_DATA_TABLE, see above work item. _RT_DATA_TABLE = [ Categorical( FastArray(['a', 'b', 'c', 'c', 'd', 'a', 'b']), ordered=True, base_index=1, filter=None, ), Categorical(CODES, LikertDecision), Categorical(CODES, decision_dict), Categorical(['b', 'a', 'a', 'c', 'a', 'b'], ['b', 'a', 'c', 'e'], sort_gb=True), Dataset({ _k: list(range(_i * 10, (_i + 1) * 10)) for _i, _k in enumerate([ "alpha", "beta", "gamma", "delta",
# Test #3: The hstacked multiplicity of categories should be equivalent to the multiplicity of aggregate categories. # Test (2) is a subset of this equality check, but remains for clarity reasons when investigating failures. assert expected_counts == actual_counts, ( f"The hstacked multiplicity of categories should be equivalent to the multiplicity of aggregate categories\n" + msg + f"actual {actual_counts}\nexpected {expected_counts}") @pytest.mark.xfail(reason="RIP-375 - Categorical unsupported dtypes") @pytest.mark.skipif(is_running_in_teamcity(), reason="Please remove alongside xfail removal.") @pytest.mark.parametrize( "data", [ # ValueError: BuildArrayInfo array has bad dtype of 21 FastArray(["1970"], dtype="datetime64[Y]"), # ValueError: BuildArrayInfo array has bad dtype of 22 FastArray([0], dtype="timedelta64[Y]"), ], ) def test_falsifying_categorical_ctor(data): Categorical(data) @pytest.mark.skipif( True, reason= "RIP-452: Mutikey Categorical isin is consistent with its single key isin alternative" ) def test_multikey_categorical_isin(): # See Python/core/riptable/tests/test_categorical.py test_multikey_categorical_isin as an example
def test_replace(self): a = FA([0, 2, 2, 3]) b = a.replace(2, 1) assert_array_equal(b, [0, 1, 1, 3])
def test_push(self): a = FA([5, nan, nan, 6, nan]) b = a.push() assert_array_equal(b, [5, 5, 5, 6, 6])
def test_map(self): a = FA([1, 1, 1, 2, 2, 2]) d = {1: 10, 2: 20} c = a.map(d) assert_array_equal(c, [10, 10, 10, 20, 20, 20])
def test_ledger(self): self.maxDiff = None sio = StringIO() with redirectStdoutCtx(sio): FA._V1() FA._V2() FA._OFF() FA._ON() FA._TOFF() FA._TON() FA._LON() FA._ROFF() FA._RON() FA._RDUMP() FA._LOFF() FA._LDUMP() FA._LCLEAR() FA._GCNOW() FA._GCSET(100) FA.Verbose = 3 FA._V1() a = FA([1, 2, 3]) b = a.copy() a += a del a a = np.arange(10000) a = FA(a) a = a + a a = a + a b = a._np del b del a FA._V0()
def test_cut(self): c = cut(arange(10), 3) self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0) c = cut(arange(10.0), 3) self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0) c = cut(arange(11), 3) self.assertTrue( sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3])) == 0) c = cut(FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10])) self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0) c = cut( FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10]), labels=['a', 'b', 'c', 'd', 'e'], ) self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0) a = np.array([1, 7, 5, 4, 6, 3]) l = FA([b'1.0->3.0', b'3.0->5.0', b'5.0->7.0']) c = cut(a, 3) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) c = cut(a, 3, labels=True) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) c = cut(a, 3, labels=None) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) c = cut(a, 3, labels=False) self.assertIsInstance(c, FastArray) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) c, b = cut(a, 3, retbins=True) self.assertIsInstance(c, Categorical) self.assertIsInstance(b, np.ndarray) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) self.assertTrue(sum(b - FA([1.0, 3.0, 5.0, 7.0])) == 0) l = ["bad", "medium", "good"] c = cut(a, 3, labels=l) self.assertIsInstance(c, Categorical) self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0) self.assertTrue((c.category_array == l).all()) # contiguous test x = arange(4).reshape(2, 2) knots = [-0.5, 0.5, 1.5, 2.5, 3.5] c = cut(x[:, 1], knots) l = FastArray([b'-0.5->0.5', b'0.5->1.5', b'1.5->2.5', b'2.5->3.5']) self.assertTrue((c.category_array == l).all()) # inf upcast test x = np.array([0, 1, 10, 100, 5]) knots = [-np.inf, 2, 11, 50, np.inf] c = cut(x, knots) self.assertTrue((c._fa == FA([1, 1, 2, 4, 2])).all())
def test_inplace_int_float(self): ''' Unlike numpy arrays, FastArray allows inplace operations between integer arrays and floating point scalars. The datatype of the array will remain the same. However currently division is not supported. Potential BUG: The floor division operator does not raise an error. ''' nums = [1, 2, 3, 4, 5] for dt in int_types[1:]: # dont include bool arr = FA(nums, dtype=dt) for dtf in float_types: scalar = dtf(1) arr += scalar self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after += operation", ) arr -= scalar self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after -= operation", ) arr *= scalar self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after *= operation", ) arr /= scalar self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after /= opration", ) arr //= scalar self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after //= operation", ) for dt in int_types[1:]: # dont include bool arr = FA(nums, dtype=dt) for dtf in float_types: arr2 = arr.astype(dtf) arr += arr2 self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after += operation", ) arr -= arr2 self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after -= operation", ) arr *= arr2 self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after *= operation", ) arr /= arr2 self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after /= operation", ) arr //= arr2 self.assertEqual( arr.dtype, dt, msg= f"Result datatype {arr.dtype} did not match original datatype {dt} after //= operation", )
str_fa = FastArray([ 'c', 'e', 'e', 'd', 'c', 'b', 'd', 'c', 'a', 'b', 'd', 'e', 'c', 'a', 'e', 'd', 'b', 'a', 'b', 'c', 'd', 'b', 'e', 'c', 'c', 'd', 'e', 'c', 'a', 'c', ])
def _test_output(self): if sys.platform != 'win32': intt_name = b'int64' else: intt_name = b'int32' class Dataset1(Struct): pass # dummy for testing, mimics behavior of real Dataset st = Struct( { 'a': Dataset1({'A': range(10), 'B': range(10, 20)}), 'b': Struct({'C': 0, 'D': 1, 'E': 2}), 'c': FastArray(np.arange(5)), 'd': np.arange(5, 10), 'e': ['abc', 'def', 'ghi'], 'f': {'q': 1, 'r': 2}, 'g': 3.14, 'h': 84, 'i': None, 'j': slice(None), } ) headers, spec = st.get_table_data() self.assertEqual(len(headers), 1) self.assertEqual( [hd.col_name for hd in headers[0]], ['Name', 'Type', 'Rows', '0', '1', '2'] ) self.assertEqual( [_r.tolist() for _r in spec], [ [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j'], [ b'Dataset1', b'Struct', intt_name, intt_name, b'list', b'dict', b'float', b'int', b'NoneType', b'slice', ], [b'2', b'3', b'5', b'5', b'3', b'2', b'0', b'0', b'0', b'0'], [b'A', b'', b'0', b'5', b'', b'', b'3.14', b'84', b'', b''], [b'B', b'', b'1', b'6', b'', b'', b'', b'', b'', b''], [b'', b'', b'2', b'7', b'', b'', b'', b'', b'', b''], ], ) self.assertEqual( str(st), f'''# Name Type Rows 0 1 2 - ---- -------- ---- ---- - - 0 a Dataset1 2 A B 1 b Struct 3 2 c {intt_name.decode()} 5 0 1 2 3 d {intt_name.decode()} 5 5 6 7 4 e list 3 5 f dict 2 6 g float 0 3.14 7 h int 0 84 8 i NoneType 0 9 j slice 0 ''', ) self.assertEqual(Struct._sizeof_fmt(128), '128.0 B') tsize = 1280 for unit in ['K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']: self.assertEqual(Struct._sizeof_fmt(tsize), f'1.2 {unit}B') tsize *= 1024 self.assertEqual(st._last_row_stats(), '[10 columns]')
def test_compare(self): ''' Compares FastArray results to numpy results for binary comparison and logical ufuncs. All results will be boolean arrays. There is a difference between calling a numpy ufunc and using a comparison operator, so the operators need to be checked separately. ''' basic_types = [np.int32, np.int64, np.float32, np.float64] numeric_types = int_types + float_types comparison_ufuncs = [ np.less_equal, np.less, np.equal, np.not_equal, np.greater, np.greater_equal, ] logical_ufuncs = [np.logical_and, np.logical_xor, np.logical_or] comparison_operators = [ '__ne__', '__eq__', '__ge__', '__gt__', '__le__', '__lt__', ] all_funcs = comparison_ufuncs + logical_ufuncs for dt1 in numeric_types: for dt2 in numeric_types: fa_arr1 = FA(num_list, dtype=dt1) fa_arr2 = FA(list(reversed(num_list)), dtype=dt2) np_arr1 = np.array(num_list, dtype=dt1) np_arr2 = np.array(list(reversed(num_list)), dtype=dt2) for func in all_funcs: fa_result = func(fa_arr1, fa_arr2) np_result = func(np_arr1, np_arr2) # check that result lengths are the same self.assertEqual( len(fa_result), len(np_result), msg= f"Result sizes did not match for {func} with dtypes {dt1} {dt2}", ) # compare each result item arr_size = len(fa_result) for i in range(arr_size): self.assertEqual( fa_result[i], np_result[i], msg= f"Comparison result did not match for {func} with dtypes {dt1} {dt2}", ) for f_name in comparison_operators: fa_func = fa_arr1.__getattribute__(f_name) np_func = np_arr1.__getattribute__(f_name) fa_result = fa_func(fa_arr2) np_result = np_func(np_arr2) # check that result lengths are the same self.assertEqual( len(fa_result), len(np_result), msg= f"Result sizes did not match for operator {f_name} with dtypes {dt1} {dt2}", ) # compare each result item arr_size = len(fa_result) for i in range(arr_size): self.assertEqual( fa_result[i], np_result[i], msg= f"Comparison operator {f_name} failed with dtypes {dt1} {dt2}", )