def test_ddof_default(self):
        '''
        FastArray differs from numpy when calculating var, nanvar, std, and nanstd. We set ddof's default to 1 instead of 0
        '''
        arr = FA([1, 2])
        func_names = ['std', 'var', 'nanstd', 'nanvar']
        for name in func_names:
            # make sure different than numpy
            func = arr.__getattribute__(name)
            fa_default = func()
            np_default = func(ddof=0)
            self.assertNotAlmostEqual(
                fa_default,
                np_default,
                msg=f"Failed to set ddof to 1 for reduce function {name}",
            )

            # make sure ddof is 1 when sent to numpy reduce
            sent_to_numpy = func(keepdims=0)
            self.assertAlmostEqual(
                fa_default,
                sent_to_numpy,
                msg=
                f"Failed to set ddof to 1 before passing to numpy for function {name}",
            )
 def test_rank(self):
     a = FA([nan, 2, 2, 3])
     b = a.nanrankdata()
     assert_array_equal(b, [nan, 1.5, 1.5, 3.0])
     a = FA([0, 2, 2, 3])
     b = a.rankdata()
     assert_array_equal(b, [1, 2.5, 2.5, 4])
    def test_np_vs_member(self):
        import builtins
        '''
        Check to make sure the result is the same no matter how the ufunc is accessed.
        '''
        func_names = ['min', 'max', 'sum']
        arr = FA([1, 2, 3, 4, 5])
        num_types = int_types[1:] + float_types
        for dt in num_types:
            arr = arr.astype(dt)
            # print(dt)
            for name in func_names:
                member_func = None
                np_func = None
                builtin_func = None

                results = []
                member_func = arr.__getattribute__(name)
                results.append(member_func())
                if hasattr(np, name):
                    np_func = np.__getattribute__(name)
                    results.append(np_func(arr))
                if hasattr(builtins, name):
                    builtin_func = builtins.__getattribute__(name)
                    results.append(builtin_func(arr))

                self.assertEqual(
                    len(set(results)),
                    1,
                    msg=
                    f"Results did not match for datatype {dt} and function {name}. Fastarray: {member_func} Numpy: {np_func} Builtin: {builtin_func}",
                )
    def test_shift(self):
        '''
        Check to make sure FastArray's shift mimics pandas shift - not numpy roll.
        '''
        arr0 = FA([1, 2, 3, 4, 5])
        all_ints = int_types  # [1:] # bool is not included
        shift_dict = {
            tuple(float_types): np.nan,
            tuple([np.str_]): '',
            tuple([np.bytes_]): b'',
        }
        for tp in all_ints:
            if tp is bool:
                tp = np.bool_
            shift_dict[(tp, )] = INVALID_DICT[tp(1).dtype.num]
        for dtypes, invalid in shift_dict.items():
            for dt in dtypes:
                arr = arr0.astype(dt)
                pos_shift = arr.shift(1)
                neg_shift = arr.shift(-1)
                if invalid != invalid:
                    self.assertNotEqual(
                        pos_shift[0],
                        pos_shift[0],
                        msg=
                        f"Positive shift on datatype {dt} did not fill with {invalid}.",
                    )
                    self.assertNotEqual(
                        neg_shift[-1],
                        neg_shift[-1],
                        msg=
                        f"Negative shift on datatype {dt} did not fill with {invalid}.",
                    )
                else:
                    self.assertEqual(
                        pos_shift[0],
                        invalid,
                        msg=
                        f"Positive shift on datatype {dt} did not fill with {invalid}.",
                    )
                    self.assertEqual(
                        neg_shift[-1],
                        invalid,
                        msg=
                        f"Negative shift on datatype {dt} did not fill with {invalid}.",
                    )

                self.assertEqual(
                    pos_shift[1],
                    arr[0],
                    msg=
                    f"Positive shift on datatype {dt} did not shift existing values to the correct place.",
                )
                self.assertEqual(
                    neg_shift[0],
                    arr[1],
                    msg=
                    f"Negative shift on datatype {dt} did not shift existing values to the correct place.",
                )
        def test_partition(self):
            a = FA([1, 0, 3, 4, 2])
            b = a.partition2(kth=2)
            assert_array_equal(b, [1, 0, 2, 4, 3])

            a = FA([10, 0, 30, 40, 20])
            b = a.argpartition2(kth=2)
            assert_array_equal(b, [0, 1, 4, 3, 2])
    def test_iter(self):
        correct_keys = FastArray(['e', 'd', 'b', 'c', 'a'])
        correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]]
        str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd'])

        gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)})
        gb = gb.gb('keycol')
        for i, tup in enumerate(gb):
            self.assertEqual(tup[0], correct_keys[i])
            self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
    def test_gb_categoricals(self):
        codes = [1, 44, 44, 133, 75, 75, 75, 1]
        stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g']
        c1 = Categorical(codes, LikertDecision, sort_gb=True)
        c2 = Categorical(stringlist)
        d = {'nums': np.arange(8)}

        # from enum only
        d_enum = d.copy()
        d_enum['cat_from_enum'] = c1
        ds_enum = Dataset(d_enum)
        enum_result = ds_enum.gb('cat_from_enum').sum()
        correct = FastArray([3, 15, 3, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, enum_result.nums),
            msg=
            f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}",
        )

        # from list only
        d_list = d.copy()
        d_list['cat_from_list'] = c2
        ds_list = Dataset(d_list)
        list_result = ds_list.gb('cat_from_list').sum()
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, list_result.nums),
            msg=f"Incorrect sum when grouping by list categorical.",
        )

        d_both = d_enum.copy()
        d_both['cat_from_list'] = c2
        ds_both = Dataset(d_both)

        # by enum, list
        result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum()
        num_result = result.nums
        correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by enum, list categoricals.",
        )

        # by list, enum
        result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum()
        num_result = result.nums
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by list, enum categoricals.",
        )
 def test_dtype_int_reduce(self):
     '''
     If the result of a reduce function is an integer, it will be cast to int64.
     If the input is uint64, the output will also be uint64
     '''
     func_names = ['nansum']
     unsigned_arr = FA(num_list, dtype=np.int32)
     signed_arr = FA(num_list, dtype=np.uint64)
     for name in func_names:
         us_func = unsigned_arr.__getattribute__(name)
         s_func = signed_arr.__getattribute__(name)
         us_dt = us_func().dtype
         s_dt = s_func().dtype
         self.assertEqual(us_dt, np.int64)
         self.assertEqual(s_dt, np.uint64)
 def test_dtype_reduce(self):
     '''
     If a dtype is passed to a reduce function, make sure the result is the correct dtype
     '''
     dt = np.int8
     arr = FA([1, 2])
     func_names = ['std', 'var', 'nanstd', 'nanvar']
     for name in func_names:
         func = arr.__getattribute__(name)
         result = func(dtype=dt)
         self.assertEqual(
             dt,
             result.dtype,
             msg=
             f"Dtypes did not match for func {name}. {dt} was the keyword, the output was {result.dtype}",
         )
Beispiel #10
0
    def test_single_key_string_count(self):
        correct_counts = FastArray([4, 5, 9, 6, 6])

        # for sorting/count bug fix 8/21/2018
        c_make_unique = Categorical(str_fa)
        result_counts = c_make_unique.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match

        c_from_codes = Categorical(sorted_codes,
                                   complete_unique_cats,
                                   base_index=0)
        result_counts = c_from_codes.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match

        c_from_codes_unsorted = Categorical(sorted_codes,
                                            unsorted_unique_cats,
                                            base_index=0)
        result_counts = c_from_codes_unsorted.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match
        # 8/24/2018 SJK - default name for groupby key columns might change, so selected this by index
        # also, in most cases (save intenum/dict) categorical groupby no longer returns a categorical
        result_keys = c_from_codes_unsorted.count()[1]
        match = bool(np.all(result_keys == unsorted_unique_cats))
        assert match, f"Result: {result_keys} Expected: {unsorted_unique_cats}"
 def test_display_properties(self):
     '''
     FastArrays of default types have default item formatting for display (see Utils.rt_display_properties)
     This checks to see that the correct item format is being returned from a FastArray
     '''
     f = FA(num_list, dtype=np.int32)
     item_format, convert_func = f.display_query_properties()
     self.assertEqual(
         item_format.length,
         DisplayLength.Short,
         msg=f"Incorrect length for item format.",
     )
     self.assertEqual(item_format.justification, DisplayJustification.Right)
     # self.assertEqual(item_format.invalid, None)
     self.assertEqual(item_format.can_have_spaces, False)
     self.assertEqual(item_format.color, DisplayColumnColors.Default)
     self.assertEqual(convert_func.__name__, 'convertInt')
    def test_string_compare(self):
        '''
        FastArray currently does not support bytestring array comparison with ufuncs - numpy also prints notimplemented
        However operators <=, <, ==, !=, >, >= will return the correct result (boolean array)
        '''
        f_arr = FA(['a', 'b', 'c'])
        invalid_funcs = [
            np.less_equal,
            np.less,
            np.equal,
            np.not_equal,
            np.greater,
            np.greater_equal,
        ]
        valid_func_names = [
            '__ne__', '__eq__', '__ge__', '__gt__', '__le__', '__lt__'
        ]
        correct_results = [
            [False, False, False],
            [True, True, True],
            [True, True, True],
            [False, False, False],
            [True, True, True],
            [False, False, False],
        ]
        correct_dict = dict(zip(valid_func_names, correct_results))

        # ufunc comparisons will not work for strings (should we implement this on our own?)
        for func in invalid_funcs:
            with self.assertRaises(
                    TypeError,
                    msg=f"String comparison did not raise TypeError for {func}"
            ):
                result = func(f_arr, f_arr)

        # strings need to be compared this way
        for f_name in valid_func_names:
            func = f_arr.__getattribute__(f_name)
            result = func(f_arr)
            correct = correct_dict[f_name]
            for i in range(len(result)):
                self.assertEqual(
                    result[i],
                    correct[i],
                    msg=f"String comparison failed for function {f_name}",
                )
Beispiel #13
0
 def test_multikey_count(self):
     mk_list = [str_fa.copy(), int_fa.copy(), str_fa.copy(), int_fa.copy()]
     c_multi = Categorical(mk_list)
     result_counts = c_multi.count().Count
     correct_counts = FastArray([6, 5, 1, 2, 3, 2, 2, 4, 2, 2, 1])
     all_correct = bool(np.all(result_counts == correct_counts))
     assert all_correct,\
         f"Incorrect result for multikey count for 4 keys. {result_counts} vs. {correct_counts}"
Beispiel #14
0
    def test_total_sizes(self):
        st = Struct({
            'a':
            Dataset({
                # 10x int32 => 40B
                'A': range(10),
                # 10x int32 => 40B
                'B': range(10, 20),
            }),
            'b':
            Struct({
                # 1x int32 => 4B
                'C': 0,
                # 1x int32 => 4B
                'D': 1,
                # 1x int32 => 4B
                'E': 2,
            }),
            # 5x int32 => 20B
            'c':
            FastArray(np.arange(5)),
            # 5x int32 => 20B
            'd':
            np.arange(5, 10),
            # ???
            'e': ['abc', 'def', 'ghi'],
            'f': {
                # 1x int32 => 4B
                'q': 1,
                # 1x int32 => 4B
                'r': 2,
            },
            # 1x float64 => 8B
            'g':
            3.14,
            # 1x int32 => 4B
            'h':
            84,
            # ???
            'i':
            None,
            # ???
            'j':
            slice(None),
        })

        # Create some duplicated/aliased data within the struct.
        st.z = st.c

        # Calculate the sizes of the Struct's data in bytes.
        (physical, logical) = st.total_sizes

        # For now, we only check that the logical size is larger than the physical size
        # (due to the presence of aliased array(s) somewhere within the Struct).
        # TODO: Strengthen this test by checking the actual computed sizes to make sure they're correct.
        self.assertLess(
            physical, logical,
            "The physical size is not less than the logical size.")
 def test_rt_pa_str(self, rt_farr: rt.FastArray) -> None:
     """Test round-tripping from rt.FastArray to pyarrow.Array and back."""
     result_pa_arr = rt_farr.to_arrow()
     result_list = result_pa_arr.to_pylist()
     assert len(rt_farr) == len(result_list)
     for i in range(len(rt_farr)):
         str_farr = rt_farr[i]
         str_result = result_list[i]
         assert len(str_farr) == len(str_result)
    def test_categorical_numeric_array_key_completion(self):
        ip = get_ipython()
        complete = ip.Completer.complete

        lst = [1, 44, 44, 133, 75]  # type: List[int]
        ip.user_ns["cat"] = Categorical(FastArray(lst))
        _, matches = complete(line_buffer="cat['")
        expected = [str(i) for i in lst]
        for c in expected:
            self.assertIn(c, matches)
 def test_categorical_string_array_key_completion(self):
     ip = get_ipython()
     complete = ip.Completer.complete
     lst = ['a', 'b', 'c', 'c', 'd', 'a', 'b']  # type: List[str]
     ip.user_ns["cat"] = Categorical(FastArray(lst),
                                     ordered=True,
                                     base_index=1,
                                     filter=None)
     _, matches = complete(line_buffer="cat['")
     for s in lst:
         self.assertIn(s, matches)
 def test_roundtrip_rt_pa_rt(self, rt_farr: rt.FastArray) -> None:
     """Test round-tripping from rt.FastArray to pyarrow.Array and back."""
     result_pa_arr = rt_farr.to_arrow()
     result_farr = rt.FastArray.from_arrow(result_pa_arr, zero_copy_only=False)
     assert_array_equal(rt_farr, result_farr)
Beispiel #19
0
    def test_reductions(self):
        message_types = [
            'CREATE',
            'RUN',
            'CREATE',
            'RUN',
            'RUN',
            'RUN',
            'RUN',
            'CANCEL',
            'RUN',
            'RUN',
            'RUN',
            'CANCEL',
        ]
        order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1]
        seconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120]
        shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0]
        d2 = dict(
            message_type=message_types,
            order_id=order_ids,
            second=seconds,
            shares=shares,
        )
        dat = Dataset(d2)
        dat = dat[['order_id', 'message_type', 'second', 'shares']]

        # Numeric reduction
        dsr = dat.groupby('order_id').sum()
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.order_id, [1, 2])
        self.assertArrayEqual(dsr.second, [410, 676])
        self.assertArrayEqual(dsr.shares, [800, 1100])

        # Numeric reduction with all columns returned
        dsr = dat.groupby('order_id', return_all=True).sum()
        self.assertEqual(dsr.shape, (2, 4))
        self.assertEqual(dsr.keys()[1], 'message_type')

        # Order-based reduction
        dsr = dat.groupby('order_id').first()
        self.assertEqual(dsr.shape, (2, 4))
        self.assertArrayEqual(dsr.order_id, [1, 2])
        self.assertArrayEqual(dsr.message_type, ['CREATE', 'CREATE'])
        self.assertArrayEqual(dsr.second, [50, 72])
        self.assertArrayEqual(dsr.shares, [0, 0])

        # Order-based reduction, which returns all columns regardless
        dsr = dat.groupby('order_id', return_all=True).first()
        self.assertEqual(dsr.shape, (2, 4))

        # Order-based reduction with multiple keys
        dsr = dat.groupby(['order_id', 'message_type']).first()
        self.assertEqual(dsr.shape, (6, 4))
        self.assertArrayEqual(dsr.order_id, [1, 1, 1, 2, 2, 2])
        self.assertArrayEqual(
            dsr.message_type,
            ['CANCEL', 'CREATE', 'RUN', 'CANCEL', 'CREATE', 'RUN'])
        self.assertArrayEqual(dsr.second, [120, 50, 70, 97, 72, 90])
        self.assertArrayEqual(dsr.shares, [0, 0, 200, 0, 0, 100])

        # On a subset of columns
        gb = dat.groupby('order_id')
        dsr = gb['shares'].sum()
        self.assertEqual(dsr.shape, (2, 2))
        self.assertArrayEqual(dsr.shares, [800, 1100])

        # Accumulating function
        dsr = dat.groupby('order_id').cumsum()
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800])

        # return_all has no effect with accumulating functions
        # 8/23/2018 SJK - changed behavior so return all shows the keys
        dsr = dat.groupby('order_id', return_all=True).cumsum()
        self.assertEqual(dsr.shape, (12, 3))

        # Add cum_shares back to a dataset
        dat['cum_shares'] = dat.groupby('order_id').shares.cumsum().shares
        self.assertEqual(dat.shape, (12, 5))
        self.assertArrayEqual(dat.cum_shares, gb.shares.cumsum().shares)

        # On a subset of columns
        dsr = dat.groupby('order_id')[['shares', 'second']].cumsum()
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800])
        self.assertArrayEqual(
            dsr.second,
            [50, 120, 72, 195, 162, 250, 290, 347, 445, 560, 676, 410])

        # On a subset of columns with a filter
        f = FastArray([
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
        ])
        dsr = dat.groupby('order_id')[['shares', 'second']].cumsum(filter=f)
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares, [0, 0, 0, 0, 100, 100, 100, 100, 400, 400, 550, 100])
        self.assertArrayEqual(
            dsr.second,
            [50, 50, 72, 50, 162, 162, 145, 162, 260, 260, 376, 145])

        # On shares and second with filter at groupby construction
        dsr = dat.groupby('order_id', filter=f)[['shares', 'second']].cumsum()
        inv = INVALID_DICT[dsr.shares[0].dtype.num]
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, inv, 0, inv, 100, inv, 100, inv, 400, inv, 550, inv])
        self.assertArrayEqual(
            dsr.second,
            [50, inv, 72, inv, 162, inv, 145, inv, 260, inv, 376, inv])

        # Using agg function
        dsr = gb[['second', 'shares']].agg(['sum', 'mean'])
        self.assertEqual(dsr.shape, (2, 2))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Sum.shares, [800, 1100])
        self.assertArrayAlmostEqual(dsr.Mean.second, [82.00, 96.57], places=2)
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Check for issue when bracket indexing on groupby
        f = open(os.devnull, 'w')
        print(gb, file=f)
        f.close()
        dsr = gb[['second', 'shares']].agg(['sum', 'mean'])

        # Using different functions on different columns
        dsr = gb.agg({'second': 'sum', 'shares': ['max', 'mean']})
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Using numpy functions
        dsr = gb.agg({'second': np.sum, 'shares': [np.max, np.mean]})
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Alternate way to add to multiset
        gb = dat.groupby('order_id')
        ms = gb[['shares']].agg(['max', 'mean'])
        ms.Sum = gb[['second']].sum()
        self.assertEqual(ms.shape, (2, 3))
        self.assertArrayEqual(ms.Sum.second, [410, 676])
        self.assertArrayEqual(ms.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(ms.Mean.shares, [160.00, 157.14], places=2)
_GREEDY_JEDI_CONFIG_TABLE = [
    GreedyJediConfigType.NEITHER,
    GreedyJediConfigType.GREEDY,
    GreedyJediConfigType.JEDI,
    GreedyJediConfigType.GREEDY_JEDI,
]

CODES = [1, 44, 44, 133, 75]

# Todo - put this in a function generator that can get data based on data type
# Add data entries to _RT_DATA_TABLE in an append only way since tests depend on ordering.
# Tests should not depend on ordering of _RT_DATA_TABLE, see above work item.
_RT_DATA_TABLE = [
    Categorical(
        FastArray(['a', 'b', 'c', 'c', 'd', 'a', 'b']),
        ordered=True,
        base_index=1,
        filter=None,
    ),
    Categorical(CODES, LikertDecision),
    Categorical(CODES, decision_dict),
    Categorical(['b', 'a', 'a', 'c', 'a', 'b'], ['b', 'a', 'c', 'e'],
                sort_gb=True),
    Dataset({
        _k: list(range(_i * 10, (_i + 1) * 10))
        for _i, _k in enumerate([
            "alpha",
            "beta",
            "gamma",
            "delta",
    # Test #3: The hstacked multiplicity of categories should be equivalent to the multiplicity of aggregate categories.
    # Test (2) is a subset of this equality check, but remains for clarity reasons when investigating failures.
    assert expected_counts == actual_counts, (
        f"The hstacked multiplicity of categories should be equivalent to the multiplicity of aggregate categories\n"
        + msg + f"actual {actual_counts}\nexpected {expected_counts}")


@pytest.mark.xfail(reason="RIP-375 - Categorical unsupported dtypes")
@pytest.mark.skipif(is_running_in_teamcity(),
                    reason="Please remove alongside xfail removal.")
@pytest.mark.parametrize(
    "data",
    [
        # ValueError: BuildArrayInfo array has bad dtype of 21
        FastArray(["1970"], dtype="datetime64[Y]"),
        # ValueError: BuildArrayInfo array has bad dtype of 22
        FastArray([0], dtype="timedelta64[Y]"),
    ],
)
def test_falsifying_categorical_ctor(data):
    Categorical(data)


@pytest.mark.skipif(
    True,
    reason=
    "RIP-452: Mutikey Categorical isin is consistent with its single key isin alternative"
)
def test_multikey_categorical_isin():
    # See Python/core/riptable/tests/test_categorical.py test_multikey_categorical_isin as an example
 def test_replace(self):
     a = FA([0, 2, 2, 3])
     b = a.replace(2, 1)
     assert_array_equal(b, [0, 1, 1, 3])
 def test_push(self):
     a = FA([5, nan, nan, 6, nan])
     b = a.push()
     assert_array_equal(b, [5, 5, 5, 6, 6])
 def test_map(self):
     a = FA([1, 1, 1, 2, 2, 2])
     d = {1: 10, 2: 20}
     c = a.map(d)
     assert_array_equal(c, [10, 10, 10, 20, 20, 20])
 def test_ledger(self):
     self.maxDiff = None
     sio = StringIO()
     with redirectStdoutCtx(sio):
         FA._V1()
         FA._V2()
         FA._OFF()
         FA._ON()
         FA._TOFF()
         FA._TON()
         FA._LON()
         FA._ROFF()
         FA._RON()
         FA._RDUMP()
         FA._LOFF()
         FA._LDUMP()
         FA._LCLEAR()
         FA._GCNOW()
         FA._GCSET(100)
         FA.Verbose = 3
         FA._V1()
         a = FA([1, 2, 3])
         b = a.copy()
         a += a
         del a
         a = np.arange(10000)
         a = FA(a)
         a = a + a
         a = a + a
         b = a._np
         del b
         del a
         FA._V0()
Beispiel #26
0
    def test_cut(self):
        c = cut(arange(10), 3)
        self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0)

        c = cut(arange(10.0), 3)
        self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0)

        c = cut(arange(11), 3)
        self.assertTrue(
            sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3])) == 0)

        c = cut(FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10]))
        self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0)

        c = cut(
            FA([2, 4, 6, 8, 10]),
            FA([0, 2, 4, 6, 8, 10]),
            labels=['a', 'b', 'c', 'd', 'e'],
        )
        self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0)

        a = np.array([1, 7, 5, 4, 6, 3])
        l = FA([b'1.0->3.0', b'3.0->5.0', b'5.0->7.0'])

        c = cut(a, 3)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        c = cut(a, 3, labels=True)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        c = cut(a, 3, labels=None)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        c = cut(a, 3, labels=False)
        self.assertIsInstance(c, FastArray)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)

        c, b = cut(a, 3, retbins=True)
        self.assertIsInstance(c, Categorical)
        self.assertIsInstance(b, np.ndarray)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())
        self.assertTrue(sum(b - FA([1.0, 3.0, 5.0, 7.0])) == 0)

        l = ["bad", "medium", "good"]
        c = cut(a, 3, labels=l)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        # contiguous test
        x = arange(4).reshape(2, 2)
        knots = [-0.5, 0.5, 1.5, 2.5, 3.5]
        c = cut(x[:, 1], knots)
        l = FastArray([b'-0.5->0.5', b'0.5->1.5', b'1.5->2.5', b'2.5->3.5'])
        self.assertTrue((c.category_array == l).all())

        # inf upcast test
        x = np.array([0, 1, 10, 100, 5])
        knots = [-np.inf, 2, 11, 50, np.inf]
        c = cut(x, knots)
        self.assertTrue((c._fa == FA([1, 1, 2, 4, 2])).all())
    def test_inplace_int_float(self):
        '''
        Unlike numpy arrays, FastArray allows inplace operations between integer arrays and floating point scalars.
        The datatype of the array will remain the same. However currently division is not supported.

        Potential BUG: The floor division operator does not raise an error.
        '''
        nums = [1, 2, 3, 4, 5]
        for dt in int_types[1:]:  # dont include bool
            arr = FA(nums, dtype=dt)
            for dtf in float_types:
                scalar = dtf(1)
                arr += scalar
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after += operation",
                )
                arr -= scalar
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after -= operation",
                )
                arr *= scalar
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after *= operation",
                )
                arr /= scalar
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after /= opration",
                )
                arr //= scalar
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after //= operation",
                )

        for dt in int_types[1:]:  # dont include bool
            arr = FA(nums, dtype=dt)
            for dtf in float_types:
                arr2 = arr.astype(dtf)
                arr += arr2
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after += operation",
                )
                arr -= arr2
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after -= operation",
                )
                arr *= arr2
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after *= operation",
                )
                arr /= arr2
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after /= operation",
                )
                arr //= arr2
                self.assertEqual(
                    arr.dtype,
                    dt,
                    msg=
                    f"Result datatype {arr.dtype} did not match original datatype {dt} after //= operation",
                )
Beispiel #28
0
str_fa = FastArray([
    'c',
    'e',
    'e',
    'd',
    'c',
    'b',
    'd',
    'c',
    'a',
    'b',
    'd',
    'e',
    'c',
    'a',
    'e',
    'd',
    'b',
    'a',
    'b',
    'c',
    'd',
    'b',
    'e',
    'c',
    'c',
    'd',
    'e',
    'c',
    'a',
    'c',
])
Beispiel #29
0
    def _test_output(self):
        if sys.platform != 'win32':
            intt_name = b'int64'
        else:
            intt_name = b'int32'

        class Dataset1(Struct):
            pass  # dummy for testing, mimics behavior of real Dataset

        st = Struct(
            {
                'a': Dataset1({'A': range(10), 'B': range(10, 20)}),
                'b': Struct({'C': 0, 'D': 1, 'E': 2}),
                'c': FastArray(np.arange(5)),
                'd': np.arange(5, 10),
                'e': ['abc', 'def', 'ghi'],
                'f': {'q': 1, 'r': 2},
                'g': 3.14,
                'h': 84,
                'i': None,
                'j': slice(None),
            }
        )
        headers, spec = st.get_table_data()
        self.assertEqual(len(headers), 1)
        self.assertEqual(
            [hd.col_name for hd in headers[0]], ['Name', 'Type', 'Rows', '0', '1', '2']
        )
        self.assertEqual(
            [_r.tolist() for _r in spec],
            [
                [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j'],
                [
                    b'Dataset1',
                    b'Struct',
                    intt_name,
                    intt_name,
                    b'list',
                    b'dict',
                    b'float',
                    b'int',
                    b'NoneType',
                    b'slice',
                ],
                [b'2', b'3', b'5', b'5', b'3', b'2', b'0', b'0', b'0', b'0'],
                [b'A', b'', b'0', b'5', b'', b'', b'3.14', b'84', b'', b''],
                [b'B', b'', b'1', b'6', b'', b'', b'', b'', b'', b''],
                [b'', b'', b'2', b'7', b'', b'', b'', b'', b'', b''],
            ],
        )
        self.assertEqual(
            str(st),
            f'''#   Name   Type       Rows   0      1   2
-   ----   --------   ----   ----   -   -
0   a      Dataset1   2      A      B
1   b      Struct     3
2   c      {intt_name.decode()}      5      0      1   2
3   d      {intt_name.decode()}      5      5      6   7
4   e      list       3
5   f      dict       2
6   g      float      0      3.14
7   h      int        0      84
8   i      NoneType   0
9   j      slice      0                  ''',
        )
        self.assertEqual(Struct._sizeof_fmt(128), '128.0 B')
        tsize = 1280
        for unit in ['K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y']:
            self.assertEqual(Struct._sizeof_fmt(tsize), f'1.2 {unit}B')
            tsize *= 1024
        self.assertEqual(st._last_row_stats(), '[10 columns]')
    def test_compare(self):
        '''
        Compares FastArray results to numpy results for binary comparison and logical ufuncs.
        All results will be boolean arrays.

        There is a difference between calling a numpy ufunc and using a comparison operator,
        so the operators need to be checked separately.
        '''
        basic_types = [np.int32, np.int64, np.float32, np.float64]
        numeric_types = int_types + float_types
        comparison_ufuncs = [
            np.less_equal,
            np.less,
            np.equal,
            np.not_equal,
            np.greater,
            np.greater_equal,
        ]
        logical_ufuncs = [np.logical_and, np.logical_xor, np.logical_or]
        comparison_operators = [
            '__ne__',
            '__eq__',
            '__ge__',
            '__gt__',
            '__le__',
            '__lt__',
        ]
        all_funcs = comparison_ufuncs + logical_ufuncs

        for dt1 in numeric_types:
            for dt2 in numeric_types:
                fa_arr1 = FA(num_list, dtype=dt1)
                fa_arr2 = FA(list(reversed(num_list)), dtype=dt2)
                np_arr1 = np.array(num_list, dtype=dt1)
                np_arr2 = np.array(list(reversed(num_list)), dtype=dt2)
                for func in all_funcs:
                    fa_result = func(fa_arr1, fa_arr2)
                    np_result = func(np_arr1, np_arr2)
                    # check that result lengths are the same
                    self.assertEqual(
                        len(fa_result),
                        len(np_result),
                        msg=
                        f"Result sizes did not match for {func} with dtypes {dt1} {dt2}",
                    )
                    # compare each result item
                    arr_size = len(fa_result)
                    for i in range(arr_size):
                        self.assertEqual(
                            fa_result[i],
                            np_result[i],
                            msg=
                            f"Comparison result did not match for {func} with dtypes {dt1} {dt2}",
                        )

                for f_name in comparison_operators:
                    fa_func = fa_arr1.__getattribute__(f_name)
                    np_func = np_arr1.__getattribute__(f_name)
                    fa_result = fa_func(fa_arr2)
                    np_result = np_func(np_arr2)
                    # check that result lengths are the same
                    self.assertEqual(
                        len(fa_result),
                        len(np_result),
                        msg=
                        f"Result sizes did not match for operator {f_name} with dtypes {dt1} {dt2}",
                    )
                    # compare each result item
                    arr_size = len(fa_result)
                    for i in range(arr_size):
                        self.assertEqual(
                            fa_result[i],
                            np_result[i],
                            msg=
                            f"Comparison operator {f_name} failed with dtypes {dt1} {dt2}",
                        )