コード例 #1
0
ファイル: test_cut.py プロジェクト: neuroradiology/riptable
    def test_qcut(self):
        c = qcut(arange(10), 3)
        self.assertTrue(sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4])) == 0)

        c = qcut(arange(11), 3)
        self.assertTrue(
            sum(c._np - FA([2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4])) == 0)

        c = qcut(range(5), 3, labels=["good", "medium", "bad"])
        self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 4])) == 0)

        c = qcut(arange(100.0), [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95])
        self.assertTrue(c._np[0] == 1)
        self.assertTrue(c._np[5] == 2)
        self.assertTrue(c._np[94] == 7)
        self.assertTrue(c._np[95] == 1)

        c = qcut(arange(100.0), [0.00, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95])
        self.assertTrue(c._np[0] == 2)
        self.assertTrue(c._np[5] == 2)
        self.assertTrue(c._np[94] == 7)
        self.assertTrue(c._np[95] == 1)

        c = qcut(range(5), 4)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0)
        self.assertTrue((c.category_array == [
            b'Clipped', b'0.0->1.0', b'1.0->2.0', b'2.0->3.0', b'3.0->4.0'
        ]).all())

        c = qcut(range(5), 4, labels=True)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0)
        self.assertTrue((c.category_array == [
            b'Clipped', b'0.0->1.0', b'1.0->2.0', b'2.0->3.0', b'3.0->4.0'
        ]).all())

        c = qcut(range(5), 4, labels=None)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0)
        self.assertTrue((c.category_array == [
            b'Clipped', b'0.0->1.0', b'1.0->2.0', b'2.0->3.0', b'3.0->4.0'
        ]).all())

        c = qcut(range(5), 3, labels=["good", "medium", "bad"])
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 4])) == 0)
        self.assertTrue(
            np.array([(h == t) for (h, t) in zip(
                c.expand_array.astype('U'),
                ['good', 'good', 'medium', 'bad', 'bad'],
            )]).all())
        self.assertTrue(
            (c.category_array == [[b'Clipped', b'good', b'medium',
                                   b'bad']]).all())

        c = qcut(range(5), 4, labels=False)
        self.assertIsInstance(c, FastArray)
        self.assertTrue(sum(c._np - FA([2, 2, 3, 4, 5])) == 0)
コード例 #2
0
 def test_ops(self):
     ds = Dataset({
         'test': arange(300000) % 3,
         'test2': arange(300000.0),
         'test2i': arange(300000),
         'test3': arange(300000) % 3,
     })
     gb = ds.groupby('test')
     result = gb.mean()
     self.assertTrue(result.test2[0] == result.test2i[0])
     self.assertTrue(result.test2[1] == result.test2i[1])
     self.assertTrue(result.test3[1] == 1.0)
     result = gb.median()
     result = gb.trimbr()
     result = gb.nanmedian()
コード例 #3
0
    def test_cumcount_vs_gb(self):
        arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50)
        ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)})
        gb_result = ds.gb('keycol').cumcount()

        c = Categorical(ds.keycol)
        c_result = c.cumcount()

        rdiff = gb_result - c_result
        assert sum(rdiff) == 0

        f = logical(arange(50) % 2)
        c_result = c.cumcount(filter=f)
        assert bool(np.all(isnotnan(c_result[f])))
        assert bool(np.all(isnan(c_result[~f])))
コード例 #4
0
    def test_extract_groups(self):
        ## Case 1: Basic operation.
        # Create a grouping from some data.
        key_data1 = rt.FA([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6])
        g1 = rt.Grouping(key_data1)

        # Create another data array the same length as the key for the Grouping.
        data1 = rt.arange(len(key_data1))

        # Create a mask which selects the even-numbered groups.
        group_mask1 = rt.arange(len(g1.ncountgroup)) % 2 == 0

        # Extract elements from the data array where they correspond to even-numbered groups.
        result1 = Grouping.extract_groups(group_mask1, data1, g1.ncountgroup, g1.ifirstgroup)

        assert_array_equal(rt.FA([1, 2, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20]), result1)
コード例 #5
0
    def test_searchsorted(self):
        a = rt.arange(10.0)
        b = rt.arange(20.0) / 2
        b[3] = -np.inf
        b[7] = np.inf
        b[5] = np.nan
        b[2] = 100
        b[1] = -100
        x1 = np.searchsorted(a, b, side='left')
        x2 = rt.searchsorted(a, b, side='left')
        assert sum(x1 - x2) == 0

        x1 = rt.searchsorted(a, b, side='right')
        x2 = rt.searchsorted(a, b, side='right')
        assert sum(x1 - x2) == 0

        b = b.astype(np.int32)
コード例 #6
0
    def test_pad(self):
        arrsize = 100
        numrows = 20
        ds = Dataset({'time': arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = ['ZYGO', 'YHOO', 'FB', 'GOOG', 'IBM']
        ds.symbol2 = Cat(1 + ds.data, list('ABCDEFGHIJKLMNOPQRST'))
        ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols)
        ds.time[[3, 4, 7]] = nan
        newds = ds.gb('symbol').pad()
        self.assertTrue(newds.time[7] == 2.00)
        self.assertTrue(newds.time[3] != newds.time[3])
        newds = ds.gb('symbol').backfill()
        self.assertTrue(newds.time[7] == 12.00)

        # see if we can pull a group
        newds = ds.gb('symbol').get_group('YHOO')
        self.assertTrue(np.all(newds.symbol == 'YHOO'))
コード例 #7
0
    def test_iter(self):
        correct_keys = FastArray(['e', 'd', 'b', 'c', 'a'])
        correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]]
        str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd'])

        gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)})
        gb = gb.gb('keycol')
        for i, tup in enumerate(gb):
            self.assertEqual(tup[0], correct_keys[i])
            self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
コード例 #8
0
    def test_shift(self):
        """
        Test that Categorical.shift shifts the values in an array or Dataset *per group*.
        """
        result = rt.Cat([1, 1, 1, 2]).shift(arange(4), window=1)[0]
        assert result[1] == 0
        assert result[2] == 1

        result = rt.Cat([1, 1, 1, 2]).shift([5, 6, 7, 8], window=1)[0]
        assert result[1] == 5
        assert result[2] == 6
コード例 #9
0
    def test_apply_nonreduce(self):
        arrsize = 200
        numrows = 7
        ds = rt.Dataset({'time': rt.arange(arrsize * 1.0)})
        ds.data = arange(arrsize) % numrows
        ds.data2 = (arange(arrsize) + 3) % numrows
        symbols = [
            'AAPL',
            'AMZN',
            'FB',
            'GOOG',
            'IBM',
            '6',
            '7',
            '8',
            '9',
            '10',
            '11',
            '12',
            '13',
            '14',
            '15',
            '16',
            '17',
            '18',
        ]
        ds.symbol = rt.Cat(1 + rt.arange(arrsize) % len(symbols), symbols)
        result = ds.symbol.apply_reduce(lambda x, y: np.sum(np.minimum(x, y)),
                                        (ds.data, ds.data))

        ac = ds.accum2('symbol', 'data')
        newds = ac.apply_nonreduce(np.cumsum)
        ds2 = ac.apply_reduce(lambda x, y: np.sum(np.maximum(x, y)),
                              (newds.data, newds.data2))

        x = np.maximum(newds.data, newds.data2)
        y = ac.apply_nonreduce(lambda x, y: np.maximum(x, y),
                               (newds.data, newds.data2))[0]
        self.assertTrue(np.all(x == y))
コード例 #10
0
    def test_transform(self):

        arrsize = 200
        numrows = 7

        ds = Dataset({'time': arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM']
        ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols)
        newds = ds.gb('symbol')['data'].sum(transform=True)

        # removed from test since gbkeys not returned in transform
        # self.assertTrue(np.all(newds.symbol == ds.symbol))
        catds = ds.symbol.sum(ds.data, transform=True)
        self.assertTrue(np.all(newds[0] == catds[0]))
        # test showfilter
        catds = ds.symbol.sum(ds.data, showfilter=True, transform=True)
        self.assertTrue(np.all(newds[0] == catds[0]))

        # test diff
        result1 = ds.gb('symbol').apply_nonreduce(TypeRegister.FastArray.diff)
        result2 = ds.gb('symbol').diff()
        self.assertTrue(result1.equals(result2))
コード例 #11
0
    def test_gb_labels_enum(self):
        # make sure enum groupby keys are displayed as string,  not integer code
        c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], {
            'a': 30,
            'b': 20,
            'c': 10
        })
        c_result = c.count()
        c_labels = c_result[c_result.label_get_names()][0]

        ds = Dataset({'catcol': c, 'data': arange(9)})
        ds_result = ds.gbu('catcol').count()
        ds_labels = ds_result[ds_result.label_get_names()][0]

        assert c_labels.dtype.char == ds_labels.dtype.char
        assert bool(np.all(c_labels == ds_labels))
コード例 #12
0
    def test_extract_groups_all_groups_off(self):
        """Test for Grouping.extract_groups() when given a condition mask with all values set to False."""

        # Create a grouping from some data.
        key_data1 = rt.FA([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6])
        g1 = rt.Grouping(key_data1)

        # Create another data array the same length as the key for the Grouping.
        data1 = rt.arange(len(key_data1))

        # Create a condition mask with all values set to False.
        group_mask1 = rt.zeros(len(g1.ncountgroup), dtype=np.bool)

        # Extract elements from the data array with all groups masked out -- i.e. we're trying
        # to select data from none of the groups.
        result1 = Grouping.extract_groups(group_mask1, data1, g1.ncountgroup, g1.ifirstgroup)

        assert_array_equal(rt.FA([]), result1)
コード例 #13
0
    'min',
    'max',
    'var',
    'std',
    'nansum',
    'nanmean',
    'nanmin',
    'nanmax',
    'nanvar',
    'nanstd',
]
gb_funcs_L2 = ['first', 'last', 'median', 'mode', 'nanmedian']
gb_funcs_L3 = ['cumsum', 'cumprod']
all_gb_ops = gb_funcs_L1 + gb_funcs_L2 + gb_funcs_L3

even_filter = logical(arange(30) % 2)
d_filter = str_fa != b'd'

d_filter_results = [
    0,
    np.nan,
    np.nan,
    np.nan,
    np.nan,
    np.nan,
    0,
    np.nan,
    np.nan,
    np.nan,
    np.nan,
    np.nan,
コード例 #14
0
ファイル: test_cut.py プロジェクト: neuroradiology/riptable
    def test_cut(self):
        c = cut(arange(10), 3)
        self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0)

        c = cut(arange(10.0), 3)
        self.assertTrue(sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3])) == 0)

        c = cut(arange(11), 3)
        self.assertTrue(
            sum(c._np - FA([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3])) == 0)

        c = cut(FA([2, 4, 6, 8, 10]), FA([0, 2, 4, 6, 8, 10]))
        self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0)

        c = cut(
            FA([2, 4, 6, 8, 10]),
            FA([0, 2, 4, 6, 8, 10]),
            labels=['a', 'b', 'c', 'd', 'e'],
        )
        self.assertTrue(sum(c._np - FA([1, 2, 3, 4, 5])) == 0)

        a = np.array([1, 7, 5, 4, 6, 3])
        l = FA([b'1.0->3.0', b'3.0->5.0', b'5.0->7.0'])

        c = cut(a, 3)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        c = cut(a, 3, labels=True)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        c = cut(a, 3, labels=None)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        c = cut(a, 3, labels=False)
        self.assertIsInstance(c, FastArray)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)

        c, b = cut(a, 3, retbins=True)
        self.assertIsInstance(c, Categorical)
        self.assertIsInstance(b, np.ndarray)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())
        self.assertTrue(sum(b - FA([1.0, 3.0, 5.0, 7.0])) == 0)

        l = ["bad", "medium", "good"]
        c = cut(a, 3, labels=l)
        self.assertIsInstance(c, Categorical)
        self.assertTrue(sum(c._np - FA([1, 3, 2, 2, 3, 1])) == 0)
        self.assertTrue((c.category_array == l).all())

        # contiguous test
        x = arange(4).reshape(2, 2)
        knots = [-0.5, 0.5, 1.5, 2.5, 3.5]
        c = cut(x[:, 1], knots)
        l = FastArray([b'-0.5->0.5', b'0.5->1.5', b'1.5->2.5', b'2.5->3.5'])
        self.assertTrue((c.category_array == l).all())

        # inf upcast test
        x = np.array([0, 1, 10, 100, 5])
        knots = [-np.inf, 2, 11, 50, np.inf]
        c = cut(x, knots)
        self.assertTrue((c._fa == FA([1, 1, 2, 4, 2])).all())
コード例 #15
0
                ["alpha", "beta", "gamma", "delta", "epsilon", "zeta"])
        }),
        "ds_beta":
        Dataset({
            k: list(range(i * 10, (i + 1) * 10))
            for i, k in enumerate(
                ["eta", "theta", "iota", "kappa", "lambada", "mu"])
        }),
    }),
    Struct({
        "alpha":
        1,
        "beta": [2, 3],
        "gamma": ['2', '3'],
        "delta":
        arange(10),
        "epsilon":
        Struct({
            "theta": Struct({
                "kappa": 3,
                "zeta": 4,
            }),
            "iota": 2,
        }),
    }),
]


@contextmanager
def greedy_completion():
    ip = get_ipython()