Example #1
0
    def testSerializeLocalTrain(self):
        sess = new_session()

        with LocalContext(sess._sess):
            dmatrix = ToDMatrix(data=self.X, label=self.y)()
            model = XGBTrain(dtrain=dmatrix)()

            graph = model.build_graph(tiled=True)
            DAG.from_json(graph.to_json())

            dmatrix = ToDMatrix(data=self.X_df,
                                label=self.y_series,
                                output_types=[OutputType.dataframe])()
            model = XGBTrain(dtrain=dmatrix)()

            graph = model.build_graph(tiled=True)
            DAG.from_json(graph.to_json())

            new_X = mt.random.rand(1000, 10, chunk_size=(1000, 5))
            new_X, new_y = ToDMatrix(data=new_X,
                                     label=self.y,
                                     multi_output=True)()
            dmatrix = ToDMatrix(data=new_X, label=new_y)()
            dmatrix = dmatrix.tiles()

            self.assertEqual(len(dmatrix.chunks), 1)
Example #2
0
    def testPercentileExecution(self):
        raw = np.random.rand(20, 10)
        q = np.random.RandomState(0).randint(100, size=11)
        a = tensor(raw, chunk_size=7)
        r = percentile(a, q)

        result = self.executor.execute_tensor(r, concat=True)[0]
        expected = np.percentile(raw, q)
        np.testing.assert_array_equal(result, expected)

        mq = tensor(q)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            r = percentile(a, mq)
            result = executor.execute_tensors([r])[0]

            np.testing.assert_array_equal(result, expected)
Example #3
0
    def testHistogramExecution(self):
        rs = np.random.RandomState(0)

        raw = rs.randint(10, size=(20,))
        a = tensor(raw, chunk_size=3)
        raw_weights = rs.random(20)
        weights = tensor(raw_weights, chunk_size=4)

        # range provided
        for range_ in [(0, 10), (3, 11), (3, 7)]:
            bin_edges = histogram(a, range=range_)[0]
            result = self.executor.execute_tensor(bin_edges)[0]
            expected = np.histogram(raw, range=range_)[0]
            np.testing.assert_array_equal(result, expected)

        for wt in (raw_weights, weights):
            for density in (True, False):
                bins = [1, 4, 6, 9]
                bin_edges = histogram(a, bins=bins, weights=wt, density=density)[0]
                result = self.executor.execute_tensor(bin_edges)[0]
                expected = np.histogram(
                    raw, bins=bins, weights=raw_weights, density=density)[0]
                np.testing.assert_almost_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            raw2 = rs.randint(10, size=(1,))
            b = tensor(raw2)
            raw3 = rs.randint(10, size=(0,))
            c = tensor(raw3)
            for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]:
                for density in (True, False):
                    test_bins = [10, 'stone', 'auto', 'doane', 'fd',
                                 'rice', 'scott', 'sqrt', 'sturges']
                    for bins in test_bins:
                        hist = histogram(t, bins=bins, density=density)[0]

                        if r.size > 0:
                            with self.assertRaises(TilesError):
                                executor.execute_tensor(hist)

                        result = executor.execute_tensors([hist])[0]
                        expected = np.histogram(r, bins=bins, density=density)[0]
                        np.testing.assert_array_equal(result, expected)

                    test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)]
                    for bins in test_bins:
                        hist = histogram(t, bins=bins, density=density)[0]
                        result = executor.execute_tensors([hist])[0]
                        expected = np.histogram(r, bins=[0, 4, 8], density=density)[0]
                        np.testing.assert_array_equal(result, expected)
Example #4
0
    def setUp(self) -> None:
        this = self

        class MockSession:
            @property
            def executor(self):
                return this.executor

        self.ctx = ctx = LocalContext(MockSession())
        self.executor = ExecutorForTest('numpy', storage=ctx)
        ctx.__enter__()
Example #5
0
    def testHistogramBinEdgesExecution(self):
        rs = np.random.RandomState(0)

        raw = rs.randint(10, size=(20,))
        a = tensor(raw, chunk_size=3)

        # range provided
        for range_ in [(0, 10), (3, 11), (3, 7)]:
            bin_edges = histogram_bin_edges(a, range=range_)
            result = self.executor.execute_tensor(bin_edges)[0]
            expected = np.histogram_bin_edges(raw, range=range_)
            np.testing.assert_array_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            raw2 = rs.randint(10, size=(1,))
            b = tensor(raw2)
            raw3 = rs.randint(10, size=(0,))
            c = tensor(raw3)
            for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]:
                test_bins = [10, 'stone', 'auto', 'doane', 'fd',
                             'rice', 'scott', 'sqrt', 'sturges']
                for bins in test_bins:
                    bin_edges = histogram_bin_edges(t, bins=bins)

                    if r.size > 0:
                        with self.assertRaises(TilesError):
                            executor.execute_tensor(bin_edges)

                    result = executor.execute_tensors([bin_edges])[0]
                    expected = np.histogram_bin_edges(r, bins=bins)
                    np.testing.assert_array_equal(result, expected)

                test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)]
                for bins in test_bins:
                    bin_edges = histogram_bin_edges(t, bins=bins)
                    result = executor.execute_tensors([bin_edges])[0]
                    expected = np.histogram_bin_edges(r, bins=[0, 4, 8])
                    np.testing.assert_array_equal(result, expected)

            raw = np.arange(5)
            a = tensor(raw, chunk_size=3)
            bin_edges = histogram_bin_edges(a)
            result = executor.execute_tensors([bin_edges])[0]
            expected = np.histogram_bin_edges(raw)
            self.assertEqual(bin_edges.shape, expected.shape)
            np.testing.assert_array_equal(result, expected)
Example #6
0
    def _create_test_context(cls, executor=None):
        d = {'executor': executor}

        class MockSession:
            def __init__(self):
                self.executor = d['executor']

        ctx = LocalContext(MockSession())
        new_executor = d['executor'] = \
            ExecutorForTest('numpy', storage=ctx)

        return ctx, new_executor
Example #7
0
    def testSeriesQuantileExecution(self):
        raw = pd.Series(np.random.rand(10), name='a')
        a = Series(raw, chunk_size=3)

        # q = 0.5, scalar
        r = a.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        self.assertEqual(result, expected)

        # q is a list
        r = a.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_series_equal(result, expected)

        # test interpolation
        r = a.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_series_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = a.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_series_equal(result, expected)
    def testStoreHDF5Execution(self):
        raw = np.random.RandomState(0).rand(10, 20)

        group_name = 'test_group'
        dataset_name = 'test_dataset'

        t1 = tensor(raw, chunk_size=20)
        t2 = tensor(raw, chunk_size=9)

        with self.assertRaises(TypeError):
            tohdf5(object(), t2)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            with tempfile.TemporaryDirectory() as d:
                filename = os.path.join(d, 'test_store_{}.hdf5'.format(int(time.time())))

                # test 1 chunk
                r = tohdf5(filename, t1, group=group_name, dataset=dataset_name)

                executor.execute_tensor(r)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)

                # test filename
                r = tohdf5(filename, t2, group=group_name, dataset=dataset_name)

                executor.execute_tensor(r)

                rt = get_tiled(r)
                self.assertEqual(type(rt.chunks[0].inputs[1].op).__name__, 'SuccessorsExclusive')
                self.assertEqual(len(rt.chunks[0].inputs[1].inputs), 0)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)

                with self.assertRaises(ValueError):
                    tohdf5(filename, t2)

                with h5py.File(filename, 'r') as f:
                    # test file
                    r = tohdf5(f, t2, group=group_name, dataset=dataset_name)

                executor.execute_tensor(r)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)

                with self.assertRaises(ValueError):
                    with h5py.File(filename, 'r') as f:
                        tohdf5(f, t2)

                with h5py.File(filename, 'r') as f:
                    # test dataset
                    ds = f['{}/{}'.format(group_name, dataset_name)]
                    # test file
                    r = tohdf5(ds, t2)

                executor.execute_tensor(r)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)
Example #9
0
    def testDataFrameQuantileExecution(self):
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
                'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)]
            },
            index=pd.RangeIndex(1, 11))
        df = DataFrame(raw, chunk_size=3)

        # q = 0.5, axis = 0, series
        r = df.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        pd.testing.assert_series_equal(result, expected)

        # q = 0.5, axis = 1, series
        r = df.quantile(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile(axis=1)

        pd.testing.assert_series_equal(result, expected)

        # q is a list, axis = 0, dataframe
        r = df.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_frame_equal(result, expected)

        # q is a list, axis = 1, dataframe
        r = df.quantile([0.3, 0.7], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], axis=1)

        pd.testing.assert_frame_equal(result, expected)

        # test interpolation
        r = df.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_frame_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = df.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_frame_equal(result, expected)

        # test numeric_only
        raw2 = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
            },
            index=pd.RangeIndex(1, 11))
        df2 = DataFrame(raw2, chunk_size=3)

        r = df2.quantile([0.3, 0.7], numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile([0.3, 0.7], numeric_only=False)

        pd.testing.assert_frame_equal(result, expected)

        r = df2.quantile(numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile(numeric_only=False)

        pd.testing.assert_series_equal(result, expected)
Example #10
0
    def testQuantileExecution(self):
        # test 1 chunk, 1-d
        raw = np.random.rand(20)
        a = tensor(raw, chunk_size=20)

        raw2 = raw.copy()
        raw2[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
        a2 = tensor(raw2, chunk_size=20)

        for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
            for interpolation in INTERPOLATION_TYPES:
                for keepdims in [True, False]:
                    r = quantile(a, q, interpolation=interpolation, keepdims=keepdims)

                    result = self.executor.execute_tensor(r, concat=True)[0]
                    expected = np.quantile(
                        raw, q, interpolation=interpolation, keepdims=keepdims)

                    np.testing.assert_array_equal(result, expected)

                    r2 = quantile(a2, q, interpolation=interpolation, keepdims=keepdims)

                    result = self.executor.execute_tensor(r2, concat=True)[0]
                    expected = np.quantile(
                        raw2, q, interpolation=interpolation, keepdims=keepdims)

                    np.testing.assert_array_equal(result, expected)

        # test 1 chunk, 2-d
        raw = np.random.rand(20, 10)
        a = tensor(raw, chunk_size=20)

        raw2 = raw.copy()
        raw2.flat[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
        a2 = tensor(raw2, chunk_size=20)

        for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
            for interpolation in INTERPOLATION_TYPES:
                for keepdims in [True, False]:
                    for axis in [None, 0, 1]:
                        r = quantile(a, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        result = self.executor.execute_tensor(r, concat=True)[0]
                        expected = np.quantile(
                            raw, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        np.testing.assert_array_equal(result, expected)

                        r2 = quantile(a2, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        result = self.executor.execute_tensor(r2, concat=True)[0]
                        expected = np.quantile(
                            raw2, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        np.testing.assert_array_equal(result, expected)

        # test multi chunks, 1-d
        raw = np.random.rand(20)
        a = tensor(raw, chunk_size=3)

        raw2 = raw.copy()
        raw2[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
        a2 = tensor(raw2, chunk_size=20)

        for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
            for interpolation in INTERPOLATION_TYPES:
                for keepdims in [True, False]:
                    r = quantile(a, q, interpolation=interpolation, keepdims=keepdims)

                    result = self.executor.execute_tensor(r, concat=True)[0]
                    expected = np.quantile(
                        raw, q, interpolation=interpolation, keepdims=keepdims)

                    np.testing.assert_array_equal(result, expected)

                    r2 = quantile(a2, q, interpolation=interpolation, keepdims=keepdims)

                    result = self.executor.execute_tensor(r2, concat=True)[0]
                    expected = np.quantile(
                        raw2, q, interpolation=interpolation, keepdims=keepdims)

                    np.testing.assert_array_equal(result, expected)

        # test multi chunk, 2-d
        raw = np.random.rand(20, 10)
        a = tensor(raw, chunk_size=(3, 4))

        raw2 = raw.copy()
        raw2.flat[np.random.RandomState(0).randint(raw.size, size=3)] = np.nan
        a2 = tensor(raw2, chunk_size=(3, 4))

        for q in [np.random.RandomState(0).rand(), np.random.RandomState(0).rand(5)]:
            for interpolation in INTERPOLATION_TYPES:
                for keepdims in [True, False]:
                    for axis in [None, 0, 1]:
                        r = quantile(a, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        result = self.executor.execute_tensor(r, concat=True)[0]
                        expected = np.quantile(
                            raw, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        np.testing.assert_array_equal(result, expected)

                        r2 = quantile(a2, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        result = self.executor.execute_tensor(r2, concat=True)[0]
                        expected = np.quantile(
                            raw2, q, axis=axis, interpolation=interpolation, keepdims=keepdims)

                        np.testing.assert_array_equal(result, expected)

        # test out, 1 chunk
        raw = np.random.rand(20)
        q = np.random.rand(11)
        a = tensor(raw, chunk_size=20)
        out = empty((5, 11))
        quantile(a, q, out=out)

        result = self.executor.execute_tensor(out, concat=True)[0]
        expected = np.quantile(raw, q, out=np.empty((5, 11)))
        np.testing.assert_array_equal(result, expected)

        # test out, multi chunks
        raw = np.random.rand(20)
        q = np.random.rand(11)
        a = tensor(raw, chunk_size=3)
        out = empty((5, 11))
        quantile(a, q, out=out)

        result = self.executor.execute_tensor(out, concat=True)[0]
        expected = np.quantile(raw, q, out=np.empty((5, 11)))
        np.testing.assert_array_equal(result, expected)

        # test q which is a tensor
        q_raw = np.random.RandomState(0).rand(5)
        q = tensor(q_raw, chunk_size=3)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            r = quantile(a, q, axis=None)

            result = executor.execute_tensors([r])[0]
            expected = np.quantile(raw, q_raw, axis=None)

            np.testing.assert_array_equal(result, expected)

            with self.assertRaises(ValueError):
                q[0] = 1.1
                r = quantile(a, q, axis=None)
                _ = executor.execute_tensors(r)[0]
    def testCutExecution(self):
        rs = np.random.RandomState(0)
        raw = rs.random(15) * 1000
        s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)])
        bins = [10, 100, 500]
        ii = pd.interval_range(10, 500, 3)
        labels = ['a', 'b']

        t = tensor(raw, chunk_size=4)
        series = from_pandas_series(s, chunk_size=4)
        iii = from_pandas_index(ii, chunk_size=2)

        # cut on Series
        r = cut(series, bins)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins))

        r, b = cut(series, bins, retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        # cut on tensor
        r = cut(t, bins)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # one chunk
        r = cut(s,
                tensor(bins, chunk_size=2),
                right=False,
                include_lowest=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(
            result, pd.cut(s, bins, right=False, include_lowest=True))

        # test labels
        r = cut(t, bins, labels=labels)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        r = cut(t, bins, labels=False)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_tensor(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=False)
        np.testing.assert_array_equal(result, expected)

        # test labels which is tensor
        labels_t = tensor(['a', 'b'], chunk_size=1)
        r = cut(raw, bins, labels=labels_t, include_lowest=True)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels, include_lowest=True)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # test labels=False
        r, b = cut(raw, ii, labels=False, retbins=True)
        # result and expected is array whose dtype is CategoricalDtype
        r_result = self.executor.execute_tileable(r, concat=True)[0]
        b_result = self.executor.execute_tileable(b, concat=True)[0]
        r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True)
        for r, e in zip(r_result, r_expected):
            np.testing.assert_equal(r, e)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test bins which is md.IntervalIndex
        r, b = cut(series,
                   iii,
                   labels=tensor(labels, chunk_size=1),
                   retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_dataframe(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test duplicates
        bins2 = [0, 2, 4, 6, 10, 10]
        r, b = cut(s,
                   bins2,
                   labels=False,
                   retbins=True,
                   right=False,
                   duplicates='drop')
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s,
                                        bins2,
                                        labels=False,
                                        retbins=True,
                                        right=False,
                                        duplicates='drop')
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            # test integer bins
            r = cut(series, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s, 3))

            r, b = cut(series, 3, right=False, retbins=True)
            r_result, b_result = executor.execute_dataframes([r, b])
            r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True)
            pd.testing.assert_series_equal(r_result, r_expected)
            np.testing.assert_array_equal(b_result, b_expected)

            # test min max same
            s2 = pd.Series([1.1] * 15)
            r = cut(s2, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s2, 3))

            # test inf exist
            s3 = s2.copy()
            s3[-1] = np.inf
            with self.assertRaises(ValueError):
                executor.execute_dataframes([cut(s3, 3)])
Example #12
0
    def testRollingAggExecution(self):
        raw = pd.DataFrame({
            'a':
            np.random.randint(100, size=(10, )),
            'b':
            np.random.rand(10),
            'c':
            np.random.randint(100, size=(10, )),
            'd': ['c' * i for i in np.random.randint(4, size=10)]
        })
        raw.iloc[1, ::4] = np.nan
        s = raw.iloc[:, 1]

        dfs = [
            md.DataFrame(raw, chunk_size=10),  # 1 chunk
            md.DataFrame(raw, chunk_size=3)  # multiple chunks on each axis
        ]
        funcs = ['min', ['max', 'mean'], {'c': ['std'], 'b': ['count', 'min']}]

        df2 = dfs[0].rolling(3).agg(funcs[2])

        # test 1 chunk
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3).agg(funcs[2])
        pd.testing.assert_frame_equal(result, expected)

        for window in [2, 5]:
            for center in [True, False]:
                for func in funcs:
                    df2 = dfs[1].rolling(window, center=center).agg(func)

                    result = self.executor.execute_dataframe(df2,
                                                             concat=True)[0]
                    expected = raw.rolling(window, center=center).agg(func)
                    pd.testing.assert_frame_equal(result, expected)

        # test min_periods and win_type
        df2 = dfs[1].rolling(3, min_periods=1, win_type='triang').agg('sum')

        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3, min_periods=1, win_type='triang').agg('sum')
        pd.testing.assert_frame_equal(result, expected)

        # test rolling getitem, series
        df2 = dfs[1].rolling(3)['b'].agg('sum')

        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3)['b'].agg('sum')
        pd.testing.assert_series_equal(result, expected)

        # test rolling getitem, dataframe
        df2 = dfs[1].rolling(3)['c', 'b'].agg('sum')

        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3)['c', 'b'].agg('sum')
        pd.testing.assert_frame_equal(result, expected)

        # test axis=1
        df2 = dfs[1].rolling(3, axis=1).agg('sum')

        result = self.executor.execute_dataframe(df2,
                                                 concat=True,
                                                 check_nsplits=False)[0]
        expected = raw.rolling(3, axis=1).agg('sum')
        pd.testing.assert_frame_equal(result, expected)

        # test window which is offset
        raw2 = raw.copy()
        raw2.reset_index(inplace=True, drop=True)
        raw2.index = pd.date_range('2020-2-25', periods=10)

        df = md.DataFrame(raw2, chunk_size=3)
        for func in funcs:
            df2 = df.rolling('2d').agg(func)

            result = self.executor.execute_dataframe(df2, concat=True)[0]
            expected = raw2.rolling('2d').agg(func)
            pd.testing.assert_frame_equal(result, expected)

        series = [md.Series(s, chunk_size=10), md.Series(s, chunk_size=4)]

        funcs = ['min', ['max', 'mean'], {'c': 'std', 'b': 'count'}]

        for series in series:
            for window in [2, 3, 5]:
                for center in [True, False]:
                    for func in funcs:
                        series2 = series.rolling(window,
                                                 center=center).agg(func)

                        result = self.executor.execute_dataframe(
                            series2, concat=True)[0]
                        expected = s.rolling(window, center=center).agg(func)
                        if isinstance(expected, pd.Series):
                            pd.testing.assert_series_equal(result, expected)
                        else:
                            pd.testing.assert_frame_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            df = md.DataFrame(raw, chunk_size=3)
            df = df[df.a > 0.5]
            r = df.rolling(3).agg('max')

            result = executor.execute_dataframes([r])[0]
            expected = raw[raw.a > 0.5].rolling(3).agg('max')
            pd.testing.assert_frame_equal(result, expected)

            series = md.Series(s, chunk_size=3)
            series = series[series > 0.5]
            r = series.rolling(3).agg('max')

            result = executor.execute_dataframes([r])[0]
            expected = s[s > 0.5].rolling(3).agg('max')
            pd.testing.assert_series_equal(result, expected)

        # test agg functions
        df = md.DataFrame(raw, chunk_size=3)
        for func in [
                'count', 'sum', 'mean', 'median', 'min', 'max', 'skew', 'kurt'
        ]:
            r = getattr(df.rolling(4), func)()

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = getattr(raw.rolling(4), func)()
            pd.testing.assert_frame_equal(result, expected)
        for func in ['std', 'var']:
            r = getattr(df.rolling(4), func)(ddof=0)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = getattr(raw.rolling(4), func)(ddof=0)
            pd.testing.assert_frame_equal(result, expected)