Ejemplo n.º 1
0
        def execute_size(t):
            def _tensordot_size_recorder(ctx, op):
                TensorTensorDot.estimate_size(ctx, op)

                chunk_key = op.outputs[0].key
                chunk_sizes[chunk_key] = ctx[chunk_key]
                chunk_nbytes[chunk_key] = op.outputs[0].nbytes

                input_sizes = dict(
                    (inp.op.key, ctx[inp.key][0]) for inp in op.inputs)
                chunk_input_sizes[chunk_key] = sum(input_sizes.values())
                input_nbytes = dict(
                    (inp.op.key, inp.nbytes) for inp in op.inputs)
                chunk_input_nbytes[chunk_key] = sum(input_nbytes.values())

            size_executor = ExecutorForTest(
                sync_provider_type=ExecutorForTest.SyncProviderType.MOCK)
            try:
                chunk_sizes.clear()
                chunk_nbytes.clear()
                chunk_input_sizes.clear()
                chunk_input_nbytes.clear()
                register(TensorTensorDot,
                         size_estimator=_tensordot_size_recorder)
                size_executor.execute_tensor(t, mock=True)
            finally:
                register_default(TensorTensorDot)
Ejemplo n.º 2
0
    def setUp(self):
        register_mars_backend()

        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context)
Ejemplo n.º 3
0
    def testPercentileExecution(self):
        raw = np.random.rand(20, 10)
        q = np.random.RandomState(0).randint(100, size=11)
        a = tensor(raw, chunk_size=7)
        r = percentile(a, q)

        result = self.executor.execute_tensor(r, concat=True)[0]
        expected = np.percentile(raw, q)
        np.testing.assert_array_equal(result, expected)

        mq = tensor(q)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            r = percentile(a, mq)
            result = executor.execute_tensors([r])[0]

            np.testing.assert_array_equal(result, expected)
Ejemplo n.º 4
0
    def testInputTileable(self):
        def f(t, x):
            return (t * x).sum().to_numpy()

        rs = np.random.RandomState(0)
        raw = rs.rand(5, 4)

        t1 = mt.tensor(raw, chunk_size=3)
        t2 = t1.sum(axis=0)
        s = spawn(f, args=(t2, 3))

        sess = new_session()
        sess._sess._executor = ExecutorForTest('numpy', storage=sess._context)

        result = s.execute(session=sess).fetch(session=sess)
        expected = (raw.sum(axis=0) * 3).sum()
        self.assertAlmostEqual(result, expected)

        df1 = md.DataFrame(raw, chunk_size=3)
        df1.execute(session=sess)
        df2 = shuffle(df1)
        df2.execute(session=sess)

        def f2(input_df):
            bonus = input_df.iloc[:, 0].fetch().sum()
            return input_df.sum().to_pandas() + bonus

        for df in [df1, df2]:
            s = spawn(f2, args=(df, ))

            result = s.execute(session=sess).fetch(session=sess)
            expected = pd.DataFrame(raw).sum() + raw[:, 0].sum()
            pd.testing.assert_series_equal(result, expected)
Ejemplo n.º 5
0
    def testHistogramExecution(self):
        rs = np.random.RandomState(0)

        raw = rs.randint(10, size=(20,))
        a = tensor(raw, chunk_size=3)
        raw_weights = rs.random(20)
        weights = tensor(raw_weights, chunk_size=4)

        # range provided
        for range_ in [(0, 10), (3, 11), (3, 7)]:
            bin_edges = histogram(a, range=range_)[0]
            result = self.executor.execute_tensor(bin_edges)[0]
            expected = np.histogram(raw, range=range_)[0]
            np.testing.assert_array_equal(result, expected)

        for wt in (raw_weights, weights):
            for density in (True, False):
                bins = [1, 4, 6, 9]
                bin_edges = histogram(a, bins=bins, weights=wt, density=density)[0]
                result = self.executor.execute_tensor(bin_edges)[0]
                expected = np.histogram(
                    raw, bins=bins, weights=raw_weights, density=density)[0]
                np.testing.assert_almost_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            raw2 = rs.randint(10, size=(1,))
            b = tensor(raw2)
            raw3 = rs.randint(10, size=(0,))
            c = tensor(raw3)
            for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]:
                for density in (True, False):
                    test_bins = [10, 'stone', 'auto', 'doane', 'fd',
                                 'rice', 'scott', 'sqrt', 'sturges']
                    for bins in test_bins:
                        hist = histogram(t, bins=bins, density=density)[0]

                        if r.size > 0:
                            with self.assertRaises(TilesError):
                                executor.execute_tensor(hist)

                        result = executor.execute_tensors([hist])[0]
                        expected = np.histogram(r, bins=bins, density=density)[0]
                        np.testing.assert_array_equal(result, expected)

                    test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)]
                    for bins in test_bins:
                        hist = histogram(t, bins=bins, density=density)[0]
                        result = executor.execute_tensors([hist])[0]
                        expected = np.histogram(r, bins=[0, 4, 8], density=density)[0]
                        np.testing.assert_array_equal(result, expected)
Ejemplo n.º 6
0
    def setUp(self):
        self.iris = mt.tensor(datasets.load_iris().data)
        # solver_list not includes arpack
        self.solver_list = ['full', 'randomized', 'auto']

        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context)
Ejemplo n.º 7
0
    def testRandintExecution(self):
        size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK)

        arr = tensor.random.randint(0, 2, size=(10, 30), chunk_size=3)
        size_res = size_executor.execute_tensor(arr, mock=True)
        self.assertEqual(arr.nbytes, sum(tp[0] for tp in size_res))

        res = self.executor.execute_tensor(arr, concat=True)[0]
        self.assertEqual(res.shape, (10, 30))
        self.assertTrue(np.all(res >= 0))
        self.assertTrue(np.all(res < 2))
Ejemplo n.º 8
0
    def setUp(self) -> None:
        this = self

        class MockSession:
            @property
            def executor(self):
                return this.executor

        self.ctx = ctx = LocalContext(MockSession())
        self.executor = ExecutorForTest('numpy', storage=ctx)
        ctx.__enter__()
Ejemplo n.º 9
0
    def testHistogramBinEdgesExecution(self):
        rs = np.random.RandomState(0)

        raw = rs.randint(10, size=(20,))
        a = tensor(raw, chunk_size=3)

        # range provided
        for range_ in [(0, 10), (3, 11), (3, 7)]:
            bin_edges = histogram_bin_edges(a, range=range_)
            result = self.executor.execute_tensor(bin_edges)[0]
            expected = np.histogram_bin_edges(raw, range=range_)
            np.testing.assert_array_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            raw2 = rs.randint(10, size=(1,))
            b = tensor(raw2)
            raw3 = rs.randint(10, size=(0,))
            c = tensor(raw3)
            for t, r in [(a, raw), (b, raw2), (c, raw3), (sort(a), raw)]:
                test_bins = [10, 'stone', 'auto', 'doane', 'fd',
                             'rice', 'scott', 'sqrt', 'sturges']
                for bins in test_bins:
                    bin_edges = histogram_bin_edges(t, bins=bins)

                    if r.size > 0:
                        with self.assertRaises(TilesError):
                            executor.execute_tensor(bin_edges)

                    result = executor.execute_tensors([bin_edges])[0]
                    expected = np.histogram_bin_edges(r, bins=bins)
                    np.testing.assert_array_equal(result, expected)

                test_bins = [[0, 4, 8], tensor([0, 4, 8], chunk_size=2)]
                for bins in test_bins:
                    bin_edges = histogram_bin_edges(t, bins=bins)
                    result = executor.execute_tensors([bin_edges])[0]
                    expected = np.histogram_bin_edges(r, bins=[0, 4, 8])
                    np.testing.assert_array_equal(result, expected)

            raw = np.arange(5)
            a = tensor(raw, chunk_size=3)
            bin_edges = histogram_bin_edges(a)
            result = executor.execute_tensors([bin_edges])[0]
            expected = np.histogram_bin_edges(raw)
            self.assertEqual(bin_edges.shape, expected.shape)
            np.testing.assert_array_equal(result, expected)
Ejemplo n.º 10
0
    def setUp(self):
        n_rows = 1000
        n_columns = 10
        chunk_size = 20
        rs = mt.random.RandomState(0)
        self.X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
        self.y = rs.rand(n_rows, chunk_size=chunk_size)

        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context)
Ejemplo n.º 11
0
    def testSparseRandintExecution(self):
        size_executor = ExecutorForTest(sync_provider_type=ExecutorForTest.SyncProviderType.MOCK)

        arr = tensor.random.randint(1, 2, size=(30, 50), density=.1, chunk_size=10, dtype='f4')
        size_res = size_executor.execute_tensor(arr, mock=True)
        self.assertAlmostEqual(arr.nbytes * 0.1, sum(tp[0] for tp in size_res))

        res = self.executor.execute_tensor(arr, concat=True)[0]
        self.assertTrue(issparse(res))
        self.assertEqual(res.shape, (30, 50))
        self.assertTrue(np.all(res.data >= 1))
        self.assertTrue(np.all(res.data < 2))
        self.assertAlmostEqual((res >= 1).toarray().sum(), 30 * 50 * .1, delta=20)
Ejemplo n.º 12
0
    def setUp(self) -> None:
        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context)

        self.estimators = [(LabelPropagation, {
            'kernel': 'rbf'
        }), (LabelPropagation, {
            'kernel': 'knn',
            'n_neighbors': 2
        }),
                           (LabelPropagation, {
                               'kernel':
                               lambda x, y: rbf_kernel(x, y, gamma=20)
                           })]
Ejemplo n.º 13
0
    def testInputTileable(self):
        def f(t, x):
            return (t * x).sum().to_numpy()

        rs = np.random.RandomState(0)
        raw = rs.rand(5, 4)

        t1 = mt.tensor(raw, chunk_size=3)
        t2 = t1.sum(axis=0)
        s = spawn(f, args=(t2, 3))

        sess = new_session()
        sess._sess._executor = ExecutorForTest('numpy', storage=sess._context)

        result = s.execute(session=sess).fetch(session=sess)
        expected = (raw.sum(axis=0) * 3).sum()
        self.assertAlmostEqual(result, expected)
Ejemplo n.º 14
0
    def setUp(self):
        # Make an X that looks somewhat like a small tf-idf matrix.
        # XXX newer versions of SciPy >0.16 have scipy.sparse.rand for this.
        shape = 60, 55
        n_samples, n_features = shape
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)
        X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64)
        X.data[:] = 1 + np.log(X.data)
        self.X = X
        self.Xdense = X.A
        self.n_samples = n_samples
        self.n_features = n_features

        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context)
Ejemplo n.º 15
0
    def testUnknownShapeInputs(self):
        def f(t, x):
            assert all(not np.isnan(s) for s in t.shape)
            return (t * x).sum().to_numpy(check_nsplits=False)

        rs = np.random.RandomState(0)
        raw = rs.rand(5, 4)

        t1 = mt.tensor(raw, chunk_size=3)
        t2 = t1[t1 > 0]
        s = spawn(f, args=(t2, 3))

        sess = new_session()
        sess._sess._executor = ExecutorForTest('numpy', storage=sess._context)

        result = s.execute(session=sess).fetch(session=sess)
        expected = (raw[raw > 0] * 3).sum()
        self.assertAlmostEqual(result, expected)
Ejemplo n.º 16
0
    def setUp(self) -> None:
        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context)

        rng = mt.random.RandomState(0)
        self.n_features = n_features = 30
        self.n_samples = n_samples = 1000
        offsets = rng.uniform(-1, 1, size=n_features)
        scales = rng.uniform(1, 10, size=n_features)
        self.X_2d = X_2d = rng.randn(n_samples, n_features) * scales + offsets
        self.X_1row = X_1row = X_2d[0, :].reshape(1, n_features)
        self.X_1col = X_1col = X_2d[:, 0].reshape(n_samples, 1)
        self.X_list_1row = X_1row.to_numpy().tolist()
        self.X_list_1col = X_1col.to_numpy().tolist()

        self.iris = mt.tensor(load_iris().data)
Ejemplo n.º 17
0
    def setUp(self):
        n_rows = 1000
        n_columns = 10
        chunk_size = 20
        rs = mt.random.RandomState(0)
        self.X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
        self.y = rs.rand(n_rows, chunk_size=chunk_size)
        self.X_df = md.DataFrame(self.X)
        x_sparse = np.random.rand(n_rows, n_columns)
        x_sparse[np.arange(n_rows),
                 np.random.randint(n_columns, size=n_rows)] = np.nan
        self.X_sparse = mt.tensor(
            x_sparse, chunk_size=chunk_size).tosparse(missing=np.nan)

        self.session = new_session().as_default()
        self._old_executor = self.session._sess._executor
        self.executor = self.session._sess._executor = \
            ExecutorForTest('numpy', storage=self.session._sess._context)
Ejemplo n.º 18
0
    def testSeriesQuantileExecution(self):
        raw = pd.Series(np.random.rand(10), name='a')
        a = Series(raw, chunk_size=3)

        # q = 0.5, scalar
        r = a.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        self.assertEqual(result, expected)

        # q is a list
        r = a.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_series_equal(result, expected)

        # test interpolation
        r = a.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_series_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = a.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_series_equal(result, expected)
Ejemplo n.º 19
0
    def testAppendExecution(self):
        executor = ExecutorForTest(storage=new_session().context)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=2)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD'))
        mdf3 = from_pandas(df3, chunk_size=3)
        expected = df1.append([df2, df3])
        adf = mdf1.append([mdf2, mdf3])
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test for series
        series1 = pd.Series(np.random.rand(10,))
        series2 = pd.Series(np.random.rand(10,))

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=2)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        series3 = pd.Series(np.random.rand(4,))
        mseries3 = series_from_pandas(series3, chunk_size=2)
        expected = series1.append([series2, series3])
        aseries = mseries1.append([mseries2, mseries3])
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)
Ejemplo n.º 20
0
 def setUp(self):
     super().setUp()
     self.executor = ExecutorForTest()
Ejemplo n.º 21
0
    def testSortValuesExecution(self):
        distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [
            '0', '1'
        ]
        for add_distinct in distinct_opts:
            os.environ['PSRS_DISTINCT_COL'] = add_distinct
            df = pd.DataFrame(np.random.rand(100, 10),
                              columns=['a' + str(i) for i in range(10)])

            # test one chunk
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a6', 'a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a6', 'a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test psrs
            mdf = DataFrame(df, chunk_size=10)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test ascending=False
            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a0', 'a1'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test multiindex
            df2 = df.copy(deep=True)
            df2.columns = pd.MultiIndex.from_product(
                [list('AB'), list('CDEFG')])
            mdf = DataFrame(df2, chunk_size=10)

            result = self.executor.execute_dataframe(mdf.sort_values([('A',
                                                                       'C')]),
                                                     concat=True)[0]
            expected = df2.sort_values([('A', 'C')])

            pd.testing.assert_frame_equal(result, expected)

            # test rechunk
            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test other types
            raw = pd.DataFrame(
                {
                    'a': np.random.rand(10),
                    'b': np.random.randint(1000, size=10),
                    'c': np.random.rand(10),
                    'd': [np.random.bytes(10) for _ in range(10)],
                    'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                    'f': [pd.Timedelta(f'{i} days') for i in range(10)]
                }, )
            mdf = DataFrame(raw, chunk_size=3)

            for label in raw.columns:
                result = self.executor.execute_dataframe(
                    mdf.sort_values(label), concat=True)[0]
                expected = raw.sort_values(label)
                pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a', 'b', 'e'], ascending=False),
                                                     concat=True)[0]
            expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test nan
            df = pd.DataFrame({
                'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
                'col2': [2, 1, 9, np.nan, 7, 4],
                'col3': [0, 1, 9, 4, 2, 3],
            })
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            # test ignore_index
            executor = ExecutorForTest(storage=new_session().context)

            df = pd.DataFrame(np.random.rand(10, 3),
                              columns=['a' + str(i) for i in range(3)])

            mdf = DataFrame(df, chunk_size=3)
            result = executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ignore_index=True),
                                                concat=True)[0]
            try:  # for python3.5
                expected = df.sort_values(['a0', 'a1'], ignore_index=True)
            except TypeError:
                expected = df.sort_values(['a0', 'a1'])
                expected.index = pd.RangeIndex(len(expected))

            pd.testing.assert_frame_equal(result, expected)

            # test inplace
            mdf = DataFrame(df)
            mdf.sort_values('a0', inplace=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            df.sort_values('a0', inplace=True)

            pd.testing.assert_frame_equal(result, df)

            # test unknown shape
            df = pd.DataFrame({
                'a': list(range(10)),
                'b': np.random.random(10)
            })
            mdf = DataFrame(df, chunk_size=4)
            filtered = mdf[mdf['a'] > 2]
            result = self.executor.execute_dataframe(
                filtered.sort_values(by='b'), concat=True)[0]

            pd.testing.assert_frame_equal(result,
                                          df[df['a'] > 2].sort_values(by='b'))

            # test Series.sort_values
            raw = pd.Series(np.random.rand(10))
            series = Series(raw)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=3)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=2)
            result = self.executor.execute_dataframe(
                series.sort_values(ascending=False), concat=True)[0]
            expected = raw.sort_values(ascending=False)

            pd.testing.assert_series_equal(result, expected)
Ejemplo n.º 22
0
 def setUp(self) -> None:
     super().setUp()
     self.executor = ExecutorForTest('numpy')
Ejemplo n.º 23
0
    def testSortIndexExecution(self):
        raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw)
        mdf.sort_index(inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=30)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=20)
        result = self.executor.execute_dataframe(
            mdf.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        executor = ExecutorForTest(storage=new_session().context)

        mdf = DataFrame(raw, chunk_size=10)
        result = executor.execute_dataframe(mdf.sort_index(ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(ignore_index=True)
        except TypeError:
            expected = raw.sort_index()
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test axis=1
        raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        result = self.executor.execute_dataframe(mdf.sort_index(
            axis=1, ascending=False),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1, ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        executor = ExecutorForTest(storage=new_session().context)

        result = executor.execute_dataframe(mdf.sort_index(axis=1,
                                                           ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(axis=1, ignore_index=True)
        except TypeError:
            expected = raw.sort_index(axis=1)
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test series
        raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))

        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(
            series.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_series_equal(result, expected)
Ejemplo n.º 24
0
 def setUp(self):
     self.executor = ExecutorForTest('numpy')
     self.old_chunk = options.chunk_size
     options.chunk_size = 10
Ejemplo n.º 25
0
 def setUp(self) -> None:
     self.session = new_session().as_default()
     self._old_executor = self.session._sess._executor
     self.executor = self.session._sess._executor = \
         ExecutorForTest('numpy', storage=self.session._sess._context)
Ejemplo n.º 26
0
    def testStoreHDF5Execution(self):
        raw = np.random.RandomState(0).rand(10, 20)

        group_name = 'test_group'
        dataset_name = 'test_dataset'

        t1 = tensor(raw, chunk_size=20)
        t2 = tensor(raw, chunk_size=9)

        with self.assertRaises(TypeError):
            tohdf5(object(), t2)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            with tempfile.TemporaryDirectory() as d:
                filename = os.path.join(d, 'test_store_{}.hdf5'.format(int(time.time())))

                # test 1 chunk
                r = tohdf5(filename, t1, group=group_name, dataset=dataset_name)

                executor.execute_tensor(r)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)

                # test filename
                r = tohdf5(filename, t2, group=group_name, dataset=dataset_name)

                executor.execute_tensor(r)

                rt = get_tiled(r)
                self.assertEqual(type(rt.chunks[0].inputs[1].op).__name__, 'SuccessorsExclusive')
                self.assertEqual(len(rt.chunks[0].inputs[1].inputs), 0)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)

                with self.assertRaises(ValueError):
                    tohdf5(filename, t2)

                with h5py.File(filename, 'r') as f:
                    # test file
                    r = tohdf5(f, t2, group=group_name, dataset=dataset_name)

                executor.execute_tensor(r)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)

                with self.assertRaises(ValueError):
                    with h5py.File(filename, 'r') as f:
                        tohdf5(f, t2)

                with h5py.File(filename, 'r') as f:
                    # test dataset
                    ds = f['{}/{}'.format(group_name, dataset_name)]
                    # test file
                    r = tohdf5(ds, t2)

                executor.execute_tensor(r)

                with h5py.File(filename, 'r') as f:
                    result = np.asarray(f['{}/{}'.format(group_name, dataset_name)])
                    np.testing.assert_array_equal(result, raw)
Ejemplo n.º 27
0
    def testTensordotExecution(self):
        size_executor = ExecutorForTest(
            sync_provider_type=ExecutorForTest.SyncProviderType.MOCK)

        a_data = np.arange(60).reshape(3, 4, 5)
        a = tensor(a_data, chunk_size=2)
        b_data = np.arange(24).reshape(4, 3, 2)
        b = tensor(b_data, chunk_size=2)

        axes = ([1, 0], [0, 1])
        c = tensordot(a, b, axes=axes)
        size_res = size_executor.execute_tensor(c, mock=True)
        self.assertEqual(sum(s[0] for s in size_res), c.nbytes)
        self.assertEqual(sum(s[1] for s in size_res), c.nbytes)

        res = self.executor.execute_tensor(c)
        expected = np.tensordot(a_data, b_data, axes=axes)
        self.assertTrue(np.array_equal(res[0], expected[:2, :]))
        self.assertTrue(np.array_equal(res[1], expected[2:4, :]))
        self.assertTrue(np.array_equal(res[2], expected[4:, :]))

        a = ones((1000, 2000), chunk_size=500)
        b = ones((2000, 100), chunk_size=500)
        c = dot(a, b)
        res = self.executor.execute_tensor(c)
        expected = np.dot(np.ones((1000, 2000)), np.ones((2000, 100)))
        self.assertEqual(len(res), 2)
        self.assertTrue(np.array_equal(res[0], expected[:500, :]))
        self.assertTrue(np.array_equal(res[1], expected[500:, :]))

        a = ones((10, 8), chunk_size=2)
        b = ones((8, 10), chunk_size=2)
        c = a.dot(b)
        res = self.executor.execute_tensor(c)
        self.assertEqual(len(res), 25)
        for r in res:
            self.assertTrue(np.array_equal(r, np.tile([8], [2, 2])))

        a = ones((500, 500), chunk_size=500)
        b = ones((500, 100), chunk_size=500)
        c = a.dot(b)
        res = self.executor.execute_tensor(c)
        self.assertTrue(np.array_equal(res[0], np.tile([500], [500, 100])))

        raw_a = np.random.random((100, 200, 50))
        raw_b = np.random.random((200, 10, 100))
        a = tensor(raw_a, chunk_size=50)
        b = tensor(raw_b, chunk_size=33)
        c = tensordot(a, b, axes=((0, 1), (2, 0)))
        res = self.executor.execute_tensor(c, concat=True)
        expected = np.tensordot(raw_a, raw_b, axes=(c.op.a_axes, c.op.b_axes))
        self.assertTrue(np.allclose(res[0], expected))

        a = ones((1000, 2000), chunk_size=500)
        b = ones((100, 2000), chunk_size=500)
        c = inner(a, b)
        res = self.executor.execute_tensor(c)
        expected = np.inner(np.ones((1000, 2000)), np.ones((100, 2000)))
        self.assertEqual(len(res), 2)
        self.assertTrue(np.array_equal(res[0], expected[:500, :]))
        self.assertTrue(np.array_equal(res[1], expected[500:, :]))

        a = ones((100, 100), chunk_size=30)
        b = ones((100, 100), chunk_size=30)
        c = a.dot(b)
        res = self.executor.execute_tensor(c, concat=True)[0]
        np.testing.assert_array_equal(res, np.ones((100, 100)) * 100)
Ejemplo n.º 28
0
 def setUp(self) -> None:
     super().setUp()
     self.executor = ExecutorForTest('numpy')
     self.ctx, self.executor = self._create_test_context(self.executor)
     self.ctx.__enter__()
Ejemplo n.º 29
0
 def setUp(self):
     self.executor = ExecutorForTest('numpy')
Ejemplo n.º 30
0
    def testOptimizedHeadTail(self):
        import sqlalchemy as sa

        with tempfile.TemporaryDirectory() as tempdir:
            executor = ExecutorForTest(storage=self.executor.storage)

            filename = os.path.join(tempdir, 'test_head.csv')
            rs = np.random.RandomState(0)
            pd_df = pd.DataFrame({
                'a':
                rs.randint(1000, size=(100, )).astype(np.int64),
                'b':
                rs.randint(1000, size=(100, )).astype(np.int64),
                'c': ['sss' for _ in range(100)],
                'd': ['eeee' for _ in range(100)]
            })
            pd_df.to_csv(filename, index=False)

            size = os.path.getsize(filename)
            chunk_bytes = size / 3

            df = md.read_csv(filename, chunk_bytes=chunk_bytes)

            # test DataFrame.head
            r = df.head(3)

            with self._inject_execute_data_source(3, DataFrameReadCSV):
                result = executor.execute_tileables([r])[0]
                expected = pd_df.head(3)
                pd.testing.assert_frame_equal(result, expected)

            # test DataFrame.tail
            r = df.tail(3)

            result = executor.execute_tileables([r])[0]
            expected = pd_df.tail(3)
            pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                          expected.reset_index(drop=True))

            # test head more than 1 chunk
            r = df.head(99)

            result = executor.execute_tileables([r])[0]
            result.reset_index(drop=True, inplace=True)
            expected = pd_df.head(99)
            pd.testing.assert_frame_equal(result, expected)

            # test Series.tail more than 1 chunk
            r = df.tail(99)

            result = executor.execute_tileables([r])[0]
            expected = pd_df.tail(99)
            pd.testing.assert_frame_equal(result.reset_index(drop=True),
                                          expected.reset_index(drop=True))

            filename = os.path.join(tempdir, 'test_sql.db')
            conn = sa.create_engine('sqlite:///' + filename)
            pd_df.to_sql('test_sql', conn)

            df = md.read_sql('test_sql',
                             conn,
                             index_col='index',
                             chunk_size=20)

            # test DataFrame.head
            r = df.head(3)

            with self._inject_execute_data_source(3, DataFrameReadSQL):
                result = executor.execute_tileables([r])[0]
                result.index.name = None
                expected = pd_df.head(3)
                pd.testing.assert_frame_equal(result, expected)