Example #1
0
    def testSeriesQuantileExecution(self):
        raw = pd.Series(np.random.rand(10), name='a')
        a = Series(raw, chunk_size=3)

        # q = 0.5, scalar
        r = a.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        self.assertEqual(result, expected)

        # q is a list
        r = a.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_series_equal(result, expected)

        # test interpolation
        r = a.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_series_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = a.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_series_equal(result, expected)
Example #2
0
    def testDataFrameQuantileExecution(self):
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
                'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)]
            },
            index=pd.RangeIndex(1, 11))
        df = DataFrame(raw, chunk_size=3)

        # q = 0.5, axis = 0, series
        r = df.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        pd.testing.assert_series_equal(result, expected)

        # q = 0.5, axis = 1, series
        r = df.quantile(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile(axis=1)

        pd.testing.assert_series_equal(result, expected)

        # q is a list, axis = 0, dataframe
        r = df.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_frame_equal(result, expected)

        # q is a list, axis = 1, dataframe
        r = df.quantile([0.3, 0.7], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], axis=1)

        pd.testing.assert_frame_equal(result, expected)

        # test interpolation
        r = df.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_frame_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = df.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_frame_equal(result, expected)

        # test numeric_only
        raw2 = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
            },
            index=pd.RangeIndex(1, 11))
        df2 = DataFrame(raw2, chunk_size=3)

        r = df2.quantile([0.3, 0.7], numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile([0.3, 0.7], numeric_only=False)

        pd.testing.assert_frame_equal(result, expected)

        r = df2.quantile(numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile(numeric_only=False)

        pd.testing.assert_series_equal(result, expected)
    def testCutExecution(self):
        rs = np.random.RandomState(0)
        raw = rs.random(15) * 1000
        s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)])
        bins = [10, 100, 500]
        ii = pd.interval_range(10, 500, 3)
        labels = ['a', 'b']

        t = tensor(raw, chunk_size=4)
        series = from_pandas_series(s, chunk_size=4)
        iii = from_pandas_index(ii, chunk_size=2)

        # cut on Series
        r = cut(series, bins)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins))

        r, b = cut(series, bins, retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        # cut on tensor
        r = cut(t, bins)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # one chunk
        r = cut(s,
                tensor(bins, chunk_size=2),
                right=False,
                include_lowest=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(
            result, pd.cut(s, bins, right=False, include_lowest=True))

        # test labels
        r = cut(t, bins, labels=labels)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        r = cut(t, bins, labels=False)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_tensor(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=False)
        np.testing.assert_array_equal(result, expected)

        # test labels which is tensor
        labels_t = tensor(['a', 'b'], chunk_size=1)
        r = cut(raw, bins, labels=labels_t, include_lowest=True)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels, include_lowest=True)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # test labels=False
        r, b = cut(raw, ii, labels=False, retbins=True)
        # result and expected is array whose dtype is CategoricalDtype
        r_result = self.executor.execute_tileable(r, concat=True)[0]
        b_result = self.executor.execute_tileable(b, concat=True)[0]
        r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True)
        for r, e in zip(r_result, r_expected):
            np.testing.assert_equal(r, e)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test bins which is md.IntervalIndex
        r, b = cut(series,
                   iii,
                   labels=tensor(labels, chunk_size=1),
                   retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_dataframe(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test duplicates
        bins2 = [0, 2, 4, 6, 10, 10]
        r, b = cut(s,
                   bins2,
                   labels=False,
                   retbins=True,
                   right=False,
                   duplicates='drop')
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s,
                                        bins2,
                                        labels=False,
                                        retbins=True,
                                        right=False,
                                        duplicates='drop')
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            # test integer bins
            r = cut(series, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s, 3))

            r, b = cut(series, 3, right=False, retbins=True)
            r_result, b_result = executor.execute_dataframes([r, b])
            r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True)
            pd.testing.assert_series_equal(r_result, r_expected)
            np.testing.assert_array_equal(b_result, b_expected)

            # test min max same
            s2 = pd.Series([1.1] * 15)
            r = cut(s2, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s2, 3))

            # test inf exist
            s3 = s2.copy()
            s3[-1] = np.inf
            with self.assertRaises(ValueError):
                executor.execute_dataframes([cut(s3, 3)])
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testToCSVExecution(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame(
            {
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                'col3': np.arange(100)
            },
            index=index)
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # DATAFRAME TESTS
            # test one file with dataframe
            path = os.path.join(base_path, 'out.csv')

            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)

            # test multi files with dataframe
            path = os.path.join(base_path, 'out-*.csv')
            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            dfs = [
                pd.read_csv(os.path.join(base_path, f'out-{i}.csv'),
                            dtype=raw.dtypes.to_dict()) for i in range(4)
            ]
            result = pd.concat(dfs, axis=0)
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)
            pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                          raw.iloc[33:66])

            # SERIES TESTS
            series = md.Series(raw.col1, chunk_size=33)

            # test one file with series
            path = os.path.join(base_path, 'out.csv')
            r = series.to_csv(path)
            self.executor.execute_dataframe(r)

            result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw.col1.to_frame())

            # test multi files with series
            path = os.path.join(base_path, 'out-*.csv')
            r = series.to_csv(path)
            self.executor.execute_dataframe(r)

            dfs = [
                pd.read_csv(os.path.join(base_path, f'out-{i}.csv'),
                            dtype=raw.dtypes.to_dict()) for i in range(4)
            ]
            result = pd.concat(dfs, axis=0)
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw.col1.to_frame())
            pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                          raw.col1.to_frame().iloc[33:66])

    @unittest.skipIf(sqlalchemy is None, 'sqlalchemy not installed')
    def testToSQL(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame(
            {
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                'col3': np.arange(100).astype('int64'),
            },
            index=index)

        with tempfile.TemporaryDirectory() as d:
            table_name1 = 'test_table'
            table_name2 = 'test_table2'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            engine = sqlalchemy.create_engine(uri)

            # test write dataframe
            df = DataFrame(raw, chunk_size=33)
            r = df.to_sql(table_name1, con=engine)
            self.executor.execute_dataframe(r)

            written = pd.read_sql(table_name1, con=engine, index_col='index') \
                .sort_index(ascending=False)
            pd.testing.assert_frame_equal(raw, written)

            # test write with existing table
            with self.assertRaises(ValueError):
                df.to_sql(table_name1, con=uri).execute()

            # test write series
            series = md.Series(raw.col1, chunk_size=33)
            with engine.connect() as conn:
                r = series.to_sql(table_name2, con=conn)
                self.executor.execute_dataframe(r)

            written = pd.read_sql(table_name2, con=engine, index_col='index') \
                .sort_index(ascending=False)
            pd.testing.assert_frame_equal(raw.col1.to_frame(), written)

    @unittest.skipIf(vineyard is None, 'vineyard not installed')
    @mock.patch('webbrowser.open_new_tab', new=lambda *_, **__: True)
    def testToVineyard(self):
        def testWithGivenSession(session):
            with option_context(
                {'vineyard.socket': '/tmp/vineyard/vineyard.sock'}):
                df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4),
                                             columns=['a', 'b', 'c', 'd']),
                                chunk_size=2)
                object_id = df1.to_vineyard().execute(session=session).fetch()
                df2 = md.from_vineyard(object_id)

                df1_value = df1.execute(session=session).fetch()
                df2_value = df2.execute(session=session).fetch()
                pd.testing.assert_frame_equal(df1_value.reset_index(drop=True),
                                              df2_value.reset_index(drop=True))

        with new_session().as_default() as session:
            testWithGivenSession(session)

        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:
            with new_session(cluster.endpoint).as_default() as session:
                testWithGivenSession(session)

            with new_session(
                    'http://' +
                    cluster._web_endpoint).as_default() as web_session:
                testWithGivenSession(web_session)

    @unittest.skipIf(pa is None, 'pyarrow not installed')
    def testToParquetArrowExecution(self):
        raw = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.arange(100),
            'col3': np.random.choice(['a', 'b', 'c'], (100, )),
        })
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # DATAFRAME TESTS
            path = os.path.join(base_path, 'out-*.parquet')
            r = df.to_parquet(path)
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result = result.sort_index()
            pd.testing.assert_frame_equal(result, raw)

            # test read_parquet then to_parquet
            read_df = md.read_parquet(path)
            r = read_df.to_parquet(path)
            self.executor.execute_dataframes([r])

            # test partition_cols
            path = os.path.join(base_path, 'out-partitioned')
            r = df.to_parquet(path, partition_cols=['col3'])
            self.executor.execute_dataframe(r)

            read_df = md.read_parquet(path)
            result = self.executor.execute_dataframe(read_df, concat=True)[0]
            result['col3'] = result['col3'].astype('object')
            pd.testing.assert_frame_equal(
                result.sort_values('col1').reset_index(drop=True),
                raw.sort_values('col1').reset_index(drop=True))

    @unittest.skipIf(fastparquet is None, 'fastparquet not installed')
    def testToParquetFastParquetExecution(self):
        raw = pd.DataFrame({
            'col1': np.random.rand(100),
            'col2': np.arange(100),
            'col3': np.random.choice(['a', 'b', 'c'], (100, )),
        })
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # test fastparquet
            path = os.path.join(base_path, 'out-fastparquet-*.parquet')
            r = df.to_parquet(path, engine='fastparquet', compression='gzip')
            self.executor.execute_dataframe(r)
Example #5
0
    def testRollingAggExecution(self):
        raw = pd.DataFrame({
            'a':
            np.random.randint(100, size=(10, )),
            'b':
            np.random.rand(10),
            'c':
            np.random.randint(100, size=(10, )),
            'd': ['c' * i for i in np.random.randint(4, size=10)]
        })
        raw.iloc[1, ::4] = np.nan
        s = raw.iloc[:, 1]

        dfs = [
            md.DataFrame(raw, chunk_size=10),  # 1 chunk
            md.DataFrame(raw, chunk_size=3)  # multiple chunks on each axis
        ]
        funcs = ['min', ['max', 'mean'], {'c': ['std'], 'b': ['count', 'min']}]

        df2 = dfs[0].rolling(3).agg(funcs[2])

        # test 1 chunk
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3).agg(funcs[2])
        pd.testing.assert_frame_equal(result, expected)

        for window in [2, 5]:
            for center in [True, False]:
                for func in funcs:
                    df2 = dfs[1].rolling(window, center=center).agg(func)

                    result = self.executor.execute_dataframe(df2,
                                                             concat=True)[0]
                    expected = raw.rolling(window, center=center).agg(func)
                    pd.testing.assert_frame_equal(result, expected)

        # test min_periods and win_type
        df2 = dfs[1].rolling(3, min_periods=1, win_type='triang').agg('sum')

        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3, min_periods=1, win_type='triang').agg('sum')
        pd.testing.assert_frame_equal(result, expected)

        # test rolling getitem, series
        df2 = dfs[1].rolling(3)['b'].agg('sum')

        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3)['b'].agg('sum')
        pd.testing.assert_series_equal(result, expected)

        # test rolling getitem, dataframe
        df2 = dfs[1].rolling(3)['c', 'b'].agg('sum')

        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = raw.rolling(3)['c', 'b'].agg('sum')
        pd.testing.assert_frame_equal(result, expected)

        # test axis=1
        df2 = dfs[1].rolling(3, axis=1).agg('sum')

        result = self.executor.execute_dataframe(df2,
                                                 concat=True,
                                                 check_nsplits=False)[0]
        expected = raw.rolling(3, axis=1).agg('sum')
        pd.testing.assert_frame_equal(result, expected)

        # test window which is offset
        raw2 = raw.copy()
        raw2.reset_index(inplace=True, drop=True)
        raw2.index = pd.date_range('2020-2-25', periods=10)

        df = md.DataFrame(raw2, chunk_size=3)
        for func in funcs:
            df2 = df.rolling('2d').agg(func)

            result = self.executor.execute_dataframe(df2, concat=True)[0]
            expected = raw2.rolling('2d').agg(func)
            pd.testing.assert_frame_equal(result, expected)

        series = [md.Series(s, chunk_size=10), md.Series(s, chunk_size=4)]

        funcs = ['min', ['max', 'mean'], {'c': 'std', 'b': 'count'}]

        for series in series:
            for window in [2, 3, 5]:
                for center in [True, False]:
                    for func in funcs:
                        series2 = series.rolling(window,
                                                 center=center).agg(func)

                        result = self.executor.execute_dataframe(
                            series2, concat=True)[0]
                        expected = s.rolling(window, center=center).agg(func)
                        if isinstance(expected, pd.Series):
                            pd.testing.assert_series_equal(result, expected)
                        else:
                            pd.testing.assert_frame_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            df = md.DataFrame(raw, chunk_size=3)
            df = df[df.a > 0.5]
            r = df.rolling(3).agg('max')

            result = executor.execute_dataframes([r])[0]
            expected = raw[raw.a > 0.5].rolling(3).agg('max')
            pd.testing.assert_frame_equal(result, expected)

            series = md.Series(s, chunk_size=3)
            series = series[series > 0.5]
            r = series.rolling(3).agg('max')

            result = executor.execute_dataframes([r])[0]
            expected = s[s > 0.5].rolling(3).agg('max')
            pd.testing.assert_series_equal(result, expected)

        # test agg functions
        df = md.DataFrame(raw, chunk_size=3)
        for func in [
                'count', 'sum', 'mean', 'median', 'min', 'max', 'skew', 'kurt'
        ]:
            r = getattr(df.rolling(4), func)()

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = getattr(raw.rolling(4), func)()
            pd.testing.assert_frame_equal(result, expected)
        for func in ['std', 'var']:
            r = getattr(df.rolling(4), func)(ddof=0)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = getattr(raw.rolling(4), func)(ddof=0)
            pd.testing.assert_frame_equal(result, expected)
Example #6
0
class Test(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testGroupByPruneReadCSV(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            df.to_csv(file_path, index=False)

            # Use test executor
            mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'})
            result = self.executor.execute_dataframe(mdf)[0]
            expected = df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(result, expected)

            mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'})
            expected = df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c'])

            mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'})
            expected = df.groupby('c').agg({'b': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c'])

            mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) + 1
            expected = df.groupby('c').agg({'b': 'sum'}) + 1
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)

            mdf = md.read_csv(file_path,
                              usecols=['a', 'b',
                                       'c']).groupby('c').agg({'b': 'sum'})
            expected = df.groupby('c').agg({'b': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)
            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c'])

            in_df = md.read_csv(file_path)
            df1 = in_df.groupby('c').agg({'b': 'sum'})
            df2 = in_df.groupby('b').agg({'a': 'sum'})

            dfs = ExecutableTuple((df1, df2))
            results = dfs.execute().fetch()
            expected1 = df.groupby('c').agg({'b': 'sum'})
            expected2 = df.groupby('b').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(results[0], expected1)
            pd.testing.assert_frame_equal(results[1], expected2)

            in_df = md.read_csv(file_path)
            df1 = in_df.groupby('c').agg({'b': 'sum'})

            dfs = ExecutableTuple((in_df, df1))
            results = dfs.execute().fetch()
            expected1 = df.groupby('c').agg({'b': 'sum'})
            pd.testing.assert_frame_equal(results[0], df)
            pd.testing.assert_frame_equal(results[1], expected1)

            with option_context({'optimize_tileable_graph': False}):
                mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'})
                expected = df.groupby('c').agg({'b': 'sum'})
                pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
                pd.testing.assert_frame_equal(mdf.fetch(), expected)

                tileable_graph = mdf.build_graph()
                self.assertIsNone(
                    list(tileable_graph.topological_iter())[0].op.usecols)

    def testGroupbyPruneReadParquet(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.parquet')

            df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            df.to_parquet(file_path, index=False)

            # Use test executor
            mdf = md.read_parquet(file_path).groupby('c').agg({'a': 'sum'})
            result = self.executor.execute_dataframes([mdf])[0]
            mdf._shape = result.shape
            expected = df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(result, expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.columns, ['a', 'c'])

            mdf = md.read_parquet(file_path).groupby(
                'c', as_index=False).c.agg({'cnt': 'count'})
            result = self.executor.execute_dataframes([mdf])[0]
            mdf._shape = result.shape
            expected = df.groupby('c', as_index=False).c.agg({'cnt': 'count'})
            pd.testing.assert_frame_equal(result, expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.columns, ['c'])

            # test getitem
            mdf = md.read_parquet(file_path)
            df1 = mdf.c.value_counts()
            df2 = mdf.groupby('b')['b'].count()
            results = self.executor.execute_dataframes([df1, df2])
            df1._shape = results[0].shape
            df2._shape = results[1].shape
            expected = df.c.value_counts(), df.groupby('b')['b'].count()
            pd.testing.assert_series_equal(results[0], expected[0])
            pd.testing.assert_series_equal(results[1], expected[1])

            optimized_df = tileable_optimized[df1.data]
            self.assertEqual(optimized_df.inputs[0].inputs[0].op.columns,
                             ['b', 'c'])

    def testPruneReadSQL(self):
        test_df = pd.DataFrame({
            'a':
            np.arange(10).astype(np.int64, copy=False),
            'b': [f's{i}' for i in range(10)],
            'c':
            np.random.rand(10),
            'd': [
                datetime.fromtimestamp(time.time() + 3600 * (i - 5))
                for i in range(10)
            ]
        })

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            # test read df with columns
            r = md.read_sql_table('test', uri, chunk_size=4)[['a', 'b']]
            pd.testing.assert_frame_equal(r.to_pandas(), test_df[['a', 'b']])

            # test read series with columns
            r = md.read_sql_table('test', uri, chunk_size=4)['a']
            pd.testing.assert_series_equal(r.to_pandas(), test_df['a'])

    def testExecutedPruning(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            pd_df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            pd_df.to_csv(file_path, index=False)

            in_df = md.read_csv(file_path)
            mdf = in_df.groupby('c').agg({'a': 'sum'})

            expected = pd_df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c'])

            # make sure in_df has correct columns
            pd.testing.assert_frame_equal(in_df.to_pandas(), pd_df)

            # skip pruning
            in_df = md.read_csv(file_path)
            df1 = in_df.groupby('d').agg({'b': 'min'})
            df2 = in_df[in_df.d.isin(df1.index)]

            expected1 = pd_df.groupby('d').agg({'b': 'min'})
            expected2 = pd_df[pd_df.d.isin(expected1.index)]

            pd.testing.assert_frame_equal(df2.to_pandas(), expected2)

    def testFetch(self):
        with tempfile.TemporaryDirectory() as tempdir:
            filename = os.path.join(tempdir, 'test_fetch.csv')
            pd_df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            pd_df.to_csv(filename, index=False)

            df = md.read_csv(filename)
            df2 = df.groupby('d').agg({'b': 'min'})
            expected = pd_df.groupby('d').agg({'b': 'min'})
            _ = df2.execute()

            def _execute_read_csv(*_):  # pragma: no cover
                raise ValueError('cannot run read_csv again')

            try:
                register(DataFrameReadCSV, _execute_read_csv)

                pd.testing.assert_frame_equal(df2.fetch(), expected)
                pd.testing.assert_frame_equal(df2.iloc[:3].fetch(),
                                              expected.iloc[:3])
            finally:
                del Executor._op_runners[DataFrameReadCSV]