def testReadSQLUseArrowDtype(self):
        test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                                'b': [f's{i}' for i in range(10)],
                                'c': np.random.rand(10),
                                'd': [datetime.fromtimestamp(time.time() + 3600 * (i - 5))
                                      for i in range(10)]})

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            r = md.read_sql_table('test', uri, chunk_size=4, use_arrow_dtype=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df)

            # test read with sql string and offset method
            r = md.read_sql_query('select * from test where c > 0.5', uri,
                                  parse_dates=['d'], chunk_size=4,
                                  incremental_index=True,
                                  use_arrow_dtype=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result),
                                          test_df[test_df.c > 0.5].reset_index(drop=True))
    def testReadCSVUseArrowDtype(self):
        rs = np.random.RandomState(0)
        df = pd.DataFrame({
            'col1':
            rs.rand(100),
            'col2':
            rs.choice(['a' * 2, 'b' * 3, 'c' * 4], (100, )),
            'col3':
            np.arange(100)
        })
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = md.read_csv(file_path, use_arrow_dtype=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)

        with tempfile.TemporaryDirectory() as tempdir:
            with option_context({'dataframe.use_arrow_dtype': True}):
                file_path = os.path.join(tempdir, 'test.csv')
                df.to_csv(file_path, index=False)

                pdf = pd.read_csv(file_path)
                mdf = md.read_csv(file_path)
                result = self.executor.execute_dataframe(mdf, concat=True)[0]
                self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
                self.assertIsInstance(result.dtypes.iloc[1],
                                      md.ArrowStringDtype)
                pd.testing.assert_frame_equal(arrow_array_to_objects(result),
                                              pdf)

        # test compression
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.gzip')
            df.to_csv(file_path, compression='gzip', index=False)

            pdf = pd.read_csv(file_path, compression='gzip')
            mdf = md.read_csv(file_path,
                              compression='gzip',
                              use_arrow_dtype=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)
    def testReadParquet(self):
        test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                                'b': [f's{i}' for i in range(10)],
                                'c': np.random.rand(10),})

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path)

            df = md.read_parquet(file_path)
            result = self.executor.execute_dataframe(df, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path, row_group_size=3)

            df = md.read_parquet(file_path, groups_as_chunks=True, columns=['a', 'b'])
            result = self.executor.execute_dataframe(df, concat=True)[0]
            pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df[['a', 'b']])

        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path, row_group_size=5)

            df = md.read_parquet(file_path, groups_as_chunks=True,
                                 use_arrow_dtype=True,
                                 incremental_index=True)
            result = self.executor.execute_dataframe(df, concat=True)[0]
            self.assertIsInstance(df.dtypes.iloc[1], md.ArrowStringDtype)
            self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype)
            pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df)

        # test fastparquet engine
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')
            test_df.to_parquet(file_path, compression=None)

            df = md.read_parquet(file_path, engine='fastparquet')
            result = self.executor.execute_dataframe(df, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)

        # test wildcards in path
        with tempfile.TemporaryDirectory() as tempdir:
            df = pd.DataFrame({'a': np.arange(300).astype(np.int64, copy=False),
                               'b': [f's{i}' for i in range(300)],
                               'c': np.random.rand(300), })

            file_paths = [os.path.join(tempdir, f'test{i}.parquet') for i in range(3)]
            df[:100].to_parquet(file_paths[0], row_group_size=50)
            df[100:200].to_parquet(file_paths[1], row_group_size=30)
            df[200:].to_parquet(file_paths[2])

            mdf = md.read_parquet(f'{tempdir}/*.parquet')
            r = self.executor.execute_dataframe(mdf, concat=True)[0]
            pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))

            mdf = md.read_parquet(f'{tempdir}/*.parquet', groups_as_chunks=True)
            r = self.executor.execute_dataframe(mdf, concat=True)[0]
            pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))
def test_read_parquet_arrow(setup):
    test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False),
                            'b': [f's{i}' for i in range(10)],
                            'c': np.random.rand(10), })

    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.csv')
        test_df.to_parquet(file_path)

        df = md.read_parquet(file_path)
        result = df.execute().fetch()
        pd.testing.assert_frame_equal(result, test_df)
        # size_res = self.executor.execute_dataframe(df, mock=True)
        # assert sum(s[0] for s in size_res) > test_df.memory_usage(deep=True).sum()

    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.parquet')
        test_df.to_parquet(file_path, row_group_size=3)

        df = md.read_parquet(file_path, groups_as_chunks=True, columns=['a', 'b'])
        result = df.execute().fetch()
        pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df[['a', 'b']])

    with tempfile.TemporaryDirectory() as tempdir:
        file_path = os.path.join(tempdir, 'test.parquet')
        test_df.to_parquet(file_path, row_group_size=5)

        df = md.read_parquet(file_path, groups_as_chunks=True,
                             use_arrow_dtype=True,
                             incremental_index=True)
        result = df.execute().fetch()
        assert isinstance(df.dtypes.iloc[1], md.ArrowStringDtype)
        assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype)
        pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df)

    # test wildcards in path
    with tempfile.TemporaryDirectory() as tempdir:
        df = pd.DataFrame({'a': np.arange(300).astype(np.int64, copy=False),
                           'b': [f's{i}' for i in range(300)],
                           'c': np.random.rand(300), })

        file_paths = [os.path.join(tempdir, f'test{i}.parquet') for i in range(3)]
        df[:100].to_parquet(file_paths[0], row_group_size=50)
        df[100:200].to_parquet(file_paths[1], row_group_size=30)
        df[200:].to_parquet(file_paths[2])

        mdf = md.read_parquet(f'{tempdir}/*.parquet')
        r = mdf.execute().fetch()
        pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))

        mdf = md.read_parquet(f'{tempdir}/*.parquet', groups_as_chunks=True)
        r = mdf.execute().fetch()
        pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))
def test_groupby_apply_with_arrow_dtype(setup):
    df1 = pd.DataFrame({'a': [1, 2, 1], 'b': ['a', 'b', 'a']})
    mdf = md.DataFrame(df1)
    mdf['b'] = mdf['b'].astype('Arrow[string]')

    applied = mdf.groupby('b').apply(lambda df: df.a.sum())
    result = applied.execute().fetch()
    expected = df1.groupby('b').apply(lambda df: df.a.sum())
    pd.testing.assert_series_equal(result, expected)

    series1 = df1['b']
    mseries = md.Series(series1).astype('Arrow[string]')

    applied = mseries.groupby(mseries).apply(lambda s: s)
    result = applied.execute().fetch()
    expected = series1.groupby(series1).apply(lambda s: s)
    pd.testing.assert_series_equal(arrow_array_to_objects(result), expected)
Exemple #6
0
    def testGroupbyApplyWithArrowDtype(self):
        df1 = pd.DataFrame({'a': [1, 2, 1], 'b': ['a', 'b', 'a']})
        mdf = md.DataFrame(df1)
        mdf['b'] = mdf['b'].astype('Arrow[string]')

        applied = mdf.groupby('b').apply(lambda df: df.a.sum())
        result = self.executor.execute_dataframe(applied, concat=True)[0]
        expected = df1.groupby('b').apply(lambda df: df.a.sum())
        pd.testing.assert_series_equal(result, expected)

        series1 = df1['b']
        mseries = md.Series(series1).astype('Arrow[string]')

        applied = mseries.groupby(mseries).apply(lambda s: s)
        result = self.executor.execute_dataframe(applied, concat=True)[0]
        expected = series1.groupby(series1).apply(lambda s: s)
        pd.testing.assert_series_equal(arrow_array_to_objects(result),
                                       expected)