def testReadSQLUseArrowDtype(self): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), 'd': [datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10)]}) with tempfile.TemporaryDirectory() as d: table_name = 'test' uri = 'sqlite:///' + os.path.join(d, 'test.db') test_df.to_sql(table_name, uri, index=False) r = md.read_sql_table('test', uri, chunk_size=4, use_arrow_dtype=True) result = self.executor.execute_dataframe(r, concat=True)[0] self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df) # test read with sql string and offset method r = md.read_sql_query('select * from test where c > 0.5', uri, parse_dates=['d'], chunk_size=4, incremental_index=True, use_arrow_dtype=True) result = self.executor.execute_dataframe(r, concat=True)[0] self.assertIsInstance(r.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df[test_df.c > 0.5].reset_index(drop=True))
def testReadCSVUseArrowDtype(self): rs = np.random.RandomState(0) df = pd.DataFrame({ 'col1': rs.rand(100), 'col2': rs.choice(['a' * 2, 'b' * 3, 'c' * 4], (100, )), 'col3': np.arange(100) }) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = md.read_csv(file_path, use_arrow_dtype=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf) with tempfile.TemporaryDirectory() as tempdir: with option_context({'dataframe.use_arrow_dtype': True}): file_path = os.path.join(tempdir, 'test.csv') df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = md.read_csv(file_path) result = self.executor.execute_dataframe(mdf, concat=True)[0] self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf) # test compression with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.gzip') df.to_csv(file_path, compression='gzip', index=False) pdf = pd.read_csv(file_path, compression='gzip') mdf = md.read_csv(file_path, compression='gzip', use_arrow_dtype=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)
def testReadParquet(self): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10),}) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path) df = md.read_parquet(file_path) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path, row_group_size=3) df = md.read_parquet(file_path, groups_as_chunks=True, columns=['a', 'b']) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df[['a', 'b']]) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path, row_group_size=5) df = md.read_parquet(file_path, groups_as_chunks=True, use_arrow_dtype=True, incremental_index=True) result = self.executor.execute_dataframe(df, concat=True)[0] self.assertIsInstance(df.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df) # test fastparquet engine with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path, compression=None) df = md.read_parquet(file_path, engine='fastparquet') result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) # test wildcards in path with tempfile.TemporaryDirectory() as tempdir: df = pd.DataFrame({'a': np.arange(300).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(300)], 'c': np.random.rand(300), }) file_paths = [os.path.join(tempdir, f'test{i}.parquet') for i in range(3)] df[:100].to_parquet(file_paths[0], row_group_size=50) df[100:200].to_parquet(file_paths[1], row_group_size=30) df[200:].to_parquet(file_paths[2]) mdf = md.read_parquet(f'{tempdir}/*.parquet') r = self.executor.execute_dataframe(mdf, concat=True)[0] pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True)) mdf = md.read_parquet(f'{tempdir}/*.parquet', groups_as_chunks=True) r = self.executor.execute_dataframe(mdf, concat=True)[0] pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))
def test_read_parquet_arrow(setup): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), }) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path) df = md.read_parquet(file_path) result = df.execute().fetch() pd.testing.assert_frame_equal(result, test_df) # size_res = self.executor.execute_dataframe(df, mock=True) # assert sum(s[0] for s in size_res) > test_df.memory_usage(deep=True).sum() with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.parquet') test_df.to_parquet(file_path, row_group_size=3) df = md.read_parquet(file_path, groups_as_chunks=True, columns=['a', 'b']) result = df.execute().fetch() pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df[['a', 'b']]) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.parquet') test_df.to_parquet(file_path, row_group_size=5) df = md.read_parquet(file_path, groups_as_chunks=True, use_arrow_dtype=True, incremental_index=True) result = df.execute().fetch() assert isinstance(df.dtypes.iloc[1], md.ArrowStringDtype) assert isinstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df) # test wildcards in path with tempfile.TemporaryDirectory() as tempdir: df = pd.DataFrame({'a': np.arange(300).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(300)], 'c': np.random.rand(300), }) file_paths = [os.path.join(tempdir, f'test{i}.parquet') for i in range(3)] df[:100].to_parquet(file_paths[0], row_group_size=50) df[100:200].to_parquet(file_paths[1], row_group_size=30) df[200:].to_parquet(file_paths[2]) mdf = md.read_parquet(f'{tempdir}/*.parquet') r = mdf.execute().fetch() pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True)) mdf = md.read_parquet(f'{tempdir}/*.parquet', groups_as_chunks=True) r = mdf.execute().fetch() pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))
def test_groupby_apply_with_arrow_dtype(setup): df1 = pd.DataFrame({'a': [1, 2, 1], 'b': ['a', 'b', 'a']}) mdf = md.DataFrame(df1) mdf['b'] = mdf['b'].astype('Arrow[string]') applied = mdf.groupby('b').apply(lambda df: df.a.sum()) result = applied.execute().fetch() expected = df1.groupby('b').apply(lambda df: df.a.sum()) pd.testing.assert_series_equal(result, expected) series1 = df1['b'] mseries = md.Series(series1).astype('Arrow[string]') applied = mseries.groupby(mseries).apply(lambda s: s) result = applied.execute().fetch() expected = series1.groupby(series1).apply(lambda s: s) pd.testing.assert_series_equal(arrow_array_to_objects(result), expected)
def testGroupbyApplyWithArrowDtype(self): df1 = pd.DataFrame({'a': [1, 2, 1], 'b': ['a', 'b', 'a']}) mdf = md.DataFrame(df1) mdf['b'] = mdf['b'].astype('Arrow[string]') applied = mdf.groupby('b').apply(lambda df: df.a.sum()) result = self.executor.execute_dataframe(applied, concat=True)[0] expected = df1.groupby('b').apply(lambda df: df.a.sum()) pd.testing.assert_series_equal(result, expected) series1 = df1['b'] mseries = md.Series(series1).astype('Arrow[string]') applied = mseries.groupby(mseries).apply(lambda s: s) result = self.executor.execute_dataframe(applied, concat=True)[0] expected = series1.groupby(series1).apply(lambda s: s) pd.testing.assert_series_equal(arrow_array_to_objects(result), expected)