def testSeriesQuantileExecution(self): raw = pd.Series(np.random.rand(10), name='a') a = Series(raw, chunk_size=3) # q = 0.5, scalar r = a.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() self.assertEqual(result, expected) # q is a list r = a.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected) # test interpolation r = a.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_series_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = a.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected)
def testDataFrameQuantileExecution(self): raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)], 'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)] }, index=pd.RangeIndex(1, 11)) df = DataFrame(raw, chunk_size=3) # q = 0.5, axis = 0, series r = df.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() pd.testing.assert_series_equal(result, expected) # q = 0.5, axis = 1, series r = df.quantile(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile(axis=1) pd.testing.assert_series_equal(result, expected) # q is a list, axis = 0, dataframe r = df.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # q is a list, axis = 1, dataframe r = df.quantile([0.3, 0.7], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], axis=1) pd.testing.assert_frame_equal(result, expected) # test interpolation r = df.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_frame_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = df.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # test numeric_only raw2 = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [pd.Timestamp('201{}'.format(i)) for i in range(10)], }, index=pd.RangeIndex(1, 11)) df2 = DataFrame(raw2, chunk_size=3) r = df2.quantile([0.3, 0.7], numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile([0.3, 0.7], numeric_only=False) pd.testing.assert_frame_equal(result, expected) r = df2.quantile(numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile(numeric_only=False) pd.testing.assert_series_equal(result, expected)
def testCutExecution(self): rs = np.random.RandomState(0) raw = rs.random(15) * 1000 s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)]) bins = [10, 100, 500] ii = pd.interval_range(10, 500, 3) labels = ['a', 'b'] t = tensor(raw, chunk_size=4) series = from_pandas_series(s, chunk_size=4) iii = from_pandas_index(ii, chunk_size=2) # cut on Series r = cut(series, bins) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins)) r, b = cut(series, bins, retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # cut on tensor r = cut(t, bins) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # one chunk r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal( result, pd.cut(s, bins, right=False, include_lowest=True)) # test labels r = cut(t, bins, labels=labels) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) r = cut(t, bins, labels=False) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_tensor(r, concat=True)[0] expected = pd.cut(raw, bins, labels=False) np.testing.assert_array_equal(result, expected) # test labels which is tensor labels_t = tensor(['a', 'b'], chunk_size=1) r = cut(raw, bins, labels=labels_t, include_lowest=True) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels, include_lowest=True) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # test labels=False r, b = cut(raw, ii, labels=False, retbins=True) # result and expected is array whose dtype is CategoricalDtype r_result = self.executor.execute_tileable(r, concat=True)[0] b_result = self.executor.execute_tileable(b, concat=True)[0] r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True) for r, e in zip(r_result, r_expected): np.testing.assert_equal(r, e) pd.testing.assert_index_equal(b_result, b_expected) # test bins which is md.IntervalIndex r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_dataframe(b, concat=True)[0] r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) pd.testing.assert_index_equal(b_result, b_expected) # test duplicates bins2 = [0, 2, 4, 6, 10, 10] r, b = cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: # test integer bins r = cut(series, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s, 3)) r, b = cut(series, 3, right=False, retbins=True) r_result, b_result = executor.execute_dataframes([r, b]) r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # test min max same s2 = pd.Series([1.1] * 15) r = cut(s2, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s2, 3)) # test inf exist s3 = s2.copy() s3[-1] = np.inf with self.assertRaises(ValueError): executor.execute_dataframes([cut(s3, 3)])
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testToCSVExecution(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # DATAFRAME TESTS # test one file with dataframe path = os.path.join(base_path, 'out.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # test multi files with dataframe path = os.path.join(base_path, 'out-*.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) dfs = [ pd.read_csv(os.path.join(base_path, f'out-{i}.csv'), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33:66]) # SERIES TESTS series = md.Series(raw.col1, chunk_size=33) # test one file with series path = os.path.join(base_path, 'out.csv') r = series.to_csv(path) self.executor.execute_dataframe(r) result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) # test multi files with series path = os.path.join(base_path, 'out-*.csv') r = series.to_csv(path) self.executor.execute_dataframe(r) dfs = [ pd.read_csv(os.path.join(base_path, f'out-{i}.csv'), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.col1.to_frame().iloc[33:66]) @unittest.skipIf(sqlalchemy is None, 'sqlalchemy not installed') def testToSQL(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100).astype('int64'), }, index=index) with tempfile.TemporaryDirectory() as d: table_name1 = 'test_table' table_name2 = 'test_table2' uri = 'sqlite:///' + os.path.join(d, 'test.db') engine = sqlalchemy.create_engine(uri) # test write dataframe df = DataFrame(raw, chunk_size=33) r = df.to_sql(table_name1, con=engine) self.executor.execute_dataframe(r) written = pd.read_sql(table_name1, con=engine, index_col='index') \ .sort_index(ascending=False) pd.testing.assert_frame_equal(raw, written) # test write with existing table with self.assertRaises(ValueError): df.to_sql(table_name1, con=uri).execute() # test write series series = md.Series(raw.col1, chunk_size=33) with engine.connect() as conn: r = series.to_sql(table_name2, con=conn) self.executor.execute_dataframe(r) written = pd.read_sql(table_name2, con=engine, index_col='index') \ .sort_index(ascending=False) pd.testing.assert_frame_equal(raw.col1.to_frame(), written) @unittest.skipIf(vineyard is None, 'vineyard not installed') @mock.patch('webbrowser.open_new_tab', new=lambda *_, **__: True) def testToVineyard(self): def testWithGivenSession(session): with option_context( {'vineyard.socket': '/tmp/vineyard/vineyard.sock'}): df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4), columns=['a', 'b', 'c', 'd']), chunk_size=2) object_id = df1.to_vineyard().execute(session=session).fetch() df2 = md.from_vineyard(object_id) df1_value = df1.execute(session=session).fetch() df2_value = df2.execute(session=session).fetch() pd.testing.assert_frame_equal(df1_value.reset_index(drop=True), df2_value.reset_index(drop=True)) with new_session().as_default() as session: testWithGivenSession(session) with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: with new_session(cluster.endpoint).as_default() as session: testWithGivenSession(session) with new_session( 'http://' + cluster._web_endpoint).as_default() as web_session: testWithGivenSession(web_session) @unittest.skipIf(pa is None, 'pyarrow not installed') def testToParquetArrowExecution(self): raw = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.arange(100), 'col3': np.random.choice(['a', 'b', 'c'], (100, )), }) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # DATAFRAME TESTS path = os.path.join(base_path, 'out-*.parquet') r = df.to_parquet(path) self.executor.execute_dataframe(r) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result = result.sort_index() pd.testing.assert_frame_equal(result, raw) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result = result.sort_index() pd.testing.assert_frame_equal(result, raw) # test read_parquet then to_parquet read_df = md.read_parquet(path) r = read_df.to_parquet(path) self.executor.execute_dataframes([r]) # test partition_cols path = os.path.join(base_path, 'out-partitioned') r = df.to_parquet(path, partition_cols=['col3']) self.executor.execute_dataframe(r) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result['col3'] = result['col3'].astype('object') pd.testing.assert_frame_equal( result.sort_values('col1').reset_index(drop=True), raw.sort_values('col1').reset_index(drop=True)) @unittest.skipIf(fastparquet is None, 'fastparquet not installed') def testToParquetFastParquetExecution(self): raw = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.arange(100), 'col3': np.random.choice(['a', 'b', 'c'], (100, )), }) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # test fastparquet path = os.path.join(base_path, 'out-fastparquet-*.parquet') r = df.to_parquet(path, engine='fastparquet', compression='gzip') self.executor.execute_dataframe(r)
def testRollingAggExecution(self): raw = pd.DataFrame({ 'a': np.random.randint(100, size=(10, )), 'b': np.random.rand(10), 'c': np.random.randint(100, size=(10, )), 'd': ['c' * i for i in np.random.randint(4, size=10)] }) raw.iloc[1, ::4] = np.nan s = raw.iloc[:, 1] dfs = [ md.DataFrame(raw, chunk_size=10), # 1 chunk md.DataFrame(raw, chunk_size=3) # multiple chunks on each axis ] funcs = ['min', ['max', 'mean'], {'c': ['std'], 'b': ['count', 'min']}] df2 = dfs[0].rolling(3).agg(funcs[2]) # test 1 chunk result = self.executor.execute_dataframe(df2, concat=True)[0] expected = raw.rolling(3).agg(funcs[2]) pd.testing.assert_frame_equal(result, expected) for window in [2, 5]: for center in [True, False]: for func in funcs: df2 = dfs[1].rolling(window, center=center).agg(func) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = raw.rolling(window, center=center).agg(func) pd.testing.assert_frame_equal(result, expected) # test min_periods and win_type df2 = dfs[1].rolling(3, min_periods=1, win_type='triang').agg('sum') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = raw.rolling(3, min_periods=1, win_type='triang').agg('sum') pd.testing.assert_frame_equal(result, expected) # test rolling getitem, series df2 = dfs[1].rolling(3)['b'].agg('sum') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = raw.rolling(3)['b'].agg('sum') pd.testing.assert_series_equal(result, expected) # test rolling getitem, dataframe df2 = dfs[1].rolling(3)['c', 'b'].agg('sum') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = raw.rolling(3)['c', 'b'].agg('sum') pd.testing.assert_frame_equal(result, expected) # test axis=1 df2 = dfs[1].rolling(3, axis=1).agg('sum') result = self.executor.execute_dataframe(df2, concat=True, check_nsplits=False)[0] expected = raw.rolling(3, axis=1).agg('sum') pd.testing.assert_frame_equal(result, expected) # test window which is offset raw2 = raw.copy() raw2.reset_index(inplace=True, drop=True) raw2.index = pd.date_range('2020-2-25', periods=10) df = md.DataFrame(raw2, chunk_size=3) for func in funcs: df2 = df.rolling('2d').agg(func) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = raw2.rolling('2d').agg(func) pd.testing.assert_frame_equal(result, expected) series = [md.Series(s, chunk_size=10), md.Series(s, chunk_size=4)] funcs = ['min', ['max', 'mean'], {'c': 'std', 'b': 'count'}] for series in series: for window in [2, 3, 5]: for center in [True, False]: for func in funcs: series2 = series.rolling(window, center=center).agg(func) result = self.executor.execute_dataframe( series2, concat=True)[0] expected = s.rolling(window, center=center).agg(func) if isinstance(expected, pd.Series): pd.testing.assert_series_equal(result, expected) else: pd.testing.assert_frame_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: df = md.DataFrame(raw, chunk_size=3) df = df[df.a > 0.5] r = df.rolling(3).agg('max') result = executor.execute_dataframes([r])[0] expected = raw[raw.a > 0.5].rolling(3).agg('max') pd.testing.assert_frame_equal(result, expected) series = md.Series(s, chunk_size=3) series = series[series > 0.5] r = series.rolling(3).agg('max') result = executor.execute_dataframes([r])[0] expected = s[s > 0.5].rolling(3).agg('max') pd.testing.assert_series_equal(result, expected) # test agg functions df = md.DataFrame(raw, chunk_size=3) for func in [ 'count', 'sum', 'mean', 'median', 'min', 'max', 'skew', 'kurt' ]: r = getattr(df.rolling(4), func)() result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(raw.rolling(4), func)() pd.testing.assert_frame_equal(result, expected) for func in ['std', 'var']: r = getattr(df.rolling(4), func)(ddof=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(raw.rolling(4), func)(ddof=0) pd.testing.assert_frame_equal(result, expected)
class Test(TestBase): def setUp(self): self.executor = ExecutorForTest() def testGroupByPruneReadCSV(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) df.to_csv(file_path, index=False) # Use test executor mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'}) result = self.executor.execute_dataframe(mdf)[0] expected = df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(result, expected) mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'}) expected = df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c']) mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) expected = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c']) mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) + 1 expected = df.groupby('c').agg({'b': 'sum'}) + 1 pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) mdf = md.read_csv(file_path, usecols=['a', 'b', 'c']).groupby('c').agg({'b': 'sum'}) expected = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c']) in_df = md.read_csv(file_path) df1 = in_df.groupby('c').agg({'b': 'sum'}) df2 = in_df.groupby('b').agg({'a': 'sum'}) dfs = ExecutableTuple((df1, df2)) results = dfs.execute().fetch() expected1 = df.groupby('c').agg({'b': 'sum'}) expected2 = df.groupby('b').agg({'a': 'sum'}) pd.testing.assert_frame_equal(results[0], expected1) pd.testing.assert_frame_equal(results[1], expected2) in_df = md.read_csv(file_path) df1 = in_df.groupby('c').agg({'b': 'sum'}) dfs = ExecutableTuple((in_df, df1)) results = dfs.execute().fetch() expected1 = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(results[0], df) pd.testing.assert_frame_equal(results[1], expected1) with option_context({'optimize_tileable_graph': False}): mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) expected = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) tileable_graph = mdf.build_graph() self.assertIsNone( list(tileable_graph.topological_iter())[0].op.usecols) def testGroupbyPruneReadParquet(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.parquet') df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) df.to_parquet(file_path, index=False) # Use test executor mdf = md.read_parquet(file_path).groupby('c').agg({'a': 'sum'}) result = self.executor.execute_dataframes([mdf])[0] mdf._shape = result.shape expected = df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(result, expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.columns, ['a', 'c']) mdf = md.read_parquet(file_path).groupby( 'c', as_index=False).c.agg({'cnt': 'count'}) result = self.executor.execute_dataframes([mdf])[0] mdf._shape = result.shape expected = df.groupby('c', as_index=False).c.agg({'cnt': 'count'}) pd.testing.assert_frame_equal(result, expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.columns, ['c']) # test getitem mdf = md.read_parquet(file_path) df1 = mdf.c.value_counts() df2 = mdf.groupby('b')['b'].count() results = self.executor.execute_dataframes([df1, df2]) df1._shape = results[0].shape df2._shape = results[1].shape expected = df.c.value_counts(), df.groupby('b')['b'].count() pd.testing.assert_series_equal(results[0], expected[0]) pd.testing.assert_series_equal(results[1], expected[1]) optimized_df = tileable_optimized[df1.data] self.assertEqual(optimized_df.inputs[0].inputs[0].op.columns, ['b', 'c']) def testPruneReadSQL(self): test_df = pd.DataFrame({ 'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), 'd': [ datetime.fromtimestamp(time.time() + 3600 * (i - 5)) for i in range(10) ] }) with tempfile.TemporaryDirectory() as d: table_name = 'test' uri = 'sqlite:///' + os.path.join(d, 'test.db') test_df.to_sql(table_name, uri, index=False) # test read df with columns r = md.read_sql_table('test', uri, chunk_size=4)[['a', 'b']] pd.testing.assert_frame_equal(r.to_pandas(), test_df[['a', 'b']]) # test read series with columns r = md.read_sql_table('test', uri, chunk_size=4)['a'] pd.testing.assert_series_equal(r.to_pandas(), test_df['a']) def testExecutedPruning(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') pd_df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) pd_df.to_csv(file_path, index=False) in_df = md.read_csv(file_path) mdf = in_df.groupby('c').agg({'a': 'sum'}) expected = pd_df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c']) # make sure in_df has correct columns pd.testing.assert_frame_equal(in_df.to_pandas(), pd_df) # skip pruning in_df = md.read_csv(file_path) df1 = in_df.groupby('d').agg({'b': 'min'}) df2 = in_df[in_df.d.isin(df1.index)] expected1 = pd_df.groupby('d').agg({'b': 'min'}) expected2 = pd_df[pd_df.d.isin(expected1.index)] pd.testing.assert_frame_equal(df2.to_pandas(), expected2) def testFetch(self): with tempfile.TemporaryDirectory() as tempdir: filename = os.path.join(tempdir, 'test_fetch.csv') pd_df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) pd_df.to_csv(filename, index=False) df = md.read_csv(filename) df2 = df.groupby('d').agg({'b': 'min'}) expected = pd_df.groupby('d').agg({'b': 'min'}) _ = df2.execute() def _execute_read_csv(*_): # pragma: no cover raise ValueError('cannot run read_csv again') try: register(DataFrameReadCSV, _execute_read_csv) pd.testing.assert_frame_equal(df2.fetch(), expected) pd.testing.assert_frame_equal(df2.iloc[:3].fetch(), expected.iloc[:3]) finally: del Executor._op_runners[DataFrameReadCSV]