class TestIndexReduction(TestBase): def setUp(self): self.executor = ExecutorForTest() def testIndexReduction(self): rs = np.random.RandomState(0) data = pd.Index(rs.randint(0, 5, (100, ))) data2 = pd.Index(rs.randint(1, 6, (100, ))) for method in ['min', 'max', 'all', 'any']: idx = md.Index(data) result = self.executor.execute_dataframe(getattr(idx, method)(), concat=True)[0] self.assertEqual(result, getattr(data, method)()) idx = md.Index(data, chunk_size=10) result = self.executor.execute_dataframe(getattr(idx, method)(), concat=True)[0] self.assertEqual(result, getattr(data, method)()) idx = md.Index(data2) result = self.executor.execute_dataframe(getattr(idx, method)(), concat=True)[0] self.assertEqual(result, getattr(data2, method)()) idx = md.Index(data2, chunk_size=10) result = self.executor.execute_dataframe(getattr(idx, method)(), concat=True)[0] self.assertEqual(result, getattr(data2, method)())
class TestCustomAggregate(TestBase): def setUp(self): self.executor = ExecutorForTest() def testDataFrameAggregate(self): data = pd.DataFrame(np.random.rand(30, 20)) df = md.DataFrame(data) result = df.agg(MockReduction1()) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction1())) result = df.agg(MockReduction2()) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction2())) df = md.DataFrame(data, chunk_size=5) result = df.agg(MockReduction2()) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction2())) result = df.agg(MockReduction2()) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction2())) def testSeriesAggregate(self): data = pd.Series(np.random.rand(20)) s = md.Series(data) result = s.agg(MockReduction1()) self.assertEqual( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction1())) result = s.agg(MockReduction2()) self.assertEqual( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction2())) s = md.Series(data, chunk_size=5) result = s.agg(MockReduction2()) self.assertAlmostEqual( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction2())) result = s.agg(MockReduction2()) self.assertAlmostEqual( self.executor.execute_dataframe(result, concat=True)[0], data.agg(MockReduction2()))
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testToCSVExecution(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # test one file path = os.path.join(base_path, 'out.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # test multi files path = os.path.join(base_path, 'out-*.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) dfs = [ pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33:66])
class TestUnary(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testAbs(self): data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10))) df1 = from_pandas(data1, chunk_size=5) result = self.executor.execute_dataframe(abs(df1), concat=True)[0] expected = data1.abs() pd.testing.assert_frame_equal(expected, result)
class TestGPUReduction(TestBase): def setUp(self): self.executor = ExecutorForTest() def testGPUExecution(self): df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc')) df = to_gpu(md.DataFrame(df_raw, chunk_size=6)) r = df.sum() res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum()) r = df.kurt() res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt()) r = df.agg(['sum', 'var']) res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(['sum', 'var'])) s_raw = pd.Series(np.random.rand(30)) s = to_gpu(md.Series(s_raw, chunk_size=6)) r = s.sum() res = self.executor.execute_dataframe(r, concat=True)[0] self.assertAlmostEqual(res, s_raw.sum()) r = s.kurt() res = self.executor.execute_dataframe(r, concat=True)[0] self.assertAlmostEqual(res, s_raw.kurt()) r = s.agg(['sum', 'var']) res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(['sum', 'var'])) s_raw = pd.Series( np.random.randint(0, 3, size=(30, )) * np.random.randint(0, 5, size=(30, ))) s = to_gpu(md.Series(s_raw, chunk_size=6)) r = s.unique() res = self.executor.execute_dataframe(r, concat=True)[0] np.testing.assert_array_equal( cp.asnumpy(res).sort(), s_raw.unique().sort())
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() @require_cudf def testToGPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) res = self.executor.execute_dataframe(cdf, concat=True)[0] self.assertIsInstance(res, cudf.DataFrame) pd.testing.assert_frame_equal(res.to_pandas(), pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries) cseries = series.to_gpu() res = self.executor.execute_dataframe(cseries, concat=True)[0] self.assertIsInstance(res, cudf.Series) pd.testing.assert_series_equal(res.to_pandas(), pseries) @require_cudf def testToCPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) df2 = to_cpu(cdf) res = self.executor.execute_dataframe(df2, concat=True)[0] self.assertIsInstance(res, pd.DataFrame) pd.testing.assert_frame_equal(res, pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries, chunk_size=(13, 21)) cseries = to_gpu(series) series2 = to_cpu(cseries) res = self.executor.execute_dataframe(series2, concat=True)[0] self.assertIsInstance(res, pd.Series) pd.testing.assert_series_equal(res, pseries) def testRechunkExecution(self): data = pd.DataFrame(np.random.rand(8, 10)) df = from_pandas_df(pd.DataFrame(data), chunk_size=3) df2 = df.rechunk((3, 4)) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) df2 = df.rechunk(5) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) # test Series rechunk execution. data = pd.Series(np.random.rand(10,)) series = from_pandas_series(data) series2 = series.rechunk(3) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) series2 = series.rechunk(1) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) # test index rechunk execution data = pd.Index(np.random.rand(10,)) index = from_pandas_index(data) index2 = index.rechunk(3) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res) index2 = index.rechunk(1) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res) def testResetIndexExecution(self): data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=['falcon', 'parrot', 'lion', 'monkey'], columns=('class', 'max_speed')) df = from_pandas_df(data) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, drop=True) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(drop=True) pd.testing.assert_frame_equal(result, expected) index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ('bird', 'parrot'), ('mammal', 'lion'), ('mammal', 'monkey')], names=['class', 'name']) data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=index, columns=('type', 'max_speed')) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, level='class') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class') pd.testing.assert_frame_equal(result, expected) columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')]) data.columns = columns df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df, level='class', col_level=1, col_fill='species') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class', col_level=1, col_fill='species') pd.testing.assert_frame_equal(result, expected) # Test Series s = pd.Series([1, 2, 3, 4], name='foo', index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) series = from_pandas_series(s) s2 = series_reset_index(series, name='bar') result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(name='bar') pd.testing.assert_frame_equal(result, expected) series = from_pandas_series(s, chunk_size=2) s2 = series_reset_index(series, drop=True) result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(drop=True) pd.testing.assert_series_equal(result, expected) # Test Unknown shape sess = new_session() data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) df2 = from_pandas_df(data2, chunk_size=6) df = (df1 + df2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) data1 = pd.Series(np.random.rand(10,), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) series1 = from_pandas_series(data1, chunk_size=3) data2 = pd.Series(np.random.rand(10,), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series2 = from_pandas_series(data2, chunk_size=3) df = (series1 + series2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) def testSeriesMapExecution(self): raw = pd.Series(np.arange(10)) s = from_pandas_series(raw, chunk_size=7) with self.assertRaises(ValueError): # cannot infer dtype, the inferred is int, # but actually it is float # just due to nan s.map({5: 10}) r = s.map({5: 10}, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10}) pd.testing.assert_series_equal(result, expected) r = s.map({i: 10 + i for i in range(7)}, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({i: 10 + i for i in range(7)}) pd.testing.assert_series_equal(result, expected) r = s.map({5: 10}, dtype=float, na_action='ignore') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10}, na_action='ignore') pd.testing.assert_series_equal(result, expected) # dtype can be inferred r = s.map({5: 10.}) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10.}) pd.testing.assert_series_equal(result, expected) r = s.map(lambda x: x + 1, dtype=int) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(lambda x: x + 1) pd.testing.assert_series_equal(result, expected) def f(x: int) -> float: return x + 1. # dtype can be inferred for function r = s.map(f) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(lambda x: x + 1.) pd.testing.assert_series_equal(result, expected) # test arg is a md.Series raw2 = pd.Series([10], index=[5]) s2 = from_pandas_series(raw2) r = s.map(s2, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(raw2) pd.testing.assert_series_equal(result, expected) # test arg is a md.Series, and dtype can be inferred raw2 = pd.Series([10.], index=[5]) s2 = from_pandas_series(raw2) r = s.map(s2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(raw2) pd.testing.assert_series_equal(result, expected) # test str raw = pd.Series(['a', 'b', 'c', 'd']) s = from_pandas_series(raw, chunk_size=2) r = s.map({'c': 'e'}) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({'c': 'e'}) pd.testing.assert_series_equal(result, expected) def testDescribeExecution(self): s_raw = pd.Series(np.random.rand(10)) # test one chunk series = from_pandas_series(s_raw, chunk_size=10) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) # test multi chunks series = from_pandas_series(s_raw, chunk_size=3) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) df_raw = pd.DataFrame(np.random.rand(10, 4), columns=list('abcd')) df_raw['e'] = np.random.randint(100, size=10) # test one chunk df = from_pandas_df(df_raw, chunk_size=10) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = series.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_series_equal(result, expected) # test multi chunks df = from_pandas_df(df_raw, chunk_size=3) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = df.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_frame_equal(result, expected) with self.assertRaises(ValueError): df.describe(percentiles=[1.1]) def testDataFrameFillNAExecution(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) value_df_raw = pd.DataFrame(np.random.randint(0, 100, (10, 7)).astype(np.float32), columns=list('ABCDEFG')) # test DataFrame single chunk with numeric fill df = from_pandas_df(df_raw) r = df.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test DataFrame single chunk with value as single chunk df = from_pandas_df(df_raw) value_df = from_pandas_df(value_df_raw) r = df.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_df_raw) pd.testing.assert_frame_equal(result, expected) # test chunked with numeric fill df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test inplace tile df = from_pandas_df(df_raw, chunk_size=3) df.fillna(1, inplace=True) result = self.executor.execute_dataframe(df, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test forward fill in axis=0 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(method='pad') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(method='pad') pd.testing.assert_frame_equal(result, expected) # test backward fill in axis=0 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(method='backfill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(method='backfill') pd.testing.assert_frame_equal(result, expected) # test forward fill in axis=1 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.ffill(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.ffill(axis=1) pd.testing.assert_frame_equal(result, expected) # test backward fill in axis=1 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.bfill(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.bfill(axis=1) pd.testing.assert_frame_equal(result, expected) # test fill with dataframe df = from_pandas_df(df_raw, chunk_size=3) value_df = from_pandas_df(value_df_raw, chunk_size=4) r = df.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_df_raw) pd.testing.assert_frame_equal(result, expected) # test fill with series value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32), index=list('ABCDEFGHIJ')) df = from_pandas_df(df_raw, chunk_size=3) value_series = from_pandas_series(value_series_raw, chunk_size=4) r = df.fillna(value_series) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_series_raw) pd.testing.assert_frame_equal(result, expected) def testSeriesFillNAExecution(self): series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32)) series = from_pandas_series(series_raw) r = series.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test DataFrame single chunk with value as single chunk series = from_pandas_series(series_raw) value_series = from_pandas_series(value_series_raw) r = series.fillna(value_series) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(value_series_raw) pd.testing.assert_series_equal(result, expected) # test chunked with numeric fill series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test inplace tile series = from_pandas_series(series_raw, chunk_size=3) series.fillna(1, inplace=True) result = self.executor.execute_dataframe(series, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test forward fill in axis=0 without limit series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(method='pad') result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(method='pad') pd.testing.assert_series_equal(result, expected) # test backward fill in axis=0 without limit series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(method='backfill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(method='backfill') pd.testing.assert_series_equal(result, expected) # test fill with series series = from_pandas_series(series_raw, chunk_size=3) value_df = from_pandas_series(value_series_raw, chunk_size=4) r = series.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(value_series_raw) pd.testing.assert_series_equal(result, expected) def testDataFrameApplyExecute(self): cols = [chr(ord('A') + i) for i in range(10)] df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols)) old_chunk_store_limit = options.chunk_store_limit try: options.chunk_store_limit = 20 df = from_pandas_df(df_raw, chunk_size=5) r = df.apply('ffill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply('ffill') pd.testing.assert_frame_equal(result, expected) r = df.apply(['sum', 'max']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(['sum', 'max']) pd.testing.assert_frame_equal(result, expected) r = df.apply(np.sqrt) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sqrt) pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: pd.Series([1, 2])) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: pd.Series([1, 2])) pd.testing.assert_frame_equal(result, expected) r = df.apply(np.sum, axis='index') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sum, axis='index') pd.testing.assert_series_equal(result, expected) r = df.apply(np.sum, axis='columns') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sum, axis='columns') pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: [1, 2], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: [1, 2], axis=1) pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: [1, 2], axis=1, result_type='expand') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: [1, 2], axis=1, result_type='expand') pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: list(range(10)), axis=1, result_type='reduce') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='reduce') pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: list(range(10)), axis=1, result_type='broadcast') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='broadcast') pd.testing.assert_frame_equal(result, expected) finally: options.chunk_store_limit = old_chunk_store_limit def testSeriesApplyExecute(self): idxes = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i ** 2 for i in range(20)], index=idxes) series = from_pandas_series(s_raw, chunk_size=5) r = series.apply('add', args=(1,)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply('add', args=(1,)) pd.testing.assert_series_equal(result, expected) r = series.apply(['sum', 'max']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(['sum', 'max']) pd.testing.assert_series_equal(result, expected) r = series.apply(np.sqrt) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(np.sqrt) pd.testing.assert_series_equal(result, expected) r = series.apply('sqrt') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply('sqrt') pd.testing.assert_series_equal(result, expected) r = series.apply(lambda x: [x, x + 1], convert_dtype=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False) pd.testing.assert_series_equal(result, expected) def testTransformExecute(self): cols = [chr(ord('A') + i) for i in range(10)] df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols)) idx_vals = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i ** 2 for i in range(20)], index=idx_vals) def rename_fn(f, new_name): f.__name__ = new_name return f old_chunk_store_limit = options.chunk_store_limit try: options.chunk_store_limit = 20 # DATAFRAME CASES df = from_pandas_df(df_raw, chunk_size=5) # test transform scenarios on data frames r = df.transform(lambda x: list(range(len(x)))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x)))) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: list(range(len(x))), axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x))), axis=1) pd.testing.assert_frame_equal(result, expected) r = df.transform(['cumsum', 'cummax', lambda x: x + 1]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(['cumsum', 'cummax', lambda x: x + 1]) pd.testing.assert_frame_equal(result, expected) fn_dict = OrderedDict([ ('A', 'cumsum'), ('D', ['cumsum', 'cummax']), ('F', lambda x: x + 1), ]) r = df.transform(fn_dict) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(fn_dict) pd.testing.assert_frame_equal(result, expected) # test agg scenarios on series r = df.transform(lambda x: x.iloc[:-1], _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.iloc[:-1]) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.iloc[:-1], axis=1) pd.testing.assert_frame_equal(result, expected) fn_list = [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True)] r = df.transform(fn_list, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(fn_list) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: x.sum(), _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(lambda x: x.sum()) pd.testing.assert_series_equal(result, expected) fn_dict = OrderedDict([ ('A', rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1')), ('D', [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'), lambda x: x.iloc[:-1].reset_index(drop=True)]), ('F', lambda x: x.iloc[:-1].reset_index(drop=True)), ]) r = df.transform(fn_dict, _call_agg=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.agg(fn_dict) pd.testing.assert_frame_equal(result, expected) # SERIES CASES series = from_pandas_series(s_raw, chunk_size=5) # test transform scenarios on series r = series.transform(lambda x: x + 1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.transform(lambda x: x + 1) pd.testing.assert_series_equal(result, expected) r = series.transform(['cumsum', lambda x: x + 1]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.transform(['cumsum', lambda x: x + 1]) pd.testing.assert_frame_equal(result, expected) finally: options.chunk_store_limit = old_chunk_store_limit def testStringMethodExecution(self): s = pd.Series(['s1,s2', 'ef,', 'dd', np.nan]) s2 = pd.concat([s, s, s]) series = from_pandas_series(s, chunk_size=2) series2 = from_pandas_series(s2, chunk_size=2) # test getitem r = series.str[:3] result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str[:3] pd.testing.assert_series_equal(result, expected) # test split, expand=False r = series.str.split(',', n=2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', n=2) pd.testing.assert_series_equal(result, expected) # test split, expand=True r = series.str.split(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test rsplit r = series.str.rsplit(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.rsplit(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test cat all data r = series2.str.cat(sep='/', na_rep='e') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s2.str.cat(sep='/', na_rep='e') self.assertEqual(result, expected) # test cat list r = series.str.cat(['a', 'b', np.nan, 'c']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(['a', 'b', np.nan, 'c']) pd.testing.assert_series_equal(result, expected) # test cat series r = series.str.cat(series.str.capitalize(), join='outer') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(s.str.capitalize(), join='outer') pd.testing.assert_series_equal(result, expected) # test extractall r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") pd.testing.assert_frame_equal(result, expected) # test extract, expand=False r = series.str.extract(r'[ab](\d)', expand=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=False) pd.testing.assert_series_equal(result, expected) # test extract, expand=True r = series.str.extract(r'[ab](\d)', expand=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=True) pd.testing.assert_frame_equal(result, expected) def testDatetimeMethodExecution(self): # test datetime s = pd.Series([pd.Timestamp('2020-1-1'), pd.Timestamp('2020-2-1'), np.nan]) series = from_pandas_series(s, chunk_size=2) r = series.dt.year result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.year pd.testing.assert_series_equal(result, expected) r = series.dt.strftime('%m-%d-%Y') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.strftime('%m-%d-%Y') pd.testing.assert_series_equal(result, expected) # test timedelta s = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('3 days'), np.nan]) series = from_pandas_series(s, chunk_size=2) r = series.dt.days result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.days pd.testing.assert_series_equal(result, expected) def testSeriesIsin(self): # one chunk in multiple chunks a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=10) sb = from_pandas_series(b, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) # multiple chunk in one chunks a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) sb = from_pandas_series(b, chunk_size=4) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) # multiple chunk in multiple chunks a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) sb = from_pandas_series(b, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = pd.Series([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(b), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = np.array([2, 1, 9, 3]) sa = from_pandas_series(a, chunk_size=2) sb = tensor(b, chunk_size=3) result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) b = {2, 1, 9, 3} # set sa = from_pandas_series(a, chunk_size=2) result = self.executor.execute_dataframe(sa.isin(b), concat=True)[0] expected = a.isin(b) pd.testing.assert_series_equal(result, expected) def testCheckNA(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) df = from_pandas_df(df_raw, chunk_size=4) pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.isna(), concat=True)[0], df_raw.isna()) pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.notna(), concat=True)[0], df_raw.notna()) series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) series = from_pandas_series(series_raw, chunk_size=4) pd.testing.assert_series_equal(self.executor.execute_dataframe(series.isna(), concat=True)[0], series_raw.isna()) pd.testing.assert_series_equal(self.executor.execute_dataframe(series.notna(), concat=True)[0], series_raw.notna()) def testDropNA(self): # dataframe cases df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(30): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) for rowid in range(random.randint(1, 5)): row = random.randint(0, 19) for idx in range(0, 10): df_raw.iloc[row, idx] = random.randint(0, 99) # only one chunk in columns, can run dropna directly r = from_pandas_df(df_raw, chunk_size=(4, 10)).dropna() pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna()) # multiple chunks in columns, count() will be called first r = from_pandas_df(df_raw, chunk_size=4).dropna() pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna()) r = from_pandas_df(df_raw, chunk_size=4).dropna(how='all') pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(how='all')) r = from_pandas_df(df_raw, chunk_size=4).dropna(subset=list('ABFI')) pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(subset=list('ABFI'))) r = from_pandas_df(df_raw, chunk_size=4).dropna(how='all', subset=list('BDHJ')) pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(how='all', subset=list('BDHJ'))) r = from_pandas_df(df_raw, chunk_size=4) r.dropna(how='all', inplace=True) pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0], df_raw.dropna(how='all')) # series cases series_raw = pd.Series(np.nan, index=range(20)) for _ in range(10): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) r = from_pandas_series(series_raw, chunk_size=4).dropna() pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0], series_raw.dropna()) r = from_pandas_series(series_raw, chunk_size=4) r.dropna(inplace=True) pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0], series_raw.dropna()) def testCutExecution(self): rs = np.random.RandomState(0) raw = rs.random(15) * 1000 s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)]) bins = [10, 100, 500] ii = pd.interval_range(10, 500, 3) labels = ['a', 'b'] t = tensor(raw, chunk_size=4) series = from_pandas_series(s, chunk_size=4) iii = from_pandas_index(ii, chunk_size=2) # cut on Series r = cut(series, bins) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins)) r, b = cut(series, bins, retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # cut on tensor r = cut(t, bins) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # one chunk r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(result, pd.cut(s, bins, right=False, include_lowest=True)) # test labels r = cut(t, bins, labels=labels) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) r = cut(t, bins, labels=False) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_tensor(r, concat=True)[0] expected = pd.cut(raw, bins, labels=False) np.testing.assert_array_equal(result, expected) # test labels which is tensor labels_t = tensor(['a', 'b'], chunk_size=1) r = cut(raw, bins, labels=labels_t, include_lowest=True) # result and expected is array whose dtype is CategoricalDtype result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.cut(raw, bins, labels=labels, include_lowest=True) self.assertEqual(len(result), len(expected)) for r, e in zip(result, expected): np.testing.assert_equal(r, e) # test labels=False r, b = cut(raw, ii, labels=False, retbins=True) # result and expected is array whose dtype is CategoricalDtype r_result = self.executor.execute_tileable(r, concat=True)[0] b_result = self.executor.execute_tileable(b, concat=True)[0] r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True) for r, e in zip(r_result, r_expected): np.testing.assert_equal(r, e) pd.testing.assert_index_equal(b_result, b_expected) # test bins which is md.IntervalIndex r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True) r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_dataframe(b, concat=True)[0] r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) pd.testing.assert_index_equal(b_result, b_expected) # test duplicates bins2 = [0, 2, 4, 6, 10, 10] r, b = cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') r_result = self.executor.execute_dataframe(r, concat=True)[0] b_result = self.executor.execute_tensor(b, concat=True)[0] r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True, right=False, duplicates='drop') pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) ctx, executor = self._create_test_context(self.executor) with ctx: # test integer bins r = cut(series, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s, 3)) r, b = cut(series, 3, right=False, retbins=True) r_result, b_result = executor.execute_dataframes([r, b]) r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True) pd.testing.assert_series_equal(r_result, r_expected) np.testing.assert_array_equal(b_result, b_expected) # test min max same s2 = pd.Series([1.1] * 15) r = cut(s2, 3) result = executor.execute_dataframes([r])[0] pd.testing.assert_series_equal(result, pd.cut(s2, 3)) # test inf exist s3 = s2.copy() s3[-1] = np.inf with self.assertRaises(ValueError): executor.execute_dataframes([cut(s3, 3)]) def testShiftExecution(self): # test dataframe rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(10, 8)), columns=['col' + str(i + 1) for i in range(8)]) df = from_pandas_df(raw, chunk_size=5) for periods in (2, -2, 6, -6): for axis in (0, 1): for fill_value in (None, 0, 1.): r = df.shift(periods=periods, axis=axis, fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.shift(periods=periods, axis=axis, fill_value=fill_value) pd.testing.assert_frame_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, axis: {}, fill_value: {}'.format( periods, axis, fill_value )) from e raw2 = raw.copy() raw2.index = pd.date_range('2020-1-1', periods=10) raw2.columns = pd.date_range('2020-3-1', periods=8) df2 = from_pandas_df(raw2, chunk_size=5) # test freq not None for periods in (2, -2): for axis in (0, 1): for fill_value in (None, 0, 1.): r = df2.shift(periods=periods, freq='D', axis=axis, fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.shift(periods=periods, freq='D', axis=axis, fill_value=fill_value) pd.testing.assert_frame_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, axis: {}, fill_value: {}'.format( periods, axis, fill_value )) from e # test tshift r = df2.tshift(periods=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.tshift(periods=1) pd.testing.assert_frame_equal(result, expected) with self.assertRaises(ValueError): _ = df.tshift(periods=1) # test series s = raw.iloc[:, 0] series = from_pandas_series(s, chunk_size=5) for periods in (0, 2, -2, 6, -6): for fill_value in (None, 0, 1.): r = series.shift(periods=periods, fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.shift(periods=periods, fill_value=fill_value) pd.testing.assert_series_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, fill_value: {}'.format( periods, fill_value )) from e s2 = raw2.iloc[:, 0] # test freq not None series2 = from_pandas_series(s2, chunk_size=5) for periods in (2, -2): for fill_value in (None, 0, 1.): r = series2.shift(periods=periods, freq='D', fill_value=fill_value) try: result = self.executor.execute_dataframe(r, concat=True)[0] expected = s2.shift(periods=periods, freq='D', fill_value=fill_value) pd.testing.assert_series_equal(result, expected) except AssertionError as e: # pragma: no cover raise AssertionError( 'Failed when periods: {}, fill_value: {}'.format( periods, fill_value )) from e
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest('numpy') self.ctx, self.executor = self._create_test_context(self.executor) self.ctx.__enter__() def tearDown(self) -> None: self.ctx.__exit__(None, None, None) def testGroupBy(self): df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }) mdf = md.DataFrame(df1, chunk_size=3) grouped = mdf.groupby('b') assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df1.groupby('b')) df2 = pd.DataFrame( { 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }, index=['i' + str(i) for i in range(9)]) mdf = md.DataFrame(df2, chunk_size=3) grouped = mdf.groupby('b') assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df2.groupby('b')) # test groupby series grouped = mdf.groupby(mdf['b']) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df2.groupby(df2['b'])) # test groupby multiple series grouped = mdf.groupby(by=[mdf['b'], mdf['c']]) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df2.groupby(by=[df2['b'], df2['c']])) df3 = pd.DataFrame( { 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }, index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i)) for i in range(9)])) mdf = md.DataFrame(df3, chunk_size=3) grouped = mdf.groupby(level=0) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], df3.groupby(level=0)) series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) ms1 = md.Series(series1, chunk_size=3) grouped = ms1.groupby(lambda x: x % 3) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series1.groupby(lambda x: x % 3)) # test groupby series grouped = ms1.groupby(ms1) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series1.groupby(series1)) series2 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3], index=['i' + str(i) for i in range(9)]) ms2 = md.Series(series2, chunk_size=3) grouped = ms2.groupby(lambda x: int(x[1:]) % 3) assert_groupby_equal( self.executor.execute_dataframe(grouped, concat=True)[0], series2.groupby(lambda x: int(x[1:]) % 3)) def testGroupByGetItem(self): df1 = pd.DataFrame( { 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }, index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i)) for i in range(9)])) mdf = md.DataFrame(df1, chunk_size=3) r = mdf.groupby(level=0)[['a', 'b']] assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby(level=0)[['a', 'b']], with_selection=True) r = mdf.groupby(level=0)[['a', 'b']].sum(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby(level=0)[['a', 'b']].sum()) r = mdf.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']] assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'b']], with_selection=True) r = mdf.groupby('b')[['a', 'c']] assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'c']], with_selection=True) r = mdf.groupby('b')[['a', 'b']].sum(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'b']].sum()) r = mdf.groupby('b')[['a', 'b']].agg(['sum', 'count'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'b']].agg(['sum', 'count'])) r = mdf.groupby('b')[['a', 'c']].agg(['sum', 'count'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b')[['a', 'c']].agg(['sum', 'count'])) r = mdf.groupby('b')[['a', 'b']].apply(lambda x: x + 1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b')[['a', 'b']].apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']].transform(lambda x: x + 1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b')[['a', 'b']].transform(lambda x: x + 1).sort_index()) r = mdf.groupby('b')[['a', 'b']].cumsum() pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b')[['a', 'b']].cumsum().sort_index()) r = mdf.groupby('b').a assert_groupby_equal(self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b').a, with_selection=True) r = mdf.groupby('b').a.sum(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b').a.sum()) r = mdf.groupby('b').a.agg(['sum', 'mean', 'var'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], df1.groupby('b').a.agg(['sum', 'mean', 'var'])) r = mdf.groupby('b').a.apply(lambda x: x + 1) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').a.apply(lambda x: x + 1).sort_index()) r = mdf.groupby('b').a.transform(lambda x: x + 1) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').a.transform(lambda x: x + 1).sort_index()) r = mdf.groupby('b').a.cumsum() pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').a.cumsum().sort_index()) def testDataFrameGroupByAgg(self): rs = np.random.RandomState(0) df1 = pd.DataFrame({ 'a': rs.choice([2, 3, 4], size=(100, )), 'b': rs.choice([2, 3, 4], size=(100, )) }) mdf = md.DataFrame(df1, chunk_size=3) df2 = pd.DataFrame({ 'c1': np.arange(10).astype(np.int64), 'c2': rs.choice(['a', 'b', 'c'], (10, )), 'c3': rs.rand(10) }) mdf2 = md.DataFrame(df2, chunk_size=2) for method in ['tree', 'shuffle']: r0 = mdf2.groupby('c2').agg('size', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r0, concat=True)[0], df2.groupby('c2').agg('size')) r1 = mdf.groupby('a').agg('sum', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r1, concat=True)[0], df1.groupby('a').agg('sum')) r2 = mdf.groupby('b').agg('min', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r2, concat=True)[0], df1.groupby('b').agg('min')) r1 = mdf2.groupby('c2').agg('prod', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r1, concat=True)[0], df2.groupby('c2').agg('prod')) r2 = mdf2.groupby('c2').agg('max', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r2, concat=True)[0], df2.groupby('c2').agg('max')) r3 = mdf2.groupby('c2').agg('count', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r3, concat=True)[0], df2.groupby('c2').agg('count')) r4 = mdf2.groupby('c2').agg('mean', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r4, concat=True)[0], df2.groupby('c2').agg('mean')) r5 = mdf2.groupby('c2').agg('var', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r5, concat=True)[0], df2.groupby('c2').agg('var')) r6 = mdf2.groupby('c2').agg('std', method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r6, concat=True)[0], df2.groupby('c2').agg('std')) agg = ['std', 'mean', 'var', 'max', 'count', 'size'] r3 = mdf2.groupby('c2').agg(agg, method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r3, concat=True)[0], df2.groupby('c2').agg(agg)) agg = OrderedDict([('c1', ['min', 'mean']), ('c3', 'std')]) r3 = mdf2.groupby('c2').agg(agg, method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r3, concat=True)[0], df2.groupby('c2').agg(agg)) agg = OrderedDict([('c1', 'min'), ('c3', 'sum')]) r3 = mdf2.groupby('c2').agg(agg, method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r3, concat=True)[0], df2.groupby('c2').agg(agg)) r3 = mdf2.groupby('c2').agg({'c1': 'min'}, method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r3, concat=True)[0], df2.groupby('c2').agg({'c1': 'min'})) # test groupby series r3 = mdf2.groupby(mdf2['c2']).sum(method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r3, concat=True)[0], df2.groupby(df2['c2']).sum()) r8 = mdf2.groupby('c2').size(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r8, concat=True)[0], df2.groupby('c2').size()) r4 = mdf2.groupby('c2').sum(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r4, concat=True)[0], df2.groupby('c2').sum()) r5 = mdf2.groupby('c2').prod(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r5, concat=True)[0], df2.groupby('c2').prod()) r6 = mdf2.groupby('c2').min(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r6, concat=True)[0], df2.groupby('c2').min()) r7 = mdf2.groupby('c2').max(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r7, concat=True)[0], df2.groupby('c2').max()) r8 = mdf2.groupby('c2').count(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r8, concat=True)[0], df2.groupby('c2').count()) r9 = mdf2.groupby('c2').mean(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r9, concat=True)[0], df2.groupby('c2').mean()) r10 = mdf2.groupby('c2').var(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r10, concat=True)[0], df2.groupby('c2').var()) r11 = mdf2.groupby('c2').std(method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r11, concat=True)[0], df2.groupby('c2').std()) # test as_index=False r12 = mdf2.groupby('c2', as_index=False).agg('mean', method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r12, concat=True)[0], df2.groupby('c2', as_index=False).agg('mean')) self.assertFalse(r12.op.groupby_params['as_index']) # test as_index=False takes no effect r13 = mdf2.groupby(['c1', 'c2'], as_index=False).agg(['mean', 'count'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r13, concat=True)[0], df2.groupby(['c1', 'c2'], as_index=False).agg(['mean', 'count'])) self.assertTrue(r13.op.groupby_params['as_index']) r14 = mdf2.groupby('c2').agg(['cumsum', 'cumcount'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r14, concat=True)[0].sort_index(), df2.groupby('c2').agg(['cumsum', 'cumcount']).sort_index()) # test auto method r15 = mdf2.groupby('c2').agg('prod') self.assertEqual(r15.op.method, 'auto') self.assertTrue( all((not isinstance(c.op, ShuffleProxy)) for c in r15.build_graph(tiled=True))) def testSeriesGroupByAgg(self): rs = np.random.RandomState(0) series1 = pd.Series(rs.rand(10)) ms1 = md.Series(series1, chunk_size=3) for method in ['tree', 'shuffle']: r0 = ms1.groupby(lambda x: x % 2).agg('size', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r0, concat=True)[0], series1.groupby(lambda x: x % 2).agg('size')) r1 = ms1.groupby(lambda x: x % 2).agg('sum', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r1, concat=True)[0], series1.groupby(lambda x: x % 2).agg('sum')) r2 = ms1.groupby(lambda x: x % 2).agg('min', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r2, concat=True)[0], series1.groupby(lambda x: x % 2).agg('min')) r1 = ms1.groupby(lambda x: x % 2).agg('prod', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r1, concat=True)[0], series1.groupby(lambda x: x % 2).agg('prod')) r2 = ms1.groupby(lambda x: x % 2).agg('max', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r2, concat=True)[0], series1.groupby(lambda x: x % 2).agg('max')) r3 = ms1.groupby(lambda x: x % 2).agg('count', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r3, concat=True)[0], series1.groupby(lambda x: x % 2).agg('count')) r4 = ms1.groupby(lambda x: x % 2).agg('mean', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r4, concat=True)[0], series1.groupby(lambda x: x % 2).agg('mean')) r5 = ms1.groupby(lambda x: x % 2).agg('var', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r5, concat=True)[0], series1.groupby(lambda x: x % 2).agg('var')) r6 = ms1.groupby(lambda x: x % 2).agg('std', method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r6, concat=True)[0], series1.groupby(lambda x: x % 2).agg('std')) agg = ['std', 'mean', 'var', 'max', 'count', 'size'] r3 = ms1.groupby(lambda x: x % 2).agg(agg, method=method) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r3, concat=True)[0], series1.groupby(lambda x: x % 2).agg(agg)) # test groupby series r3 = ms1.groupby(ms1).sum(method=method) pd.testing.assert_series_equal( self.executor.execute_dataframe(r3, concat=True)[0], series1.groupby(series1).sum()) r4 = ms1.groupby(lambda x: x % 2).size(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r4, concat=True)[0], series1.groupby(lambda x: x % 2).size()) r4 = ms1.groupby(lambda x: x % 2).sum(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r4, concat=True)[0], series1.groupby(lambda x: x % 2).sum()) r5 = ms1.groupby(lambda x: x % 2).prod(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r5, concat=True)[0], series1.groupby(lambda x: x % 2).prod()) r6 = ms1.groupby(lambda x: x % 2).min(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r6, concat=True)[0], series1.groupby(lambda x: x % 2).min()) r7 = ms1.groupby(lambda x: x % 2).max(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r7, concat=True)[0], series1.groupby(lambda x: x % 2).max()) r8 = ms1.groupby(lambda x: x % 2).count(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r8, concat=True)[0], series1.groupby(lambda x: x % 2).count()) r9 = ms1.groupby(lambda x: x % 2).mean(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r9, concat=True)[0], series1.groupby(lambda x: x % 2).mean()) r10 = ms1.groupby(lambda x: x % 2).var(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r10, concat=True)[0], series1.groupby(lambda x: x % 2).var()) r11 = ms1.groupby(lambda x: x % 2).std(method='tree') pd.testing.assert_series_equal( self.executor.execute_dataframe(r11, concat=True)[0], series1.groupby(lambda x: x % 2).std()) r11 = ms1.groupby(lambda x: x % 2).agg(['cumsum', 'cumcount'], method='tree') pd.testing.assert_frame_equal( self.executor.execute_dataframe(r11, concat=True)[0].sort_index(), series1.groupby(lambda x: x % 2).agg(['cumsum', 'cumcount']).sort_index()) def testGroupByApply(self): df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }) def apply_df(df): df = df.sort_index() df.a += df.b if len(df.index) > 0: df = df.iloc[:-1, :] return df def apply_series(s, truncate=True): s = s.sort_index() if truncate and len(s.index) > 0: s = s.iloc[:-1] return s mdf = md.DataFrame(df1, chunk_size=3) applied = mdf.groupby('b').apply(apply_df) pd.testing.assert_frame_equal( self.executor.execute_dataframe(applied, concat=True)[0].sort_index(), df1.groupby('b').apply(apply_df).sort_index()) applied = mdf.groupby('b').apply(lambda df: df.a) pd.testing.assert_series_equal( self.executor.execute_dataframe(applied, concat=True)[0].sort_index(), df1.groupby('b').apply(lambda df: df.a).sort_index()) applied = mdf.groupby('b').apply(lambda df: df.a.sum()) pd.testing.assert_series_equal( self.executor.execute_dataframe(applied, concat=True)[0].sort_index(), df1.groupby('b').apply(lambda df: df.a.sum()).sort_index()) series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) ms1 = md.Series(series1, chunk_size=3) applied = ms1.groupby(lambda x: x % 3).apply(apply_series) pd.testing.assert_series_equal( self.executor.execute_dataframe(applied, concat=True)[0].sort_index(), series1.groupby(lambda x: x % 3).apply(apply_series).sort_index()) sindex2 = pd.MultiIndex.from_arrays( [list(range(9)), list('ABCDEFGHI')]) series2 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3], index=sindex2) ms2 = md.Series(series2, chunk_size=3) applied = ms2.groupby(lambda x: x[0] % 3).apply(apply_series) pd.testing.assert_series_equal( self.executor.execute_dataframe(applied, concat=True)[0].sort_index(), series2.groupby(lambda x: x[0] % 3).apply( apply_series).sort_index()) def testGroupByTransform(self): df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'e': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'f': list('aabaaddce'), }) def transform_series(s, truncate=True): s = s.sort_index() if truncate and len(s.index) > 1: s = s.iloc[:-1].reset_index(drop=True) return s mdf = md.DataFrame(df1, chunk_size=3) r = mdf.groupby('b').transform(transform_series, truncate=False) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').transform(transform_series, truncate=False).sort_index()) r = mdf.groupby('b').transform(['cummax', 'cumsum'], _call_agg=True) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').agg(['cummax', 'cumsum']).sort_index()) agg_list = ['cummax', 'cumsum'] r = mdf.groupby('b').transform(agg_list, _call_agg=True) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').agg(agg_list).sort_index()) agg_dict = OrderedDict([('d', 'cummax'), ('b', 'cumsum')]) r = mdf.groupby('b').transform(agg_dict, _call_agg=True) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').agg(agg_dict).sort_index()) agg_list = ['sum', lambda s: s.sum()] r = mdf.groupby('b').transform(agg_list, _call_agg=True) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), df1.groupby('b').agg(agg_list).sort_index()) series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) ms1 = md.Series(series1, chunk_size=3) r = ms1.groupby(lambda x: x % 3).transform(lambda x: x + 1) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), series1.groupby(lambda x: x % 3).transform( lambda x: x + 1).sort_index()) r = ms1.groupby(lambda x: x % 3).transform('cummax', _call_agg=True) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), series1.groupby(lambda x: x % 3).agg('cummax').sort_index()) agg_list = ['cummax', 'cumcount'] r = ms1.groupby(lambda x: x % 3).transform(agg_list, _call_agg=True) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0].sort_index(), series1.groupby(lambda x: x % 3).agg(agg_list).sort_index()) def testGroupByCum(self): df1 = pd.DataFrame({ 'a': [3, 5, 2, 7, 1, 2, 4, 6, 2, 4], 'b': [8, 3, 4, 1, 8, 2, 2, 2, 2, 3], 'c': [1, 8, 8, 5, 3, 5, 0, 0, 5, 4] }) mdf = md.DataFrame(df1, chunk_size=3) for fun in ['cummin', 'cummax', 'cumprod', 'cumsum']: r1 = getattr(mdf.groupby('b'), fun)() pd.testing.assert_frame_equal( self.executor.execute_dataframe(r1, concat=True)[0].sort_index(), getattr(df1.groupby('b'), fun)().sort_index()) r2 = getattr(mdf.groupby('b'), fun)(axis=1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r2, concat=True)[0].sort_index(), getattr(df1.groupby('b'), fun)(axis=1).sort_index()) r3 = mdf.groupby('b').cumcount() pd.testing.assert_series_equal( self.executor.execute_dataframe(r3, concat=True)[0].sort_index(), df1.groupby('b').cumcount().sort_index()) series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) ms1 = md.Series(series1, chunk_size=3) for fun in ['cummin', 'cummax', 'cumprod', 'cumsum', 'cumcount']: r1 = getattr(ms1.groupby(lambda x: x % 2), fun)() pd.testing.assert_series_equal( self.executor.execute_dataframe(r1, concat=True)[0].sort_index(), getattr(series1.groupby(lambda x: x % 2), fun)().sort_index())
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testMerge(self): df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e']) df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y']) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) # Note [Index of Merge] # # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to # the final result dataframe. # # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex. # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the # same index value with pandas. But we guarantee that the content of dataframe is correct. # merge on index expected0 = df1.merge(df2) jdf0 = mdf1.merge(mdf2) result0 = self.executor.execute_dataframe(jdf0, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)) # merge on left index and `right_on` expected1 = df1.merge(df2, how='left', right_on='x', left_index=True) jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True) result1 = self.executor.execute_dataframe(jdf1, concat=True)[0] expected1.set_index('a_x', inplace=True) result1.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0)) # merge on `left_on` and right index expected2 = df1.merge(df2, how='right', left_on='a', right_index=True) jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True) result2 = self.executor.execute_dataframe(jdf2, concat=True)[0] expected2.set_index('a', inplace=True) result2.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)) # merge on `left_on` and `right_on` expected3 = df1.merge(df2, how='left', left_on='a', right_on='x') jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x') result3 = self.executor.execute_dataframe(jdf3, concat=True)[0] expected3.set_index('a_x', inplace=True) result3.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)) # merge on `on` expected4 = df1.merge(df2, how='right', on='a') jdf4 = mdf1.merge(mdf2, how='right', on='a') result4 = self.executor.execute_dataframe(jdf4, concat=True)[0] expected4.set_index('a', inplace=True) result4.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0)) # merge on multiple columns expected5 = df1.merge(df2, how='inner', on=['a', 'b']) jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b']) result5 = self.executor.execute_dataframe(jdf5, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0)) def testJoin(self): df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=['a1', 'a2', 'a3']) df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=['a1', 'b2', 'b3']) + 1 df2 = pd.concat([df2, df2 + 1]) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) # default `how` expected0 = df1.join(df2, lsuffix='l_', rsuffix='r_') jdf0 = mdf1.join(mdf2, lsuffix='l_', rsuffix='r_') result0 = self.executor.execute_dataframe(jdf0, concat=True)[0] pd.testing.assert_frame_equal(expected0.sort_index(), result0.sort_index()) # how = 'left' expected1 = df1.join(df2, how='left', lsuffix='l_', rsuffix='r_') jdf1 = mdf1.join(mdf2, how='left', lsuffix='l_', rsuffix='r_') result1 = self.executor.execute_dataframe(jdf1, concat=True)[0] pd.testing.assert_frame_equal(expected1.sort_index(), result1.sort_index()) # how = 'right' expected2 = df1.join(df2, how='right', lsuffix='l_', rsuffix='r_') jdf2 = mdf1.join(mdf2, how='right', lsuffix='l_', rsuffix='r_') result2 = self.executor.execute_dataframe(jdf2, concat=True)[0] pd.testing.assert_frame_equal(expected2.sort_index(), result2.sort_index()) # how = 'inner' expected3 = df1.join(df2, how='inner', lsuffix='l_', rsuffix='r_') jdf3 = mdf1.join(mdf2, how='inner', lsuffix='l_', rsuffix='r_') result3 = self.executor.execute_dataframe(jdf3, concat=True)[0] pd.testing.assert_frame_equal(expected3.sort_index(), result3.sort_index()) # how = 'outer' expected4 = df1.join(df2, how='outer', lsuffix='l_', rsuffix='r_') jdf4 = mdf1.join(mdf2, how='outer', lsuffix='l_', rsuffix='r_') result4 = self.executor.execute_dataframe(jdf4, concat=True)[0] pd.testing.assert_frame_equal(expected4.sort_index(), result4.sort_index()) def testJoinOn(self): df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=['a1', 'a2', 'a3']) df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=['a1', 'b2', 'b3']) + 1 df2 = pd.concat([df2, df2 + 1]) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) expected0 = df1.join(df2, on=None, lsuffix='_l', rsuffix='_r') jdf0 = mdf1.join(mdf2, on=None, lsuffix='_l', rsuffix='_r') result0 = self.executor.execute_dataframe(jdf0, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)) expected1 = df1.join(df2, how='left', on='a1', lsuffix='_l', rsuffix='_r') jdf1 = mdf1.join(mdf2, how='left', on='a1', lsuffix='_l', rsuffix='_r') result1 = self.executor.execute_dataframe(jdf1, concat=True)[0] # Note [Columns of Left Join] # # I believe we have no chance to obtain the entirely same result with pandas here: # # Look at the following example: # # >>> df1 # a1 a2 a3 # 0 1 3 3 # >>> df2 # a1 b2 b3 # 1 2 6 7 # >>> df3 # a1 b2 b3 # 1 2 6 7 # 1 2 6 7 # # >>> df1.merge(df2, how='left', left_on='a1', left_index=False, right_index=True) # a1_x a2 a3 a1_y b2 b3 # 0 1 3 3 2 6 7 # >>> df1.merge(df3, how='left', left_on='a1', left_index=False, right_index=True) # a1 a1_x a2 a3 a1_y b2 b3 # 0 1 1 3 3 2 6 7 # 0 1 1 3 3 2 6 7 # # Note that the result of `df1.merge(df3)` has an extra column `a` compared to `df1.merge(df2)`. # The value of column `a` is the same of `a1_x`, just because `1` occurs twice in index of `df3`. # I haven't invistagated why pandas has such behaviour... # # We cannot yeild the same result with pandas, because, the `df3` is chunked, then some of the # result chunk has 6 columns, others may have 7 columns, when concatenated into one DataFrame # some cells of column `a` will have value `NaN`, which is different from the result of pandas. # # But we can guarantee that other effective columns have absolutely same value with pandas. columns_to_compare = jdf1.columns_value.to_pandas() pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1[columns_to_compare], 0, 1), sort_dataframe_inplace(result1[columns_to_compare], 0, 1)) # Note [Index of Join on EmptyDataFrame] # # It is tricky that it is non-trivial to get the same `index` result with pandas. # # Look at the following example: # # >>> df1 # a1 a2 a3 # 1 4 2 6 # >>> df2 # a1 b2 b3 # 1 2 6 7 # 2 8 9 10 # >>> df3 # Empty DataFrame # Columns: [a1, a2, a3] # Index: [] # >>> df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') # a1_l a2 a3 a1_r b2 b3 # 1.0 4.0 2 6.0 8 9 10 # NaN NaN 1 NaN 2 6 7 # >>> df3.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') # a1_l a2 a3 a1_r b2 b3 # 1 NaN 1 NaN 2 6 7 # 2 NaN 2 NaN 8 9 10 # # When the `left` dataframe is not empty, the mismatched rows in `right` will have index value `NaN`, # and the matched rows have index value from `right`. When the `left` dataframe is empty, the mismatched # rows have index value from `right`. # # Since we chunked the `left` dataframe, it is uneasy to obtain the same index value with pandas in the # final result dataframe, but we guaranteed that the dataframe content is correctly. expected2 = df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') jdf2 = mdf1.join(mdf2, how='right', on='a2', lsuffix='_l', rsuffix='_r') result2 = self.executor.execute_dataframe(jdf2, concat=True)[0] expected2.set_index('a2', inplace=True) result2.set_index('a2', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)) expected3 = df1.join(df2, how='inner', on='a2', lsuffix='_l', rsuffix='_r') jdf3 = mdf1.join(mdf2, how='inner', on='a2', lsuffix='_l', rsuffix='_r') result3 = self.executor.execute_dataframe(jdf3, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)) expected4 = df1.join(df2, how='outer', on='a2', lsuffix='_l', rsuffix='_r') jdf4 = mdf1.join(mdf2, how='outer', on='a2', lsuffix='_l', rsuffix='_r') result4 = self.executor.execute_dataframe(jdf4, concat=True)[0] expected4.set_index('a2', inplace=True) result4.set_index('a2', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0)) def testMergeOneChunk(self): df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]}, index=['a1', 'a2', 'a3', 'a4']) df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]}, index=['a1', 'a2', 'a3', 'a4']) # all have one chunk mdf1 = from_pandas(df1) mdf2 = from_pandas(df2) expected = df1.merge(df2, left_on='lkey', right_on='rkey') jdf = mdf1.merge(mdf2, left_on='lkey', right_on='rkey') result = self.executor.execute_dataframe(jdf, concat=True)[0] pd.testing.assert_frame_equal(expected.sort_values(by=expected.columns[1]).reset_index(drop=True), result.sort_values(by=result.columns[1]).reset_index(drop=True)) # left have one chunk mdf1 = from_pandas(df1) mdf2 = from_pandas(df2, chunk_size=2) expected = df1.merge(df2, left_on='lkey', right_on='rkey') jdf = mdf1.merge(mdf2, left_on='lkey', right_on='rkey') result = self.executor.execute_dataframe(jdf, concat=True)[0] pd.testing.assert_frame_equal(expected.sort_values(by=expected.columns[1]).reset_index(drop=True), result.sort_values(by=result.columns[1]).reset_index(drop=True)) # right have one chunk mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2) expected = df1.merge(df2, left_on='lkey', right_on='rkey') jdf = mdf1.merge(mdf2, left_on='lkey', right_on='rkey') result = self.executor.execute_dataframe(jdf, concat=True)[0] pd.testing.assert_frame_equal(expected.sort_values(by=expected.columns[1]).reset_index(drop=True), result.sort_values(by=result.columns[1]).reset_index(drop=True)) def testAppendExecution(self): executor = ExecutorForTest(storage=new_session().context) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=2) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD')) mdf3 = from_pandas(df3, chunk_size=3) expected = df1.append([df2, df3]) adf = mdf1.append([mdf2, mdf3]) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test for series series1 = pd.Series(np.random.rand(10,)) series2 = pd.Series(np.random.rand(10,)) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=2) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) series3 = pd.Series(np.random.rand(4,)) mseries3 = series_from_pandas(series3, chunk_size=2) expected = series1.append([series2, series3]) aseries = mseries1.append([mseries2, mseries3]) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result)
def testSortIndexExecution(self): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = self.executor.execute_dataframe( mdf.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) executor = ExecutorForTest(storage=new_session().context) mdf = DataFrame(raw, chunk_size=10) result = executor.execute_dataframe(mdf.sort_index(ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = self.executor.execute_dataframe(mdf.sort_index( axis=1, ascending=False), concat=True)[0] expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) executor = ExecutorForTest(storage=new_session().context) result = executor.execute_dataframe(mdf.sort_index(axis=1, ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe( series.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected)
class Test(unittest.TestCase): def setUp(self) -> None: self.executor = ExecutorForTest('numpy') def testDotExecution(self): df1_raw = pd.DataFrame(np.random.rand(4, 7)) df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi')) s1_raw = pd.Series(np.random.rand(7)) s2_raw = pd.Series(np.random.rand(7)) df1 = DataFrame(df1_raw, chunk_size=(3, 2)) df2 = DataFrame(df2_raw, chunk_size=(3, 4)) # df.dot(df) r = df1.dot(df2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(df2_raw) pd.testing.assert_frame_equal(result, expected) # test @ r = df1 @ df2 result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw @ df2_raw pd.testing.assert_frame_equal(result, expected) series1 = Series(s1_raw, chunk_size=5) # df.dot(series) r = df1.dot(series1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(s1_raw) pd.testing.assert_series_equal(result, expected) # df.dot(2d_array) r = df1.dot(df2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(df2_raw.to_numpy()) pd.testing.assert_frame_equal(result, expected) # df.dot(1d_array) r = df1.dot(s1_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df1_raw.dot(s1_raw.to_numpy()) pd.testing.assert_series_equal(result, expected) series2 = Series(s2_raw, chunk_size=4) # series.dot(series) r = series1.dot(series2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(s2_raw) self.assertAlmostEqual(result, expected) # series.dot(df) r = series1.dot(df2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(df2_raw) pd.testing.assert_series_equal(result, expected) # series.dot(2d_array) r = series1.dot(df2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(df2_raw.to_numpy()) np.testing.assert_almost_equal(result, expected) # series.dot(1d_array) r = series1.dot(s2_raw.to_numpy()) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s1_raw.dot(s2_raw.to_numpy()) self.assertAlmostEqual(result, expected)
class TestReduction(TestBase): def setUp(self): self.executor = ExecutorForTest() def compute(self, data, **kwargs): return getattr(data, self.func_name)(**kwargs) def testSeriesReduction(self): data = pd.Series(np.random.randint(0, 8, (10,)), index=[str(i) for i in range(10)], name='a') reduction_df1 = self.compute(from_pandas_series(data)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4), axis='index') self.assertAlmostEqual( self.compute(data, axis='index'), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) data = pd.Series(np.random.rand(20), name='a') data[0] = 0.1 # make sure not all elements are NAN data[data > 0.5] = np.nan reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False) self.assertTrue( np.isnan(self.executor.execute_dataframe(reduction_df2, concat=True)[0])) if self.has_min_count: reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False, min_count=2) self.assertTrue( np.isnan(self.executor.execute_dataframe(reduction_df3, concat=True)[0])) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=3), min_count=1) self.assertAlmostEqual( self.compute(data, min_count=1), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) reduction_df5 = self.compute(from_pandas_series(data, chunk_size=3), min_count=21) self.assertTrue( np.isnan(self.executor.execute_dataframe(reduction_df5, concat=True)[0])) def testDataFrameReduction(self): data = pd.DataFrame(np.random.rand(20, 10)) reduction_df1 = self.compute(from_pandas_df(data)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_df(data, chunk_size=6), axis='index', numeric_only=True) pd.testing.assert_series_equal( self.compute(data, axis='index', numeric_only=True), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3), axis=1) pd.testing.assert_series_equal( self.compute(data, axis=1), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) # test null np_data = np.random.rand(20, 10) np_data[np_data > 0.6] = np.nan data = pd.DataFrame(np_data) reduction_df1 = self.compute(from_pandas_df(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False) pd.testing.assert_series_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False) pd.testing.assert_series_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) if self.has_min_count: reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3), min_count=15) pd.testing.assert_series_equal( self.compute(data, min_count=15), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3), min_count=3) pd.testing.assert_series_equal( self.compute(data, min_count=3), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3), axis=1, min_count=3) pd.testing.assert_series_equal( self.compute(data, axis=1, min_count=3), self.executor.execute_dataframe(reduction_df5, concat=True)[0]) reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3), axis=1, min_count=8) pd.testing.assert_series_equal( self.compute(data, axis=1, min_count=8), self.executor.execute_dataframe(reduction_df5, concat=True)[0]) # test numeric_only data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)), columns=[np.random.bytes(10) for _ in range(10)]) reduction_df1 = self.compute(from_pandas_df(data, chunk_size=2)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=6), axis='index', numeric_only=True) pd.testing.assert_series_equal( self.compute(data, axis='index', numeric_only=True), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3), axis='columns') pd.testing.assert_series_equal( self.compute(data, axis='columns'), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) data_dict = dict((str(i), np.random.rand(10)) for i in range(10)) data_dict['string'] = [str(i) for i in range(10)] data_dict['bool'] = np.random.choice([True, False], (10,)) data = pd.DataFrame(data_dict) reduction_df = self.compute(from_pandas_df(data, chunk_size=3), axis='index', numeric_only=True) pd.testing.assert_series_equal( self.compute(data, axis='index', numeric_only=True), self.executor.execute_dataframe(reduction_df, concat=True)[0]) data1 = pd.DataFrame(np.random.rand(10, 10), columns=[str(i) for i in range(10)]) data2 = pd.DataFrame(np.random.rand(10, 10), columns=[str(i) for i in range(10)]) df = from_pandas_df(data1, chunk_size=5) + from_pandas_df(data2, chunk_size=6) reduction_df = self.compute(df) pd.testing.assert_series_equal( self.compute(data1 + data2).sort_index(), self.executor.execute_dataframe(reduction_df, concat=True)[0].sort_index()) @require_cudf @require_cupy def testGPUExecution(self): df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc')) df = to_gpu(from_pandas_df(df_raw, chunk_size=6)) r = df.sum() res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum()) r = df.kurt() res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt()) r = df.agg(['sum', 'var']) res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(['sum', 'var'])) s_raw = pd.Series(np.random.rand(30)) s = to_gpu(from_pandas_series(s_raw, chunk_size=6)) r = s.sum() res = self.executor.execute_dataframe(r, concat=True)[0] self.assertAlmostEqual(res, s_raw.sum()) r = s.kurt() res = self.executor.execute_dataframe(r, concat=True)[0] self.assertAlmostEqual(res, s_raw.kurt()) r = s.agg(['sum', 'var']) res = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(['sum', 'var'])) s_raw = pd.Series(np.random.randint(0, 3, size=(30,)) * np.random.randint(0, 5, size=(30,))) s = to_gpu(from_pandas_series(s_raw, chunk_size=6)) r = s.unique() res = self.executor.execute_dataframe(r, concat=True)[0] np.testing.assert_array_equal(cp.asnumpy(res).sort(), s_raw.unique().sort())
class TestCount(TestBase): def setUp(self): self.executor = ExecutorForTest() def testSeriesCount(self): array = np.random.rand(10) array[[2, 7, 9]] = np.nan data = pd.Series(array) series = from_pandas_series(data) result = self.executor.execute_dataframe(series.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) series2 = from_pandas_series(data, chunk_size=1) result = self.executor.execute_dataframe(series2.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) series2 = from_pandas_series(data, chunk_size=3) result = self.executor.execute_dataframe(series2.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) def testDataFrameCount(self): data = pd.DataFrame({ "Person": ["John", "Myla", "Lewis", "John", "Myla"], "Age": [24., np.nan, 21., 33, 26], "Single": [False, True, True, True, False] }) df = from_pandas_df(data) result = self.executor.execute_dataframe(df.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df2 = from_pandas_df(data, chunk_size=2) result = self.executor.execute_dataframe(df2.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df2.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df3 = from_pandas_df(data, chunk_size=3) result = self.executor.execute_dataframe(df3.count(numeric_only=True), concat=True)[0] expected = data.count(numeric_only=True) pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df3.count(axis='columns', numeric_only=True), concat=True)[0] expected = data.count(axis='columns', numeric_only=True) pd.testing.assert_series_equal(result, expected)
class TestCount(TestBase): def setUp(self): self.executor = ExecutorForTest() def testSeriesCount(self): array = np.random.rand(10) array[[2, 7, 9]] = np.nan data = pd.Series(array) series = from_pandas_series(data) result = self.executor.execute_dataframe(series.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) series2 = from_pandas_series(data, chunk_size=1) result = self.executor.execute_dataframe(series2.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) series2 = from_pandas_series(data, chunk_size=3) result = self.executor.execute_dataframe(series2.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) def testDataFrameCount(self): data = pd.DataFrame({ "Person": ["John", "Myla", "Lewis", "John", "Myla"], "Age": [24., np.nan, 21., 33, 26], "Single": [False, True, True, True, False] }) df = from_pandas_df(data) result = self.executor.execute_dataframe(df.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df2 = from_pandas_df(data, chunk_size=2) result = self.executor.execute_dataframe(df2.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df2.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df3 = from_pandas_df(data, chunk_size=3) result = self.executor.execute_dataframe(df3.count(numeric_only=True), concat=True)[0] expected = data.count(numeric_only=True) pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df3.count(axis='columns', numeric_only=True), concat=True)[0] expected = data.count(axis='columns', numeric_only=True) pd.testing.assert_series_equal(result, expected) def testNunique(self): data1 = pd.Series(np.random.randint(0, 5, size=(20, ))) series = from_pandas_series(data1) result = self.executor.execute_dataframe(series.nunique(), concat=True)[0] expected = data1.nunique() self.assertEqual(result, expected) series = from_pandas_series(data1, chunk_size=6) result = self.executor.execute_dataframe(series.nunique(), concat=True)[0] expected = data1.nunique() self.assertEqual(result, expected) # test dropna data2 = data1.copy() data2[[2, 9, 18]] = np.nan series = from_pandas_series(data2) result = self.executor.execute_dataframe(series.nunique(), concat=True)[0] expected = data2.nunique() self.assertEqual(result, expected) series = from_pandas_series(data2, chunk_size=3) result = self.executor.execute_dataframe(series.nunique(dropna=False), concat=True)[0] expected = data2.nunique(dropna=False) self.assertEqual(result, expected) # test dataframe data1 = pd.DataFrame(np.random.randint(0, 6, size=(20, 20)), columns=['c' + str(i) for i in range(20)]) df = from_pandas_df(data1) result = self.executor.execute_dataframe(df.nunique(), concat=True)[0] expected = data1.nunique() pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1, chunk_size=6) result = self.executor.execute_dataframe(df.nunique(), concat=True)[0] expected = data1.nunique() pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1) result = self.executor.execute_dataframe(df.nunique(axis=1), concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1, chunk_size=3) result = self.executor.execute_dataframe(df.nunique(axis=1), concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected) # test dropna data2 = data1.copy() data2.iloc[[2, 9, 18], [2, 9, 18]] = np.nan df = from_pandas_df(data2) result = self.executor.execute_dataframe(df.nunique(), concat=True)[0] expected = data2.nunique() pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data2, chunk_size=3) result = self.executor.execute_dataframe(df.nunique(dropna=False), concat=True)[0] expected = data2.nunique(dropna=False) pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1, chunk_size=3) result = self.executor.execute_dataframe(df.nunique(axis=1), concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected)
def testSortValuesExecution(self): df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)], 'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe(mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe(filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Sereis.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testToCSVExecution(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # DATAFRAME TESTS # test one file with dataframe path = os.path.join(base_path, 'out.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) # test multi files with dataframe path = os.path.join(base_path, 'out-*.csv') r = df.to_csv(path) self.executor.execute_dataframe(r) dfs = [ pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.iloc[33:66]) # SERIES TESTS series = md.Series(raw.col1, chunk_size=33) # test one file with series path = os.path.join(base_path, 'out.csv') r = series.to_csv(path) self.executor.execute_dataframe(r) result = pd.read_csv(path, dtype=raw.dtypes.to_dict()) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) # test multi files with series path = os.path.join(base_path, 'out-*.csv') r = series.to_csv(path) self.executor.execute_dataframe(r) dfs = [ pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)), dtype=raw.dtypes.to_dict()) for i in range(4) ] result = pd.concat(dfs, axis=0) result.set_index('index', inplace=True) pd.testing.assert_frame_equal(result, raw.col1.to_frame()) pd.testing.assert_frame_equal(dfs[1].set_index('index'), raw.col1.to_frame().iloc[33:66]) @unittest.skipIf(sqlalchemy is None, 'sqlalchemy not installed') def testToSQL(self): index = pd.RangeIndex(100, 0, -1, name='index') raw = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100).astype('int64'), }, index=index) with tempfile.TemporaryDirectory() as d: table_name1 = 'test_table' table_name2 = 'test_table2' uri = 'sqlite:///' + os.path.join(d, 'test.db') engine = sqlalchemy.create_engine(uri) # test write dataframe df = DataFrame(raw, chunk_size=33) r = df.to_sql(table_name1, con=engine) self.executor.execute_dataframe(r) written = pd.read_sql(table_name1, con=engine, index_col='index') \ .sort_index(ascending=False) pd.testing.assert_frame_equal(raw, written) # test write with existing table with self.assertRaises(ValueError): df.to_sql(table_name1, con=uri).execute() # test write series series = md.Series(raw.col1, chunk_size=33) with engine.connect() as conn: r = series.to_sql(table_name2, con=conn) self.executor.execute_dataframe(r) written = pd.read_sql(table_name2, con=engine, index_col='index') \ .sort_index(ascending=False) pd.testing.assert_frame_equal(raw.col1.to_frame(), written) @unittest.skipIf(vineyard is None, 'vineyard not installed') @mock.patch('webbrowser.open_new_tab', new=lambda *_, **__: True) def testToVineyard(self): def testWithGivenSession(session): with option_context({'vineyard.socket': '/tmp/vineyard.sock'}): df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4), columns=['a', 'b', 'c', 'd']), chunk_size=2) object_id = df1.to_vineyard().execute(session=session) df2 = md.from_vineyard(object_id) df1_value = df1.execute(session=session) df2_value = df2.execute(session=session) pd.testing.assert_frame_equal(df1_value.reset_index(drop=True), df2_value.reset_index(drop=True)) with new_session().as_default() as session: testWithGivenSession(session) with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: with new_session(cluster.endpoint).as_default() as session: testWithGivenSession(session) with new_session( 'http://' + cluster._web_endpoint).as_default() as web_session: testWithGivenSession(web_session)
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) def testFromPandasSeriesExecution(self): ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = from_pandas_series(ps, chunk_size=13) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testInitializerExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = md.DataFrame(pdf, chunk_size=(15, 10)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = md.Series(ps, chunk_size=7) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testSeriesFromTensor(self): data = np.random.rand(10) series = md.Series(mt.tensor(data), name='a') pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data, name='a')) series = md.Series(mt.tensor(data, chunk_size=3)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data)) series = md.Series(mt.ones((10, ), chunk_size=4)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(np.ones(10, ))) index_data = np.random.rand(10) series = md.Series(mt.tensor(data, chunk_size=3), name='a', index=mt.tensor(index_data, chunk_size=4)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data, name='a', index=index_data)) def testFromTensorExecution(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) tensor_res = self.executor.execute_tensor(tensor, concat=True)[0] pdf_expected = pd.DataFrame(tensor_res) df_result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) pd.testing.assert_frame_equal(df_result, pdf_expected) # test converted with specified index_value and columns tensor2 = mt.random.rand(2, 2, chunk_size=1) df2 = dataframe_from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4])) df_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b'])) pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) # test converted from 1-d tensor tensor3 = mt.array([1, 2, 3]) df3 = dataframe_from_tensor(tensor3) result3 = self.executor.execute_dataframe(df3, concat=True)[0] pdf_expected = pd.DataFrame(np.array([1, 2, 3])) pd.testing.assert_frame_equal(pdf_expected, result3) # test converted from identical chunks tensor4 = mt.ones((10, 10), chunk_size=3) df4 = dataframe_from_tensor(tensor4) result4 = self.executor.execute_dataframe(df4, concat=True)[0] pdf_expected = pd.DataFrame( self.executor.execute_tensor(tensor4, concat=True)[0]) pd.testing.assert_frame_equal(pdf_expected, result4) # from tensor with given index tensor5 = mt.ones((10, 10), chunk_size=3) df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2)) result5 = self.executor.execute_dataframe(df5, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor5, concat=True)[0], index=np.arange(0, 20, 2)) pd.testing.assert_frame_equal(pdf_expected, result5) # from tensor with given index that is a tensor raw7 = np.random.rand(10, 10) tensor7 = mt.tensor(raw7, chunk_size=3) index_raw7 = np.random.rand(10) index7 = mt.tensor(index_raw7, chunk_size=4) df7 = dataframe_from_tensor(tensor7, index=index7) result7 = self.executor.execute_dataframe(df7, concat=True)[0] pdf_expected = pd.DataFrame(raw7, index=index_raw7) pd.testing.assert_frame_equal(pdf_expected, result7) # from tensor with given columns tensor6 = mt.ones((10, 10), chunk_size=3) df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij')) result6 = self.executor.execute_dataframe(df6, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor6, concat=True)[0], columns=list('abcdefghij')) pd.testing.assert_frame_equal(pdf_expected, result6) # from 1d tensors raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10, size=8)), ('c', [ ''.join(np.random.choice(list(printable), size=6)) for _ in range(8) ])] tensors8 = [mt.tensor(r[1], chunk_size=3) for r in raws8] df8 = dataframe_from_1d_tensors(tensors8, columns=[r[0] for r in raws8]) result = self.executor.execute_dataframe(df8, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8)) pd.testing.assert_frame_equal(result, pdf_expected) # from 1d tensors and specify index with a tensor index_raw9 = np.random.rand(8) index9 = mt.tensor(index_raw9, chunk_size=4) df9 = dataframe_from_1d_tensors(tensors8, columns=[r[0] for r in raws8], index=index9) result = self.executor.execute_dataframe(df9, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9) pd.testing.assert_frame_equal(result, pdf_expected) def testFromRecordsExecution(self): dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')]) ndarr = np.ones((10, ), dtype=dtype) pdf_expected = pd.DataFrame.from_records(ndarr, index=pd.RangeIndex(10)) # from structured array of mars tensor = mt.ones((10, ), dtype=dtype, chunk_size=3) df1 = from_records(tensor) df1_result = self.executor.execute_dataframe(df1, concat=True)[0] pd.testing.assert_frame_equal(df1_result, pdf_expected) # from structured array of numpy df2 = from_records(ndarr) df2_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(df2_result, pdf_expected) def testReadCSVExecution(self): tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test sep tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, sep=';') pdf = pd.read_csv(file_path, sep=';', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test missing value tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame({ 'c1': [np.nan, 'a', 'b', 'c'], 'c2': [1, 2, 3, np.nan], 'c3': [np.nan, np.nan, 3.4, 2.2] }) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=12), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, index_col=0, chunk_bytes=100), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test compression tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.gzip') try: index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path, compression='gzip') pdf = pd.read_csv(file_path, compression='gzip', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0, chunk_bytes='1k'), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test multiply files tempdir = tempfile.mkdtemp() try: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [ os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3) ] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) mdf = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2) finally: shutil.rmtree(tempdir) # test wildcards in path tempdir = tempfile.mkdtemp() try: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [ os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3) ] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) # As we can not guarantee the order in which these files are processed, # the result may not keep the original order. mdf = self.executor.execute_dataframe(md.read_csv( '{}/*.csv'.format(tempdir), index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf.sort_index()) mdf2 = self.executor.execute_dataframe(md.read_csv( '{}/*.csv'.format(tempdir), index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2.sort_index()) finally: shutil.rmtree(tempdir) @require_cudf def testReadCSVGPUExecution(self): tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = self.executor.execute_dataframe(md.read_csv(file_path, gpu=True), concat=True)[0] pd.testing.assert_frame_equal( pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True)) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, gpu=True, chunk_bytes=200), concat=True)[0] pd.testing.assert_frame_equal( pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True)) finally: shutil.rmtree(tempdir)
class Test(unittest.TestCase): def setUp(self) -> None: super().setUp() self.executor = ExecutorForTest('numpy') def testSeriesQuantileExecution(self): raw = pd.Series(np.random.rand(10), name='a') a = Series(raw, chunk_size=3) # q = 0.5, scalar r = a.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() self.assertEqual(result, expected) # q is a list r = a.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected) # test interpolation r = a.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_series_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = a.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_series_equal(result, expected) def testDataFrameQuantileExecution(self): raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)], 'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)] }, index=pd.RangeIndex(1, 11)) df = DataFrame(raw, chunk_size=3) # q = 0.5, axis = 0, series r = df.quantile() result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile() pd.testing.assert_series_equal(result, expected) # q = 0.5, axis = 1, series r = df.quantile(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile(axis=1) pd.testing.assert_series_equal(result, expected) # q is a list, axis = 0, dataframe r = df.quantile([0.3, 0.7]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # q is a list, axis = 1, dataframe r = df.quantile([0.3, 0.7], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], axis=1) pd.testing.assert_frame_equal(result, expected) # test interpolation r = df.quantile([0.3, 0.7], interpolation='midpoint') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.quantile([0.3, 0.7], interpolation='midpoint') pd.testing.assert_frame_equal(result, expected) this = self class MockSession: def __init__(self): self.executor = this.executor ctx = LocalContext(MockSession()) executor = ExecutorForTest('numpy', storage=ctx) with ctx: q = tensor([0.3, 0.7]) # q is a tensor r = df.quantile(q) result = executor.execute_dataframes([r])[0] expected = raw.quantile([0.3, 0.7]) pd.testing.assert_frame_equal(result, expected) # test numeric_only raw2 = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [pd.Timestamp('201{}'.format(i)) for i in range(10)], }, index=pd.RangeIndex(1, 11)) df2 = DataFrame(raw2, chunk_size=3) r = df2.quantile([0.3, 0.7], numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile([0.3, 0.7], numeric_only=False) pd.testing.assert_frame_equal(result, expected) r = df2.quantile(numeric_only=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw2.quantile(numeric_only=False) pd.testing.assert_series_equal(result, expected)
class TestReduction(TestBase): def setUp(self): self.executor = ExecutorForTest() def compute(self, data, **kwargs): return getattr(data, self.func_name)(**kwargs) def testSeriesReduction(self): data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name='a') reduction_df1 = self.compute(from_pandas_series(data)) self.assertEqual( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4), axis='index') self.assertAlmostEqual( self.compute(data, axis='index'), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) data = pd.Series(np.random.rand(20), name='a') data[0] = 0.1 # make sure not all elements are NAN data[data > 0.5] = np.nan reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3)) self.assertAlmostEqual( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False) self.assertTrue( np.isnan( self.executor.execute_dataframe(reduction_df2, concat=True)[0])) if self.has_min_count: reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False, min_count=2) self.assertTrue( np.isnan( self.executor.execute_dataframe(reduction_df3, concat=True)[0])) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=3), min_count=1) self.assertAlmostEqual( self.compute(data, min_count=1), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) reduction_df5 = self.compute(from_pandas_series(data, chunk_size=3), min_count=21) self.assertTrue( np.isnan( self.executor.execute_dataframe(reduction_df5, concat=True)[0])) def testDataFrameReduction(self): data = pd.DataFrame(np.random.rand(20, 10)) reduction_df1 = self.compute(from_pandas_df(data)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_df(data, chunk_size=6), axis='index', numeric_only=True) pd.testing.assert_series_equal( self.compute(data, axis='index', numeric_only=True), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3), axis=1) pd.testing.assert_series_equal( self.compute(data, axis=1), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) # test null np_data = np.random.rand(20, 10) np_data[np_data > 0.6] = np.nan data = pd.DataFrame(np_data) reduction_df1 = self.compute(from_pandas_df(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False) pd.testing.assert_series_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False) pd.testing.assert_series_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) if self.has_min_count: reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3), min_count=15) pd.testing.assert_series_equal( self.compute(data, min_count=15), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3), min_count=3) pd.testing.assert_series_equal( self.compute(data, min_count=3), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3), axis=1, min_count=3) pd.testing.assert_series_equal( self.compute(data, axis=1, min_count=3), self.executor.execute_dataframe(reduction_df5, concat=True)[0]) reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3), axis=1, min_count=8) pd.testing.assert_series_equal( self.compute(data, axis=1, min_count=8), self.executor.execute_dataframe(reduction_df5, concat=True)[0]) # test numeric_only data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) reduction_df1 = self.compute(from_pandas_df(data, chunk_size=2)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=6), axis='index', numeric_only=True) pd.testing.assert_series_equal( self.compute(data, axis='index', numeric_only=True), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3), axis='columns') pd.testing.assert_series_equal( self.compute(data, axis='columns'), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) data_dict = dict((str(i), np.random.rand(10)) for i in range(10)) data_dict['string'] = [str(i) for i in range(10)] data_dict['bool'] = np.random.choice([True, False], (10, )) data = pd.DataFrame(data_dict) reduction_df = self.compute(from_pandas_df(data, chunk_size=3), axis='index', numeric_only=True) pd.testing.assert_series_equal( self.compute(data, axis='index', numeric_only=True), self.executor.execute_dataframe(reduction_df, concat=True)[0])
class TestCount(TestBase): def setUp(self): self.executor = ExecutorForTest() def testSeriesCount(self): array = np.random.rand(10) array[[2, 7, 9]] = np.nan data = pd.Series(array) series = from_pandas_series(data) result = self.executor.execute_dataframe(series.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) series2 = from_pandas_series(data, chunk_size=1) result = self.executor.execute_dataframe(series2.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) series2 = from_pandas_series(data, chunk_size=3) result = self.executor.execute_dataframe(series2.count(), concat=True)[0] expected = data.count() self.assertEqual(result, expected) def testDataFrameCount(self): data = pd.DataFrame({ "Person": ["John", "Myla", "Lewis", "John", "Myla"], "Age": [24., np.nan, 21., 33, 26], "Single": [False, True, True, True, False] }) df = from_pandas_df(data) result = self.executor.execute_dataframe(df.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df2 = from_pandas_df(data, chunk_size=2) result = self.executor.execute_dataframe(df2.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df2.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df3 = from_pandas_df(data, chunk_size=3) result = self.executor.execute_dataframe(df3.count(numeric_only=True), concat=True)[0] expected = data.count(numeric_only=True) pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df3.count(axis='columns', numeric_only=True), concat=True)[0] expected = data.count(axis='columns', numeric_only=True) pd.testing.assert_series_equal(result, expected) def testNunique(self): data1 = pd.Series(np.random.randint(0, 5, size=(20, ))) series = from_pandas_series(data1) result = self.executor.execute_dataframe(series.nunique(), concat=True)[0] expected = data1.nunique() self.assertEqual(result, expected) series = from_pandas_series(data1, chunk_size=6) result = self.executor.execute_dataframe(series.nunique(), concat=True)[0] expected = data1.nunique() self.assertEqual(result, expected) # test dropna data2 = data1.copy() data2[[2, 9, 18]] = np.nan series = from_pandas_series(data2) result = self.executor.execute_dataframe(series.nunique(), concat=True)[0] expected = data2.nunique() self.assertEqual(result, expected) series = from_pandas_series(data2, chunk_size=3) result = self.executor.execute_dataframe(series.nunique(dropna=False), concat=True)[0] expected = data2.nunique(dropna=False) self.assertEqual(result, expected) # test dataframe data1 = pd.DataFrame(np.random.randint(0, 6, size=(20, 20)), columns=['c' + str(i) for i in range(20)]) df = from_pandas_df(data1) result = self.executor.execute_dataframe(df.nunique(), concat=True)[0] expected = data1.nunique() pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1, chunk_size=6) result = self.executor.execute_dataframe(df.nunique(), concat=True)[0] expected = data1.nunique() pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1) result = self.executor.execute_dataframe(df.nunique(axis=1), concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1, chunk_size=3) result = self.executor.execute_dataframe(df.nunique(axis=1), concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected) # test dropna data2 = data1.copy() data2.iloc[[2, 9, 18], [2, 9, 18]] = np.nan df = from_pandas_df(data2) result = self.executor.execute_dataframe(df.nunique(), concat=True)[0] expected = data2.nunique() pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data2, chunk_size=3) result = self.executor.execute_dataframe(df.nunique(dropna=False), concat=True)[0] expected = data2.nunique(dropna=False) pd.testing.assert_series_equal(result, expected) df = from_pandas_df(data1, chunk_size=3) result = self.executor.execute_dataframe(df.nunique(axis=1), concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected) def testUseArrowDtypeNUnique(self): with option_context({ 'dataframe.use_arrow_dtype': True, 'combine_size': 2 }): rs = np.random.RandomState(0) data1 = pd.DataFrame({ 'a': rs.random(10), 'b': [f's{i}' for i in rs.randint(100, size=10)] }) data1['c'] = data1['b'].copy() data1['d'] = data1['b'].copy() data1['e'] = data1['b'].copy() df = from_pandas_df(data1, chunk_size=(3, 2)) r = df.nunique(axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data1.nunique(axis=0) pd.testing.assert_series_equal(result, expected) r = df.nunique(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected) def testUnique(self): data1 = pd.Series(np.random.randint(0, 5, size=(20, ))) series = from_pandas_series(data1) result = self.executor.execute_dataframe(series.unique(), concat=True)[0] expected = data1.unique() np.testing.assert_array_equal(result, expected) series = from_pandas_series(data1, chunk_size=6) result = self.executor.execute_dataframe(series.unique(), concat=True)[0] expected = data1.unique() np.testing.assert_array_equal(result, expected) data2 = pd.Series([ pd.Timestamp('20200101'), ] * 5 + [pd.Timestamp('20200202')] + [pd.Timestamp('20020101')] * 9) series = from_pandas_series(data2) result = self.executor.execute_dataframe(series.unique(), concat=True)[0] expected = data2.unique() np.testing.assert_array_equal(result, expected) series = from_pandas_series(data2, chunk_size=6) result = self.executor.execute_dataframe(series.unique(), concat=True)[0] expected = data2.unique() np.testing.assert_array_equal(result, expected)
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) def testFromPandasSeriesExecution(self): ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = from_pandas_series(ps, chunk_size=13) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testInitializerExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = md.DataFrame(pdf, chunk_size=(15, 10)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = md.Series(ps, chunk_size=7) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testSeriesFromTensor(self): data = np.random.rand(10) series = md.Series(mt.tensor(data), name='a') pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data, name='a')) series = md.Series(mt.tensor(data, chunk_size=3)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data)) series = md.Series(mt.ones((10, ), chunk_size=4)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(np.ones(10, ))) index_data = np.random.rand(10) series = md.Series(mt.tensor(data, chunk_size=3), name='a', index=mt.tensor(index_data, chunk_size=4)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data, name='a', index=index_data)) def testFromTensorExecution(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) tensor_res = self.executor.execute_tensor(tensor, concat=True)[0] pdf_expected = pd.DataFrame(tensor_res) df_result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) pd.testing.assert_frame_equal(df_result, pdf_expected) # test converted with specified index_value and columns tensor2 = mt.random.rand(2, 2, chunk_size=1) df2 = dataframe_from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4])) df_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b'])) pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) # test converted from 1-d tensor tensor3 = mt.array([1, 2, 3]) df3 = dataframe_from_tensor(tensor3) result3 = self.executor.execute_dataframe(df3, concat=True)[0] pdf_expected = pd.DataFrame(np.array([1, 2, 3])) pd.testing.assert_frame_equal(pdf_expected, result3) # test converted from identical chunks tensor4 = mt.ones((10, 10), chunk_size=3) df4 = dataframe_from_tensor(tensor4) result4 = self.executor.execute_dataframe(df4, concat=True)[0] pdf_expected = pd.DataFrame( self.executor.execute_tensor(tensor4, concat=True)[0]) pd.testing.assert_frame_equal(pdf_expected, result4) # from tensor with given index tensor5 = mt.ones((10, 10), chunk_size=3) df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2)) result5 = self.executor.execute_dataframe(df5, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor5, concat=True)[0], index=np.arange(0, 20, 2)) pd.testing.assert_frame_equal(pdf_expected, result5) # from tensor with given index that is a tensor raw7 = np.random.rand(10, 10) tensor7 = mt.tensor(raw7, chunk_size=3) index_raw7 = np.random.rand(10) index7 = mt.tensor(index_raw7, chunk_size=4) df7 = dataframe_from_tensor(tensor7, index=index7) result7 = self.executor.execute_dataframe(df7, concat=True)[0] pdf_expected = pd.DataFrame(raw7, index=index_raw7) pd.testing.assert_frame_equal(pdf_expected, result7) # from tensor with given columns tensor6 = mt.ones((10, 10), chunk_size=3) df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij')) result6 = self.executor.execute_dataframe(df6, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor6, concat=True)[0], columns=list('abcdefghij')) pd.testing.assert_frame_equal(pdf_expected, result6) # from 1d tensors raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10, size=8)), ('c', [ ''.join(np.random.choice(list(printable), size=6)) for _ in range(8) ])] tensors8 = [mt.tensor(r[1], chunk_size=3) for r in raws8] df8 = dataframe_from_1d_tensors(tensors8, columns=[r[0] for r in raws8]) result = self.executor.execute_dataframe(df8, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8)) pd.testing.assert_frame_equal(result, pdf_expected) # from 1d tensors and specify index with a tensor index_raw9 = np.random.rand(8) index9 = mt.tensor(index_raw9, chunk_size=4) df9 = dataframe_from_1d_tensors(tensors8, columns=[r[0] for r in raws8], index=index9) result = self.executor.execute_dataframe(df9, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9) pd.testing.assert_frame_equal(result, pdf_expected) def testFromRecordsExecution(self): dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')]) ndarr = np.ones((10, ), dtype=dtype) pdf_expected = pd.DataFrame.from_records(ndarr, index=pd.RangeIndex(10)) # from structured array of mars tensor = mt.ones((10, ), dtype=dtype, chunk_size=3) df1 = from_records(tensor) df1_result = self.executor.execute_dataframe(df1, concat=True)[0] pd.testing.assert_frame_equal(df1_result, pdf_expected) # from structured array of numpy df2 = from_records(ndarr) df2_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(df2_result, pdf_expected) def testReadCSVExecution(self): tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test sep tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, sep=';') pdf = pd.read_csv(file_path, sep=';', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test missing value tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame({ 'c1': [np.nan, 'a', 'b', 'c'], 'c2': [1, 2, 3, np.nan], 'c3': [np.nan, np.nan, 3.4, 2.2] }) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=12), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, index_col=0, chunk_bytes=100), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test compression tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.gzip') try: index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path, compression='gzip') pdf = pd.read_csv(file_path, compression='gzip', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0, chunk_bytes='1k'), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test multiply files tempdir = tempfile.mkdtemp() try: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [ os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3) ] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) mdf = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2) finally: shutil.rmtree(tempdir) # test wildcards in path tempdir = tempfile.mkdtemp() try: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [ os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3) ] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) # As we can not guarantee the order in which these files are processed, # the result may not keep the original order. mdf = self.executor.execute_dataframe(md.read_csv( '{}/*.csv'.format(tempdir), index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf.sort_index()) mdf2 = self.executor.execute_dataframe(md.read_csv( '{}/*.csv'.format(tempdir), index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2.sort_index()) finally: shutil.rmtree(tempdir) @require_cudf def testReadCSVGPUExecution(self): tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = self.executor.execute_dataframe(md.read_csv(file_path, gpu=True), concat=True)[0] pd.testing.assert_frame_equal( pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True)) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, gpu=True, chunk_bytes=200), concat=True)[0] pd.testing.assert_frame_equal( pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True)) finally: shutil.rmtree(tempdir) def testReadCSVWithoutIndex(self): sess = new_session() # test csv file without storing index tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = sess.run(md.read_csv(file_path, sort_range_index=True)) pd.testing.assert_frame_equal(pdf, mdf) mdf2 = sess.run( md.read_csv(file_path, sort_range_index=True, chunk_bytes=10)) pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) def testReadSQLTableExecution(self): import sqlalchemy as sa test_df = pd.DataFrame({ 'a': np.arange(10).astype(np.int64, copy=False), 'b': ['s%d' % i for i in range(10)], 'c': np.random.rand(10) }) with tempfile.TemporaryDirectory() as d: table_name = 'test' table_name2 = 'test2' uri = 'sqlite:///' + os.path.join(d, 'test.db') test_df.to_sql(table_name, uri, index=False) r = md.read_sql_table('test', uri, chunk_size=4) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) engine = sa.create_engine(uri) m = sa.MetaData() try: # test index_col and columns r = md.read_sql_table('test', engine.connect(), chunk_size=4, index_col='a', columns=['b']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = test_df.copy(deep=True) expected.set_index('a', inplace=True) del expected['c'] pd.testing.assert_frame_equal(result, expected) # do not specify chunk_size r = md.read_sql_table('test', engine.connect(), index_col='a', columns=['b']) result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, expected) table = sa.Table(table_name, m, autoload=True, autoload_with=engine) r = md.read_sql_table( table, engine, chunk_size=4, index_col=[table.columns['a'], table.columns['b']], columns=[table.columns['c']]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = test_df.copy(deep=True) expected.set_index(['a', 'b'], inplace=True) pd.testing.assert_frame_equal(result, expected) # test primary key sa.Table(table_name2, m, sa.Column('id', sa.Integer, primary_key=True), sa.Column('a', sa.Integer), sa.Column('b', sa.String), sa.Column('c', sa.Float)) m.create_all(engine) test_df = test_df.copy(deep=True) test_df.index.name = 'id' test_df.to_sql(table_name2, uri, if_exists='append') r = md.read_sql_table(table_name2, engine, chunk_size=4, index_col='id') result = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) finally: engine.dispose()
class TestCumReduction(TestBase): def setUp(self): self.executor = ExecutorForTest() def compute(self, data, **kwargs): return getattr(data, self.func_name)(**kwargs) def testSeriesCumReduction(self): data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name='a') reduction_df1 = self.compute(from_pandas_series(data)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df3, concat=True)[0]) reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4), axis='index') pd.testing.assert_series_equal( self.compute(data, axis='index'), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) data = pd.Series(np.random.rand(20), name='a') data[0] = 0.1 # make sure not all elements are NAN data[data > 0.5] = np.nan reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3)) pd.testing.assert_series_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False) pd.testing.assert_series_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) def testDataFrameCumReduction(self): data = pd.DataFrame(np.random.rand(20, 10)) reduction_df1 = self.compute(from_pandas_df(data)) pd.testing.assert_frame_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3)) pd.testing.assert_frame_equal( self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3), axis=1) pd.testing.assert_frame_equal( self.compute(data, axis=1), self.executor.execute_dataframe(reduction_df4, concat=True)[0]) # test null np_data = np.random.rand(20, 10) np_data[np_data > 0.6] = np.nan data = pd.DataFrame(np_data) reduction_df1 = self.compute(from_pandas_df(data, chunk_size=3)) pd.testing.assert_frame_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False) pd.testing.assert_frame_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False) pd.testing.assert_frame_equal( self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0]) # test numeric_only data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) reduction_df1 = self.compute(from_pandas_df(data, chunk_size=2)) pd.testing.assert_frame_equal( self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0]) reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3), axis='columns') pd.testing.assert_frame_equal( self.compute(data, axis='columns'), self.executor.execute_dataframe(reduction_df3, concat=True)[0])
class Test(unittest.TestCase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testDataFrameEwmAgg(self): np.random.seed(0) raw = pd.DataFrame({ 'a': np.random.randint(100, size=(10, )), 'b': np.random.rand(10), 'c': np.random.randint(100, size=(10, )), 'd': ['c' * i for i in np.random.randint(4, size=10)] }) raw.b[0:3] = np.nan raw.b[5:7] = np.nan raw.b[9] = np.nan df = md.DataFrame(raw, chunk_size=(10, 3)) r = df.ewm(alpha=0.5).agg(['mean']) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.5).agg(['mean'])) df = md.DataFrame(raw, chunk_size=(3, 3)) aggs = ['mean', 'var', 'std'] for fun_name in aggs: r = df.ewm(alpha=0.3).agg(fun_name) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(fun_name)) r = df.ewm(alpha=0.3, ignore_na=True).agg(fun_name) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name)) r = df.ewm(alpha=0.3).agg(['mean']) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(['mean'])) r = df.ewm(alpha=0.3).agg(aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(aggs)) agg_dict = {'c': 'mean'} r = df.ewm(alpha=0.3).agg(agg_dict) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(agg_dict)) agg_dict = OrderedDict([('a', ['mean', 'var']), ('b', 'var')]) r = df.ewm(alpha=0.3).agg(agg_dict) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(agg_dict)) r = df.ewm(alpha=0.3, min_periods=0).agg(aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3, min_periods=0).agg(aggs)) r = df.ewm(alpha=0.3, min_periods=2).agg(aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3, min_periods=2).agg(aggs)) agg_dict = OrderedDict([('a', ['mean', 'var']), ('b', 'var'), ('c', 'mean')]) r = df.ewm(alpha=0.3, min_periods=2).agg(agg_dict) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3, min_periods=2).agg(agg_dict)) def testSeriesExpandingAgg(self): raw = pd.Series(np.random.rand(10), name='a') raw[:3] = np.nan raw[5:10:2] = np.nan series = md.Series(raw, chunk_size=10) r = series.ewm(alpha=0.3).agg(['mean']) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(['mean'])) r = series.ewm(alpha=0.3).agg('mean') pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg('mean')) series = md.Series(raw, chunk_size=3) aggs = ['mean', 'var', 'std'] for fun_name in aggs: r = series.ewm(alpha=0.3).agg(fun_name) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(fun_name)) r = series.ewm(alpha=0.3, ignore_na=True).agg(fun_name) pd.testing.assert_series_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name)) r = series.ewm(alpha=0.3).agg(['mean']) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(['mean'])) r = series.ewm(alpha=0.3).agg(aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3).agg(aggs)) r = series.ewm(alpha=0.3, min_periods=0).agg(aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3, min_periods=0).agg(aggs)) r = series.ewm(alpha=0.3, min_periods=2).agg(aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(r, concat=True)[0], raw.ewm(alpha=0.3, min_periods=2).agg(aggs))
class TestAggregate(TestBase): def setUp(self): self.executor = ExecutorForTest() def testDataFrameAggregate(self): all_aggs = ['sum', 'prod', 'min', 'max', 'count', 'mean', 'var', 'std'] data = pd.DataFrame(np.random.rand(20, 20)) df = from_pandas_df(data) result = df.agg(all_aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs)) df = from_pandas_df(data, chunk_size=3) # will redirect to transform result = df.agg(['cumsum', 'cummax']) pd.testing.assert_frame_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(['cumsum', 'cummax'])) for func in all_aggs: result = df.agg(func) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(func)) result = df.agg(func, axis=1) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(func, axis=1)) result = df.agg(['sum']) pd.testing.assert_frame_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(['sum'])) result = df.agg(all_aggs) pd.testing.assert_frame_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs)) result = df.agg(all_aggs, axis=1) pd.testing.assert_frame_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs, axis=1)) result = df.agg({0: ['sum', 'min', 'var'], 9: ['mean', 'var', 'std']}) pd.testing.assert_frame_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg({ 0: ['sum', 'min', 'var'], 9: ['mean', 'var', 'std'] })) def testSeriesAggregate(self): all_aggs = ['sum', 'prod', 'min', 'max', 'count', 'mean', 'var', 'std'] data = pd.Series(np.random.rand(20), index=[str(i) for i in range(20)], name='a') series = from_pandas_series(data) result = series.agg(all_aggs) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs)) series = from_pandas_series(data, chunk_size=3) for func in all_aggs: result = series.agg(func) self.assertAlmostEqual( self.executor.execute_dataframe(result, concat=True)[0], data.agg(func)) result = series.agg(all_aggs) pd.testing.assert_series_equal( self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs))
class Test(unittest.TestCase): def setUp(self) -> None: super().setUp() self.executor = ExecutorForTest('numpy') def testSortValuesExecution(self): df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]), concat=True)[0] expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe(mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)}) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe(filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Sereis.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected) def testSortIndexExecution(self): raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw) mdf.sort_index(inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=30) result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=20) result = self.executor.execute_dataframe( mdf.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_frame_equal(result, expected) executor = ExecutorForTest(storage=new_session().context) mdf = DataFrame(raw, chunk_size=10) result = executor.execute_dataframe(mdf.sort_index(ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(ignore_index=True) except TypeError: expected = raw.sort_index() expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test axis=1 raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10)) mdf = DataFrame(raw) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0] expected = raw.sort_index(axis=1) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) result = self.executor.execute_dataframe(mdf.sort_index( axis=1, ascending=False), concat=True)[0] expected = raw.sort_index(axis=1, ascending=False) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(raw, chunk_size=4) executor = ExecutorForTest(storage=new_session().context) result = executor.execute_dataframe(mdf.sort_index(axis=1, ignore_index=True), concat=True)[0] try: # for python3.5 expected = raw.sort_index(axis=1, ignore_index=True) except TypeError: expected = raw.sort_index(axis=1) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test series raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0] expected = raw.sort_index() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe( series.sort_index(ascending=False), concat=True)[0] expected = raw.sort_index(ascending=False) pd.testing.assert_series_equal(result, expected) def testArrowStringSortValues(self): rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw['b'] = raw['b'].astype(ArrowStringDtype()) mdf = DataFrame(raw, chunk_size=3) df = mdf.sort_values(by='b') result = self.executor.execute_dataframe(df, concat=True)[0] expected = raw.sort_values(by='b') pd.testing.assert_frame_equal(result, expected)
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testToDatetimeExecution(self): # scalar r = to_datetime(1490195805, unit='s') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(1490195805, unit='s') self.assertEqual(pd.to_datetime(result.item()), expected) # test list like raw = ['3/11/2000', '3/12/2000', '3/13/2000'] t = tensor(raw, chunk_size=2) r = to_datetime(t, infer_datetime_format=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw, infer_datetime_format=True) pd.testing.assert_index_equal(result, expected) # test series raw_series = pd.Series(raw) s = Series(raw_series, chunk_size=2) r = to_datetime(s) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_series) pd.testing.assert_series_equal(result, expected) # test DataFrame raw_df = pd.DataFrame({ 'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5] }) df = DataFrame(raw_df, chunk_size=(1, 2)) r = to_datetime(df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_df) pd.testing.assert_series_equal(result, expected) # test Index raw_index = pd.Index([1, 2, 3]) s = Index(raw_index, chunk_size=2) r = to_datetime(s) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw_index) pd.testing.assert_index_equal(result, expected) # test raises == 'ignore' raw = ['13000101'] r = to_datetime(raw, format='%Y%m%d', errors='ignore') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore') pd.testing.assert_index_equal(result, expected) # test unit r = to_datetime([1490195805], unit='s') result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime([1490195805], unit='s') pd.testing.assert_index_equal(result, expected) # test origin r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) result = self.executor.execute_dataframe(r, concat=True)[0] expected = pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) pd.testing.assert_index_equal(result, expected)
class TestBinary(TestBase): def setUp(self): self.executor = ExecutorForTest() def to_boolean_if_needed(self, value, split_value=0.5): if self.func_name in ['__and__', '__or__', '__xor__']: return value > split_value else: return value def testWithoutShuffleExecution(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # all the axes are monotonic # data1 with index split into [0...4], [5...9], # columns [3...7], [8...12] data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], # columns [4...9], [10, 13] data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithOneShuffleExecution(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithAllShuffleExecution(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # no axis is monotonic data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testBothWithOneChunk(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=10) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=10) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=10) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=10) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithoutShuffleAndWithOneChunk(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=(10, 5)) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithShuffleAndWithOneChunk(self): if self.func_name in ['__and__', '__or__', '__xor__']: # pandas fails to compute some expected values due to `na`. return # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(10, 5)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testSameIndex(self): data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(0, 2, size=(10, )), columns=['c' + str(i) for i in range(10)]) data = self.to_boolean_if_needed(data) df = from_pandas(data, chunk_size=3) df2 = self.func(df, df) expected = self.func(data, data) result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(expected, result) series = from_pandas_series(data.iloc[0], chunk_size=3) df3 = self.func(df, series) expected = self.func(data, data.iloc[0]) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) series = from_pandas_series(data.iloc[:, 0], chunk_size=3) df4 = getattr(df, self.func_name)(series, axis=0) expected = getattr(data, self.func_name)(data.iloc[:, 0], axis=0) result = self.executor.execute_dataframe(df4, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testChained(self): data1 = pd.DataFrame(np.random.rand(10, 10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) data4 = pd.DataFrame(np.random.rand(10, 10)) data4 = self.to_boolean_if_needed(data1) df4 = from_pandas(data4, chunk_size=6) df5 = self.func(df3, df4) result = self.executor.execute_dataframe(df5, concat=True)[0] expected = self.func(self.func(data1, data2), data4) pd.testing.assert_frame_equal(expected, result) def testRfunc(self): data1 = pd.DataFrame(np.random.rand(10, 10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = getattr(df1, self.rfunc_name)(df2) result = self.executor.execute_dataframe(df3, concat=True)[0] expected = self.func(data2, data1) pd.testing.assert_frame_equal(expected, result) data3 = pd.DataFrame(np.random.rand(10, 10)) data3 = self.to_boolean_if_needed(data3) df4 = from_pandas(data3, chunk_size=5) df5 = getattr(df4, self.rfunc_name)(1) # todo check dtypes when pandas reverts its behavior on broadcasting check_dtypes = self.func_name not in ('__and__', '__or__', '__xor__') result = self.executor.execute_dataframe(df5, concat=True, check_dtypes=check_dtypes)[0] expected2 = self.func(1, data3) pd.testing.assert_frame_equal(expected2, result) def testWithMultiForms(self): # test multiple forms # such as self+other, self.add(other), add(self,other) data1 = pd.DataFrame(np.random.rand(10, 10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) expected = self.func(data1, data2) result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(getattr(df1, self.func_name)(df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(getattr(df1, self.rfunc_name)(df2), concat=True)[0] pd.testing.assert_frame_equal(self.func(data2, data1), result) def testDataframeAndScalar(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators doesn\'t support floating point scalars return # test dataframe and scalar pdf = pd.DataFrame(np.random.rand(10, 10)) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf, chunk_size=2) expected = self.func(pdf, 1) result = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result2 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result2) result3 = self.executor.execute_dataframe(getattr(df, self.func_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected, result3) # test scalar and dataframe result4 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result4) expected2 = self.func(1, pdf) result5 = self.executor.execute_dataframe(self.func(1, df), concat=True)[0] pd.testing.assert_frame_equal(expected2, result5) result6 = self.executor.execute_dataframe(getattr(df, self.rfunc_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected2, result6) def testWithShuffleOnStringIndex(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # no axis is monotonic, and the index values are strings. data1 = pd.DataFrame( np.random.rand(10, 10), index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame( np.random.rand(10, 10), index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testDataframeAndSeries(self): if self.func_name in ['__and__', '__or__', '__xor__']: # pandas fails to compute some expected values due to `na`. return data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) s1 = from_pandas_series(data2[1], chunk_size=(6, )) # operate on single-column dataframe and series df1 = from_pandas(data1[[1]], chunk_size=(5, 5)) r1 = getattr(df1, self.func_name)(s1, axis='index') expected = getattr(data1[[1]], self.func_name)(data2[1], axis='index') result = self.executor.execute_dataframe(r1, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # operate on dataframe and series without shuffle df2 = from_pandas(data1, chunk_size=(5, 5)) r2 = getattr(df2, self.func_name)(s1, axis='index') expected = getattr(data1, self.func_name)(data2[1], axis='index') result = self.executor.execute_dataframe(r2, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # operate on dataframe and series with shuffle df3 = from_pandas(data1, chunk_size=(5, 5)) r3 = getattr(df3, self.func_name)(s1, axis='columns') expected = getattr(data1, self.func_name)(data2[1], axis='columns') result = self.executor.execute_dataframe(r3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test both one chunk, axis=0 pdf = pd.DataFrame({ 'ca': [1, 3, 2], 'cb': [360, 180, 2] }, index=[1, 2, 3]) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)( mars_series, axis=0), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=0) pd.testing.assert_frame_equal(expected, result) # test different number of chunks, axis=0 pdf = pd.DataFrame({ 'ca': [1, 3, 2], 'cb': [360, 180, 2] }, index=[1, 2, 3]) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)( mars_series, axis=0), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=0) pd.testing.assert_frame_equal(expected, result) # test with row shuffle, axis=0 pdf = pd.DataFrame({ 'ca': [1, 3, 2], 'cb': [360, 180, 2] }, index=[2, 1, 3]) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[3, 1, 2]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)( mars_series, axis=0), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=0).reindex([3, 1, 2]) # modify the order of rows result = result.reindex(index=[3, 1, 2]) pd.testing.assert_frame_equal(expected, result) # test both one chunk, axis=1 pdf = pd.DataFrame({ 1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3] }, index=['ra', 'rb', 'rc']) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)( mars_series, axis=1), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=1) pd.testing.assert_frame_equal(expected, result) # test different number of chunks, axis=1 pdf = pd.DataFrame({ 1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3] }, index=['ra', 'rb', 'rc']) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)( mars_series, axis=1), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=1) pd.testing.assert_frame_equal(expected, result) # test with row shuffle, axis=1 pdf = pd.DataFrame({ 1: [1, 3, 2], 3: [1, 2, 3], 2: [360, 180, 2] }, index=['ra', 'rb', 'rc']) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[3, 1, 2]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)( mars_series, axis=1), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=1) # modify the order of columns result = result[[1, 2, 3]] pd.testing.assert_frame_equal(expected, result) def testSeries(self): # only one chunk s1 = pd.Series(np.arange(10) + 1) s1 = self.to_boolean_if_needed(s1) s2 = pd.Series(np.arange(10) + 1) s2 = self.to_boolean_if_needed(s2) r = self.func(from_pandas_series(s1, chunk_size=10), from_pandas_series(s2, chunk_size=10)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) # same index s1 = pd.Series(np.arange(10) + 1) s1 = self.to_boolean_if_needed(s1) s2 = pd.Series(np.arange(10) + 1) s2 = self.to_boolean_if_needed(s2) r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) # no shuffle s1 = pd.Series(np.arange(10) + 1, index=range(10)) s1 = self.to_boolean_if_needed(s1) s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1)) s2 = self.to_boolean_if_needed(s2) r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) # shuffle data = (np.arange(10) + 1).astype(np.int64, copy=False) s1 = pd.Series(data, index=np.random.permutation(range(10))) s1 = self.to_boolean_if_needed(s1) s2 = pd.Series(data, index=np.random.permutation(range(10, 0, -1))) s2 = self.to_boolean_if_needed(s2) r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) if self.func_name in ['__and__', '__or__', '__xor__']: # bitwise logical operators doesn\'t support floating point scalars return # operate with scalar s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) s1 = self.to_boolean_if_needed(s1) r = self.func(from_pandas_series(s1, chunk_size=4), 4) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, 4) pd.testing.assert_series_equal(expected, result) # reverse with scalar s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) s1 = self.to_boolean_if_needed(s1) r = self.func(4, from_pandas_series(s1, chunk_size=4)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(4, s1) pd.testing.assert_series_equal(expected, result) def testWithPlainValue(self): if self.func_name in ['__and__', '__or__', '__xor__']: # skip tests for bitwise logical operators on plain value. return data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) pd.testing.assert_series_equal(expected, result) # specify index, not the default range index data1 = pd.DataFrame(np.random.rand(10, 7), index=np.arange(5, 15), columns=[4, 1, 3, 2, 5, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)(from_array( np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)(from_array( np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result)
def testSortValuesExecution(self): distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [ '0', '1' ] for add_distinct in distinct_opts: os.environ['PSRS_DISTINCT_COL'] = add_distinct df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)]) # test one chunk mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a6', 'a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a6', 'a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test psrs mdf = DataFrame(df, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test ascending=False result = self.executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ascending=False), concat=True)[0] expected = df.sort_values(['a0', 'a1'], ascending=False) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a7'], ascending=False), concat=True)[0] expected = df.sort_values(['a7'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test multiindex df2 = df.copy(deep=True) df2.columns = pd.MultiIndex.from_product( [list('AB'), list('CDEFG')]) mdf = DataFrame(df2, chunk_size=10) result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]), concat=True)[0] expected = df2.sort_values([('A', 'C')]) pd.testing.assert_frame_equal(result, expected) # test rechunk mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0] expected = df.sort_values('a0') pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a3', 'a4']), concat=True)[0] expected = df.sort_values(['a3', 'a4']) pd.testing.assert_frame_equal(result, expected) # test other types raw = pd.DataFrame( { 'a': np.random.rand(10), 'b': np.random.randint(1000, size=10), 'c': np.random.rand(10), 'd': [np.random.bytes(10) for _ in range(10)], 'e': [pd.Timestamp(f'201{i}') for i in range(10)], 'f': [pd.Timedelta(f'{i} days') for i in range(10)] }, ) mdf = DataFrame(raw, chunk_size=3) for label in raw.columns: result = self.executor.execute_dataframe( mdf.sort_values(label), concat=True)[0] expected = raw.sort_values(label) pd.testing.assert_frame_equal(result, expected) result = self.executor.execute_dataframe(mdf.sort_values( ['a', 'b', 'e'], ascending=False), concat=True)[0] expected = raw.sort_values(['a', 'b', 'e'], ascending=False) pd.testing.assert_frame_equal(result, expected) # test nan df = pd.DataFrame({ 'col1': ['A', 'A', 'B', 'B', 'D', 'C'], 'col2': [2, 1, 9, np.nan, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], }) mdf = DataFrame(df) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) mdf = DataFrame(df, chunk_size=3) result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0] expected = df.sort_values(['col2']) pd.testing.assert_frame_equal(result, expected) # test ignore_index executor = ExecutorForTest(storage=new_session().context) df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)]) mdf = DataFrame(df, chunk_size=3) result = executor.execute_dataframe(mdf.sort_values( ['a0', 'a1'], ignore_index=True), concat=True)[0] try: # for python3.5 expected = df.sort_values(['a0', 'a1'], ignore_index=True) except TypeError: expected = df.sort_values(['a0', 'a1']) expected.index = pd.RangeIndex(len(expected)) pd.testing.assert_frame_equal(result, expected) # test inplace mdf = DataFrame(df) mdf.sort_values('a0', inplace=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] df.sort_values('a0', inplace=True) pd.testing.assert_frame_equal(result, df) # test unknown shape df = pd.DataFrame({ 'a': list(range(10)), 'b': np.random.random(10) }) mdf = DataFrame(df, chunk_size=4) filtered = mdf[mdf['a'] > 2] result = self.executor.execute_dataframe( filtered.sort_values(by='b'), concat=True)[0] pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b')) # test Series.sort_values raw = pd.Series(np.random.rand(10)) series = Series(raw) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=3) result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0] expected = raw.sort_values() pd.testing.assert_series_equal(result, expected) series = Series(raw, chunk_size=2) result = self.executor.execute_dataframe( series.sort_values(ascending=False), concat=True)[0] expected = raw.sort_values(ascending=False) pd.testing.assert_series_equal(result, expected)
class TestUnary(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() def testAbs(self): data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10))) df1 = from_pandas(data1, chunk_size=5) result = self.executor.execute_dataframe(df1.abs(), concat=True)[0] expected = data1.abs() pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(abs(df1), concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testNot(self): data1 = pd.DataFrame( np.random.uniform(low=-1, high=1, size=(10, 10)) > 0) df1 = from_pandas(data1, chunk_size=5) result = self.executor.execute_dataframe(~df1, concat=True)[0] expected = ~data1 pd.testing.assert_frame_equal(expected, result) def testUfunc(self): df_raw = pd.DataFrame(np.random.uniform(size=(10, 10)), index=pd.RangeIndex(9, -1, -1)) df = from_pandas(df_raw, chunk_size=5) series_raw = pd.Series(np.random.uniform(size=10), index=pd.RangeIndex(9, -1, -1)) series = from_pandas_series(series_raw, chunk_size=5) ufuncs = [[np.abs, mt.abs], [np.log, mt.log], [np.log2, mt.log2], [np.log10, mt.log10], [np.sin, mt.sin], [np.cos, mt.cos], [np.tan, mt.tan], [np.sinh, mt.sinh], [np.cosh, mt.cosh], [np.tanh, mt.tanh], [np.arcsin, mt.arcsin], [np.arccos, mt.arccos], [np.arctan, mt.arctan], [np.arcsinh, mt.arcsinh], [np.arccosh, mt.arccosh], [np.arctanh, mt.arctanh], [np.radians, mt.radians], [np.degrees, mt.degrees], [np.ceil, mt.ceil], [np.floor, mt.floor], [ partial(np.around, decimals=2), partial(mt.around, decimals=2) ], [np.exp, mt.exp], [np.exp2, mt.exp2], [np.expm1, mt.expm1], [np.sqrt, mt.sqrt]] for raw, data in [(df_raw, df), (series_raw, series)]: for npf, mtf in ufuncs: r = mtf(data) result = self.executor.execute_tensor(r, concat=True)[0] expected = npf(raw) if isinstance(raw, pd.DataFrame): pd.testing.assert_frame_equal(result, expected) else: pd.testing.assert_series_equal(result, expected) # test numpy ufunc r = npf(data) result = self.executor.execute_tensor(r, concat=True)[0] if isinstance(raw, pd.DataFrame): pd.testing.assert_frame_equal(result, expected) else: pd.testing.assert_series_equal(result, expected)
def testAppendExecution(self): executor = ExecutorForTest(storage=new_session().context) df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=2) adf = mdf1.append(mdf2) expected = df1.append(df2) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(mdf2, ignore_index=True) expected = df1.append(df2, ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD')) mdf3 = from_pandas(df3, chunk_size=3) expected = df1.append([df2, df3]) adf = mdf1.append([mdf2, mdf3]) result = self.executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True) result = executor.execute_dataframe(adf, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test for series series1 = pd.Series(np.random.rand(10,)) series2 = pd.Series(np.random.rand(10,)) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=3) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) mseries1 = series_from_pandas(series1, chunk_size=3) mseries2 = series_from_pandas(series2, chunk_size=2) aseries = mseries1.append(mseries2) expected = series1.append(series2) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) aseries = mseries1.append(mseries2, ignore_index=True) expected = series1.append(series2, ignore_index=True) result = executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result) series3 = pd.Series(np.random.rand(4,)) mseries3 = series_from_pandas(series3, chunk_size=2) expected = series1.append([series2, series3]) aseries = mseries1.append([mseries2, mseries3]) result = self.executor.execute_dataframe(aseries, concat=True)[0] pd.testing.assert_series_equal(expected, result)
class Test(TestBase): def setUp(self): super().setUp() self.executor = ExecutorForTest() @require_cudf def testToGPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) res = self.executor.execute_dataframe(cdf, concat=True)[0] self.assertIsInstance(res, cudf.DataFrame) pd.testing.assert_frame_equal(res.to_pandas(), pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries) cseries = series.to_gpu() res = self.executor.execute_dataframe(cseries, concat=True)[0] self.assertIsInstance(res, cudf.Series) pd.testing.assert_series_equal(res.to_pandas(), pseries) @require_cudf def testToCPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) df2 = to_cpu(cdf) res = self.executor.execute_dataframe(df2, concat=True)[0] self.assertIsInstance(res, pd.DataFrame) pd.testing.assert_frame_equal(res, pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries, chunk_size=(13, 21)) cseries = to_gpu(series) series2 = to_cpu(cseries) res = self.executor.execute_dataframe(series2, concat=True)[0] self.assertIsInstance(res, pd.Series) pd.testing.assert_series_equal(res, pseries) def testRechunkExecution(self): data = pd.DataFrame(np.random.rand(8, 10)) df = from_pandas_df(pd.DataFrame(data), chunk_size=3) df2 = df.rechunk((3, 4)) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) df2 = df.rechunk(5) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) # test Series rechunk execution. data = pd.Series(np.random.rand(10, )) series = from_pandas_series(data) series2 = series.rechunk(3) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) series2 = series.rechunk(1) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) def testResetIndexExecution(self): data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=['falcon', 'parrot', 'lion', 'monkey'], columns=('class', 'max_speed')) df = from_pandas_df(data) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index() pd.testing.assert_frame_equal(result, expected) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, drop=True) result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(drop=True) pd.testing.assert_frame_equal(result, expected) index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ('bird', 'parrot'), ('mammal', 'lion'), ('mammal', 'monkey')], names=['class', 'name']) data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=index, columns=('type', 'max_speed')) df = from_pandas_df(data, chunk_size=1) df2 = df_reset_index(df, level='class') result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class') pd.testing.assert_frame_equal(result, expected) columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')]) df = from_pandas_df(data, chunk_size=2) df2 = df_reset_index(df, level='class', col_level=1, col_fill='species') data.columns = columns result = self.executor.execute_dataframe(df2, concat=True)[0] expected = data.reset_index(level='class', col_level=1, col_fill='species') pd.testing.assert_frame_equal(result, expected) # Test Series s = pd.Series([1, 2, 3, 4], name='foo', index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) series = from_pandas_series(s) s2 = series_reset_index(series, name='bar') result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(name='bar') pd.testing.assert_frame_equal(result, expected) series = from_pandas_series(s, chunk_size=2) s2 = series_reset_index(series, drop=True) result = self.executor.execute_dataframe(s2, concat=True)[0] expected = s.reset_index(drop=True) pd.testing.assert_series_equal(result, expected) # Test Unknown shape sess = new_session() data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) df2 = from_pandas_df(data2, chunk_size=6) df = (df1 + df2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) data1 = pd.Series(np.random.rand(10, ), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9]) series1 = from_pandas_series(data1, chunk_size=3) data2 = pd.Series(np.random.rand(10, ), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series2 = from_pandas_series(data2, chunk_size=3) df = (series1 + series2).reset_index() result = sess.run(df) pd.testing.assert_index_equal(result.index, pd.RangeIndex(12)) # Inconsistent with Pandas when input dataframe's shape is unknown. result = result.sort_values(by=result.columns[0]) expected = (data1 + data2).reset_index() np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy()) def testSeriesMapExecution(self): raw = pd.Series(np.arange(10)) s = from_pandas_series(raw, chunk_size=7) with self.assertRaises(ValueError): # cannot infer dtype, the inferred is int, # but actually it is float # just due to nan s.map({5: 10}) r = s.map({5: 10}, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10}) pd.testing.assert_series_equal(result, expected) r = s.map({i: 10 + i for i in range(7)}, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({i: 10 + i for i in range(7)}) pd.testing.assert_series_equal(result, expected) r = s.map({5: 10}, dtype=float, na_action='ignore') result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10}, na_action='ignore') pd.testing.assert_series_equal(result, expected) # dtype can be inferred r = s.map({5: 10.}) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({5: 10.}) pd.testing.assert_series_equal(result, expected) r = s.map(lambda x: x + 1, dtype=int) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(lambda x: x + 1) pd.testing.assert_series_equal(result, expected) def f(x: int) -> float: return x + 1. # dtype can be inferred for function r = s.map(f) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(lambda x: x + 1.) pd.testing.assert_series_equal(result, expected) # test arg is a md.Series raw2 = pd.Series([10], index=[5]) s2 = from_pandas_series(raw2) r = s.map(s2, dtype=float) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(raw2) pd.testing.assert_series_equal(result, expected) # test arg is a md.Series, and dtype can be inferred raw2 = pd.Series([10.], index=[5]) s2 = from_pandas_series(raw2) r = s.map(s2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map(raw2) pd.testing.assert_series_equal(result, expected) # test str raw = pd.Series(['a', 'b', 'c', 'd']) s = from_pandas_series(raw, chunk_size=2) r = s.map({'c': 'e'}) result = self.executor.execute_dataframe(r, concat=True)[0] expected = raw.map({'c': 'e'}) pd.testing.assert_series_equal(result, expected) def testDescribeExecution(self): s_raw = pd.Series(np.random.rand(10)) # test one chunk series = from_pandas_series(s_raw, chunk_size=10) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) # test multi chunks series = from_pandas_series(s_raw, chunk_size=3) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) df_raw = pd.DataFrame(np.random.rand(10, 4), columns=list('abcd')) df_raw['e'] = np.random.randint(100, size=10) # test one chunk df = from_pandas_df(df_raw, chunk_size=10) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = series.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_series_equal(result, expected) # test multi chunks df = from_pandas_df(df_raw, chunk_size=3) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = df.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_frame_equal(result, expected) with self.assertRaises(ValueError): df.describe(percentiles=[1.1]) def testDataFrameFillNAExecution(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) value_df_raw = pd.DataFrame(np.random.randint(0, 100, (10, 7)).astype( np.float32), columns=list('ABCDEFG')) # test DataFrame single chunk with numeric fill df = from_pandas_df(df_raw) r = df.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test DataFrame single chunk with value as single chunk df = from_pandas_df(df_raw) value_df = from_pandas_df(value_df_raw) r = df.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_df_raw) pd.testing.assert_frame_equal(result, expected) # test chunked with numeric fill df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test inplace tile df = from_pandas_df(df_raw, chunk_size=3) df.fillna(1, inplace=True) result = self.executor.execute_dataframe(df, concat=True)[0] expected = df_raw.fillna(1) pd.testing.assert_frame_equal(result, expected) # test forward fill in axis=0 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(method='pad') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(method='pad') pd.testing.assert_frame_equal(result, expected) # test backward fill in axis=0 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.fillna(method='backfill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(method='backfill') pd.testing.assert_frame_equal(result, expected) # test forward fill in axis=1 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.ffill(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.ffill(axis=1) pd.testing.assert_frame_equal(result, expected) # test backward fill in axis=1 without limit df = from_pandas_df(df_raw, chunk_size=3) r = df.bfill(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.bfill(axis=1) pd.testing.assert_frame_equal(result, expected) # test fill with dataframe df = from_pandas_df(df_raw, chunk_size=3) value_df = from_pandas_df(value_df_raw, chunk_size=4) r = df.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_df_raw) pd.testing.assert_frame_equal(result, expected) # test fill with series value_series_raw = pd.Series(np.random.randint(0, 100, (10, )).astype( np.float32), index=list('ABCDEFGHIJ')) df = from_pandas_df(df_raw, chunk_size=3) value_series = from_pandas_series(value_series_raw, chunk_size=4) r = df.fillna(value_series) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.fillna(value_series_raw) pd.testing.assert_frame_equal(result, expected) def testSeriesFillNAExecution(self): series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) value_series_raw = pd.Series( np.random.randint(0, 100, (10, )).astype(np.float32)) series = from_pandas_series(series_raw) r = series.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test DataFrame single chunk with value as single chunk series = from_pandas_series(series_raw) value_series = from_pandas_series(value_series_raw) r = series.fillna(value_series) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(value_series_raw) pd.testing.assert_series_equal(result, expected) # test chunked with numeric fill series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test inplace tile series = from_pandas_series(series_raw, chunk_size=3) series.fillna(1, inplace=True) result = self.executor.execute_dataframe(series, concat=True)[0] expected = series_raw.fillna(1) pd.testing.assert_series_equal(result, expected) # test forward fill in axis=0 without limit series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(method='pad') result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(method='pad') pd.testing.assert_series_equal(result, expected) # test backward fill in axis=0 without limit series = from_pandas_series(series_raw, chunk_size=3) r = series.fillna(method='backfill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(method='backfill') pd.testing.assert_series_equal(result, expected) # test fill with series series = from_pandas_series(series_raw, chunk_size=3) value_df = from_pandas_series(value_series_raw, chunk_size=4) r = series.fillna(value_df) result = self.executor.execute_dataframe(r, concat=True)[0] expected = series_raw.fillna(value_series_raw) pd.testing.assert_series_equal(result, expected) def testDataFrameApplyExecute(self): cols = [chr(ord('A') + i) for i in range(10)] df_raw = pd.DataFrame( dict((c, [i**2 for i in range(20)]) for c in cols)) old_chunk_store_limit = options.chunk_store_limit try: options.chunk_store_limit = 20 df = from_pandas_df(df_raw, chunk_size=5) r = df.apply('ffill') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply('ffill') pd.testing.assert_frame_equal(result, expected) r = df.apply(np.sqrt) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sqrt) pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: pd.Series([1, 2])) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: pd.Series([1, 2])) pd.testing.assert_frame_equal(result, expected) r = df.apply(np.sum, axis='index') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sum, axis='index') pd.testing.assert_series_equal(result, expected) r = df.apply(np.sum, axis='columns') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(np.sum, axis='columns') pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: [1, 2], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: [1, 2], axis=1) pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply( lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: [1, 2], axis=1, result_type='expand') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: [1, 2], axis=1, result_type='expand') pd.testing.assert_frame_equal(result, expected) r = df.apply(lambda x: list(range(10)), axis=1, result_type='reduce') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='reduce') pd.testing.assert_series_equal(result, expected) r = df.apply(lambda x: list(range(10)), axis=1, result_type='broadcast') result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='broadcast') pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: list(range(len(x)))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x)))) pd.testing.assert_frame_equal(result, expected) r = df.transform(lambda x: list(range(len(x))), axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.transform(lambda x: list(range(len(x))), axis=1) pd.testing.assert_frame_equal(result, expected) finally: options.chunk_store_limit = old_chunk_store_limit def testSeriesApplyExecute(self): idxes = [chr(ord('A') + i) for i in range(20)] s_raw = pd.Series([i**2 for i in range(20)], index=idxes) series = from_pandas_series(s_raw, chunk_size=5) r = series.apply('add', args=(1, )) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply('add', args=(1, )) pd.testing.assert_series_equal(result, expected) r = series.apply(np.sqrt) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(np.sqrt) pd.testing.assert_series_equal(result, expected) r = series.apply('sqrt') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply('sqrt') pd.testing.assert_series_equal(result, expected) r = series.apply(lambda x: [x, x + 1], convert_dtype=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False) pd.testing.assert_series_equal(result, expected) r = series.transform(lambda x: x + 1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.transform(lambda x: x + 1) pd.testing.assert_series_equal(result, expected) def testStringMethodExecution(self): s = pd.Series(['s1,s2', 'ef,', 'dd', np.nan]) s2 = pd.concat([s, s, s]) series = from_pandas_series(s, chunk_size=2) series2 = from_pandas_series(s2, chunk_size=2) # test getitem r = series.str[:3] result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str[:3] pd.testing.assert_series_equal(result, expected) # test split, expand=False r = series.str.split(',', n=2) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', n=2) pd.testing.assert_series_equal(result, expected) # test split, expand=True r = series.str.split(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.split(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test rsplit r = series.str.rsplit(',', expand=True, n=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.rsplit(',', expand=True, n=1) pd.testing.assert_frame_equal(result, expected) # test cat all data r = series2.str.cat(sep='/', na_rep='e') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s2.str.cat(sep='/', na_rep='e') self.assertEqual(result, expected) # test cat list r = series.str.cat(['a', 'b', np.nan, 'c']) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(['a', 'b', np.nan, 'c']) pd.testing.assert_series_equal(result, expected) # test cat series r = series.str.cat(series.str.capitalize(), join='outer') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.cat(s.str.capitalize(), join='outer') pd.testing.assert_series_equal(result, expected) # test extractall r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") pd.testing.assert_frame_equal(result, expected) # test extract, expand=False r = series.str.extract(r'[ab](\d)', expand=False) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=False) pd.testing.assert_series_equal(result, expected) # test extract, expand=True r = series.str.extract(r'[ab](\d)', expand=True) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.str.extract(r'[ab](\d)', expand=True) pd.testing.assert_frame_equal(result, expected) def testDatetimeMethodExecution(self): # test datetime s = pd.Series( [pd.Timestamp('2020-1-1'), pd.Timestamp('2020-2-1'), np.nan]) series = from_pandas_series(s, chunk_size=2) r = series.dt.year result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.year pd.testing.assert_series_equal(result, expected) r = series.dt.strftime('%m-%d-%Y') result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.strftime('%m-%d-%Y') pd.testing.assert_series_equal(result, expected) # test timedelta s = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('3 days'), np.nan]) series = from_pandas_series(s, chunk_size=2) r = series.dt.days result = self.executor.execute_dataframe(r, concat=True)[0] expected = s.dt.days pd.testing.assert_series_equal(result, expected)