class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) def testFromPandasSeriesExecution(self): ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = from_pandas_series(ps, chunk_size=13) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testFromTensorExecution(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = from_tensor(tensor) tensor_res = self.executor.execute_tensor(tensor, concat=True)[0] pdf_expected = pd.DataFrame(tensor_res) df_result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) pd.testing.assert_frame_equal(df_result, pdf_expected) # test converted with specified index_value and columns tensor2 = mt.random.rand(2, 2, chunk_size=1) df2 = from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4])) df_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b'])) pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) # test converted from 1-d tensor tensor3 = mt.array([1, 2, 3]) df3 = from_tensor(tensor3) result3 = self.executor.execute_dataframe(df3, concat=True)[0] pdf_expected = pd.DataFrame(np.array([1, 2, 3])) pd.testing.assert_frame_equal(pdf_expected, result3)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() @require_cudf def testToGPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) res = self.executor.execute_dataframe(cdf, concat=True)[0] self.assertIsInstance(res, cudf.DataFrame) pd.testing.assert_frame_equal(res.to_pandas(), pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries) cseries = series.to_gpu() res = self.executor.execute_dataframe(cseries, concat=True)[0] self.assertIsInstance(res, cudf.Series) pd.testing.assert_series_equal(res.to_pandas(), pseries) @require_cudf def testToCPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) df2 = to_cpu(cdf) res = self.executor.execute_dataframe(df2, concat=True)[0] self.assertIsInstance(res, pd.DataFrame) pd.testing.assert_frame_equal(res, pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries, chunk_size=(13, 21)) cseries = to_gpu(series) series2 = to_cpu(cseries) res = self.executor.execute_dataframe(series2, concat=True)[0] self.assertIsInstance(res, pd.Series) pd.testing.assert_series_equal(res, pseries)
class TestUnary(TestBase): def setUp(self): super(TestUnary, self).setUp() self.executor = Executor() def testAbs(self): data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10))) df1 = from_pandas(data1, chunk_size=5) result = self.executor.execute_dataframe(abs(df1), concat=True)[0] expected = data1.abs() pd.testing.assert_frame_equal(expected, result)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() def testPandasExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) def testFromPandasSeriesExecution(self): ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = from_pandas_series(ps, chunk_size=13) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) def testFromPandasSeriesExecution(self): ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = from_pandas_series(ps, chunk_size=13) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testInitializerExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = md.DataFrame(pdf, chunk_size=(15, 10)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = md.Series(ps, chunk_size=7) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testSeriesFromTensor(self): data = np.random.rand(10) series = md.Series(mt.tensor(data), name='a') pd.testing.assert_series_equal(series.execute(), pd.Series(data, name='a')) series = md.Series(mt.tensor(data, chunk_size=3)) pd.testing.assert_series_equal(series.execute(), pd.Series(data)) series = md.Series(mt.ones((10,), chunk_size=4)) pd.testing.assert_series_equal(series.execute(), pd.Series(np.ones(10,))) def testFromTensorExecution(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) tensor_res = self.executor.execute_tensor(tensor, concat=True)[0] pdf_expected = pd.DataFrame(tensor_res) df_result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) pd.testing.assert_frame_equal(df_result, pdf_expected) # test converted with specified index_value and columns tensor2 = mt.random.rand(2, 2, chunk_size=1) df2 = dataframe_from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4])) df_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b'])) pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) # test converted from 1-d tensor tensor3 = mt.array([1, 2, 3]) df3 = dataframe_from_tensor(tensor3) result3 = self.executor.execute_dataframe(df3, concat=True)[0] pdf_expected = pd.DataFrame(np.array([1, 2, 3])) pd.testing.assert_frame_equal(pdf_expected, result3) # test converted from identical chunks tensor4 = mt.ones((10, 10), chunk_size=3) df4 = dataframe_from_tensor(tensor4) result4 = self.executor.execute_dataframe(df4, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor(tensor4, concat=True)[0]) pd.testing.assert_frame_equal(pdf_expected, result4) # from tensor with given index tensor5 = mt.ones((10, 10), chunk_size=3) df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2)) result5 = self.executor.execute_dataframe(df5, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor(tensor5, concat=True)[0], index=np.arange(0, 20, 2)) pd.testing.assert_frame_equal(pdf_expected, result5) # from tensor with given columns tensor6 = mt.ones((10, 10), chunk_size=3) df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij')) result6 = self.executor.execute_dataframe(df6, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor(tensor6, concat=True)[0], columns=list('abcdefghij')) pd.testing.assert_frame_equal(pdf_expected, result6) def testFromRecordsExecution(self): dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')]) ndarr = np.ones((10,), dtype=dtype) pdf_expected = pd.DataFrame.from_records(ndarr, index=pd.RangeIndex(10)) # from structured array of mars tensor = mt.ones((10,), dtype=dtype, chunk_size=3) df1 = from_records(tensor) df1_result = self.executor.execute_dataframe(df1, concat=True)[0] pd.testing.assert_frame_equal(df1_result, pdf_expected) # from structured array of numpy df2 = from_records(ndarr) df2_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(df2_result, pdf_expected)
class TestBinary(TestBase): def setUp(self): self.executor = Executor() @property def rfunc_name(self): return 'r' + self.func_name def testWithoutShuffleExecution(self): # all the axes are monotonic # data1 with index split into [0...4], [5...9], # columns [3...7], [8...12] data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], # columns [4...9], [10, 13] data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithOneShuffleExecution(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=5) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithAllShuffleExecution(self): # no axis is monotonic data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testBothWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=10) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=10) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=10) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=10) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithoutShuffleAndWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=(10, 5)) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testWithShuffleAndWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(10, 5)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testChained(self): data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) data4 = pd.DataFrame(np.random.rand(10, 10)) df4 = from_pandas(data4, chunk_size=6) df5 = self.func(df3, df4) result = self.executor.execute_dataframe(df5, concat=True)[0] expected = self.func(self.func(data1, data2), data4) pd.testing.assert_frame_equal(expected, result) def testRfunc(self): data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) df3 = getattr(df1, self.rfunc_name)(df2) result = self.executor.execute_dataframe(df3, concat=True)[0] expected = self.func(data2, data1) pd.testing.assert_frame_equal(expected, result) data3 = pd.DataFrame(np.random.rand(10, 10)) df4 = from_pandas(data3, chunk_size=5) df5 = getattr(df4, self.rfunc_name)(1) result = self.executor.execute_dataframe(df5, concat=True)[0] expected2 = self.func(1, data3) pd.testing.assert_frame_equal(expected2, result) def testWithMultiForms(self): # test multiple forms # such as self+other, self.add(other), add(self,other) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) expected = self.func(data1, data2) result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(getattr(df1, self.func_name)(df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(getattr(df1, self.rfunc_name)(df2), concat=True)[0] pd.testing.assert_frame_equal(self.func(data2, data1), result) def testDataframeAndScalar(self): # test dataframe and scalar pdf = pd.DataFrame(np.random.rand(10, 10)) df = from_pandas(pdf, chunk_size=2) expected = self.func(pdf, 1) result = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result2 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result2) result3 = self.executor.execute_dataframe(getattr(df, self.func_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected, result3) # test scalar and dataframe result4 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result4) expected2 = self.func(1, pdf) result5 = self.executor.execute_dataframe(self.func(1, df), concat=True)[0] pd.testing.assert_frame_equal(expected2, result5) result6 = self.executor.execute_dataframe(getattr(df, self.rfunc_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected2, result6) def testWithShuffleOnStringIndex(self): # no axis is monotonic, and the index values are strings. data1 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testDataframeAndSeries(self): data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) s1 = from_pandas_series(data2[1], chunk_size=(6,)) # operate on single-column dataframe and series df1 = from_pandas(data1[[1]], chunk_size=(5, 5)) r1 = getattr(df1, self.func_name)(s1, axis='index') expected = getattr(data1[[1]], self.func_name)(data2[1], axis='index') result = self.executor.execute_dataframe(r1, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # operate on dataframe and series without shuffle df2 = from_pandas(data1, chunk_size=(5, 5)) r2 = getattr(df2, self.func_name)(s1, axis='index') expected = getattr(data1, self.func_name)(data2[1], axis='index') result = self.executor.execute_dataframe(r2, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # operate on dataframe and series with shuffle df3 = from_pandas(data1, chunk_size=(5, 5)) r3 = getattr(df3, self.func_name)(s1, axis='columns') expected = getattr(data1, self.func_name)(data2[1], axis='columns') result = self.executor.execute_dataframe(r3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # test both one chunk, axis=0 pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3]) df = from_pandas(pdf) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=0), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=0) pd.testing.assert_frame_equal(expected, result) # test different number of chunks, axis=0 pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3]) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=0), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=0) pd.testing.assert_frame_equal(expected, result) # test with row shuffle, axis=0 pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[2, 1, 3]) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[3, 1, 2]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=0), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=0).reindex([3, 1, 2]) # modify the order of rows result = result.reindex(index=[3, 1, 2]) pd.testing.assert_frame_equal(expected, result) # test both one chunk, axis=1 pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc']) df = from_pandas(pdf) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=1), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=1) pd.testing.assert_frame_equal(expected, result) # test different number of chunks, axis=1 pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc']) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[1, 2, 3]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=1), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=1) pd.testing.assert_frame_equal(expected, result) # test with row shuffle, axis=1 pdf = pd.DataFrame({1: [1, 3, 2], 3: [1, 2, 3], 2: [360, 180, 2]}, index=['ra', 'rb', 'rc']) df = from_pandas(pdf, chunk_size=1) series = pd.Series([0, 1, 2], index=[3, 1, 2]) mars_series = from_pandas_series(series) result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=1), concat=True)[0] expected = getattr(pdf, self.func_name)(series, axis=1) # modify the order of columns result = result[[1, 2, 3]] pd.testing.assert_frame_equal(expected, result) def testSeries(self): # only one chunk s1 = pd.Series(np.arange(10) + 1) s2 = pd.Series(np.arange(10) + 1) r = self.func(from_pandas_series(s1, chunk_size=10), from_pandas_series(s2, chunk_size=10)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) # same index s1 = pd.Series(np.arange(10) + 1) s2 = pd.Series(np.arange(10) + 1) r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) # no shuffle s1 = pd.Series(np.arange(10) + 1, index=range(10)) s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1)) r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) # shuffle s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) s2 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10, 0, -1))) r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, s2) pd.testing.assert_series_equal(expected, result) # operate with scalar s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) r = self.func(from_pandas_series(s1, chunk_size=4), 4) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(s1, 4) pd.testing.assert_series_equal(expected, result) # reverse with scalar s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10))) r = self.func(4, from_pandas_series(s1, chunk_size=4)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = self.func(4, s1) pd.testing.assert_series_equal(expected, result) def testWithPlainValue(self): data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result) @unittest.expectedFailure def testWithPlainValueUnaligned(self): # When adding dataframe with a sequence value, pandas treats the sequence # as a series using the index_value of the dataframe. # # In mars we cannot do such things because the index_value is not stored. # We also cannot split the sequence using the nsplits of the dataframe since # in many cases the shape of the dataframe chunks is np.nan. # # We record this case as `expectedFailure`. data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=6) r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) pd.testing.assert_frame_equal(expected, result)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() @require_cudf def testToGPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) res = self.executor.execute_dataframe(cdf, concat=True)[0] self.assertIsInstance(res, cudf.DataFrame) pd.testing.assert_frame_equal(res.to_pandas(), pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries) cseries = series.to_gpu() res = self.executor.execute_dataframe(cseries, concat=True)[0] self.assertIsInstance(res, cudf.Series) pd.testing.assert_series_equal(res.to_pandas(), pseries) @require_cudf def testToCPUExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1)) df = from_pandas_df(pdf, chunk_size=(13, 21)) cdf = to_gpu(df) df2 = to_cpu(cdf) res = self.executor.execute_dataframe(df2, concat=True)[0] self.assertIsInstance(res, pd.DataFrame) pd.testing.assert_frame_equal(res, pdf) pseries = pdf.iloc[:, 0] series = from_pandas_series(pseries, chunk_size=(13, 21)) cseries = to_gpu(series) series2 = to_cpu(cseries) res = self.executor.execute_dataframe(series2, concat=True)[0] self.assertIsInstance(res, pd.Series) pd.testing.assert_series_equal(res, pseries) def testRechunkExecution(self): data = pd.DataFrame(np.random.rand(8, 10)) df = from_pandas_df(pd.DataFrame(data), chunk_size=3) df2 = df.rechunk((3, 4)) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) df2 = df.rechunk(5) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) # test Series rechunk execution. data = pd.Series(np.random.rand(10, )) series = from_pandas_series(data) series2 = series.rechunk(3) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) series2 = series.rechunk(1) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() def testAddWithoutShuffleExecution(self): # all the axes are monotonic # data1 with index split into [0...4], [5...9], # columns [3...7], [8...12] data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], # columns [4...9], [10, 13] data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testAddWithOneShuffleExecution(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=5) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testAddWithAllShuffleExecution(self): # no axis is monotonic data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testAddBothWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=10) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=10) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=10) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=10) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testAddWithoutShuffleAndWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=(10, 5)) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testAddWithShuffleAndWithOneChunk(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(10, 5)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = add(df1, df2) expected = data1 + data2 result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testAddWithAdded(self): data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) data4 = pd.DataFrame(np.random.rand(10, 10)) df4 = from_pandas(data4, chunk_size=6) df5 = add(df3, df4) result = self.executor.execute_dataframe(df5, concat=True)[0] expected = data1 + data2 + data4 pd.testing.assert_frame_equal(expected, result) def testRadd(self): data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) radd = getattr(df2, '__radd__') df3 = radd(df1, df2) result = self.executor.execute_dataframe(df3, concat=True)[0] expected = data1 + data2 pd.testing.assert_frame_equal(expected, result) def testAddWithMultiForms(self): # test multiple forms of add # such as self+other, self.add(other), add(self,other) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) expected = data1 + data2 result = self.executor.execute_dataframe(df1 + df2, concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(add(df1, df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(df1.add(df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(df1.radd(df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) def testAbs(self): data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10))) df1 = from_pandas(data1, chunk_size=5) result = self.executor.execute_dataframe(abs(df1), concat=True)[0] expected = data1.abs() pd.testing.assert_frame_equal(expected, result)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() def testMerge(self): df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e']) df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y']) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) # Note [Index of Merge] # # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to # the final result dataframe. # # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex. # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the # same index value with pandas. But we guarantee that the content of dataframe is correct. # merge on index expected0 = df1.merge(df2) jdf0 = mdf1.merge(mdf2) result0 = self.executor.execute_dataframe(jdf0, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)) # merge on left index and `right_on` expected1 = df1.merge(df2, how='left', right_on='x', left_index=True) jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True) result1 = self.executor.execute_dataframe(jdf1, concat=True)[0] expected1.set_index('a_x', inplace=True) result1.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0)) # merge on `left_on` and right index expected2 = df1.merge(df2, how='right', left_on='a', right_index=True) jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True) result2 = self.executor.execute_dataframe(jdf2, concat=True)[0] expected2.set_index('a', inplace=True) result2.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)) # merge on `left_on` and `right_on` expected3 = df1.merge(df2, how='left', left_on='a', right_on='x') jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x') result3 = self.executor.execute_dataframe(jdf3, concat=True)[0] expected3.set_index('a_x', inplace=True) result3.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)) # merge on `on` expected4 = df1.merge(df2, how='right', on='a') jdf4 = mdf1.merge(mdf2, how='right', on='a') result4 = self.executor.execute_dataframe(jdf4, concat=True)[0] expected4.set_index('a', inplace=True) result4.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0)) # merge on multiple columns expected5 = df1.merge(df2, how='inner', on=['a', 'b']) jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b']) result5 = self.executor.execute_dataframe(jdf5, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0)) def testJoin(self): df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=['a1', 'a2', 'a3']) df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=['a1', 'b2', 'b3']) + 1 df2 = pd.concat([df2, df2 + 1]) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) # default `how` expected0 = df1.join(df2, lsuffix='l_', rsuffix='r_') jdf0 = mdf1.join(mdf2, lsuffix='l_', rsuffix='r_') result0 = self.executor.execute_dataframe(jdf0, concat=True)[0] pd.testing.assert_frame_equal(expected0.sort_index(), result0.sort_index()) # how = 'left' expected1 = df1.join(df2, how='left', lsuffix='l_', rsuffix='r_') jdf1 = mdf1.join(mdf2, how='left', lsuffix='l_', rsuffix='r_') result1 = self.executor.execute_dataframe(jdf1, concat=True)[0] pd.testing.assert_frame_equal(expected1.sort_index(), result1.sort_index()) # how = 'right' expected2 = df1.join(df2, how='right', lsuffix='l_', rsuffix='r_') jdf2 = mdf1.join(mdf2, how='right', lsuffix='l_', rsuffix='r_') result2 = self.executor.execute_dataframe(jdf2, concat=True)[0] pd.testing.assert_frame_equal(expected2.sort_index(), result2.sort_index()) # how = 'inner' expected3 = df1.join(df2, how='inner', lsuffix='l_', rsuffix='r_') jdf3 = mdf1.join(mdf2, how='inner', lsuffix='l_', rsuffix='r_') result3 = self.executor.execute_dataframe(jdf3, concat=True)[0] pd.testing.assert_frame_equal(expected3.sort_index(), result3.sort_index()) # how = 'outer' expected4 = df1.join(df2, how='outer', lsuffix='l_', rsuffix='r_') jdf4 = mdf1.join(mdf2, how='outer', lsuffix='l_', rsuffix='r_') result4 = self.executor.execute_dataframe(jdf4, concat=True)[0] pd.testing.assert_frame_equal(expected4.sort_index(), result4.sort_index()) def testJoinOn(self): df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=['a1', 'a2', 'a3']) df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=['a1', 'b2', 'b3']) + 1 df2 = pd.concat([df2, df2 + 1]) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) expected0 = df1.join(df2, on=None, lsuffix='_l', rsuffix='_r') jdf0 = mdf1.join(mdf2, on=None, lsuffix='_l', rsuffix='_r') result0 = self.executor.execute_dataframe(jdf0, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)) expected1 = df1.join(df2, how='left', on='a1', lsuffix='_l', rsuffix='_r') jdf1 = mdf1.join(mdf2, how='left', on='a1', lsuffix='_l', rsuffix='_r') result1 = self.executor.execute_dataframe(jdf1, concat=True)[0] # Note [Columns of Left Join] # # I believe we have no chance to obtain the entirely same result with pandas here: # # Look at the following example: # # >>> df1 # a1 a2 a3 # 0 1 3 3 # >>> df2 # a1 b2 b3 # 1 2 6 7 # >>> df3 # a1 b2 b3 # 1 2 6 7 # 1 2 6 7 # # >>> df1.merge(df2, how='left', left_on='a1', left_index=False, right_index=True) # a1_x a2 a3 a1_y b2 b3 # 0 1 3 3 2 6 7 # >>> df1.merge(df3, how='left', left_on='a1', left_index=False, right_index=True) # a1 a1_x a2 a3 a1_y b2 b3 # 0 1 1 3 3 2 6 7 # 0 1 1 3 3 2 6 7 # # Note that the result of `df1.merge(df3)` has an extra column `a` compared to `df1.merge(df2)`. # The value of column `a` is the same of `a1_x`, just because `1` occurs twice in index of `df3`. # I haven't invistagated why pandas has such behaviour... # # We cannot yeild the same result with pandas, because, the `df3` is chunked, then some of the # result chunk has 6 columns, others may have 7 columns, when concatenated into one DataFrame # some cells of column `a` will have value `NaN`, which is different from the result of pandas. # # But we can guarantee that other effective columns have absolutely same value with pandas. columns_to_compare = jdf1.columns_value.to_pandas() pd.testing.assert_frame_equal( sort_dataframe_inplace(expected1[columns_to_compare], 0, 1), sort_dataframe_inplace(result1[columns_to_compare], 0, 1)) # Note [Index of Join on EmptyDataFrame] # # It is tricky that it is non-trivial to get the same `index` result with pandas. # # Look at the following example: # # >>> df1 # a1 a2 a3 # 1 4 2 6 # >>> df2 # a1 b2 b3 # 1 2 6 7 # 2 8 9 10 # >>> df3 # Empty DataFrame # Columns: [a1, a2, a3] # Index: [] # >>> df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') # a1_l a2 a3 a1_r b2 b3 # 1.0 4.0 2 6.0 8 9 10 # NaN NaN 1 NaN 2 6 7 # >>> df3.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') # a1_l a2 a3 a1_r b2 b3 # 1 NaN 1 NaN 2 6 7 # 2 NaN 2 NaN 8 9 10 # # When the `left` dataframe is not empty, the mismatched rows in `right` will have index value `NaN`, # and the matched rows have index value from `right`. When the `left` dataframe is empty, the mismatched # rows have index value from `right`. # # Since we chunked the `left` dataframe, it is uneasy to obtain the same index value with pandas in the # final result dataframe, but we guaranteed that the dataframe content is correctly. expected2 = df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r') jdf2 = mdf1.join(mdf2, how='right', on='a2', lsuffix='_l', rsuffix='_r') result2 = self.executor.execute_dataframe(jdf2, concat=True)[0] expected2.set_index('a2', inplace=True) result2.set_index('a2', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)) expected3 = df1.join(df2, how='inner', on='a2', lsuffix='_l', rsuffix='_r') jdf3 = mdf1.join(mdf2, how='inner', on='a2', lsuffix='_l', rsuffix='_r') result3 = self.executor.execute_dataframe(jdf3, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)) expected4 = df1.join(df2, how='outer', on='a2', lsuffix='_l', rsuffix='_r') jdf4 = mdf1.join(mdf2, how='outer', on='a2', lsuffix='_l', rsuffix='_r') result4 = self.executor.execute_dataframe(jdf4, concat=True)[0] expected4.set_index('a2', inplace=True) result4.set_index('a2', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0))
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.executor = Executor() def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) def testFromPandasSeriesExecution(self): ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = from_pandas_series(ps, chunk_size=13) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testInitializerExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = md.DataFrame(pdf, chunk_size=(15, 10)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = md.Series(ps, chunk_size=7) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) def testSeriesFromTensor(self): data = np.random.rand(10) series = md.Series(mt.tensor(data), name='a') pd.testing.assert_series_equal(series.execute(), pd.Series(data, name='a')) series = md.Series(mt.tensor(data, chunk_size=3)) pd.testing.assert_series_equal(series.execute(), pd.Series(data)) series = md.Series(mt.ones((10, ), chunk_size=4)) pd.testing.assert_series_equal(series.execute(), pd.Series(np.ones(10, ))) def testFromTensorExecution(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) tensor_res = self.executor.execute_tensor(tensor, concat=True)[0] pdf_expected = pd.DataFrame(tensor_res) df_result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) pd.testing.assert_frame_equal(df_result, pdf_expected) # test converted with specified index_value and columns tensor2 = mt.random.rand(2, 2, chunk_size=1) df2 = dataframe_from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4])) df_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b'])) pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) # test converted from 1-d tensor tensor3 = mt.array([1, 2, 3]) df3 = dataframe_from_tensor(tensor3) result3 = self.executor.execute_dataframe(df3, concat=True)[0] pdf_expected = pd.DataFrame(np.array([1, 2, 3])) pd.testing.assert_frame_equal(pdf_expected, result3) # test converted from identical chunks tensor4 = mt.ones((10, 10), chunk_size=3) df4 = dataframe_from_tensor(tensor4) result4 = self.executor.execute_dataframe(df4, concat=True)[0] pdf_expected = pd.DataFrame( self.executor.execute_tensor(tensor4, concat=True)[0]) pd.testing.assert_frame_equal(pdf_expected, result4) # from tensor with given index tensor5 = mt.ones((10, 10), chunk_size=3) df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2)) result5 = self.executor.execute_dataframe(df5, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor5, concat=True)[0], index=np.arange(0, 20, 2)) pd.testing.assert_frame_equal(pdf_expected, result5) # from tensor with given columns tensor6 = mt.ones((10, 10), chunk_size=3) df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij')) result6 = self.executor.execute_dataframe(df6, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor6, concat=True)[0], columns=list('abcdefghij')) pd.testing.assert_frame_equal(pdf_expected, result6) def testFromRecordsExecution(self): dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')]) ndarr = np.ones((10, ), dtype=dtype) pdf_expected = pd.DataFrame.from_records(ndarr, index=pd.RangeIndex(10)) # from structured array of mars tensor = mt.ones((10, ), dtype=dtype, chunk_size=3) df1 = from_records(tensor) df1_result = self.executor.execute_dataframe(df1, concat=True)[0] pd.testing.assert_frame_equal(df1_result, pdf_expected) # from structured array of numpy df2 = from_records(ndarr) df2_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(df2_result, pdf_expected) def testReadCSVExecution(self): tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test sep tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, sep=';') pdf = pd.read_csv(file_path, sep=';', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test missing value tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame({ 'c1': [np.nan, 'a', 'b', 'c'], 'c2': [1, 2, 3, np.nan], 'c3': [np.nan, np.nan, 3.4, 2.2] }) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=12), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, index_col=0, chunk_bytes=100), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test compression tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.gzip') try: index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path, compression='gzip') pdf = pd.read_csv(file_path, compression='gzip', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0, chunk_bytes='1k'), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir) # test multiply files tempdir = tempfile.mkdtemp() try: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [ os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3) ] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) mdf = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2) finally: shutil.rmtree(tempdir)