def testNunique(self): data = pd.DataFrame(np.random.randint(0, 6, size=(20, 10)), columns=['c' + str(i) for i in range(10)]) df = from_pandas_df(data, chunk_size=3) result = df.nunique() self.assertEqual(result.shape, (10,)) self.assertEqual(result.op.output_types[0], OutputType.series) self.assertIsInstance(result.op, DataFrameNunique) tiled = result.tiles() self.assertEqual(tiled.shape, (10,)) self.assertEqual(len(tiled.chunks), 4) self.assertEqual(tiled.nsplits, ((3, 3, 3, 1,),)) self.assertEqual(tiled.chunks[0].op.stage, OperandStage.agg) self.assertIsInstance(tiled.chunks[0].op, DataFrameAggregate) data2 = data.copy() df2 = from_pandas_df(data2, chunk_size=3) result2 = df2.nunique(axis=1) self.assertEqual(result2.shape, (20,)) self.assertEqual(result2.op.output_types[0], OutputType.series) self.assertIsInstance(result2.op, DataFrameNunique) tiled = result2.tiles() self.assertEqual(tiled.shape, (20,)) self.assertEqual(len(tiled.chunks), 7) self.assertEqual(tiled.nsplits, ((3, 3, 3, 3, 3, 3, 2,),)) self.assertEqual(tiled.chunks[0].op.stage, OperandStage.agg) self.assertIsInstance(tiled.chunks[0].op, DataFrameAggregate)
def testRechunkExecution(self): data = pd.DataFrame(np.random.rand(8, 10)) df = from_pandas_df(pd.DataFrame(data), chunk_size=3) df2 = df.rechunk((3, 4)) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) df2 = df.rechunk(5) res = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(data, res) # test Series rechunk execution. data = pd.Series(np.random.rand(10,)) series = from_pandas_series(data) series2 = series.rechunk(3) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) series2 = series.rechunk(1) res = self.executor.execute_dataframe(series2, concat=True)[0] pd.testing.assert_series_equal(data, res) # test index rechunk execution data = pd.Index(np.random.rand(10,)) index = from_pandas_index(data) index2 = index.rechunk(3) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res) index2 = index.rechunk(1) res = self.executor.execute_dataframe(index2, concat=True)[0] pd.testing.assert_index_equal(data, res)
def testDataFrameReduction(self): data = pd.DataFrame({ 'a': list(range(20)), 'b': list(range(20, 0, -1)) }, index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=3), self.func_name)() self.assertIsInstance(reduction_df, Series) self.assertIsInstance(reduction_df.op, self.op) self.assertIsInstance(reduction_df.index_value._index_value, IndexValue.Index) self.assertEqual(reduction_df.shape, (2, )) reduction_df = reduction_df.tiles() self.assertEqual(len(reduction_df.chunks), 1) self.assertIsInstance(reduction_df.chunks[0].op, DataFrameAggregate) self.assertIsInstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) self.assertEqual(len(reduction_df.chunks[0].inputs[0].inputs), 2) data = pd.DataFrame(np.random.rand(20, 10)) reduction_df = getattr(from_pandas_df(data, chunk_size=3), self.func_name)() self.assertIsInstance(reduction_df, Series) self.assertIsInstance(reduction_df.index_value._index_value, IndexValue.RangeIndex) self.assertEqual(reduction_df.shape, (10, )) reduction_df = reduction_df.tiles() self.assertEqual(len(reduction_df.chunks), 4) self.assertEqual(reduction_df.nsplits, ((3, 3, 3, 1), )) self.assertIsInstance(reduction_df.chunks[0].op, DataFrameAggregate) self.assertIsInstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) self.assertEqual(len(reduction_df.chunks[0].inputs[0].inputs), 2) data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=4), self.func_name)(axis='columns') self.assertEqual(reduction_df.shape, (20, )) reduction_df = reduction_df.tiles() self.assertEqual(len(reduction_df.chunks), 5) self.assertEqual(reduction_df.nsplits, ((4, ) * 5, )) self.assertIsInstance(reduction_df.chunks[0].op, DataFrameAggregate) self.assertIsInstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) self.assertEqual(len(reduction_df.chunks[0].inputs[0].inputs), 2) with self.assertRaises(NotImplementedError): getattr(from_pandas_df(data, chunk_size=3), self.func_name)(level=0, axis=1)
def testDataFrameCount(self): data = pd.DataFrame({ "Person": ["John", "Myla", "Lewis", "John", "Myla"], "Age": [24., np.nan, 21., 33, 26], "Single": [False, True, True, True, False]}) df = from_pandas_df(data) result = self.executor.execute_dataframe(df.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df2 = from_pandas_df(data, chunk_size=2) result = self.executor.execute_dataframe(df2.count(), concat=True)[0] expected = data.count() pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df2.count(axis='columns'), concat=True)[0] expected = data.count(axis='columns') pd.testing.assert_series_equal(result, expected) df3 = from_pandas_df(data, chunk_size=3) result = self.executor.execute_dataframe(df3.count(numeric_only=True), concat=True)[0] expected = data.count(numeric_only=True) pd.testing.assert_series_equal(result, expected) result = self.executor.execute_dataframe(df3.count(axis='columns', numeric_only=True), concat=True)[0] expected = data.count(axis='columns', numeric_only=True) pd.testing.assert_series_equal(result, expected)
def testDataFrameShuffle(self, *_): from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.merge.merge import merge from mars.dataframe.utils import sort_dataframe_inplace with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: session = cluster.session data1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e']) data2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y']) df1 = from_pandas_df(data1, chunk_size=2) df2 = from_pandas_df(data2, chunk_size=2) r1 = data1.merge(data2) r2 = session.run(merge(df1, df2), timeout=_exec_timeout) pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0)) r1 = data1.merge(data2, how='inner', on=['a', 'b']) r2 = session.run(merge(df1, df2, how='inner', on=['a', 'b']), timeout=_exec_timeout) pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0)) web_session = new_session('http://' + cluster._web_endpoint) r1 = data1.merge(data2) r2 = web_session.run(merge(df1, df2), timeout=_exec_timeout) pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0)) r1 = data1.merge(data2, how='inner', on=['a', 'b']) r2 = web_session.run(merge(df1, df2, how='inner', on=['a', 'b']), timeout=_exec_timeout) pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))
def testResetIndex(self): data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5), ('mammal', np.nan)], index=['falcon', 'parrot', 'lion', 'monkey'], columns=('class', 'max_speed')) df = df_reset_index(from_pandas_df(data, chunk_size=2)) r = data.reset_index() self.assertEqual(df.shape, (4, 3)) pd.testing.assert_series_equal(df.dtypes, r.dtypes) df2 = df.tiles() self.assertEqual(len(df2.chunks), 2) self.assertEqual(df2.chunks[0].shape, (2, 3)) pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2)) pd.testing.assert_series_equal(df2.chunks[0].dtypes, r.dtypes) self.assertEqual(df2.chunks[1].shape, (2, 3)) pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4)) pd.testing.assert_series_equal(df2.chunks[1].dtypes, r.dtypes) df = df_reset_index(from_pandas_df(data, chunk_size=1), drop=True) r = data.reset_index(drop=True) self.assertEqual(df.shape, (4, 2)) pd.testing.assert_series_equal(df.dtypes, r.dtypes) df2 = df.tiles() self.assertEqual(len(df2.chunks), 8) for c in df2.chunks: self.assertEqual(c.shape, (1, 1)) pd.testing.assert_index_equal( c.index_value.to_pandas(), pd.RangeIndex(c.index[0], c.index[0] + 1)) pd.testing.assert_series_equal(c.dtypes, r.dtypes[c.index[1]:c.index[1] + 1]) # test Series series_data = pd.Series([1, 2, 3, 4], name='foo', index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) s = series_reset_index(from_pandas_series(series_data, chunk_size=2)) r = series_data.reset_index() self.assertEqual(s.shape, (4, 2)) pd.testing.assert_series_equal(s.dtypes, r.dtypes) s2 = s.tiles() self.assertEqual(len(s2.chunks), 2) self.assertEqual(s2.chunks[0].shape, (2, 2)) pd.testing.assert_index_equal(s2.chunks[0].index_value.to_pandas(), pd.RangeIndex(2)) self.assertEqual(s2.chunks[1].shape, (2, 2)) pd.testing.assert_index_equal(s2.chunks[1].index_value.to_pandas(), pd.RangeIndex(2, 4))
def testDataFrameReduction(self): data = pd.DataFrame({'a': list(range(20)), 'b': list(range(20, 0, -1))}, index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=3), self.func_name)() self.assertIsInstance(reduction_df, DataFrame) self.assertIsInstance(reduction_df.index_value._index_value, IndexValue.Index) self.assertEqual(reduction_df.shape, (20, 2)) reduction_df = reduction_df.tiles() self.assertEqual(len(reduction_df.chunks), 7) self.assertIsInstance(reduction_df.chunks[0].op, self.op) self.assertEqual(reduction_df.chunks[0].op.stage, OperandStage.combine) self.assertIsInstance(reduction_df.chunks[-1].inputs[-1].op, self.op) self.assertEqual(reduction_df.chunks[-1].inputs[-1].op.stage, OperandStage.map) self.assertEqual(len(reduction_df.chunks[-1].inputs), 7) data = pd.DataFrame(np.random.rand(20, 10)) reduction_df = getattr(from_pandas_df(data, chunk_size=3), self.func_name)() self.assertIsInstance(reduction_df, DataFrame) self.assertIsInstance(reduction_df.index_value._index_value, IndexValue.RangeIndex) self.assertEqual(reduction_df.shape, (20, 10)) reduction_df = reduction_df.tiles() self.assertEqual(len(reduction_df.chunks), 28) self.assertEqual(reduction_df.nsplits, ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1))) self.assertEqual(reduction_df.chunks[0].op.stage, OperandStage.combine) self.assertIsInstance(reduction_df.chunks[-1].inputs[-1].op, self.op) self.assertEqual(reduction_df.chunks[-1].inputs[-1].op.stage, OperandStage.map) self.assertEqual(len(reduction_df.chunks[-1].inputs), 7)
def test_nunique(): data = pd.DataFrame(np.random.randint(0, 6, size=(20, 10)), columns=['c' + str(i) for i in range(10)]) df = from_pandas_df(data, chunk_size=3) result = df.nunique() assert result.shape == (10,) assert result.op.output_types[0] == OutputType.series assert isinstance(result.op, DataFrameNunique) tiled = tile(result) assert tiled.shape == (10,) assert len(tiled.chunks) == 4 assert tiled.nsplits == ((3, 3, 3, 1,),) assert tiled.chunks[0].op.stage == OperandStage.agg assert isinstance(tiled.chunks[0].op, DataFrameAggregate) data2 = data.copy() df2 = from_pandas_df(data2, chunk_size=3) result2 = df2.nunique(axis=1) assert result2.shape == (20,) assert result2.op.output_types[0] == OutputType.series assert isinstance(result2.op, DataFrameNunique) tiled = tile(result2) assert tiled.shape == (20,) assert len(tiled.chunks) == 7 assert tiled.nsplits == ((3, 3, 3, 3, 3, 3, 2,),) assert tiled.chunks[0].op.stage == OperandStage.agg assert isinstance(tiled.chunks[0].op, DataFrameAggregate)
def test_cum_dataframe_reduction(func_name, op, func_opts: FunctionOptions): data = pd.DataFrame({'a': list(range(20)), 'b': list(range(20, 0, -1))}, index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, DataFrame) assert isinstance(reduction_df.index_value._index_value, IndexValue.Index) assert reduction_df.shape == (20, 2) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 7 assert isinstance(reduction_df.chunks[0].op, op) assert reduction_df.chunks[0].op.stage == OperandStage.combine assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op) assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map assert len(reduction_df.chunks[-1].inputs) == 7 data = pd.DataFrame(np.random.rand(20, 10)) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, DataFrame) assert isinstance(reduction_df.index_value._index_value, IndexValue.RangeIndex) assert reduction_df.shape == (20, 10) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 28 assert reduction_df.nsplits == ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1)) assert reduction_df.chunks[0].op.stage == OperandStage.combine assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op) assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map assert len(reduction_df.chunks[-1].inputs) == 7
def testDescribeExecution(self): s_raw = pd.Series(np.random.rand(10)) # test one chunk series = from_pandas_series(s_raw, chunk_size=10) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) # test multi chunks series = from_pandas_series(s_raw, chunk_size=3) r = series.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe() pd.testing.assert_series_equal(result, expected) r = series.describe(percentiles=[]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[]) pd.testing.assert_series_equal(result, expected) df_raw = pd.DataFrame(np.random.rand(10, 4), columns=list('abcd')) df_raw['e'] = np.random.randint(100, size=10) # test one chunk df = from_pandas_df(df_raw, chunk_size=10) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = series.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_series_equal(result, expected) # test multi chunks df = from_pandas_df(df_raw, chunk_size=3) r = df.describe() result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe() pd.testing.assert_frame_equal(result, expected) r = df.describe(percentiles=[], include=np.float64) result = self.executor.execute_dataframe(r, concat=True)[0] expected = df_raw.describe(percentiles=[], include=np.float64) pd.testing.assert_frame_equal(result, expected) with self.assertRaises(ValueError): df.describe(percentiles=[1.1])
def testFetchDataFrame(self, *_): from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.arithmetic import add with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: session = cluster.session data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) r1 = session.run(df3, compose=False, timeout=_exec_timeout) r2 = session.fetch(df3) pd.testing.assert_frame_equal(r1, r2) data4 = pd.DataFrame(np.random.rand(10, 10)) df4 = from_pandas_df(data4, chunk_size=6) df5 = add(df3, df4) r1 = session.run(df5, compose=False, timeout=_exec_timeout) r2 = session.fetch(df5) pd.testing.assert_frame_equal(r1, r2) df6 = df5.sum() r1 = session.run(df6, timeout=_exec_timeout) r2 = session.fetch(df6) pd.testing.assert_series_equal(r1, r2)
def testDataFrameAggregate(self): all_aggs = ['sum', 'prod', 'min', 'max', 'count', 'size', 'mean', 'var', 'std', 'sem', 'skew', 'kurt'] data = pd.DataFrame(np.random.rand(20, 20)) df = from_pandas_df(data) result = df.agg(all_aggs) pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs)) result = df.agg('size') self.assertEqual(self.executor.execute_dataframe(result)[0], data.agg('size')) for func in (a for a in all_aggs if a != 'size'): result = df.agg(func) pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(func)) result = df.agg(func, axis=1) pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(func, axis=1)) df = from_pandas_df(data, chunk_size=3) # will redirect to transform result = df.agg(['cumsum', 'cummax']) pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(['cumsum', 'cummax'])) result = df.agg('size') self.assertEqual(self.executor.execute_dataframe(result)[0], data.agg('size')) for func in (a for a in all_aggs if a != 'size'): result = df.agg(func) pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(func)) result = df.agg(func, axis=1) pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(func, axis=1)) result = df.agg(['sum']) pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(['sum'])) result = df.agg(all_aggs) pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs)) result = df.agg(all_aggs, axis=1) pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg(all_aggs, axis=1)) result = df.agg({0: ['sum', 'min', 'var'], 9: ['mean', 'var', 'std']}) pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0], data.agg({0: ['sum', 'min', 'var'], 9: ['mean', 'var', 'std']}))
def testDataFrameGraphSerialize(self): df = from_pandas_df(pd.DataFrame(np.random.rand(10, 10), columns=pd.timedelta_range(start='1 day', periods=10), index=pd.date_range('2020-1-1', periods=10))) graph = df.build_graph(tiled=False) pb = graph.to_pb() graph2 = DAG.from_pb(pb) self.assertEqual(len(graph), len(graph2)) t = next(iter(graph)) t2 = next(iter(graph2)) self.assertTrue(t2.op.outputs[0], ReferenceType) # make sure outputs are all weak reference self.assertBaseEqual(t.op, t2.op) self.assertEqual(t.shape, t2.shape) self.assertEqual(sorted(i.key for i in t.inputs), sorted(i.key for i in t2.inputs)) pd.testing.assert_index_equal(t2.index_value.to_pandas(), t.index_value.to_pandas()) pd.testing.assert_index_equal(t2.columns_value.to_pandas(), t.columns_value.to_pandas()) jsn = graph.to_json() graph2 = DAG.from_json(jsn) self.assertEqual(len(graph), len(graph2)) t = next(iter(graph)) t2 = next(iter(graph2)) self.assertTrue(t2.op.outputs[0], ReferenceType) # make sure outputs are all weak reference self.assertBaseEqual(t.op, t2.op) self.assertEqual(t.shape, t2.shape) self.assertEqual(sorted(i.key for i in t.inputs), sorted(i.key for i in t2.inputs)) pd.testing.assert_index_equal(t2.index_value.to_pandas(), t.index_value.to_pandas()) pd.testing.assert_index_equal(t2.columns_value.to_pandas(), t.columns_value.to_pandas()) # test graph with tiled DataFrame t2 = from_pandas_df(pd.DataFrame(np.random.rand(10, 10)), chunk_size=(5, 4)).tiles() graph = DAG() graph.add_node(t2) pb = graph.to_pb() graph2 = DAG.from_pb(pb) self.assertEqual(len(graph), len(graph2)) chunks = next(iter(graph2)).chunks self.assertEqual(len(chunks), 6) self.assertIsInstance(chunks[0], DataFrameChunk) self.assertEqual(chunks[0].index, t2.chunks[0].index) self.assertBaseEqual(chunks[0].op, t2.chunks[0].op) pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(), t2.chunks[0].index_value.to_pandas()) pd.testing.assert_index_equal(chunks[0].columns_value.to_pandas(), t2.chunks[0].columns_value.to_pandas()) jsn = graph.to_json() graph2 = DAG.from_json(jsn) self.assertEqual(len(graph), len(graph2)) chunks = next(iter(graph2)).chunks self.assertEqual(len(chunks), 6) self.assertIsInstance(chunks[0], DataFrameChunk) self.assertEqual(chunks[0].index, t2.chunks[0].index) self.assertBaseEqual(chunks[0].op, t2.chunks[0].op) pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(), t2.chunks[0].index_value.to_pandas()) pd.testing.assert_index_equal(chunks[0].columns_value.to_pandas(), t2.chunks[0].columns_value.to_pandas())
def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions): data = pd.DataFrame({ 'a': list(range(20)), 'b': list(range(20, 0, -1)) }, index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, Series) assert isinstance(reduction_df.op, op) assert isinstance(reduction_df.index_value._index_value, IndexValue.Index) assert reduction_df.shape == (2, ) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 1 assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 data = pd.DataFrame(np.random.rand(20, 10)) reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)() assert isinstance(reduction_df, Series) assert isinstance(reduction_df.index_value._index_value, (IndexValue.RangeIndex, IndexValue.Int64Index)) assert reduction_df.shape == (10, ) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 4 assert reduction_df.nsplits == ((3, 3, 3, 1), ) assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)]) reduction_df = getattr(from_pandas_df(data, chunk_size=4), func_name)(axis='columns') assert reduction_df.shape == (20, ) reduction_df = tile(reduction_df) assert len(reduction_df.chunks) == 5 assert reduction_df.nsplits == ((4, ) * 5, ) assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate) assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat) assert len(reduction_df.chunks[0].inputs[0].inputs) == 2 with pytest.raises(NotImplementedError): getattr(from_pandas_df(data, chunk_size=3), func_name)(level=0, axis=1)
def testDropNA(self): # dataframe cases df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(30): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) for rowid in range(random.randint(1, 5)): row = random.randint(0, 19) for idx in range(0, 10): df_raw.iloc[row, idx] = random.randint(0, 99) # not supporting drop with axis=1 with self.assertRaises(NotImplementedError): from_pandas_df(df_raw).dropna(axis=1) # only one chunk in columns, can run dropna directly r = from_pandas_df(df_raw, chunk_size=(4, 10)).dropna().tiles() self.assertEqual(r.shape, (np.nan, 10)) self.assertEqual(r.nsplits, ((np.nan, ) * 5, (10, ))) for c in r.chunks: self.assertIsInstance(c.op, type(r.op)) self.assertEqual(len(c.inputs), 1) self.assertEqual(len(c.inputs[0].inputs), 0) self.assertEqual(c.shape, (np.nan, 10)) # multiple chunks in columns, count() will be called first r = from_pandas_df(df_raw, chunk_size=4).dropna().tiles() self.assertEqual(r.shape, (np.nan, 10)) self.assertEqual(r.nsplits, ((np.nan, ) * 5, (4, 4, 2))) for c in r.chunks: self.assertIsInstance(c.op, type(r.op)) self.assertEqual(len(c.inputs), 2) self.assertEqual(len(c.inputs[0].inputs), 0) self.assertEqual(c.inputs[1].op.stage, OperandStage.agg) self.assertTrue(np.isnan(c.shape[0])) # series cases series_raw = pd.Series(np.nan, index=range(20)) for _ in range(10): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) r = from_pandas_series(series_raw, chunk_size=4).dropna().tiles() self.assertEqual(r.shape, (np.nan, )) self.assertEqual(r.nsplits, ((np.nan, ) * 5, )) for c in r.chunks: self.assertIsInstance(c.op, type(r.op)) self.assertEqual(len(c.inputs), 1) self.assertEqual(len(c.inputs[0].inputs), 0) self.assertEqual(c.shape, (np.nan, ))
def testDataFrameSum(self): data = pd.DataFrame({ 'a': list(range(20)), 'b': list(range(20, 0, -1)) }, index=[str(i) for i in range(20)]) sum_df = from_pandas_df(data, chunk_size=3).sum() self.assertIsInstance(sum_df, Series) self.assertIsInstance(sum_df.index_value._index_value, IndexValue.Index) self.assertEqual(sum_df.shape, (2, )) sum_df.tiles() self.assertEqual(len(sum_df.chunks), 1) self.assertIsInstance(sum_df.chunks[0].op, DataFrameSum) self.assertIsInstance(sum_df.chunks[0].inputs[0].op, DataFrameConcat) self.assertEqual(len(sum_df.chunks[0].inputs[0].inputs), 2) data = pd.DataFrame(np.random.rand(20, 10)) sum_df = from_pandas_df(data, chunk_size=3).sum() self.assertIsInstance(sum_df, Series) self.assertIsInstance(sum_df.index_value._index_value, IndexValue.RangeIndex) self.assertEqual(sum_df.shape, (10, )) sum_df.tiles() self.assertEqual(len(sum_df.chunks), 4) self.assertEqual(sum_df.nsplits, ((3, 3, 3, 1), )) self.assertIsInstance(sum_df.chunks[0].op, DataFrameSum) self.assertIsInstance(sum_df.chunks[0].inputs[0].op, DataFrameConcat) self.assertEqual(len(sum_df.chunks[0].inputs[0].inputs), 2) data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)]) sum_df = from_pandas_df(data, chunk_size=4).sum(axis='columns') self.assertEqual(sum_df.shape, (20, )) sum_df.tiles() self.assertEqual(len(sum_df.chunks), 5) self.assertEqual(sum_df.nsplits, ((np.nan, ) * 5, )) self.assertIsInstance(sum_df.chunks[0].op, DataFrameSum) self.assertIsInstance(sum_df.chunks[0].inputs[0].op, DataFrameConcat) self.assertEqual(len(sum_df.chunks[0].inputs[0].inputs), 2)
def test_drop_duplicates(): rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(20, 7)), columns=['c' + str(i + 1) for i in range(7)]) raw['c7'] = [f's{j}' for j in range(20)] df = from_pandas_df(raw, chunk_size=10) with pytest.raises(ValueError): df.drop_duplicates(method='unknown') with pytest.raises(KeyError): df.drop_duplicates(subset='c8') # test auto method selection assert tile(df.drop_duplicates()).chunks[0].op.method == 'tree' # subset size less than chunk_store_limit assert tile(df.drop_duplicates( subset=['c1', 'c3'])).chunks[0].op.method == 'subset_tree' with option_context({'chunk_store_limit': 5}): # subset size greater than chunk_store_limit assert tile(df.drop_duplicates( subset=['c1', 'c3'])).chunks[0].op.method == 'tree' assert tile( df.drop_duplicates(subset=['c1', 'c7'])).chunks[0].op.method == 'tree' assert tile(df['c7'].drop_duplicates()).chunks[0].op.method == 'tree' s = df['c7'] with pytest.raises(ValueError): s.drop_duplicates(method='unknown')
def testToCPU(self): data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data) cdf = to_gpu(df) df2 = to_cpu(cdf) self.assertEqual(df.index_value, df2.index_value) self.assertEqual(df.columns_value, df2.columns_value) self.assertFalse(df2.op.gpu) pd.testing.assert_series_equal(df.dtypes, df2.dtypes) df2 = df2.tiles() df = get_tiled(df) self.assertEqual(df.nsplits, df2.nsplits) self.assertEqual(df.chunks[0].index_value, df2.chunks[0].index_value) self.assertEqual(df.chunks[0].columns_value, df2.chunks[0].columns_value) self.assertFalse(df2.chunks[0].op.gpu) pd.testing.assert_series_equal(df.chunks[0].dtypes, df2.chunks[0].dtypes) self.assertIs(df2, to_cpu(df2))
def testChunkSerialize(self): data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data).tiles() # pb chunk = df.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.DATAFRAME_DATA_SOURCE) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas()) pd.testing.assert_index_equal(chunk2.columns_value.to_pandas(), chunk.columns_value.to_pandas()) # json chunk = df.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas()) pd.testing.assert_index_equal(chunk2.columns_value.to_pandas(), chunk.columns_value.to_pandas())
def testDropDuplicates(self): rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(20, 7)), columns=['c' + str(i + 1) for i in range(7)]) raw['c7'] = ['s{}'.format(j) for j in range(20)] df = from_pandas_df(raw, chunk_size=10) with self.assertRaises(ValueError): df.drop_duplicates(method='unknown') with self.assertRaises(KeyError): df.drop_duplicates(subset='c8') # test auto method selection self.assertEqual(df.drop_duplicates().tiles().chunks[0].op.method, 'tree') # subset size less than chunk_store_limit self.assertEqual( df.drop_duplicates( subset=['c1', 'c3']).tiles().chunks[0].op.method, 'subset_tree') with option_context({'chunk_store_limit': 5}): # subset size greater than chunk_store_limit self.assertEqual( df.drop_duplicates( subset=['c1', 'c3']).tiles().chunks[0].op.method, 'tree') self.assertEqual( df.drop_duplicates( subset=['c1', 'c7']).tiles().chunks[0].op.method, 'tree') self.assertEqual( df['c7'].drop_duplicates().tiles().chunks[0].op.method, 'tree') s = df['c7'] with self.assertRaises(ValueError): s.drop_duplicates(method='unknown')
def testCheckNA(self): df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ')) for _ in range(20): df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99) df = from_pandas_df(df_raw, chunk_size=4) pd.testing.assert_frame_equal( self.executor.execute_dataframe(df.isna(), concat=True)[0], df_raw.isna()) pd.testing.assert_frame_equal( self.executor.execute_dataframe(df.notna(), concat=True)[0], df_raw.notna()) series_raw = pd.Series(np.nan, index=range(20)) for _ in range(3): series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99) series = from_pandas_series(series_raw, chunk_size=4) pd.testing.assert_series_equal( self.executor.execute_dataframe(series.isna(), concat=True)[0], series_raw.isna()) pd.testing.assert_series_equal( self.executor.execute_dataframe(series.notna(), concat=True)[0], series_raw.notna())
def testUseArrowDtypeNUnique(self): with option_context({ 'dataframe.use_arrow_dtype': True, 'combine_size': 2 }): rs = np.random.RandomState(0) data1 = pd.DataFrame({ 'a': rs.random(10), 'b': [f's{i}' for i in rs.randint(100, size=10)] }) data1['c'] = data1['b'].copy() data1['d'] = data1['b'].copy() data1['e'] = data1['b'].copy() df = from_pandas_df(data1, chunk_size=(3, 2)) r = df.nunique(axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data1.nunique(axis=0) pd.testing.assert_series_equal(result, expected) r = df.nunique(axis=1) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data1.nunique(axis=1) pd.testing.assert_series_equal(result, expected)
def testDrop(self): # test dataframe drop rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(20, 8)), columns=['c' + str(i + 1) for i in range(8)]) df = from_pandas_df(raw, chunk_size=3) with self.assertRaises(KeyError): df.drop(columns=['c9']) with self.assertRaises(NotImplementedError): df.drop(columns=from_pandas_series(pd.Series(['c9']))) columns = ['c2', 'c4', 'c5', 'c6'] index = [3, 6, 7] r = df.drop(columns=columns, index=index) self.assertIsInstance(r, DATAFRAME_TYPE) # test series drop raw = pd.Series(rs.randint(1000, size=(20, ))) series = from_pandas_series(raw, chunk_size=3) r = series.drop(index=index) self.assertIsInstance(r, SERIES_TYPE) # test index drop ser = pd.Series(range(20)) rs.shuffle(ser) raw = pd.Index(ser) idx = from_pandas_index(raw) r = idx.drop(index) self.assertIsInstance(r, INDEX_TYPE)
def test_eval_query(): rs = np.random.RandomState(0) raw = pd.DataFrame({ 'a': rs.rand(100), 'b': rs.rand(100), 'c c': rs.rand(100) }) df = from_pandas_df(raw, chunk_size=(10, 2)) with pytest.raises(NotImplementedError): mars_eval('df.a * 2', engine='numexpr') with pytest.raises(NotImplementedError): mars_eval('df.a * 2', parser='pandas') with pytest.raises(TypeError): df.eval(df) with pytest.raises(SyntaxError): df.query(""" a + b a + `c c` """) with pytest.raises(SyntaxError): df.eval(""" def a(): return v a() """) with pytest.raises(SyntaxError): df.eval("a + `c") with pytest.raises(KeyError): df.eval("a + c") with pytest.raises(ValueError): df.eval("p, q = a + c") with pytest.raises(ValueError): df.query("p = a + c")
def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result)
def testMemoryUsage(self): dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] data = dict([(t, np.ones(shape=500).astype(t)) for t in dtypes]) raw = pd.DataFrame(data) df = from_pandas_df(raw, chunk_size=(500, 2)) r = df.memory_usage().tiles() self.assertIsInstance(r, SERIES_TYPE) self.assertEqual(r.shape, (6,)) self.assertEqual(len(r.chunks), 3) self.assertIsNone(r.chunks[0].op.stage) df = from_pandas_df(raw, chunk_size=(100, 3)) r = df.memory_usage(index=True).tiles() self.assertIsInstance(r, SERIES_TYPE) self.assertEqual(r.shape, (6,)) self.assertEqual(len(r.chunks), 2) self.assertEqual(r.chunks[0].op.stage, OperandStage.reduce) r = df.memory_usage(index=False).tiles() self.assertIsInstance(r, SERIES_TYPE) self.assertEqual(r.shape, (5,)) self.assertEqual(len(r.chunks), 2) self.assertEqual(r.chunks[0].op.stage, OperandStage.reduce) raw = pd.Series(np.ones(shape=500).astype('object'), name='s') series = from_pandas_series(raw) r = series.memory_usage().tiles() self.assertIsInstance(r, TENSOR_TYPE) self.assertEqual(r.shape, ()) self.assertEqual(len(r.chunks), 1) self.assertIsNone(r.chunks[0].op.stage) series = from_pandas_series(raw, chunk_size=100) r = series.memory_usage().tiles() self.assertIsInstance(r, TENSOR_TYPE) self.assertEqual(r.shape, ()) self.assertEqual(len(r.chunks), 1) self.assertEqual(r.chunks[0].op.stage, OperandStage.reduce)
def testRebalance(self): raw = pd.DataFrame(np.random.rand(10, 3), columns=list('abc')) df = from_pandas_df(raw) df2 = df.rebalance() df2 = df2.tiles() self.assertIsInstance(df2.op, type(df.op))
def test_drop(): # test dataframe drop rs = np.random.RandomState(0) raw = pd.DataFrame(rs.randint(1000, size=(20, 8)), columns=['c' + str(i + 1) for i in range(8)]) df = from_pandas_df(raw, chunk_size=8) with pytest.raises(KeyError): df.drop(columns=['c9']) with pytest.raises(NotImplementedError): df.drop(columns=from_pandas_series(pd.Series(['c9']))) r = df.drop(columns=['c1']) pd.testing.assert_index_equal(r.index_value.to_pandas(), raw.index) tiled = tile(r) start = 0 for c in tiled.chunks: raw_index = raw.index[start:start + c.shape[0]] start += c.shape[0] pd.testing.assert_index_equal(raw_index, c.index_value.to_pandas()) df = from_pandas_df(raw, chunk_size=3) columns = ['c2', 'c4', 'c5', 'c6'] index = [3, 6, 7] r = df.drop(columns=columns, index=index) assert isinstance(r, DATAFRAME_TYPE) # test series drop raw = pd.Series(rs.randint(1000, size=(20, ))) series = from_pandas_series(raw, chunk_size=3) r = series.drop(index=index) assert isinstance(r, SERIES_TYPE) # test index drop ser = pd.Series(range(20)) rs.shuffle(ser) raw = pd.Index(ser) idx = from_pandas_index(raw) r = idx.drop(index) assert isinstance(r, INDEX_TYPE)
def testGPUExecution(self): pdf = pd.DataFrame(np.random.rand(30, 3), columns=list('abc')) df = from_pandas_df(pdf, chunk_size=6) cdf = to_gpu(df).sum() res = self.executor.execute_dataframe(cdf, concat=True)[0] expected = pdf.sum() pd.testing.assert_series_equal(res.to_pandas(), expected)
def test_memory_usage(): dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] data = dict([(t, np.ones(shape=500).astype(t)) for t in dtypes]) raw = pd.DataFrame(data) df = from_pandas_df(raw, chunk_size=(500, 2)) r = tile(df.memory_usage()) assert isinstance(r, SERIES_TYPE) assert r.shape == (6, ) assert len(r.chunks) == 3 assert r.chunks[0].op.stage is None df = from_pandas_df(raw, chunk_size=(100, 3)) r = tile(df.memory_usage(index=True)) assert isinstance(r, SERIES_TYPE) assert r.shape == (6, ) assert len(r.chunks) == 2 assert r.chunks[0].op.stage == OperandStage.reduce r = tile(df.memory_usage(index=False)) assert isinstance(r, SERIES_TYPE) assert r.shape == (5, ) assert len(r.chunks) == 2 assert r.chunks[0].op.stage == OperandStage.reduce raw = pd.Series(np.ones(shape=500).astype('object'), name='s') series = from_pandas_series(raw) r = tile(series.memory_usage()) assert isinstance(r, TENSOR_TYPE) assert r.shape == () assert len(r.chunks) == 1 assert r.chunks[0].op.stage is None series = from_pandas_series(raw, chunk_size=100) r = tile(series.memory_usage()) assert isinstance(r, TENSOR_TYPE) assert r.shape == () assert len(r.chunks) == 1 assert r.chunks[0].op.stage == OperandStage.reduce