Beispiel #1
0
    def testNunique(self):
        data = pd.DataFrame(np.random.randint(0, 6, size=(20, 10)),
                            columns=['c' + str(i) for i in range(10)])
        df = from_pandas_df(data, chunk_size=3)
        result = df.nunique()

        self.assertEqual(result.shape, (10,))
        self.assertEqual(result.op.output_types[0], OutputType.series)
        self.assertIsInstance(result.op, DataFrameNunique)

        tiled = result.tiles()
        self.assertEqual(tiled.shape, (10,))
        self.assertEqual(len(tiled.chunks), 4)
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 1,),))
        self.assertEqual(tiled.chunks[0].op.stage, OperandStage.agg)
        self.assertIsInstance(tiled.chunks[0].op, DataFrameAggregate)

        data2 = data.copy()
        df2 = from_pandas_df(data2, chunk_size=3)
        result2 = df2.nunique(axis=1)

        self.assertEqual(result2.shape, (20,))
        self.assertEqual(result2.op.output_types[0], OutputType.series)
        self.assertIsInstance(result2.op, DataFrameNunique)

        tiled = result2.tiles()
        self.assertEqual(tiled.shape, (20,))
        self.assertEqual(len(tiled.chunks), 7)
        self.assertEqual(tiled.nsplits, ((3, 3, 3, 3, 3, 3, 2,),))
        self.assertEqual(tiled.chunks[0].op.stage, OperandStage.agg)
        self.assertIsInstance(tiled.chunks[0].op, DataFrameAggregate)
Beispiel #2
0
    def testRechunkExecution(self):
        data = pd.DataFrame(np.random.rand(8, 10))
        df = from_pandas_df(pd.DataFrame(data), chunk_size=3)
        df2 = df.rechunk((3, 4))
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        df2 = df.rechunk(5)
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        # test Series rechunk execution.
        data = pd.Series(np.random.rand(10,))
        series = from_pandas_series(data)
        series2 = series.rechunk(3)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        series2 = series.rechunk(1)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        # test index rechunk execution
        data = pd.Index(np.random.rand(10,))
        index = from_pandas_index(data)
        index2 = index.rechunk(3)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)

        index2 = index.rechunk(1)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)
Beispiel #3
0
    def testDataFrameReduction(self):
        data = pd.DataFrame({
            'a': list(range(20)),
            'b': list(range(20, 0, -1))
        },
                            index=[str(i) for i in range(20)])
        reduction_df = getattr(from_pandas_df(data, chunk_size=3),
                               self.func_name)()

        self.assertIsInstance(reduction_df, Series)
        self.assertIsInstance(reduction_df.op, self.op)
        self.assertIsInstance(reduction_df.index_value._index_value,
                              IndexValue.Index)
        self.assertEqual(reduction_df.shape, (2, ))

        reduction_df = reduction_df.tiles()

        self.assertEqual(len(reduction_df.chunks), 1)
        self.assertIsInstance(reduction_df.chunks[0].op, DataFrameAggregate)
        self.assertIsInstance(reduction_df.chunks[0].inputs[0].op,
                              DataFrameConcat)
        self.assertEqual(len(reduction_df.chunks[0].inputs[0].inputs), 2)

        data = pd.DataFrame(np.random.rand(20, 10))
        reduction_df = getattr(from_pandas_df(data, chunk_size=3),
                               self.func_name)()

        self.assertIsInstance(reduction_df, Series)
        self.assertIsInstance(reduction_df.index_value._index_value,
                              IndexValue.RangeIndex)
        self.assertEqual(reduction_df.shape, (10, ))

        reduction_df = reduction_df.tiles()

        self.assertEqual(len(reduction_df.chunks), 4)
        self.assertEqual(reduction_df.nsplits, ((3, 3, 3, 1), ))
        self.assertIsInstance(reduction_df.chunks[0].op, DataFrameAggregate)
        self.assertIsInstance(reduction_df.chunks[0].inputs[0].op,
                              DataFrameConcat)
        self.assertEqual(len(reduction_df.chunks[0].inputs[0].inputs), 2)

        data = pd.DataFrame(np.random.rand(20, 20),
                            index=[str(i) for i in range(20)])
        reduction_df = getattr(from_pandas_df(data, chunk_size=4),
                               self.func_name)(axis='columns')

        self.assertEqual(reduction_df.shape, (20, ))

        reduction_df = reduction_df.tiles()

        self.assertEqual(len(reduction_df.chunks), 5)
        self.assertEqual(reduction_df.nsplits, ((4, ) * 5, ))
        self.assertIsInstance(reduction_df.chunks[0].op, DataFrameAggregate)
        self.assertIsInstance(reduction_df.chunks[0].inputs[0].op,
                              DataFrameConcat)
        self.assertEqual(len(reduction_df.chunks[0].inputs[0].inputs), 2)

        with self.assertRaises(NotImplementedError):
            getattr(from_pandas_df(data, chunk_size=3),
                    self.func_name)(level=0, axis=1)
Beispiel #4
0
    def testDataFrameCount(self):
        data = pd.DataFrame({
            "Person": ["John", "Myla", "Lewis", "John", "Myla"],
            "Age": [24., np.nan, 21., 33, 26],
            "Single": [False, True, True, True, False]})
        df = from_pandas_df(data)

        result = self.executor.execute_dataframe(df.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df.count(axis='columns'), concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df2 = from_pandas_df(data, chunk_size=2)

        result = self.executor.execute_dataframe(df2.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df2.count(axis='columns'), concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df3 = from_pandas_df(data, chunk_size=3)

        result = self.executor.execute_dataframe(df3.count(numeric_only=True), concat=True)[0]
        expected = data.count(numeric_only=True)
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df3.count(axis='columns', numeric_only=True), concat=True)[0]
        expected = data.count(axis='columns', numeric_only=True)
        pd.testing.assert_series_equal(result, expected)
Beispiel #5
0
    def testDataFrameShuffle(self, *_):
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.merge.merge import merge
        from mars.dataframe.utils import sort_dataframe_inplace

        with new_cluster(scheduler_n_process=2, worker_n_process=2,
                         shared_memory='20M', web=True) as cluster:
            session = cluster.session

            data1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e'])
            data2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y'])

            df1 = from_pandas_df(data1, chunk_size=2)
            df2 = from_pandas_df(data2, chunk_size=2)

            r1 = data1.merge(data2)
            r2 = session.run(merge(df1, df2), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))

            r1 = data1.merge(data2, how='inner', on=['a', 'b'])
            r2 = session.run(merge(df1, df2, how='inner', on=['a', 'b']), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))

            web_session = new_session('http://' + cluster._web_endpoint)

            r1 = data1.merge(data2)
            r2 = web_session.run(merge(df1, df2), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))

            r1 = data1.merge(data2, how='inner', on=['a', 'b'])
            r2 = web_session.run(merge(df1, df2, how='inner', on=['a', 'b']), timeout=_exec_timeout)
            pd.testing.assert_frame_equal(sort_dataframe_inplace(r1, 0), sort_dataframe_inplace(r2, 0))
Beispiel #6
0
    def testResetIndex(self):
        data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5),
                             ('mammal', np.nan)],
                            index=['falcon', 'parrot', 'lion', 'monkey'],
                            columns=('class', 'max_speed'))
        df = df_reset_index(from_pandas_df(data, chunk_size=2))
        r = data.reset_index()

        self.assertEqual(df.shape, (4, 3))
        pd.testing.assert_series_equal(df.dtypes, r.dtypes)

        df2 = df.tiles()

        self.assertEqual(len(df2.chunks), 2)
        self.assertEqual(df2.chunks[0].shape, (2, 3))
        pd.testing.assert_index_equal(df2.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(2))
        pd.testing.assert_series_equal(df2.chunks[0].dtypes, r.dtypes)
        self.assertEqual(df2.chunks[1].shape, (2, 3))
        pd.testing.assert_index_equal(df2.chunks[1].index_value.to_pandas(),
                                      pd.RangeIndex(2, 4))
        pd.testing.assert_series_equal(df2.chunks[1].dtypes, r.dtypes)

        df = df_reset_index(from_pandas_df(data, chunk_size=1), drop=True)
        r = data.reset_index(drop=True)

        self.assertEqual(df.shape, (4, 2))
        pd.testing.assert_series_equal(df.dtypes, r.dtypes)

        df2 = df.tiles()

        self.assertEqual(len(df2.chunks), 8)

        for c in df2.chunks:
            self.assertEqual(c.shape, (1, 1))
            pd.testing.assert_index_equal(
                c.index_value.to_pandas(),
                pd.RangeIndex(c.index[0], c.index[0] + 1))
            pd.testing.assert_series_equal(c.dtypes,
                                           r.dtypes[c.index[1]:c.index[1] + 1])

        # test Series
        series_data = pd.Series([1, 2, 3, 4],
                                name='foo',
                                index=pd.Index(['a', 'b', 'c', 'd'],
                                               name='idx'))
        s = series_reset_index(from_pandas_series(series_data, chunk_size=2))
        r = series_data.reset_index()

        self.assertEqual(s.shape, (4, 2))
        pd.testing.assert_series_equal(s.dtypes, r.dtypes)

        s2 = s.tiles()
        self.assertEqual(len(s2.chunks), 2)
        self.assertEqual(s2.chunks[0].shape, (2, 2))
        pd.testing.assert_index_equal(s2.chunks[0].index_value.to_pandas(),
                                      pd.RangeIndex(2))
        self.assertEqual(s2.chunks[1].shape, (2, 2))
        pd.testing.assert_index_equal(s2.chunks[1].index_value.to_pandas(),
                                      pd.RangeIndex(2, 4))
Beispiel #7
0
    def testDataFrameReduction(self):
        data = pd.DataFrame({'a': list(range(20)), 'b': list(range(20, 0, -1))},
                            index=[str(i) for i in range(20)])
        reduction_df = getattr(from_pandas_df(data, chunk_size=3), self.func_name)()

        self.assertIsInstance(reduction_df, DataFrame)
        self.assertIsInstance(reduction_df.index_value._index_value, IndexValue.Index)
        self.assertEqual(reduction_df.shape, (20, 2))

        reduction_df = reduction_df.tiles()

        self.assertEqual(len(reduction_df.chunks), 7)
        self.assertIsInstance(reduction_df.chunks[0].op, self.op)
        self.assertEqual(reduction_df.chunks[0].op.stage, OperandStage.combine)
        self.assertIsInstance(reduction_df.chunks[-1].inputs[-1].op, self.op)
        self.assertEqual(reduction_df.chunks[-1].inputs[-1].op.stage, OperandStage.map)
        self.assertEqual(len(reduction_df.chunks[-1].inputs), 7)

        data = pd.DataFrame(np.random.rand(20, 10))
        reduction_df = getattr(from_pandas_df(data, chunk_size=3), self.func_name)()

        self.assertIsInstance(reduction_df, DataFrame)
        self.assertIsInstance(reduction_df.index_value._index_value, IndexValue.RangeIndex)
        self.assertEqual(reduction_df.shape, (20, 10))

        reduction_df = reduction_df.tiles()

        self.assertEqual(len(reduction_df.chunks), 28)
        self.assertEqual(reduction_df.nsplits, ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1)))
        self.assertEqual(reduction_df.chunks[0].op.stage, OperandStage.combine)
        self.assertIsInstance(reduction_df.chunks[-1].inputs[-1].op, self.op)
        self.assertEqual(reduction_df.chunks[-1].inputs[-1].op.stage, OperandStage.map)
        self.assertEqual(len(reduction_df.chunks[-1].inputs), 7)
Beispiel #8
0
def test_nunique():
    data = pd.DataFrame(np.random.randint(0, 6, size=(20, 10)),
                        columns=['c' + str(i) for i in range(10)])
    df = from_pandas_df(data, chunk_size=3)
    result = df.nunique()

    assert result.shape == (10,)
    assert result.op.output_types[0] == OutputType.series
    assert isinstance(result.op, DataFrameNunique)

    tiled = tile(result)
    assert tiled.shape == (10,)
    assert len(tiled.chunks) == 4
    assert tiled.nsplits == ((3, 3, 3, 1,),)
    assert tiled.chunks[0].op.stage == OperandStage.agg
    assert isinstance(tiled.chunks[0].op, DataFrameAggregate)

    data2 = data.copy()
    df2 = from_pandas_df(data2, chunk_size=3)
    result2 = df2.nunique(axis=1)

    assert result2.shape == (20,)
    assert result2.op.output_types[0] == OutputType.series
    assert isinstance(result2.op, DataFrameNunique)

    tiled = tile(result2)
    assert tiled.shape == (20,)
    assert len(tiled.chunks) == 7
    assert tiled.nsplits == ((3, 3, 3, 3, 3, 3, 2,),)
    assert tiled.chunks[0].op.stage == OperandStage.agg
    assert isinstance(tiled.chunks[0].op, DataFrameAggregate)
Beispiel #9
0
def test_cum_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
    data = pd.DataFrame({'a': list(range(20)), 'b': list(range(20, 0, -1))},
                        index=[str(i) for i in range(20)])
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, DataFrame)
    assert isinstance(reduction_df.index_value._index_value, IndexValue.Index)
    assert reduction_df.shape == (20, 2)

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 7
    assert isinstance(reduction_df.chunks[0].op, op)
    assert reduction_df.chunks[0].op.stage == OperandStage.combine
    assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op)
    assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert len(reduction_df.chunks[-1].inputs) == 7

    data = pd.DataFrame(np.random.rand(20, 10))
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, DataFrame)
    assert isinstance(reduction_df.index_value._index_value, IndexValue.RangeIndex)
    assert reduction_df.shape == (20, 10)

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 28
    assert reduction_df.nsplits == ((3, 3, 3, 3, 3, 3, 2), (3, 3, 3, 1))
    assert reduction_df.chunks[0].op.stage == OperandStage.combine
    assert isinstance(reduction_df.chunks[-1].inputs[-1].op, op)
    assert reduction_df.chunks[-1].inputs[-1].op.stage == OperandStage.map
    assert len(reduction_df.chunks[-1].inputs) == 7
Beispiel #10
0
    def testDescribeExecution(self):
        s_raw = pd.Series(np.random.rand(10))

        # test one chunk
        series = from_pandas_series(s_raw, chunk_size=10)

        r = series.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe()
        pd.testing.assert_series_equal(result, expected)

        r = series.describe(percentiles=[])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[])
        pd.testing.assert_series_equal(result, expected)

        # test multi chunks
        series = from_pandas_series(s_raw, chunk_size=3)

        r = series.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe()
        pd.testing.assert_series_equal(result, expected)

        r = series.describe(percentiles=[])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[])
        pd.testing.assert_series_equal(result, expected)

        df_raw = pd.DataFrame(np.random.rand(10, 4), columns=list('abcd'))
        df_raw['e'] = np.random.randint(100, size=10)

        # test one chunk
        df = from_pandas_df(df_raw, chunk_size=10)

        r = df.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe()
        pd.testing.assert_frame_equal(result, expected)

        r = series.describe(percentiles=[], include=np.float64)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[], include=np.float64)
        pd.testing.assert_series_equal(result, expected)

        # test multi chunks
        df = from_pandas_df(df_raw, chunk_size=3)

        r = df.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe()
        pd.testing.assert_frame_equal(result, expected)

        r = df.describe(percentiles=[], include=np.float64)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe(percentiles=[], include=np.float64)
        pd.testing.assert_frame_equal(result, expected)

        with self.assertRaises(ValueError):
            df.describe(percentiles=[1.1])
Beispiel #11
0
    def testFetchDataFrame(self, *_):
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.arithmetic import add

        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:
            session = cluster.session

            data1 = pd.DataFrame(np.random.rand(10, 10))
            df1 = from_pandas_df(data1, chunk_size=5)
            data2 = pd.DataFrame(np.random.rand(10, 10))
            df2 = from_pandas_df(data2, chunk_size=6)

            df3 = add(df1, df2)

            r1 = session.run(df3, compose=False, timeout=_exec_timeout)
            r2 = session.fetch(df3)
            pd.testing.assert_frame_equal(r1, r2)

            data4 = pd.DataFrame(np.random.rand(10, 10))
            df4 = from_pandas_df(data4, chunk_size=6)

            df5 = add(df3, df4)

            r1 = session.run(df5, compose=False, timeout=_exec_timeout)
            r2 = session.fetch(df5)
            pd.testing.assert_frame_equal(r1, r2)

            df6 = df5.sum()
            r1 = session.run(df6, timeout=_exec_timeout)
            r2 = session.fetch(df6)
            pd.testing.assert_series_equal(r1, r2)
Beispiel #12
0
    def testDataFrameAggregate(self):
        all_aggs = ['sum', 'prod', 'min', 'max', 'count', 'size',
                    'mean', 'var', 'std', 'sem', 'skew', 'kurt']
        data = pd.DataFrame(np.random.rand(20, 20))

        df = from_pandas_df(data)
        result = df.agg(all_aggs)
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                      data.agg(all_aggs))

        result = df.agg('size')
        self.assertEqual(self.executor.execute_dataframe(result)[0], data.agg('size'))

        for func in (a for a in all_aggs if a != 'size'):
            result = df.agg(func)
            pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                           data.agg(func))

            result = df.agg(func, axis=1)
            pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                           data.agg(func, axis=1))

        df = from_pandas_df(data, chunk_size=3)

        # will redirect to transform
        result = df.agg(['cumsum', 'cummax'])
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                      data.agg(['cumsum', 'cummax']))

        result = df.agg('size')
        self.assertEqual(self.executor.execute_dataframe(result)[0], data.agg('size'))

        for func in (a for a in all_aggs if a != 'size'):
            result = df.agg(func)
            pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                           data.agg(func))

            result = df.agg(func, axis=1)
            pd.testing.assert_series_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                           data.agg(func, axis=1))

        result = df.agg(['sum'])
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                      data.agg(['sum']))

        result = df.agg(all_aggs)
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                      data.agg(all_aggs))

        result = df.agg(all_aggs, axis=1)
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                      data.agg(all_aggs, axis=1))

        result = df.agg({0: ['sum', 'min', 'var'], 9: ['mean', 'var', 'std']})
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(result, concat=True)[0],
                                      data.agg({0: ['sum', 'min', 'var'], 9: ['mean', 'var', 'std']}))
Beispiel #13
0
    def testDataFrameGraphSerialize(self):
        df = from_pandas_df(pd.DataFrame(np.random.rand(10, 10),
                                         columns=pd.timedelta_range(start='1 day', periods=10),
                                         index=pd.date_range('2020-1-1', periods=10)))
        graph = df.build_graph(tiled=False)

        pb = graph.to_pb()
        graph2 = DAG.from_pb(pb)
        self.assertEqual(len(graph), len(graph2))
        t = next(iter(graph))
        t2 = next(iter(graph2))
        self.assertTrue(t2.op.outputs[0], ReferenceType)  # make sure outputs are all weak reference
        self.assertBaseEqual(t.op, t2.op)
        self.assertEqual(t.shape, t2.shape)
        self.assertEqual(sorted(i.key for i in t.inputs), sorted(i.key for i in t2.inputs))
        pd.testing.assert_index_equal(t2.index_value.to_pandas(), t.index_value.to_pandas())
        pd.testing.assert_index_equal(t2.columns_value.to_pandas(), t.columns_value.to_pandas())

        jsn = graph.to_json()
        graph2 = DAG.from_json(jsn)
        self.assertEqual(len(graph), len(graph2))
        t = next(iter(graph))
        t2 = next(iter(graph2))
        self.assertTrue(t2.op.outputs[0], ReferenceType)  # make sure outputs are all weak reference
        self.assertBaseEqual(t.op, t2.op)
        self.assertEqual(t.shape, t2.shape)
        self.assertEqual(sorted(i.key for i in t.inputs), sorted(i.key for i in t2.inputs))
        pd.testing.assert_index_equal(t2.index_value.to_pandas(), t.index_value.to_pandas())
        pd.testing.assert_index_equal(t2.columns_value.to_pandas(), t.columns_value.to_pandas())

        # test graph with tiled DataFrame
        t2 = from_pandas_df(pd.DataFrame(np.random.rand(10, 10)), chunk_size=(5, 4)).tiles()
        graph = DAG()
        graph.add_node(t2)

        pb = graph.to_pb()
        graph2 = DAG.from_pb(pb)
        self.assertEqual(len(graph), len(graph2))
        chunks = next(iter(graph2)).chunks
        self.assertEqual(len(chunks), 6)
        self.assertIsInstance(chunks[0], DataFrameChunk)
        self.assertEqual(chunks[0].index, t2.chunks[0].index)
        self.assertBaseEqual(chunks[0].op, t2.chunks[0].op)
        pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(), t2.chunks[0].index_value.to_pandas())
        pd.testing.assert_index_equal(chunks[0].columns_value.to_pandas(), t2.chunks[0].columns_value.to_pandas())

        jsn = graph.to_json()
        graph2 = DAG.from_json(jsn)
        self.assertEqual(len(graph), len(graph2))
        chunks = next(iter(graph2)).chunks
        self.assertEqual(len(chunks), 6)
        self.assertIsInstance(chunks[0], DataFrameChunk)
        self.assertEqual(chunks[0].index, t2.chunks[0].index)
        self.assertBaseEqual(chunks[0].op, t2.chunks[0].op)
        pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(), t2.chunks[0].index_value.to_pandas())
        pd.testing.assert_index_equal(chunks[0].columns_value.to_pandas(), t2.chunks[0].columns_value.to_pandas())
Beispiel #14
0
def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
    data = pd.DataFrame({
        'a': list(range(20)),
        'b': list(range(20, 0, -1))
    },
                        index=[str(i) for i in range(20)])
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, Series)
    assert isinstance(reduction_df.op, op)
    assert isinstance(reduction_df.index_value._index_value, IndexValue.Index)
    assert reduction_df.shape == (2, )

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 1
    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2

    data = pd.DataFrame(np.random.rand(20, 10))
    reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()

    assert isinstance(reduction_df, Series)
    assert isinstance(reduction_df.index_value._index_value,
                      (IndexValue.RangeIndex, IndexValue.Int64Index))
    assert reduction_df.shape == (10, )

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 4
    assert reduction_df.nsplits == ((3, 3, 3, 1), )
    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2

    data = pd.DataFrame(np.random.rand(20, 20),
                        index=[str(i) for i in range(20)])
    reduction_df = getattr(from_pandas_df(data, chunk_size=4),
                           func_name)(axis='columns')

    assert reduction_df.shape == (20, )

    reduction_df = tile(reduction_df)

    assert len(reduction_df.chunks) == 5
    assert reduction_df.nsplits == ((4, ) * 5, )
    assert isinstance(reduction_df.chunks[0].op, DataFrameAggregate)
    assert isinstance(reduction_df.chunks[0].inputs[0].op, DataFrameConcat)
    assert len(reduction_df.chunks[0].inputs[0].inputs) == 2

    with pytest.raises(NotImplementedError):
        getattr(from_pandas_df(data, chunk_size=3), func_name)(level=0, axis=1)
Beispiel #15
0
    def testDropNA(self):
        # dataframe cases
        df_raw = pd.DataFrame(np.nan,
                              index=range(0, 20),
                              columns=list('ABCDEFGHIJ'))
        for _ in range(30):
            df_raw.iloc[random.randint(0, 19),
                        random.randint(0, 9)] = random.randint(0, 99)
        for rowid in range(random.randint(1, 5)):
            row = random.randint(0, 19)
            for idx in range(0, 10):
                df_raw.iloc[row, idx] = random.randint(0, 99)

        # not supporting drop with axis=1
        with self.assertRaises(NotImplementedError):
            from_pandas_df(df_raw).dropna(axis=1)

        # only one chunk in columns, can run dropna directly
        r = from_pandas_df(df_raw, chunk_size=(4, 10)).dropna().tiles()
        self.assertEqual(r.shape, (np.nan, 10))
        self.assertEqual(r.nsplits, ((np.nan, ) * 5, (10, )))
        for c in r.chunks:
            self.assertIsInstance(c.op, type(r.op))
            self.assertEqual(len(c.inputs), 1)
            self.assertEqual(len(c.inputs[0].inputs), 0)
            self.assertEqual(c.shape, (np.nan, 10))

        # multiple chunks in columns, count() will be called first
        r = from_pandas_df(df_raw, chunk_size=4).dropna().tiles()
        self.assertEqual(r.shape, (np.nan, 10))
        self.assertEqual(r.nsplits, ((np.nan, ) * 5, (4, 4, 2)))
        for c in r.chunks:
            self.assertIsInstance(c.op, type(r.op))
            self.assertEqual(len(c.inputs), 2)
            self.assertEqual(len(c.inputs[0].inputs), 0)
            self.assertEqual(c.inputs[1].op.stage, OperandStage.agg)
            self.assertTrue(np.isnan(c.shape[0]))

        # series cases
        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(10):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

        r = from_pandas_series(series_raw, chunk_size=4).dropna().tiles()
        self.assertEqual(r.shape, (np.nan, ))
        self.assertEqual(r.nsplits, ((np.nan, ) * 5, ))
        for c in r.chunks:
            self.assertIsInstance(c.op, type(r.op))
            self.assertEqual(len(c.inputs), 1)
            self.assertEqual(len(c.inputs[0].inputs), 0)
            self.assertEqual(c.shape, (np.nan, ))
Beispiel #16
0
    def testDataFrameSum(self):
        data = pd.DataFrame({
            'a': list(range(20)),
            'b': list(range(20, 0, -1))
        },
                            index=[str(i) for i in range(20)])
        sum_df = from_pandas_df(data, chunk_size=3).sum()

        self.assertIsInstance(sum_df, Series)
        self.assertIsInstance(sum_df.index_value._index_value,
                              IndexValue.Index)
        self.assertEqual(sum_df.shape, (2, ))

        sum_df.tiles()

        self.assertEqual(len(sum_df.chunks), 1)
        self.assertIsInstance(sum_df.chunks[0].op, DataFrameSum)
        self.assertIsInstance(sum_df.chunks[0].inputs[0].op, DataFrameConcat)
        self.assertEqual(len(sum_df.chunks[0].inputs[0].inputs), 2)

        data = pd.DataFrame(np.random.rand(20, 10))
        sum_df = from_pandas_df(data, chunk_size=3).sum()

        self.assertIsInstance(sum_df, Series)
        self.assertIsInstance(sum_df.index_value._index_value,
                              IndexValue.RangeIndex)
        self.assertEqual(sum_df.shape, (10, ))

        sum_df.tiles()

        self.assertEqual(len(sum_df.chunks), 4)
        self.assertEqual(sum_df.nsplits, ((3, 3, 3, 1), ))
        self.assertIsInstance(sum_df.chunks[0].op, DataFrameSum)
        self.assertIsInstance(sum_df.chunks[0].inputs[0].op, DataFrameConcat)
        self.assertEqual(len(sum_df.chunks[0].inputs[0].inputs), 2)

        data = pd.DataFrame(np.random.rand(20, 20),
                            index=[str(i) for i in range(20)])
        sum_df = from_pandas_df(data, chunk_size=4).sum(axis='columns')

        self.assertEqual(sum_df.shape, (20, ))

        sum_df.tiles()

        self.assertEqual(len(sum_df.chunks), 5)
        self.assertEqual(sum_df.nsplits, ((np.nan, ) * 5, ))
        self.assertIsInstance(sum_df.chunks[0].op, DataFrameSum)
        self.assertIsInstance(sum_df.chunks[0].inputs[0].op, DataFrameConcat)
        self.assertEqual(len(sum_df.chunks[0].inputs[0].inputs), 2)
Beispiel #17
0
def test_drop_duplicates():
    rs = np.random.RandomState(0)
    raw = pd.DataFrame(rs.randint(1000, size=(20, 7)),
                       columns=['c' + str(i + 1) for i in range(7)])
    raw['c7'] = [f's{j}' for j in range(20)]

    df = from_pandas_df(raw, chunk_size=10)
    with pytest.raises(ValueError):
        df.drop_duplicates(method='unknown')
    with pytest.raises(KeyError):
        df.drop_duplicates(subset='c8')

    # test auto method selection
    assert tile(df.drop_duplicates()).chunks[0].op.method == 'tree'
    # subset size less than chunk_store_limit
    assert tile(df.drop_duplicates(
        subset=['c1', 'c3'])).chunks[0].op.method == 'subset_tree'
    with option_context({'chunk_store_limit': 5}):
        # subset size greater than chunk_store_limit
        assert tile(df.drop_duplicates(
            subset=['c1', 'c3'])).chunks[0].op.method == 'tree'
    assert tile(
        df.drop_duplicates(subset=['c1', 'c7'])).chunks[0].op.method == 'tree'
    assert tile(df['c7'].drop_duplicates()).chunks[0].op.method == 'tree'

    s = df['c7']
    with pytest.raises(ValueError):
        s.drop_duplicates(method='unknown')
Beispiel #18
0
    def testToCPU(self):
        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        cdf = to_gpu(df)
        df2 = to_cpu(cdf)

        self.assertEqual(df.index_value, df2.index_value)
        self.assertEqual(df.columns_value, df2.columns_value)
        self.assertFalse(df2.op.gpu)
        pd.testing.assert_series_equal(df.dtypes, df2.dtypes)

        df2 = df2.tiles()
        df = get_tiled(df)

        self.assertEqual(df.nsplits, df2.nsplits)
        self.assertEqual(df.chunks[0].index_value, df2.chunks[0].index_value)
        self.assertEqual(df.chunks[0].columns_value,
                         df2.chunks[0].columns_value)
        self.assertFalse(df2.chunks[0].op.gpu)
        pd.testing.assert_series_equal(df.chunks[0].dtypes,
                                       df2.chunks[0].dtypes)

        self.assertIs(df2, to_cpu(df2))
Beispiel #19
0
    def testChunkSerialize(self):
        data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data).tiles()

        # pb
        chunk = df.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.DATAFRAME_DATA_SOURCE)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas())
        pd.testing.assert_index_equal(chunk2.columns_value.to_pandas(), chunk.columns_value.to_pandas())

        # json
        chunk = df.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas())
        pd.testing.assert_index_equal(chunk2.columns_value.to_pandas(), chunk.columns_value.to_pandas())
Beispiel #20
0
    def testDropDuplicates(self):
        rs = np.random.RandomState(0)
        raw = pd.DataFrame(rs.randint(1000, size=(20, 7)),
                           columns=['c' + str(i + 1) for i in range(7)])
        raw['c7'] = ['s{}'.format(j) for j in range(20)]

        df = from_pandas_df(raw, chunk_size=10)
        with self.assertRaises(ValueError):
            df.drop_duplicates(method='unknown')
        with self.assertRaises(KeyError):
            df.drop_duplicates(subset='c8')

        # test auto method selection
        self.assertEqual(df.drop_duplicates().tiles().chunks[0].op.method,
                         'tree')
        # subset size less than chunk_store_limit
        self.assertEqual(
            df.drop_duplicates(
                subset=['c1', 'c3']).tiles().chunks[0].op.method,
            'subset_tree')
        with option_context({'chunk_store_limit': 5}):
            # subset size greater than chunk_store_limit
            self.assertEqual(
                df.drop_duplicates(
                    subset=['c1', 'c3']).tiles().chunks[0].op.method, 'tree')
        self.assertEqual(
            df.drop_duplicates(
                subset=['c1', 'c7']).tiles().chunks[0].op.method, 'tree')
        self.assertEqual(
            df['c7'].drop_duplicates().tiles().chunks[0].op.method, 'tree')

        s = df['c7']
        with self.assertRaises(ValueError):
            s.drop_duplicates(method='unknown')
    def testCheckNA(self):
        df_raw = pd.DataFrame(np.nan,
                              index=range(0, 20),
                              columns=list('ABCDEFGHIJ'))
        for _ in range(20):
            df_raw.iloc[random.randint(0, 19),
                        random.randint(0, 9)] = random.randint(0, 99)

        df = from_pandas_df(df_raw, chunk_size=4)

        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df.isna(), concat=True)[0],
            df_raw.isna())
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(df.notna(), concat=True)[0],
            df_raw.notna())

        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(3):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

        series = from_pandas_series(series_raw, chunk_size=4)

        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series.isna(), concat=True)[0],
            series_raw.isna())
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series.notna(), concat=True)[0],
            series_raw.notna())
Beispiel #22
0
    def testUseArrowDtypeNUnique(self):
        with option_context({
                'dataframe.use_arrow_dtype': True,
                'combine_size': 2
        }):
            rs = np.random.RandomState(0)
            data1 = pd.DataFrame({
                'a':
                rs.random(10),
                'b': [f's{i}' for i in rs.randint(100, size=10)]
            })
            data1['c'] = data1['b'].copy()
            data1['d'] = data1['b'].copy()
            data1['e'] = data1['b'].copy()

            df = from_pandas_df(data1, chunk_size=(3, 2))
            r = df.nunique(axis=0)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data1.nunique(axis=0)
            pd.testing.assert_series_equal(result, expected)

            r = df.nunique(axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data1.nunique(axis=1)
            pd.testing.assert_series_equal(result, expected)
Beispiel #23
0
    def testDrop(self):
        # test dataframe drop
        rs = np.random.RandomState(0)
        raw = pd.DataFrame(rs.randint(1000, size=(20, 8)),
                           columns=['c' + str(i + 1) for i in range(8)])

        df = from_pandas_df(raw, chunk_size=3)

        with self.assertRaises(KeyError):
            df.drop(columns=['c9'])
        with self.assertRaises(NotImplementedError):
            df.drop(columns=from_pandas_series(pd.Series(['c9'])))

        columns = ['c2', 'c4', 'c5', 'c6']
        index = [3, 6, 7]
        r = df.drop(columns=columns, index=index)
        self.assertIsInstance(r, DATAFRAME_TYPE)

        # test series drop
        raw = pd.Series(rs.randint(1000, size=(20, )))
        series = from_pandas_series(raw, chunk_size=3)

        r = series.drop(index=index)
        self.assertIsInstance(r, SERIES_TYPE)

        # test index drop
        ser = pd.Series(range(20))
        rs.shuffle(ser)
        raw = pd.Index(ser)

        idx = from_pandas_index(raw)

        r = idx.drop(index)
        self.assertIsInstance(r, INDEX_TYPE)
Beispiel #24
0
def test_eval_query():
    rs = np.random.RandomState(0)
    raw = pd.DataFrame({
        'a': rs.rand(100),
        'b': rs.rand(100),
        'c c': rs.rand(100)
    })
    df = from_pandas_df(raw, chunk_size=(10, 2))

    with pytest.raises(NotImplementedError):
        mars_eval('df.a * 2', engine='numexpr')
    with pytest.raises(NotImplementedError):
        mars_eval('df.a * 2', parser='pandas')
    with pytest.raises(TypeError):
        df.eval(df)
    with pytest.raises(SyntaxError):
        df.query("""
        a + b
        a + `c c`
        """)
    with pytest.raises(SyntaxError):
        df.eval("""
        def a():
            return v
        a()
        """)
    with pytest.raises(SyntaxError):
        df.eval("a + `c")
    with pytest.raises(KeyError):
        df.eval("a + c")
    with pytest.raises(ValueError):
        df.eval("p, q = a + c")
    with pytest.raises(ValueError):
        df.query("p = a + c")
Beispiel #25
0
    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)
Beispiel #26
0
    def testMemoryUsage(self):
        dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
        data = dict([(t, np.ones(shape=500).astype(t))
                    for t in dtypes])
        raw = pd.DataFrame(data)

        df = from_pandas_df(raw, chunk_size=(500, 2))
        r = df.memory_usage().tiles()

        self.assertIsInstance(r, SERIES_TYPE)
        self.assertEqual(r.shape, (6,))
        self.assertEqual(len(r.chunks), 3)
        self.assertIsNone(r.chunks[0].op.stage)

        df = from_pandas_df(raw, chunk_size=(100, 3))
        r = df.memory_usage(index=True).tiles()

        self.assertIsInstance(r, SERIES_TYPE)
        self.assertEqual(r.shape, (6,))
        self.assertEqual(len(r.chunks), 2)
        self.assertEqual(r.chunks[0].op.stage, OperandStage.reduce)

        r = df.memory_usage(index=False).tiles()

        self.assertIsInstance(r, SERIES_TYPE)
        self.assertEqual(r.shape, (5,))
        self.assertEqual(len(r.chunks), 2)
        self.assertEqual(r.chunks[0].op.stage, OperandStage.reduce)

        raw = pd.Series(np.ones(shape=500).astype('object'), name='s')

        series = from_pandas_series(raw)
        r = series.memory_usage().tiles()

        self.assertIsInstance(r, TENSOR_TYPE)
        self.assertEqual(r.shape, ())
        self.assertEqual(len(r.chunks), 1)
        self.assertIsNone(r.chunks[0].op.stage)

        series = from_pandas_series(raw, chunk_size=100)
        r = series.memory_usage().tiles()

        self.assertIsInstance(r, TENSOR_TYPE)
        self.assertEqual(r.shape, ())
        self.assertEqual(len(r.chunks), 1)
        self.assertEqual(r.chunks[0].op.stage, OperandStage.reduce)
Beispiel #27
0
    def testRebalance(self):
        raw = pd.DataFrame(np.random.rand(10, 3), columns=list('abc'))
        df = from_pandas_df(raw)

        df2 = df.rebalance()
        df2 = df2.tiles()

        self.assertIsInstance(df2.op, type(df.op))
Beispiel #28
0
def test_drop():
    # test dataframe drop
    rs = np.random.RandomState(0)
    raw = pd.DataFrame(rs.randint(1000, size=(20, 8)),
                       columns=['c' + str(i + 1) for i in range(8)])

    df = from_pandas_df(raw, chunk_size=8)

    with pytest.raises(KeyError):
        df.drop(columns=['c9'])
    with pytest.raises(NotImplementedError):
        df.drop(columns=from_pandas_series(pd.Series(['c9'])))

    r = df.drop(columns=['c1'])
    pd.testing.assert_index_equal(r.index_value.to_pandas(), raw.index)

    tiled = tile(r)
    start = 0
    for c in tiled.chunks:
        raw_index = raw.index[start:start + c.shape[0]]
        start += c.shape[0]
        pd.testing.assert_index_equal(raw_index, c.index_value.to_pandas())

    df = from_pandas_df(raw, chunk_size=3)

    columns = ['c2', 'c4', 'c5', 'c6']
    index = [3, 6, 7]
    r = df.drop(columns=columns, index=index)
    assert isinstance(r, DATAFRAME_TYPE)

    # test series drop
    raw = pd.Series(rs.randint(1000, size=(20, )))
    series = from_pandas_series(raw, chunk_size=3)

    r = series.drop(index=index)
    assert isinstance(r, SERIES_TYPE)

    # test index drop
    ser = pd.Series(range(20))
    rs.shuffle(ser)
    raw = pd.Index(ser)

    idx = from_pandas_index(raw)

    r = idx.drop(index)
    assert isinstance(r, INDEX_TYPE)
Beispiel #29
0
    def testGPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(30, 3), columns=list('abc'))
        df = from_pandas_df(pdf, chunk_size=6)
        cdf = to_gpu(df).sum()

        res = self.executor.execute_dataframe(cdf, concat=True)[0]
        expected = pdf.sum()
        pd.testing.assert_series_equal(res.to_pandas(), expected)
Beispiel #30
0
def test_memory_usage():
    dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
    data = dict([(t, np.ones(shape=500).astype(t)) for t in dtypes])
    raw = pd.DataFrame(data)

    df = from_pandas_df(raw, chunk_size=(500, 2))
    r = tile(df.memory_usage())

    assert isinstance(r, SERIES_TYPE)
    assert r.shape == (6, )
    assert len(r.chunks) == 3
    assert r.chunks[0].op.stage is None

    df = from_pandas_df(raw, chunk_size=(100, 3))
    r = tile(df.memory_usage(index=True))

    assert isinstance(r, SERIES_TYPE)
    assert r.shape == (6, )
    assert len(r.chunks) == 2
    assert r.chunks[0].op.stage == OperandStage.reduce

    r = tile(df.memory_usage(index=False))

    assert isinstance(r, SERIES_TYPE)
    assert r.shape == (5, )
    assert len(r.chunks) == 2
    assert r.chunks[0].op.stage == OperandStage.reduce

    raw = pd.Series(np.ones(shape=500).astype('object'), name='s')

    series = from_pandas_series(raw)
    r = tile(series.memory_usage())

    assert isinstance(r, TENSOR_TYPE)
    assert r.shape == ()
    assert len(r.chunks) == 1
    assert r.chunks[0].op.stage is None

    series = from_pandas_series(raw, chunk_size=100)
    r = tile(series.memory_usage())

    assert isinstance(r, TENSOR_TYPE)
    assert r.shape == ()
    assert len(r.chunks) == 1
    assert r.chunks[0].op.stage == OperandStage.reduce