class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

    def testFromPandasSeriesExecution(self):
        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = from_pandas_series(ps, chunk_size=13)

        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testFromTensorExecution(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = from_tensor(tensor)
        tensor_res = self.executor.execute_tensor(tensor, concat=True)[0]
        pdf_expected = pd.DataFrame(tensor_res)
        df_result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
        pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
        pd.testing.assert_frame_equal(df_result, pdf_expected)

        # test converted with specified index_value and columns
        tensor2 = mt.random.rand(2, 2, chunk_size=1)
        df2 = from_tensor(tensor2,
                          index=pd.Index(['a', 'b']),
                          columns=pd.Index([3, 4]))
        df_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b']))
        pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))

        # test converted from 1-d tensor
        tensor3 = mt.array([1, 2, 3])
        df3 = from_tensor(tensor3)
        result3 = self.executor.execute_dataframe(df3, concat=True)[0]
        pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
        pd.testing.assert_frame_equal(pdf_expected, result3)
Beispiel #2
0
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    @require_cudf
    def testToGPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)

        res = self.executor.execute_dataframe(cdf, concat=True)[0]
        self.assertIsInstance(res, cudf.DataFrame)
        pd.testing.assert_frame_equal(res.to_pandas(), pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries)
        cseries = series.to_gpu()

        res = self.executor.execute_dataframe(cseries, concat=True)[0]
        self.assertIsInstance(res, cudf.Series)
        pd.testing.assert_series_equal(res.to_pandas(), pseries)

    @require_cudf
    def testToCPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)
        df2 = to_cpu(cdf)

        res = self.executor.execute_dataframe(df2, concat=True)[0]
        self.assertIsInstance(res, pd.DataFrame)
        pd.testing.assert_frame_equal(res, pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries, chunk_size=(13, 21))
        cseries = to_gpu(series)
        series2 = to_cpu(cseries)

        res = self.executor.execute_dataframe(series2, concat=True)[0]
        self.assertIsInstance(res, pd.Series)
        pd.testing.assert_series_equal(res, pseries)
Beispiel #3
0
class TestUnary(TestBase):
    def setUp(self):
        super(TestUnary, self).setUp()
        self.executor = Executor()

    def testAbs(self):
        data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)))
        df1 = from_pandas(data1, chunk_size=5)

        result = self.executor.execute_dataframe(abs(df1), concat=True)[0]
        expected = data1.abs()
        pd.testing.assert_frame_equal(expected, result)
Beispiel #4
0
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    def testPandasExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)
Beispiel #5
0
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

    def testFromPandasSeriesExecution(self):
        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = from_pandas_series(ps, chunk_size=13)

        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

    def testFromPandasSeriesExecution(self):
        ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a')
        series = from_pandas_series(ps, chunk_size=13)

        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testInitializerExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        ps = pd.Series(np.random.rand(20), index=[np.arange(20), np.arange(20, 0, -1)], name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testSeriesFromTensor(self):
        data = np.random.rand(10)
        series = md.Series(mt.tensor(data), name='a')
        pd.testing.assert_series_equal(series.execute(), pd.Series(data, name='a'))

        series = md.Series(mt.tensor(data, chunk_size=3))
        pd.testing.assert_series_equal(series.execute(), pd.Series(data))

        series = md.Series(mt.ones((10,), chunk_size=4))
        pd.testing.assert_series_equal(series.execute(), pd.Series(np.ones(10,)))


    def testFromTensorExecution(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = dataframe_from_tensor(tensor)
        tensor_res = self.executor.execute_tensor(tensor, concat=True)[0]
        pdf_expected = pd.DataFrame(tensor_res)
        df_result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
        pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
        pd.testing.assert_frame_equal(df_result, pdf_expected)

        # test converted with specified index_value and columns
        tensor2 = mt.random.rand(2, 2, chunk_size=1)
        df2 = dataframe_from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4]))
        df_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b']))
        pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))

        # test converted from 1-d tensor
        tensor3 = mt.array([1, 2, 3])
        df3 = dataframe_from_tensor(tensor3)
        result3 = self.executor.execute_dataframe(df3, concat=True)[0]
        pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
        pd.testing.assert_frame_equal(pdf_expected, result3)

        # test converted from identical chunks
        tensor4 = mt.ones((10, 10), chunk_size=3)
        df4 = dataframe_from_tensor(tensor4)
        result4 = self.executor.execute_dataframe(df4, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(tensor4, concat=True)[0])
        pd.testing.assert_frame_equal(pdf_expected, result4)

        # from tensor with given index
        tensor5 = mt.ones((10, 10), chunk_size=3)
        df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2))
        result5 = self.executor.execute_dataframe(df5, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(tensor5, concat=True)[0],
                                    index=np.arange(0, 20, 2))
        pd.testing.assert_frame_equal(pdf_expected, result5)

        # from tensor with given columns
        tensor6 = mt.ones((10, 10), chunk_size=3)
        df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij'))
        result6 = self.executor.execute_dataframe(df6, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(tensor6, concat=True)[0],
                                    columns=list('abcdefghij'))
        pd.testing.assert_frame_equal(pdf_expected, result6)

    def testFromRecordsExecution(self):
        dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')])

        ndarr = np.ones((10,), dtype=dtype)
        pdf_expected = pd.DataFrame.from_records(ndarr, index=pd.RangeIndex(10))

        # from structured array of mars
        tensor = mt.ones((10,), dtype=dtype, chunk_size=3)
        df1 = from_records(tensor)
        df1_result = self.executor.execute_dataframe(df1, concat=True)[0]
        pd.testing.assert_frame_equal(df1_result, pdf_expected)

        # from structured array of numpy
        df2 = from_records(ndarr)
        df2_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(df2_result, pdf_expected)
Beispiel #7
0
class TestBinary(TestBase):
    def setUp(self):
        self.executor = Executor()

    @property
    def rfunc_name(self):
        return 'r' + self.func_name

    def testWithoutShuffleExecution(self):
        # all the axes are monotonic
        # data1 with index split into [0...4], [5...9],
        # columns [3...7], [8...12]
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=np.arange(3, 13))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        # columns [4...9], [10, 13]
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=np.arange(4, 14))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithOneShuffleExecution(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithAllShuffleExecution(self):
        # no axis is monotonic
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testBothWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=10)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=10)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=10)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=10)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithoutShuffleAndWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=(10, 5))
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithShuffleAndWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(10, 5))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testChained(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        data4 = pd.DataFrame(np.random.rand(10, 10))
        df4 = from_pandas(data4, chunk_size=6)

        df5 = self.func(df3, df4)

        result = self.executor.execute_dataframe(df5, concat=True)[0]
        expected = self.func(self.func(data1, data2), data4)

        pd.testing.assert_frame_equal(expected, result)

    def testRfunc(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)
        df3 = getattr(df1, self.rfunc_name)(df2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]
        expected = self.func(data2, data1)
        pd.testing.assert_frame_equal(expected, result)

        data3 = pd.DataFrame(np.random.rand(10, 10))
        df4 = from_pandas(data3, chunk_size=5)
        df5 = getattr(df4, self.rfunc_name)(1)
        result = self.executor.execute_dataframe(df5, concat=True)[0]
        expected2 = self.func(1, data3)
        pd.testing.assert_frame_equal(expected2, result)

    def testWithMultiForms(self):
        # test multiple forms
        # such as self+other, self.add(other), add(self,other)
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(getattr(df1, self.func_name)(df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(getattr(df1, self.rfunc_name)(df2), concat=True)[0]
        pd.testing.assert_frame_equal(self.func(data2, data1), result)

    def testDataframeAndScalar(self):
        # test dataframe and scalar
        pdf = pd.DataFrame(np.random.rand(10, 10))
        df = from_pandas(pdf, chunk_size=2)
        expected = self.func(pdf, 1)
        result = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result2 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result2)
        result3 = self.executor.execute_dataframe(getattr(df, self.func_name)(1), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result3)

        # test scalar and dataframe
        result4 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result4)

        expected2 = self.func(1, pdf)
        result5 = self.executor.execute_dataframe(self.func(1, df), concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result5)

        result6 = self.executor.execute_dataframe(getattr(df, self.rfunc_name)(1), concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result6)

    def testWithShuffleOnStringIndex(self):
        # no axis is monotonic, and the index values are strings.
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testDataframeAndSeries(self):
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])

        s1 = from_pandas_series(data2[1], chunk_size=(6,))

        # operate on single-column dataframe and series
        df1 = from_pandas(data1[[1]], chunk_size=(5, 5))
        r1 = getattr(df1, self.func_name)(s1, axis='index')

        expected = getattr(data1[[1]], self.func_name)(data2[1], axis='index')
        result = self.executor.execute_dataframe(r1, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # operate on dataframe and series without shuffle
        df2 = from_pandas(data1, chunk_size=(5, 5))
        r2 = getattr(df2, self.func_name)(s1, axis='index')

        expected = getattr(data1, self.func_name)(data2[1], axis='index')
        result = self.executor.execute_dataframe(r2, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # operate on dataframe and series with shuffle
        df3 = from_pandas(data1, chunk_size=(5, 5))
        r3 = getattr(df3, self.func_name)(s1, axis='columns')

        expected = getattr(data1, self.func_name)(data2[1], axis='columns')
        result = self.executor.execute_dataframe(r3, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test both one chunk, axis=0
        pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3])
        df = from_pandas(pdf)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=0), concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=0)
        pd.testing.assert_frame_equal(expected, result)

        # test different number of chunks, axis=0
        pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[1, 2, 3])
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=0), concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=0)
        pd.testing.assert_frame_equal(expected, result)

        # test with row shuffle, axis=0
        pdf = pd.DataFrame({'ca': [1, 3, 2], 'cb': [360, 180, 2]}, index=[2, 1, 3])
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[3, 1, 2])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=0), concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=0).reindex([3, 1, 2])
        # modify the order of rows
        result = result.reindex(index=[3, 1, 2])
        pd.testing.assert_frame_equal(expected, result)

        # test both one chunk, axis=1
        pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc'])
        df = from_pandas(pdf)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=1), concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=1)
        pd.testing.assert_frame_equal(expected, result)

        # test different number of chunks, axis=1
        pdf = pd.DataFrame({1: [1, 3, 2], 2: [360, 180, 2], 3: [1, 2, 3]}, index=['ra', 'rb', 'rc'])
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=1), concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=1)
        pd.testing.assert_frame_equal(expected, result)

        # test with row shuffle, axis=1
        pdf = pd.DataFrame({1: [1, 3, 2], 3: [1, 2, 3], 2: [360, 180, 2]}, index=['ra', 'rb', 'rc'])
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[3, 1, 2])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(mars_series, axis=1), concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=1)
        # modify the order of columns
        result = result[[1, 2, 3]]
        pd.testing.assert_frame_equal(expected, result)

    def testSeries(self):
        # only one chunk
        s1 = pd.Series(np.arange(10) + 1)
        s2 = pd.Series(np.arange(10) + 1)
        r = self.func(from_pandas_series(s1, chunk_size=10), from_pandas_series(s2, chunk_size=10))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        # same index
        s1 = pd.Series(np.arange(10) + 1)
        s2 = pd.Series(np.arange(10) + 1)
        r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        # no shuffle
        s1 = pd.Series(np.arange(10) + 1, index=range(10))
        s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1))
        r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        # shuffle
        s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10)))
        s2 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10, 0, -1)))
        r = self.func(from_pandas_series(s1, chunk_size=4), from_pandas_series(s2, chunk_size=6))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        # operate with scalar
        s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10)))
        r = self.func(from_pandas_series(s1, chunk_size=4), 4)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, 4)
        pd.testing.assert_series_equal(expected, result)

        # reverse with scalar
        s1 = pd.Series(np.arange(10) + 1, index=np.random.permutation(range(10)))
        r = self.func(4, from_pandas_series(s1, chunk_size=4))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(4, s1)
        pd.testing.assert_series_equal(expected, result)

    def testWithPlainValue(self):
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=6)
        s1 = df1[2]

        r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)

    @unittest.expectedFailure
    def testWithPlainValueUnaligned(self):
        # When adding dataframe with a sequence value, pandas treats the sequence
        # as a series using the index_value of the dataframe.
        #
        # In mars we cannot do such things because the index_value is not stored.
        # We also cannot split the sequence using the nsplits of the dataframe since
        # in many cases the shape of the dataframe chunks is np.nan.
        #
        # We record this case as `expectedFailure`.
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=6)

        r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0)
        pd.testing.assert_frame_equal(expected, result)
Beispiel #8
0
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    @require_cudf
    def testToGPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)

        res = self.executor.execute_dataframe(cdf, concat=True)[0]
        self.assertIsInstance(res, cudf.DataFrame)
        pd.testing.assert_frame_equal(res.to_pandas(), pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries)
        cseries = series.to_gpu()

        res = self.executor.execute_dataframe(cseries, concat=True)[0]
        self.assertIsInstance(res, cudf.Series)
        pd.testing.assert_series_equal(res.to_pandas(), pseries)

    @require_cudf
    def testToCPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)
        df2 = to_cpu(cdf)

        res = self.executor.execute_dataframe(df2, concat=True)[0]
        self.assertIsInstance(res, pd.DataFrame)
        pd.testing.assert_frame_equal(res, pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries, chunk_size=(13, 21))
        cseries = to_gpu(series)
        series2 = to_cpu(cseries)

        res = self.executor.execute_dataframe(series2, concat=True)[0]
        self.assertIsInstance(res, pd.Series)
        pd.testing.assert_series_equal(res, pseries)

    def testRechunkExecution(self):
        data = pd.DataFrame(np.random.rand(8, 10))
        df = from_pandas_df(pd.DataFrame(data), chunk_size=3)
        df2 = df.rechunk((3, 4))
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        df2 = df.rechunk(5)
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        # test Series rechunk execution.
        data = pd.Series(np.random.rand(10, ))
        series = from_pandas_series(data)
        series2 = series.rechunk(3)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        series2 = series.rechunk(1)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)
Beispiel #9
0
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    def testAddWithoutShuffleExecution(self):
        # all the axes are monotonic
        # data1 with index split into [0...4], [5...9],
        # columns [3...7], [8...12]
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=np.arange(3, 13))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        # columns [4...9], [10, 13]
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=np.arange(4, 14))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testAddWithOneShuffleExecution(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testAddWithAllShuffleExecution(self):
        # no axis is monotonic
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testAddBothWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=10)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=10)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=10)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=10)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testAddWithoutShuffleAndWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=(10, 5))
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testAddWithShuffleAndWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(10, 5))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testAddWithAdded(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        data4 = pd.DataFrame(np.random.rand(10, 10))
        df4 = from_pandas(data4, chunk_size=6)

        df5 = add(df3, df4)

        result = self.executor.execute_dataframe(df5, concat=True)[0]
        expected = data1 + data2 + data4

        pd.testing.assert_frame_equal(expected, result)

    def testRadd(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)
        radd = getattr(df2, '__radd__')
        df3 = radd(df1, df2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]
        expected = data1 + data2
        pd.testing.assert_frame_equal(expected, result)

    def testAddWithMultiForms(self):
        # test multiple forms of add
        # such as self+other, self.add(other), add(self,other)
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df1 + df2, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(add(df1, df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(df1.add(df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(df1.radd(df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

    def testAbs(self):
        data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)))
        df1 = from_pandas(data1, chunk_size=5)

        result = self.executor.execute_dataframe(abs(df1), concat=True)[0]
        expected = data1.abs()
        pd.testing.assert_frame_equal(expected, result)
Beispiel #10
0
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    def testMerge(self):
        df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1,
                           columns=['a', 'b', 'c', 'd', 'e'])
        df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1,
                           columns=['a', 'b', 'x', 'y'])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        # Note [Index of Merge]
        #
        # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to
        # the final result dataframe.
        #
        # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex.
        # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the
        # same index value with pandas. But we guarantee that the content of dataframe is correct.

        # merge on index
        expected0 = df1.merge(df2)
        jdf0 = mdf1.merge(mdf2)
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0),
                                      sort_dataframe_inplace(result0, 0))

        # merge on left index and `right_on`
        expected1 = df1.merge(df2, how='left', right_on='x', left_index=True)
        jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True)
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]
        expected1.set_index('a_x', inplace=True)
        result1.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0),
                                      sort_dataframe_inplace(result1, 0))

        # merge on `left_on` and right index
        expected2 = df1.merge(df2, how='right', left_on='a', right_index=True)
        jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True)
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]
        expected2.set_index('a', inplace=True)
        result2.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0),
                                      sort_dataframe_inplace(result2, 0))

        # merge on `left_on` and `right_on`
        expected3 = df1.merge(df2, how='left', left_on='a', right_on='x')
        jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        expected3.set_index('a_x', inplace=True)
        result3.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0),
                                      sort_dataframe_inplace(result3, 0))

        # merge on `on`
        expected4 = df1.merge(df2, how='right', on='a')
        jdf4 = mdf1.merge(mdf2, how='right', on='a')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]
        expected4.set_index('a', inplace=True)
        result4.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0),
                                      sort_dataframe_inplace(result4, 0))

        # merge on multiple columns
        expected5 = df1.merge(df2, how='inner', on=['a', 'b'])
        jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b'])
        result5 = self.executor.execute_dataframe(jdf5, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0),
                                      sort_dataframe_inplace(result5, 0))

    def testJoin(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]],
                           index=['a1', 'a2', 'a3'])
        df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]],
                           index=['a1', 'b2', 'b3']) + 1
        df2 = pd.concat([df2, df2 + 1])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        # default `how`
        expected0 = df1.join(df2, lsuffix='l_', rsuffix='r_')
        jdf0 = mdf1.join(mdf2, lsuffix='l_', rsuffix='r_')
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(expected0.sort_index(),
                                      result0.sort_index())

        # how = 'left'
        expected1 = df1.join(df2, how='left', lsuffix='l_', rsuffix='r_')
        jdf1 = mdf1.join(mdf2, how='left', lsuffix='l_', rsuffix='r_')
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]
        pd.testing.assert_frame_equal(expected1.sort_index(),
                                      result1.sort_index())

        # how = 'right'
        expected2 = df1.join(df2, how='right', lsuffix='l_', rsuffix='r_')
        jdf2 = mdf1.join(mdf2, how='right', lsuffix='l_', rsuffix='r_')
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]
        pd.testing.assert_frame_equal(expected2.sort_index(),
                                      result2.sort_index())

        # how = 'inner'
        expected3 = df1.join(df2, how='inner', lsuffix='l_', rsuffix='r_')
        jdf3 = mdf1.join(mdf2, how='inner', lsuffix='l_', rsuffix='r_')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        pd.testing.assert_frame_equal(expected3.sort_index(),
                                      result3.sort_index())

        # how = 'outer'
        expected4 = df1.join(df2, how='outer', lsuffix='l_', rsuffix='r_')
        jdf4 = mdf1.join(mdf2, how='outer', lsuffix='l_', rsuffix='r_')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]
        pd.testing.assert_frame_equal(expected4.sort_index(),
                                      result4.sort_index())

    def testJoinOn(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]],
                           columns=['a1', 'a2', 'a3'])
        df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]],
                           columns=['a1', 'b2', 'b3']) + 1
        df2 = pd.concat([df2, df2 + 1])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        expected0 = df1.join(df2, on=None, lsuffix='_l', rsuffix='_r')
        jdf0 = mdf1.join(mdf2, on=None, lsuffix='_l', rsuffix='_r')
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0),
                                      sort_dataframe_inplace(result0, 0))

        expected1 = df1.join(df2,
                             how='left',
                             on='a1',
                             lsuffix='_l',
                             rsuffix='_r')
        jdf1 = mdf1.join(mdf2, how='left', on='a1', lsuffix='_l', rsuffix='_r')
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]

        # Note [Columns of Left Join]
        #
        # I believe we have no chance to obtain the entirely same result with pandas here:
        #
        # Look at the following example:
        #
        # >>> df1
        #     a1  a2  a3
        # 0   1   3   3
        # >>> df2
        #     a1  b2  b3
        # 1   2   6   7
        # >>> df3
        #     a1  b2  b3
        # 1   2   6   7
        # 1   2   6   7
        #
        # >>> df1.merge(df2, how='left', left_on='a1', left_index=False, right_index=True)
        #     a1_x  a2  a3  a1_y  b2  b3
        # 0   1   3   3     2   6   7
        # >>> df1.merge(df3, how='left', left_on='a1', left_index=False, right_index=True)
        #     a1  a1_x  a2  a3  a1_y  b2  b3
        # 0   1     1   3   3     2   6   7
        # 0   1     1   3   3     2   6   7
        #
        # Note that the result of `df1.merge(df3)` has an extra column `a` compared to `df1.merge(df2)`.
        # The value of column `a` is the same of `a1_x`, just because `1` occurs twice in index of `df3`.
        # I haven't invistagated why pandas has such behaviour...
        #
        # We cannot yeild the same result with pandas, because, the `df3` is chunked, then some of the
        # result chunk has 6 columns, others may have 7 columns, when concatenated into one DataFrame
        # some cells of column `a` will have value `NaN`, which is different from the result of pandas.
        #
        # But we can guarantee that other effective columns have absolutely same value with pandas.

        columns_to_compare = jdf1.columns_value.to_pandas()

        pd.testing.assert_frame_equal(
            sort_dataframe_inplace(expected1[columns_to_compare], 0, 1),
            sort_dataframe_inplace(result1[columns_to_compare], 0, 1))

        # Note [Index of Join on EmptyDataFrame]
        #
        # It is tricky that it is non-trivial to get the same `index` result with pandas.
        #
        # Look at the following example:
        #
        # >>> df1
        #    a1  a2  a3
        # 1   4   2   6
        # >>> df2
        #    a1  b2  b3
        # 1   2   6   7
        # 2   8   9  10
        # >>> df3
        # Empty DataFrame
        # Columns: [a1, a2, a3]
        # Index: []
        # >>> df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        #       a1_l  a2   a3  a1_r  b2  b3
        # 1.0   4.0   2  6.0     8   9  10
        # NaN   NaN   1  NaN     2   6   7
        # >>> df3.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        #     a1_l  a2  a3  a1_r  b2  b3
        # 1   NaN   1 NaN     2   6   7
        # 2   NaN   2 NaN     8   9  10
        #
        # When the `left` dataframe is not empty, the mismatched rows in `right` will have index value `NaN`,
        # and the matched rows have index value from `right`. When the `left` dataframe is empty, the mismatched
        # rows have index value from `right`.
        #
        # Since we chunked the `left` dataframe, it is uneasy to obtain the same index value with pandas in the
        # final result dataframe, but we guaranteed that the dataframe content is correctly.

        expected2 = df1.join(df2,
                             how='right',
                             on='a2',
                             lsuffix='_l',
                             rsuffix='_r')
        jdf2 = mdf1.join(mdf2,
                         how='right',
                         on='a2',
                         lsuffix='_l',
                         rsuffix='_r')
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]

        expected2.set_index('a2', inplace=True)
        result2.set_index('a2', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0),
                                      sort_dataframe_inplace(result2, 0))

        expected3 = df1.join(df2,
                             how='inner',
                             on='a2',
                             lsuffix='_l',
                             rsuffix='_r')
        jdf3 = mdf1.join(mdf2,
                         how='inner',
                         on='a2',
                         lsuffix='_l',
                         rsuffix='_r')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0),
                                      sort_dataframe_inplace(result3, 0))

        expected4 = df1.join(df2,
                             how='outer',
                             on='a2',
                             lsuffix='_l',
                             rsuffix='_r')
        jdf4 = mdf1.join(mdf2,
                         how='outer',
                         on='a2',
                         lsuffix='_l',
                         rsuffix='_r')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]

        expected4.set_index('a2', inplace=True)
        result4.set_index('a2', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0),
                                      sort_dataframe_inplace(result4, 0))
Beispiel #11
0
class Test(TestBase):
    def setUp(self):
        super(Test, self).setUp()
        self.executor = Executor()

    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

    def testFromPandasSeriesExecution(self):
        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = from_pandas_series(ps, chunk_size=13)

        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testInitializerExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testSeriesFromTensor(self):
        data = np.random.rand(10)
        series = md.Series(mt.tensor(data), name='a')
        pd.testing.assert_series_equal(series.execute(),
                                       pd.Series(data, name='a'))

        series = md.Series(mt.tensor(data, chunk_size=3))
        pd.testing.assert_series_equal(series.execute(), pd.Series(data))

        series = md.Series(mt.ones((10, ), chunk_size=4))
        pd.testing.assert_series_equal(series.execute(),
                                       pd.Series(np.ones(10, )))

    def testFromTensorExecution(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = dataframe_from_tensor(tensor)
        tensor_res = self.executor.execute_tensor(tensor, concat=True)[0]
        pdf_expected = pd.DataFrame(tensor_res)
        df_result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
        pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
        pd.testing.assert_frame_equal(df_result, pdf_expected)

        # test converted with specified index_value and columns
        tensor2 = mt.random.rand(2, 2, chunk_size=1)
        df2 = dataframe_from_tensor(tensor2,
                                    index=pd.Index(['a', 'b']),
                                    columns=pd.Index([3, 4]))
        df_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b']))
        pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))

        # test converted from 1-d tensor
        tensor3 = mt.array([1, 2, 3])
        df3 = dataframe_from_tensor(tensor3)
        result3 = self.executor.execute_dataframe(df3, concat=True)[0]
        pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
        pd.testing.assert_frame_equal(pdf_expected, result3)

        # test converted from identical chunks
        tensor4 = mt.ones((10, 10), chunk_size=3)
        df4 = dataframe_from_tensor(tensor4)
        result4 = self.executor.execute_dataframe(df4, concat=True)[0]
        pdf_expected = pd.DataFrame(
            self.executor.execute_tensor(tensor4, concat=True)[0])
        pd.testing.assert_frame_equal(pdf_expected, result4)

        # from tensor with given index
        tensor5 = mt.ones((10, 10), chunk_size=3)
        df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2))
        result5 = self.executor.execute_dataframe(df5, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor5, concat=True)[0],
                                    index=np.arange(0, 20, 2))
        pd.testing.assert_frame_equal(pdf_expected, result5)

        # from tensor with given columns
        tensor6 = mt.ones((10, 10), chunk_size=3)
        df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij'))
        result6 = self.executor.execute_dataframe(df6, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor6, concat=True)[0],
                                    columns=list('abcdefghij'))
        pd.testing.assert_frame_equal(pdf_expected, result6)

    def testFromRecordsExecution(self):
        dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')])

        ndarr = np.ones((10, ), dtype=dtype)
        pdf_expected = pd.DataFrame.from_records(ndarr,
                                                 index=pd.RangeIndex(10))

        # from structured array of mars
        tensor = mt.ones((10, ), dtype=dtype, chunk_size=3)
        df1 = from_records(tensor)
        df1_result = self.executor.execute_dataframe(df1, concat=True)[0]
        pd.testing.assert_frame_equal(df1_result, pdf_expected)

        # from structured array of numpy
        df2 = from_records(ndarr)
        df2_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(df2_result, pdf_expected)

    def testReadCSVExecution(self):
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test sep
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path, sep=';')

            pdf = pd.read_csv(file_path, sep=';', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              sep=';',
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               sep=';',
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test missing value
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame({
                'c1': [np.nan, 'a', 'b', 'c'],
                'c2': [1, 2, 3, np.nan],
                'c3': [np.nan, np.nan, 3.4, 2.2]
            })
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=12),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, index_col=0, chunk_bytes=100),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test compression
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.gzip')
        try:
            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path, compression='gzip')

            pdf = pd.read_csv(file_path, compression='gzip', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0, chunk_bytes='1k'),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test multiply files
        tempdir = tempfile.mkdtemp()
        try:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [
                os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3)
            ]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            mdf = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                               index_col=0,
                                                               chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2)

        finally:
            shutil.rmtree(tempdir)