コード例 #1
0
    def testAddWithOneShuffleExecution(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
コード例 #2
0
ファイル: test_cluster.py プロジェクト: ueshin/mars
    def testFetchDataFrame(self, *_):
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.arithmetic import add

        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:
            session = cluster.session

            data1 = pd.DataFrame(np.random.rand(10, 10))
            df1 = from_pandas_df(data1, chunk_size=5)
            data2 = pd.DataFrame(np.random.rand(10, 10))
            df2 = from_pandas_df(data2, chunk_size=6)

            df3 = add(df1, df2)

            r1 = session.run(df3, compose=False, timeout=_exec_timeout)
            r2 = session.fetch(df3)
            pd.testing.assert_frame_equal(r1, r2)

            data4 = pd.DataFrame(np.random.rand(10, 10))
            df4 = from_pandas_df(data4, chunk_size=6)

            df5 = add(df3, df4)

            r1 = session.run(df5, compose=False, timeout=_exec_timeout)
            r2 = session.fetch(df5)
            pd.testing.assert_frame_equal(r1, r2)

            df6 = df5.sum()
            r1 = session.run(df6, timeout=_exec_timeout)
            r2 = session.fetch(df6)
            pd.testing.assert_series_equal(r1, r2)
コード例 #3
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testBothOneChunk(self):
        # no axis is monotonic, but 1 chunk for all axes
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=10)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=10)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 12)  # columns is recorded, so we can get it

        df3.tiles()

        self.assertEqual(df3.chunk_shape, (1, 1))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test the left side
            self.assertIs(c.inputs[0], df1.chunks[0].data)
            # test the right side
            self.assertIs(c.inputs[1], df2.chunks[0].data)
コード例 #4
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testAddSelf(self):
        data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas(data, chunk_size=3)
        df2 = add(df, df)

        # test df2's index and columns
        pd.testing.assert_index_equal(df2.columns.to_pandas(), (data + data).columns)
        self.assertTrue(df2.columns.should_be_monotonic)
        self.assertIsInstance(df2.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df2.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df2.index_value.to_pandas(), pd.Int64Index([]))
        self.assertEqual(df2.index_value.key, df.index_value.key)
        self.assertEqual(df2.columns.key, df.columns.key)
        self.assertEqual(df2.shape[1], 10)

        df2.tiles()

        self.assertEqual(df2.chunk_shape, df.chunk_shape)
        for c in df2.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test the left side
            self.assertIs(c.inputs[0], df.cix[c.index].data)
            # test the right side
            self.assertIs(c.inputs[1], df.cix[c.index].data)
コード例 #5
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testAddScalar(self):
        data = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                            columns=np.arange(3, 13))
        df = from_pandas(data, chunk_size=5)
        # test add with scalar
        result = add(df, 1)
        result2 = df.add(1)

        # test radd with scalar
        result3 = df.radd(1)
        result4 = df + 1
        result5 = 1 + df
        pd.testing.assert_index_equal(result.columns.to_pandas(), data.columns)
        self.assertIsInstance(result.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result2.columns.to_pandas(), data.columns)
        self.assertIsInstance(result2.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result3.columns.to_pandas(), data.columns)
        self.assertIsInstance(result3.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result4.columns.to_pandas(), data.columns)
        self.assertIsInstance(result4.index_value.value, IndexValue.Int64Index)

        pd.testing.assert_index_equal(result5.columns.to_pandas(), data.columns)
        self.assertIsInstance(result5.index_value.value, IndexValue.Int64Index)

        # test NotImplemented, use other's radd instead
        class TestRadd:
            def __radd__(self, other):
                return 1

        other = TestRadd()
        ret = df + other
        self.assertEqual(ret, 1)
コード例 #6
0
    def testAddWithAdded(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        data4 = pd.DataFrame(np.random.rand(10, 10))
        df4 = from_pandas(data4, chunk_size=6)

        df5 = add(df3, df4)

        result = self.executor.execute_dataframe(df5, concat=True)[0]
        expected = data1 + data2 + data4

        pd.testing.assert_frame_equal(expected, result)
コード例 #7
0
    def testAddScalar(self):
        # test dataframe + scalar
        pdf = pd.DataFrame(np.random.rand(10, 10))
        df = from_pandas(pdf, chunk_size=2)
        expected = pdf + 1
        result = self.executor.execute_dataframe(add(df, 1), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result2 = self.executor.execute_dataframe(df + 1, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result2)
        result3 = self.executor.execute_dataframe(df.add(1), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result3)

        # test scalar + dataframe
        result4 = self.executor.execute_dataframe(add(1, df), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result4)

        expected2 = 1 + pdf
        result5 = self.executor.execute_dataframe(1 + df, concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result5)

        result6 = self.executor.execute_dataframe(df.radd(1), concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result6)
コード例 #8
0
    def testDataFrame(self):
        with option_context({'eager_mode': True}):
            from mars.dataframe.arithmetic import add

            data1 = pd.DataFrame(np.random.rand(10, 10))
            df1 = from_pandas(data1, chunk_size=5)
            pd.testing.assert_frame_equal(df1.fetch(), data1)

            data2 = pd.DataFrame(np.random.rand(10, 10))
            df2 = from_pandas(data2, chunk_size=6)
            pd.testing.assert_frame_equal(df2.fetch(), data2)

            df3 = add(df1, df2)
            pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)
コード例 #9
0
    def testAddWithAllShuffleExecution(self):
        # no axis is monotonic
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
コード例 #10
0
    def testAddWithShuffleOnStringIndex(self):
        # no axis is monotonic, and the index values are strings.
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]],
                                columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]],
                                columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
コード例 #11
0
    def testAddWithMultiForms(self):
        # test multiple forms of add
        # such as self+other, self.add(other), add(self,other)
        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas(data2, chunk_size=6)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df1 + df2, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(add(df1, df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(df1.add(df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(df1.radd(df2), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
コード例 #12
0
    def testAddWithoutShuffleExecution(self):
        # all the axes are monotonic
        # data1 with index split into [0...4], [5...9],
        # columns [3...7], [8...12]
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=np.arange(3, 13))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        # columns [4...9], [10, 13]
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=np.arange(4, 14))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        expected = data1 + data2
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)
コード例 #13
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testAddIdenticalIndexAndColumns(self):
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             columns=np.arange(3, 13))
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             columns=np.arange(3, 13))
        df2 = from_pandas(data2, chunk_size=5)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.RangeIndex)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.RangeIndex(0, 10))
        self.assertEqual(df3.index_value.key, df1.index_value.key)
        self.assertEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape, (10, 10))  # columns is recorded, so we can get it

        df3.tiles()

        self.assertEqual(df3.chunk_shape, (2, 2))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            self.assertEqual(c.shape, (5, 5))
            self.assertEqual(c.index_value.key, df1.cix[c.index].index_value.key)
            self.assertEqual(c.index_value.key, df2.cix[c.index].index_value.key)
            self.assertEqual(c.columns.key, df1.cix[c.index].columns.key)
            self.assertEqual(c.columns.key, df2.cix[c.index].columns.key)
            pd.testing.assert_index_equal(c.columns.to_pandas(), df1.cix[c.index].columns.to_pandas())
            pd.testing.assert_index_equal(c.columns.to_pandas(), df2.cix[c.index].columns.to_pandas())
            pd.testing.assert_index_equal(c.dtypes.index, df1.cix[c.index].columns.to_pandas())

            # test the left side
            self.assertIs(c.inputs[0], df1.cix[c.index].data)
            # test the right side
            self.assertIs(c.inputs[1], df2.cix[c.index].data)
コード例 #14
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testAddWithAllShuffle(self):
        # no axis is monotonic
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 12)  # columns is recorded, so we can get it

        df3.tiles()

        self.assertEqual(df3.chunk_shape, (2, 2))
        proxy_keys = set()
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
                                       for ic in c.inputs[0].inputs[0].inputs if ic.index[0] == 0])
            pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), c.inputs[0].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
            proxy_keys.add(c.inputs[0].inputs[0].op.key)
            for ic, ci in zip(c.inputs[0].inputs[0].inputs, df1.chunks):
                self.assertIsInstance(ic.op, DataFrameIndexAlignMap)
                self.assertEqual(ic.op.index_shuffle_size, 2)
                self.assertIsInstance(ic.index_value.to_pandas(), type(data1.index))
                self.assertEqual(ic.op.column_shuffle_size, 2)
                self.assertIsNotNone(ic.columns)
                shuffle_segments = ic.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 2)
                self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ic.inputs[0], ci.data)
            # test right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
                                       for ic in c.inputs[1].inputs[0].inputs if ic.index[0] == 0])
            pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), c.inputs[1].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
            proxy_keys.add(c.inputs[1].inputs[0].op.key)
            for ic, ci in zip(c.inputs[1].inputs[0].inputs, df2.chunks):
                self.assertIsInstance(ic.op, DataFrameIndexAlignMap)
                self.assertEqual(ic.op.index_shuffle_size, 2)
                self.assertIsInstance(ic.index_value.to_pandas(), type(data1.index))
                self.assertEqual(ic.op.column_shuffle_size, 2)
                self.assertIsNotNone(ic.columns)
                shuffle_segments = ic.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 2)
                self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ic.inputs[0], ci.data)

        self.assertEqual(len(proxy_keys), 2)

        data4 = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                             columns=[np.random.bytes(10) for _ in range(10)])
        df4 = from_pandas(data4, chunk_size=3)

        data5 = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                             columns=[np.random.bytes(10) for _ in range(10)])
        df5 = from_pandas(data5, chunk_size=3)

        df6 = add(df4, df5)

        # test df6's index and columns
        pd.testing.assert_index_equal(df6.columns.to_pandas(), (data4 + data5).columns)
        self.assertTrue(df6.columns.should_be_monotonic)
        self.assertIsInstance(df6.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df6.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df6.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df6.index_value.key, df4.index_value.key)
        self.assertNotEqual(df6.index_value.key, df5.index_value.key)
        self.assertEqual(df6.shape[1], 20)  # columns is recorded, so we can get it

        df6.tiles()

        self.assertEqual(df6.chunk_shape, (4, 4))
        proxy_keys = set()
        for c in df6.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 4)[c.index[1]]
                                       for ic in c.inputs[0].inputs[0].inputs if ic.index[0] == 0])
            pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), c.inputs[0].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
            proxy_keys.add(c.inputs[0].inputs[0].op.key)
            for ic, ci in zip(c.inputs[0].inputs[0].inputs, df4.chunks):
                self.assertIsInstance(ic.op, DataFrameIndexAlignMap)
                self.assertEqual(ic.op.index_shuffle_size, 4)
                self.assertIsInstance(ic.index_value.to_pandas(), type(data1.index))
                self.assertEqual(ic.op.column_shuffle_size, 4)
                self.assertIsNotNone(ic.columns)
                shuffle_segments = ic.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 4)
                self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ic.inputs[0], ci.data)
            # test right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 4)[c.index[1]]
                                       for ic in c.inputs[1].inputs[0].inputs if ic.index[0] == 0])
            pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), c.inputs[1].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
            proxy_keys.add(c.inputs[1].inputs[0].op.key)
            for ic, ci in zip(c.inputs[1].inputs[0].inputs, df5.chunks):
                self.assertIsInstance(ic.op, DataFrameIndexAlignMap)
                self.assertEqual(ic.op.index_shuffle_size, 4)
                self.assertIsInstance(ic.index_value.to_pandas(), type(data1.index))
                self.assertEqual(ic.op.column_shuffle_size, 4)
                self.assertIsNotNone(ic.columns)
                shuffle_segments = ic.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ci.data.dtypes, 4)
                self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ic.inputs[0], ci.data)

        self.assertEqual(len(proxy_keys), 2)
コード例 #15
0
    def testMainDataFrameWithoutEtcd(self):
        import pandas as pd
        from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.datasource.series import from_pandas as from_pandas_series
        from mars.dataframe.arithmetic import add

        self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=(10, 5))
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = from_pandas_series(s1)

        graph = series1.build_graph()
        targets = [series1.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, series1.key)
        pd.testing.assert_series_equal(s1, loads(result))
コード例 #16
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testWithShuffleAndOneChunk(self):
        # no axis is monotonic
        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(5, 10))
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 12)  # columns is recorded, so we can get it

        df3.tiles()

        self.assertEqual(df3.chunk_shape, (2, 1))
        proxy_keys = set()
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([ic.inputs[0].op.data.dtypes
                                       for ic in c.inputs[0].inputs[0].inputs if ic.index[0] == 0])
            pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), c.inputs[0].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
            proxy_keys.add(c.inputs[0].inputs[0].op.key)
            for ic, ci in zip(c.inputs[0].inputs[0].inputs, df1.chunks):
                self.assertIsInstance(ic.op, DataFrameIndexAlignMap)
                self.assertEqual(ic.op.index_shuffle_size, 2)
                self.assertIsInstance(ic.index_value.to_pandas(), type(data1.index))
                self.assertEqual(ic.op.column_min, ci.columns.min_val)
                self.assertEqual(ic.op.column_min_close, ci.columns.min_val_close)
                self.assertEqual(ic.op.column_max, ci.columns.max_val)
                self.assertEqual(ic.op.column_max_close, ci.columns.max_val_close)
                self.assertIsNone(ic.op.column_shuffle_size, None)
                self.assertIsNotNone(ic.columns)
                self.assertIs(ic.inputs[0], ci.data)
            # test right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([ic.inputs[0].op.data.dtypes
                                       for ic in c.inputs[1].inputs[0].inputs if ic.index[0] == 0])
            pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), c.inputs[1].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
            proxy_keys.add(c.inputs[1].inputs[0].op.key)
            for ic, ci in zip(c.inputs[1].inputs[0].inputs, df2.chunks):
                self.assertIsInstance(ic.op, DataFrameIndexAlignMap)
                self.assertEqual(ic.op.index_shuffle_size, 2)
                self.assertIsInstance(ic.index_value.to_pandas(), type(data1.index))
                self.assertIsNone(ic.op.column_shuffle_size)
                self.assertEqual(ic.op.column_min, ci.columns.min_val)
                self.assertEqual(ic.op.column_min_close, ci.columns.min_val_close)
                self.assertEqual(ic.op.column_max, ci.columns.max_val)
                self.assertEqual(ic.op.column_max_close, ci.columns.max_val_close)
                self.assertIsNone(ic.op.column_shuffle_size, None)
                self.assertIsNotNone(ic.columns)
                self.assertIs(ic.inputs[0], ci.data)

        self.assertEqual(len(proxy_keys), 2)
コード例 #17
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testAddWithOneShuffle(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 12)  # columns is recorded, so we can get it

        df3.tiles()

        data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
        data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]

        left_index_splits, right_index_splits = split_monotonic_index_min_max(
            data1_index_min_max, True, data2_index_min_max, False)

        left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits)
        right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits, False)

        self.assertEqual(df3.chunk_shape, (7, 2))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            idx = c.index
            # test the left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
                                       for ic in c.inputs[0].inputs[0].inputs])
            pd.testing.assert_series_equal(c.inputs[0].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), c.inputs[0].dtypes.index)
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[0].inputs[0].op, DataFrameShuffleProxy)
            left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]]
            left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx]
            ics = [ic for ic in df1.chunks if ic.index[0] == left_row_idx]
            for j, ci, ic in zip(itertools.count(0), c.inputs[0].inputs[0].inputs, ics):
                self.assertIsInstance(ci.op, DataFrameIndexAlignMap)
                self.assertEqual(ci.index, (idx[0], j))
                self.assertEqual(ci.op.index_min, left_index_min_max[0])
                self.assertEqual(ci.op.index_min_close, left_index_min_max[1])
                self.assertEqual(ci.op.index_max, left_index_min_max[2])
                self.assertEqual(ci.op.index_max_close, left_index_min_max[3])
                self.assertIsInstance(ci.index_value.to_pandas(), type(data1.index))
                self.assertTrue(ci.op.column_shuffle_size, 2)
                shuffle_segments = ci.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2)
                self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ci.inputs[0], ic.data)
            # test the right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignReduce)
            expect_dtypes = pd.concat([hash_dtypes(ic.inputs[0].op.data.dtypes, 2)[c.index[1]]
                                       for ic in c.inputs[1].inputs[0].inputs])
            pd.testing.assert_series_equal(c.inputs[1].dtypes, expect_dtypes)
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), c.inputs[1].dtypes.index)
            self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data1.index))
            self.assertIsInstance(c.inputs[1].inputs[0].op, DataFrameShuffleProxy)
            right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]]
            right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
            ics = [ic for ic in df2.chunks if ic.index[0] == right_row_idx]
            for j, ci, ic in zip(itertools.count(0), c.inputs[1].inputs[0].inputs, ics):
                self.assertIsInstance(ci.op, DataFrameIndexAlignMap)
                self.assertEqual(ci.index, (idx[0], j))
                self.assertEqual(ci.op.index_min, right_index_min_max[0])
                self.assertEqual(ci.op.index_min_close, right_index_min_max[1])
                self.assertEqual(ci.op.index_max, right_index_min_max[2])
                self.assertEqual(ci.op.index_max_close, right_index_min_max[3])
                self.assertTrue(ci.op.column_shuffle_size, 2)
                shuffle_segments = ci.op.column_shuffle_segments
                expected_shuffle_segments = hash_dtypes(ic.data.dtypes, 2)
                self.assertEqual(len(shuffle_segments), len(expected_shuffle_segments))
                for ss, ess in zip(shuffle_segments, expected_shuffle_segments):
                    pd.testing.assert_series_equal(ss, ess)
                self.assertIs(ci.inputs[0], ic.data)

        # make sure shuffle proxies' key are different
        proxy_keys = set()
        for i in range(df3.chunk_shape[0]):
            cs = [c for c in df3.chunks if c.index[0] == i]
            lps = {c.inputs[0].inputs[0].op.key for c in cs}
            self.assertEqual(len(lps), 1)
            proxy_keys.add(lps.pop())
            rps = {c.inputs[1].inputs[0].op.key for c in cs}
            self.assertEqual(len(rps), 1)
            proxy_keys.add(rps.pop())
        self.assertEqual(len(proxy_keys), 2 * df3.chunk_shape[0])
コード例 #18
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testWithoutShuffleAndWithOneChunk(self):
        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 12)  # columns is recorded, so we can get it

        df3.tiles()

        data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
        data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]

        left_index_splits, right_index_splits = split_monotonic_index_min_max(
            data1_index_min_max, True, data2_index_min_max, False)

        left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits)
        right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits, False)

        self.assertEqual(df3.chunk_shape, (7, 1))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test shape
            idx = c.index
            # test the left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap)
            left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]]
            expect_df1_input = df1.cix[left_row_idx, 0].data
            self.assertIs(c.inputs[0].inputs[0], expect_df1_input)
            left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx]
            self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0])
            self.assertEqual(c.inputs[0].op.index_min_close, left_index_min_max[1])
            self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2])
            self.assertEqual(c.inputs[0].op.index_max_close, left_index_min_max[3])
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            self.assertEqual(c.inputs[0].op.column_min, expect_df1_input.columns.min_val)
            self.assertEqual(c.inputs[0].op.column_min_close, expect_df1_input.columns.min_val_close)
            self.assertEqual(c.inputs[0].op.column_max, expect_df1_input.columns.max_val)
            self.assertEqual(c.inputs[0].op.column_max_close, expect_df1_input.columns.max_val_close)
            expect_left_columns = expect_df1_input.columns
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), expect_left_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[0].dtypes.index, expect_left_columns.to_pandas())
            # test the right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap)
            right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]]
            expect_df2_input = df2.cix[right_row_idx, 0].data
            self.assertIs(c.inputs[1].inputs[0], expect_df2_input)
            right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
            self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0])
            self.assertEqual(c.inputs[1].op.index_min_close, right_index_min_max[1])
            self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2])
            self.assertEqual(c.inputs[1].op.index_max_close, right_index_min_max[3])
            self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data2.index))
            self.assertEqual(c.inputs[1].op.column_min, expect_df2_input.columns.min_val)
            self.assertEqual(c.inputs[1].op.column_min_close, expect_df2_input.columns.min_val_close)
            self.assertEqual(c.inputs[1].op.column_max, expect_df2_input.columns.max_val)
            self.assertEqual(c.inputs[1].op.column_max_close, expect_df2_input.columns.max_val_close)
            expect_right_columns = expect_df2_input.columns
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), expect_right_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[1].dtypes.index, expect_right_columns.to_pandas())
コード例 #19
0
ファイル: test_cluster.py プロジェクト: ueshin/mars
    def testEagerMode(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:

            self.assertIsInstance(Session.default_or_local()._sess,
                                  LocalClusterSession)

            with option_context({'eager_mode': True}):
                a_data = np.random.rand(10, 10)

                a = mt.tensor(a_data, chunk_size=3)
                np.testing.assert_array_equal(a, a_data)

                r1 = a + 1
                expected1 = a_data + 1
                np.testing.assert_array_equal(r1, expected1)

                r2 = r1.dot(r1)
                expected2 = expected1.dot(expected1)
                np.testing.assert_array_almost_equal(r2, expected2)

            a = mt.ones((10, 10), chunk_size=3)
            with self.assertRaises(ValueError):
                a.fetch()

            r = a.dot(a)
            np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    a_data = np.random.rand(10, 10)

                    a = mt.tensor(a_data, chunk_size=3)
                    np.testing.assert_array_equal(a, a_data)

                    r1 = a + 1
                    expected1 = a_data + 1
                    np.testing.assert_array_equal(r1, expected1)

                    r2 = r1.dot(r1)
                    expected2 = expected1.dot(expected1)
                    np.testing.assert_array_almost_equal(r2, expected2)

                    web_session = Session.default_or_local()._sess
                    self.assertEqual(web_session.get_task_count(), 3)

                a = mt.ones((10, 10), chunk_size=3)
                with self.assertRaises(ValueError):
                    a.fetch()

                r = a.dot(a)
                np.testing.assert_array_equal(r.execute(),
                                              np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
                from mars.dataframe.datasource.series import from_pandas as from_pandas_series
                from mars.dataframe.arithmetic import add

                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    data1 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
                    df1 = from_pandas_df(data1, chunk_size=5)
                    pd.testing.assert_frame_equal(df1.fetch(), data1)

                    data2 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
                    df2 = from_pandas_df(data2, chunk_size=6)
                    pd.testing.assert_frame_equal(df2.fetch(), data2)

                    df3 = add(df1, df2)
                    pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)

                    s1 = pd.Series(np.random.rand(10),
                                   index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
                    series1 = from_pandas_series(s1)
                    pd.testing.assert_series_equal(series1.fetch(), s1)

                web_session = Session.default_or_local()._sess
                self.assertEqual(web_session.get_task_count(), 4)
コード例 #20
0
ファイル: test_arithmetic.py プロジェクト: zuodh/mars
    def testAddWithoutShuffle(self):
        # all the axes are monotonic
        # data1 with index split into [0...4], [5...9],
        # columns [3...7], [8...12]
        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=np.arange(3, 13))
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        # columns [4...9], [10, 13]
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=np.arange(4, 14))
        df2 = from_pandas(data2, chunk_size=6)

        df3 = add(df1, df2)

        # test df3's index and columns
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 11)  # columns is recorded, so we can get it

        df3.tiles()

        # test df3's index and columns after tiling
        pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns)
        self.assertTrue(df3.columns.should_be_monotonic)
        self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index)
        self.assertTrue(df3.index_value.should_be_monotonic)
        pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([]))
        self.assertNotEqual(df3.index_value.key, df1.index_value.key)
        self.assertNotEqual(df3.index_value.key, df2.index_value.key)
        self.assertEqual(df3.shape[1], 11)  # columns is recorded, so we can get it

        data1_index_min_max = [(0, True, 4, True), (5, True, 9, True)]
        data1_columns_min_max = [[3, True, 7, True], [8, True, 12, True]]
        data2_index_min_max = [(2, True, 5, True), (6, True, 11, True)]
        data2_columns_min_max = [(4, True, 9, True), (10, True, 13, True)]

        left_index_splits, right_index_splits = split_monotonic_index_min_max(
            data1_index_min_max, True, data2_index_min_max, False)
        left_columns_splits, right_columns_splits = split_monotonic_index_min_max(
            data1_columns_min_max, True, data2_columns_min_max, True)

        left_index_idx_to_original_idx = build_split_idx_to_origin_idx(left_index_splits)
        right_index_idx_to_original_idx = build_split_idx_to_origin_idx(right_index_splits, False)
        left_columns_idx_to_original_idx = build_split_idx_to_origin_idx(left_columns_splits)
        right_columns_idx_to_original_idx = build_split_idx_to_origin_idx(right_columns_splits)

        self.assertEqual(df3.chunk_shape, (7, 7))
        for c in df3.chunks:
            self.assertIsInstance(c.op, DataFrameAdd)
            self.assertEqual(len(c.inputs), 2)
            # test shape
            idx = c.index
            # test the left side
            self.assertIsInstance(c.inputs[0].op, DataFrameIndexAlignMap)
            left_row_idx, left_row_inner_idx = left_index_idx_to_original_idx[idx[0]]
            left_col_idx, left_col_inner_idx = left_columns_idx_to_original_idx[idx[1]]
            expect_df1_input = df1.cix[left_row_idx, left_col_idx].data
            self.assertIs(c.inputs[0].inputs[0], expect_df1_input)
            left_index_min_max = left_index_splits[left_row_idx][left_row_inner_idx]
            self.assertEqual(c.inputs[0].op.index_min, left_index_min_max[0])
            self.assertEqual(c.inputs[0].op.index_min_close, left_index_min_max[1])
            self.assertEqual(c.inputs[0].op.index_max, left_index_min_max[2])
            self.assertEqual(c.inputs[0].op.index_max_close, left_index_min_max[3])
            self.assertIsInstance(c.inputs[0].index_value.to_pandas(), type(data1.index))
            left_column_min_max = left_columns_splits[left_col_idx][left_col_inner_idx]
            self.assertEqual(c.inputs[0].op.column_min, left_column_min_max[0])
            self.assertEqual(c.inputs[0].op.column_min_close, left_column_min_max[1])
            self.assertEqual(c.inputs[0].op.column_max, left_column_min_max[2])
            self.assertEqual(c.inputs[0].op.column_max_close, left_column_min_max[3])
            expect_left_columns = filter_index_value(expect_df1_input.columns, left_column_min_max,
                                                     store_data=True)
            pd.testing.assert_index_equal(c.inputs[0].columns.to_pandas(), expect_left_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[0].dtypes.index, expect_left_columns.to_pandas())
            # test the right side
            self.assertIsInstance(c.inputs[1].op, DataFrameIndexAlignMap)
            right_row_idx, right_row_inner_idx = right_index_idx_to_original_idx[idx[0]]
            right_col_idx, right_col_inner_idx = right_columns_idx_to_original_idx[idx[1]]
            expect_df2_input = df2.cix[right_row_idx, right_col_idx].data
            self.assertIs(c.inputs[1].inputs[0], expect_df2_input)
            right_index_min_max = right_index_splits[right_row_idx][right_row_inner_idx]
            self.assertEqual(c.inputs[1].op.index_min, right_index_min_max[0])
            self.assertEqual(c.inputs[1].op.index_min_close, right_index_min_max[1])
            self.assertEqual(c.inputs[1].op.index_max, right_index_min_max[2])
            self.assertEqual(c.inputs[1].op.index_max_close, right_index_min_max[3])
            self.assertIsInstance(c.inputs[1].index_value.to_pandas(), type(data2.index))
            right_column_min_max = right_columns_splits[right_col_idx][right_col_inner_idx]
            self.assertEqual(c.inputs[1].op.column_min, right_column_min_max[0])
            self.assertEqual(c.inputs[1].op.column_min_close, right_column_min_max[1])
            self.assertEqual(c.inputs[1].op.column_max, right_column_min_max[2])
            self.assertEqual(c.inputs[1].op.column_max_close, right_column_min_max[3])
            expect_right_columns = filter_index_value(expect_df2_input.columns, left_column_min_max,
                                                      store_data=True)
            pd.testing.assert_index_equal(c.inputs[1].columns.to_pandas(), expect_right_columns.to_pandas())
            pd.testing.assert_index_equal(c.inputs[1].dtypes.index, expect_right_columns.to_pandas())