Ejemplo n.º 1
0
 def setUp(self):
     self.iris = mt.tensor(datasets.load_iris().data)
     # solver_list not includes arpack
     self.solver_list = ['full', 'randomized', 'auto']
     self.session = new_session().as_default()
Ejemplo n.º 2
0
 def setUp(self):
     self.session = new_session().as_default()
     self._old_executor = self.session._sess._executor
     self.executor = self.session._sess._executor = \
         ExecutorForTest('numpy', storage=self.session._sess._context)
Ejemplo n.º 3
0
 def testLocalTrainDataFrame(self):
     new_session().as_default()
     dtrain = MarsDMatrix(self.X_df, self.y_series)
     booster = train({}, dtrain, num_boost_round=2)
     self.assertIsInstance(booster, Booster)
Ejemplo n.º 4
0
    def testWebApi(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            self.assertEqual(sess.count_workers(), 1)
            a = mt.ones((100, 100), chunk_size=30)
            b = mt.ones((100, 100), chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            assert_array_equal(value, np.ones((100, 100)) * 100)

            # check resubmission
            value2 = sess.run(c, timeout=timeout)
            assert_array_equal(value, value2)

            # check when local compression libs are missing
            from mars.serialize import dataserializer
            try:
                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                assert_array_equal(value, np.ones((10, 10)) * 10)

                dataserializer.decompressors[
                    dataserializer.CompressType.LZ4] = None
                dataserializer.decompressobjs[
                    dataserializer.CompressType.LZ4] = None
                dataserializer.compress_openers[
                    dataserializer.CompressType.LZ4] = None

                assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10)
            finally:
                dataserializer.decompressors[
                    dataserializer.CompressType.
                    LZ4] = dataserializer.lz4_decompress
                dataserializer.decompressobjs[
                    dataserializer.CompressType.
                    LZ4] = dataserializer.lz4_decompressobj
                dataserializer.compress_openers[
                    dataserializer.CompressType.LZ4] = dataserializer.lz4_open

            va = np.random.randint(0, 10000, (100, 100))
            vb = np.random.randint(0, 10000, (100, 100))
            a = mt.array(va, chunk_size=30)
            b = mt.array(vb, chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            assert_array_equal(value, va.dot(vb))

            graphs = sess.get_graph_states()

            # make sure status got uploaded
            time.sleep(1.5)

            # check web UI requests
            res = requests.get(service_ep)
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/scheduler' % (service_ep, ))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/scheduler/127.0.0.1:%s' %
                               (service_ep, self.scheduler_port))
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/worker' % (service_ep, ))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/worker/127.0.0.1:%s' %
                               (service_ep, self.worker_port))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/worker/127.0.0.1:%s/timeline' %
                               (service_ep, self.worker_port))
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/session' % (service_ep, ))
            self.assertEqual(res.status_code, 200)
            task_id = next(iter(graphs.keys()))
            res = requests.get('%s/session/%s/graph/%s' %
                               (service_ep, sess._session_id, task_id))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/session/%s/graph/%s/running_nodes' %
                               (service_ep, sess._session_id, task_id))
            self.assertEqual(res.status_code, 200)

            from mars.web.task_pages import PROGRESS_APP_NAME
            res = requests.get(
                '%s/%s?session_id=%s&task_id=%s' %
                (service_ep, PROGRESS_APP_NAME, sess._session_id, task_id))
            self.assertEqual(res.status_code, 200)

            from mars.web.worker_pages import TIMELINE_APP_NAME
            res = requests.get(
                '%s/%s?endpoint=127.0.0.1:%s' %
                (service_ep, TIMELINE_APP_NAME, self.worker_port))
            self.assertEqual(res.status_code, 200)

        # make sure all chunks freed when session quits
        from mars.worker.storage import StorageManagerActor
        actor_client = new_client()
        storage_manager_ref = actor_client.actor_ref(
            StorageManagerActor.default_uid(),
            address='127.0.0.1:' + str(self.worker_port))
        self.assertFalse(bool(storage_manager_ref.dump_keys()))
Ejemplo n.º 5
0
    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values(0)
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values(0))

        rs = np.random.RandomState(0)
        raw2 = pd.DataFrame({'a': rs.rand(10),
                            'b': [f's{rs.randint(1000)}' for _ in range(10)]
                            })
        raw2['b'] = raw2['b'].astype(md.ArrowStringDtype())
        mdf = md.DataFrame(raw2, chunk_size=3)
        df2 = mdf.sort_values(by='b')
        result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = raw2.sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)

        data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df3 = md.DataFrame(data, chunk_size=4)

        r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3))

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = data.reindex(index=np.arange(10, 1, -1))
        pd.testing.assert_frame_equal(result, expected)

        # test rebalance
        df4 = md.DataFrame(data)

        r = df4.rebalance()

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, data)
        chunk_metas = sess.get_tileable_chunk_metas(r.key)
        workers = list(set(itertools.chain(*(m.workers for m in chunk_metas.values()))))
        self.assertEqual(len(workers), 2)
Ejemplo n.º 6
0
    def testResetIndexExecution(self):
        data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5),
                             ('mammal', np.nan)],
                            index=['falcon', 'parrot', 'lion', 'monkey'],
                            columns=('class', 'max_speed'))
        df = from_pandas_df(data)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, drop=True)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(drop=True)
        pd.testing.assert_frame_equal(result, expected)

        index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                           ('bird', 'parrot'),
                                           ('mammal', 'lion'),
                                           ('mammal', 'monkey')],
                                          names=['class', 'name'])
        data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5),
                             ('mammal', np.nan)],
                            index=index,
                            columns=('type', 'max_speed'))
        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, level='class')
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class')
        pd.testing.assert_frame_equal(result, expected)

        columns = pd.MultiIndex.from_tuples([('speed', 'max'),
                                             ('species', 'type')])
        data.columns = columns
        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df,
                             level='class',
                             col_level=1,
                             col_fill='species')
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class',
                                    col_level=1,
                                    col_fill='species')
        pd.testing.assert_frame_equal(result, expected)

        # Test Series

        s = pd.Series([1, 2, 3, 4],
                      name='foo',
                      index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))

        series = from_pandas_series(s)
        s2 = series_reset_index(series, name='bar')
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(name='bar')
        pd.testing.assert_frame_equal(result, expected)

        series = from_pandas_series(s, chunk_size=2)
        s2 = series_reset_index(series, drop=True)
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(drop=True)
        pd.testing.assert_series_equal(result, expected)

        # Test Unknown shape
        sess = new_session()
        data1 = pd.DataFrame(np.random.rand(10, 3),
                             index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 3),
                             index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        df2 = from_pandas_df(data2, chunk_size=6)
        df = (df1 + df2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())

        data1 = pd.Series(np.random.rand(10, ),
                          index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        series1 = from_pandas_series(data1, chunk_size=3)
        data2 = pd.Series(np.random.rand(10, ),
                          index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series2 = from_pandas_series(data2, chunk_size=3)
        df = (series1 + series2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())
Ejemplo n.º 7
0
    def testConcat(self):
        executor = ExecutorForTest(storage=new_session().context)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        r = concat([mdf1, mdf2])
        expected = pd.concat([df1, df2])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test different chunk size and ignore_index=True
        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=3)

        r = concat([mdf1, mdf2], ignore_index=True)
        expected = pd.concat([df1, df2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test axis=1
        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=3)

        r = concat([mdf1, mdf2], axis=1)
        expected = pd.concat([df1, df2], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test multiply dataframes
        r = concat([mdf1, mdf2, mdf1])
        expected = pd.concat([df1, df2, df1])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 3), columns=list('ABC'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        # test join=inner
        r = concat([mdf1, mdf2], join='inner')
        expected = pd.concat([df1, df2], join='inner')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test for series
        series1 = pd.Series(np.random.rand(10, ))
        series2 = pd.Series(np.random.rand(10, ))

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        r = concat([mseries1, mseries2])
        expected = pd.concat([series1, series2])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, expected)

        # test different series and ignore_index
        mseries1 = series_from_pandas(series1, chunk_size=4)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        r = concat([mseries1, mseries2], ignore_index=True)
        expected = pd.concat([series1, series2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, expected)

        # test axis=1
        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        r = concat([mseries1, mseries2], axis=1)
        expected = pd.concat([series1, series2], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge dataframe and series
        r = concat([mdf1, mseries2], ignore_index=True)
        expected = pd.concat([df1, series2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge series and dataframe
        r = concat([mseries1, mdf2], ignore_index=True)
        expected = pd.concat([series1, df2], ignore_index=True)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge dataframe and series, axis=1
        r = concat([mdf1, mseries2], axis=1)
        expected = pd.concat([df1, series2], axis=1)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)

        # test merge series and dataframe, axis=1
        r = concat([mseries1, mdf2], axis=1)
        expected = pd.concat([series1, df2], axis=1)
        result = executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(result, expected)
Ejemplo n.º 8
0
    def testRemoteFunctionInLocalCluster(self):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=3,
                         shared_memory='20M',
                         modules=[__name__],
                         web=True) as cluster:
            session = cluster.session

            def f(x):
                return x + 1

            def g(x, y):
                return x * y

            a = mr.spawn(f, 3)
            b = mr.spawn(f, 4)
            c = mr.spawn(g, (a, b))

            r = session.run(c, timeout=_exec_timeout)
            self.assertEqual(r, 20)

            e = mr.spawn(f, mr.spawn(f, 2))

            r = session.run(e, timeout=_exec_timeout)
            self.assertEqual(r, 4)

            session2 = new_session(cluster.endpoint)
            expect_session_id = session2.session_id

            def f2():
                session = Session.default
                assert isinstance(session._sess, ClusterSession)
                assert session._sess.session_id == expect_session_id

                t = mt.ones((3, 2))
                return t.sum().to_numpy()

            self.assertEqual(
                cloudpickle.loads(cloudpickle.dumps(
                    Session.default)).session_id, session.session_id)
            self.assertIsInstance(serialize_function(f2), bytes)

            d = mr.spawn(f2, retry_when_fail=False)

            r = session2.run(d, timeout=_exec_timeout)
            self.assertEqual(r, 6)

            # test input tileable
            def f(t, x):
                return (t * x).sum().to_numpy()

            rs = np.random.RandomState(0)
            raw = rs.rand(5, 4)

            t1 = mt.tensor(raw, chunk_size=3)
            t2 = t1.sum(axis=0)
            s = mr.spawn(f, args=(t2, 3), retry_when_fail=False)

            r = session.run(s, timeout=_exec_timeout)
            expected = (raw.sum(axis=0) * 3).sum()
            self.assertAlmostEqual(r, expected)

            # test named tileable
            session3 = new_session(cluster.endpoint)
            t = mt.ones((10, 10), chunk_size=3)
            session3.run(t, name='t_name')

            def f3():
                import mars.tensor as mt

                s = mt.named_tensor(name='t_name')
                return (s + 1).to_numpy()

            d = mr.spawn(f3, retry_when_fail=False)
            r = session3.run(d, timeout=_exec_timeout)
            np.testing.assert_array_equal(r, np.ones((10, 10)) + 1)
Ejemplo n.º 9
0
 def submitter():
     sess = new_session(self.session_manager_ref.address)
     return tileable.execute(session=sess,
                             timeout=self.timeout).fetch(session=sess)
Ejemplo n.º 10
0
    def testWebApi(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            session_id = sess._session_id
            self.assertEqual(sess.count_workers(), 1)

            a = mt.ones((100, 100), chunk_size=30)
            b = mt.ones((100, 100), chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            np.testing.assert_array_equal(value, np.ones((100, 100)) * 100)

            # check resubmission
            value2 = sess.run(c, timeout=timeout)
            np.testing.assert_array_equal(value, value2)

            # check when local compression libs are missing
            from mars.serialize import dataserializer
            try:
                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                np.testing.assert_array_equal(value, np.ones((10, 10)) * 10)

                dataserializer.decompressors[dataserializer.CompressType.LZ4] = None
                dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = None
                dataserializer.compress_openers[dataserializer.CompressType.LZ4] = None

                np.testing.assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10)
            finally:
                dataserializer.decompressors[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompress
                dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompressobj
                dataserializer.compress_openers[dataserializer.CompressType.LZ4] = dataserializer.lz4_open

            # check serialization by pickle
            try:
                sess._sess._serial_type = SerialType.PICKLE

                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                np.testing.assert_array_equal(value, np.ones((10, 10)) * 10)

                raw = pd.DataFrame(np.random.rand(10, 5), columns=list('ABCDE'),
                                   index=pd.RangeIndex(10, 0, -1))
                data = md.DataFrame(raw).astype({'E': 'arrow_string'})
                ret_data = data.execute(session=sess).fetch(session=sess)
                self.assertEqual(ret_data.dtypes['E'], np.dtype('O'))
                pd.testing.assert_frame_equal(
                    ret_data.astype({'E': 'float'}), raw, check_less_precise=True)

                raw = pd.Series(np.random.rand(10), index=pd.RangeIndex(10, 0, -1),
                                name='r')
                data = md.Series(raw).astype('Arrow[string]')
                ret_data = data.execute(session=sess).fetch(session=sess)
                self.assertEqual(ret_data.dtype, np.dtype('O'))
                pd.testing.assert_series_equal(ret_data.astype('float'), raw)
            finally:
                sess._sess._serial_type = SerialType.ARROW

            va = np.random.randint(0, 10000, (100, 100))
            vb = np.random.randint(0, 10000, (100, 100))
            a = mt.array(va, chunk_size=30)
            b = mt.array(vb, chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            np.testing.assert_array_equal(value, va.dot(vb))

            # test fetch log
            def f():
                print('test')

            r = mr.spawn(f).execute(session=sess, timeout=timeout)
            self.assertEqual(str(r.fetch_log()).strip(), 'test')
            self.assertEqual(str(r.fetch_log(offsets=0)).strip(), 'test')
            self.assertEqual(str(r.fetch_log()).strip(), '')
            self.assertEqual(str(r.fetch_log(offsets='-0.003k', sizes=2)).strip(), 'st')

            graphs = sess.get_graph_states()

            # make sure status got uploaded
            time.sleep(1.5)

            # check web UI requests
            res = requests.get(service_ep)
            self.assertEqual(res.status_code, 200)

            res = requests.get(f'{service_ep}/scheduler')
            self.assertEqual(res.status_code, 200)
            res = requests.get(f'{service_ep}/scheduler/127.0.0.1:{self.scheduler_port}')
            self.assertEqual(res.status_code, 200)

            res = requests.get(f'{service_ep}/worker')
            self.assertEqual(res.status_code, 200)
            res = requests.get(f'{service_ep}/worker/127.0.0.1:{self.worker_port}')
            self.assertEqual(res.status_code, 200)
            res = requests.get(f'{service_ep}/worker/127.0.0.1:{self.worker_port}/timeline')
            self.assertEqual(res.status_code, 200)

            res = requests.get(f'{service_ep}/session')
            self.assertEqual(res.status_code, 200)
            task_id = next(iter(graphs.keys()))
            res = requests.get(f'{service_ep}/session/{session_id}/graph/{task_id}')
            self.assertEqual(res.status_code, 200)
            res = requests.get(f'{service_ep}/session/{session_id}/graph/{task_id}/running_nodes')
            self.assertEqual(res.status_code, 200)

            from mars.web.task_pages import PROGRESS_APP_NAME
            res = requests.get(f'{service_ep}/{PROGRESS_APP_NAME}?session_id={session_id}&task_id={task_id}')
            self.assertEqual(res.status_code, 200)

            from mars.web.worker_pages import TIMELINE_APP_NAME
            res = requests.get(f'{service_ep}/{TIMELINE_APP_NAME}?endpoint=127.0.0.1:{self.worker_port}')
            self.assertEqual(res.status_code, 200)

        # make sure all chunks freed when session quits
        from mars.worker.storage import StorageManagerActor
        actor_client = new_client()
        storage_manager_ref = actor_client.actor_ref(StorageManagerActor.default_uid(),
                                                     address='127.0.0.1:' + str(self.worker_port))
        self.assertSetEqual(set(storage_manager_ref.dump_keys()), set())
Ejemplo n.º 11
0
    def testWebApiException(self):
        def normalize_tbs(tb_lines):
            new_lines = []
            for line in tb_lines:
                first_line = line.splitlines(True)[0]
                new_lines.append(first_line if '.pyx' in first_line else line)
            return new_lines

        service_ep = 'http://127.0.0.1:' + self.web_port

        # query worker info
        res = requests.get(f'{service_ep}/api/worker')
        self.assertEqual(res.status_code, 200)
        self.assertEqual(len(json.loads(res.text)), 1)
        res = requests.get(f'{service_ep}/api/worker?action=count')
        self.assertEqual(res.status_code, 200)
        self.assertEqual(int(res.text), 1)
        res = requests.patch(f'{service_ep}/api/worker?action=count',
                             data=json.dumps(dict(new_scale=2)))
        self.assertEqual(res.status_code, 405)

        # query sessions (should be empty)
        res = requests.get(f'{service_ep}/api/session')
        self.assertEqual(res.status_code, 200)
        self.assertEqual(len(json.loads(res.text)), 0)

        # raise on malicious python version
        res = requests.post(f'{service_ep}/api/session', dict(pyver='mal.version'))
        self.assertEqual(res.status_code, 400)
        wrong_version = '3.7.4' if sys.version_info[0] < 3 else '2.7.4'
        res = requests.post(f'{service_ep}/api/session', dict(pyver=wrong_version))
        self.assertEqual(res.status_code, 400)

        # use pickle when arrow version does not agree
        pyarrow, arrow_ver = None, None
        pickle_ver = pickle.HIGHEST_PROTOCOL
        try:
            pickle.HIGHEST_PROTOCOL = 2000

            import pyarrow
            arrow_ver = pyarrow.__version__
            pyarrow.__version__ = '2000.0.0'

            with new_session(service_ep, verify_ssl=False) as sess:
                self.assertEqual(sess._sess._serial_type, SerialType.PICKLE)
                self.assertEqual(sess._sess._pickle_protocol, pickle_ver)
        except ImportError:
            pass
        finally:
            pickle.HIGHEST_PROTOCOL = pickle_ver
            if pyarrow:
                pyarrow.__version__ = arrow_ver

        with new_session(service_ep) as sess:
            # Stop non-existing graph should raise an exception
            graph_key = str(uuid.uuid4())
            res = requests.delete(f'{service_ep}/api/session/{sess._session_id}/graph/{graph_key}')
            self.assertEqual(res.status_code, 404)
            resp_json = json.loads(res.text)
            typ, value, tb = pickle.loads(base64.b64decode(resp_json['exc_info']))
            self.assertEqual(typ, ActorNotExist)
            self.assertEqual(normalize_tbs(traceback.format_exception(typ, value, tb)),
                             normalize_tbs(resp_json['exc_info_text']))

            # get graph states of non-existing session should raise an exception
            res = requests.get(f'{service_ep}/api/session/xxxx/graph')
            self.assertEqual(res.status_code, 500)
            resp_json = json.loads(res.text)
            typ, value, tb = pickle.loads(base64.b64decode(resp_json['exc_info']))
            self.assertEqual(typ, KeyError)
            self.assertEqual(normalize_tbs(traceback.format_exception(typ, value, tb)),
                             normalize_tbs(resp_json['exc_info_text']))
Ejemplo n.º 12
0
    def testMutableTensorWrite(self):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M') as cluster:
            with new_session(cluster.endpoint) as session:
                mut = session.create_mutable_tensor("test", (4, 5),
                                                    dtype=np.double,
                                                    chunk_size=3)

                # write [1:4, 2], and buffer is not full.
                chunk_records = mut._do_write((slice(1, 4, None), 2), 8)
                self.assertEqual(chunk_records, [])
                chunk_records = mut._do_flush()
                chunk_records_map = dict((k, v) for k, _, v in chunk_records)

                result = chunk_records_map[mut.cix[(0, 0)].key]
                expected = np.array([[5, 8.], [8, 8.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(1, 0)].key]
                expected = np.array([[2, 8.]])
                self.assertRecordsEqual(result, expected)

                # write [2:4], and buffer is not full.
                chunk_records = mut._do_write(slice(2, 4, None),
                                              np.arange(10).reshape((2, 5)))
                self.assertEqual(chunk_records, [])
                chunk_records = mut._do_flush()
                chunk_records_map = dict((k, v) for k, _, v in chunk_records)

                result = chunk_records_map[mut.cix[(0, 0)].key]
                expected = np.array([[6, 0.], [7, 1.], [8, 2.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(0, 1)].key]
                expected = np.array([[4, 3.], [5, 4.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(1, 0)].key]
                expected = np.array([[0, 5.], [1, 6.], [2, 7.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(1, 1)].key]
                expected = np.array([[0, 8.], [1, 9.]])
                self.assertRecordsEqual(result, expected)

                # write [1], and buffer is not full.
                chunk_records = mut._do_write(1, np.arange(5))
                self.assertEqual(chunk_records, [])
                chunk_records = mut._do_flush()
                chunk_records_map = dict((k, v) for k, _, v in chunk_records)

                result = chunk_records_map[mut.cix[(0, 0)].key]
                expected = np.array([[3, 0.], [4, 1.], [5, 2.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(0, 1)].key]
                expected = np.array([[2, 3.], [3, 4.]])
                self.assertRecordsEqual(result, expected)

                # write [2, [0, 2, 4]] (fancy index), and buffer is not full.
                chunk_records = mut._do_write((2, [0, 2, 4]),
                                              np.array([11, 22, 33]))
                self.assertEqual(chunk_records, [])
                chunk_records = mut._do_flush()
                chunk_records_map = dict((k, v) for k, _, v in chunk_records)

                result = chunk_records_map[mut.cix[(0, 0)].key]
                expected = np.array([[6, 11.], [8, 22.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(0, 1)].key]
                expected = np.array([[5, 33.]])
                self.assertRecordsEqual(result, expected)

                # write [:], and the first buffer is full.
                chunk_records = mut._do_write(slice(None, None, None), 999)
                chunk_records_map = dict((k, v) for k, _, v in chunk_records)

                result = chunk_records_map[mut.cix[(0, 0)].key]
                expected = np.array([[0, 999.], [1, 999.], [2,
                                                            999.], [3, 999.],
                                     [4, 999.], [5, 999.], [6, 999.],
                                     [7, 999.], [8, 999.]])
                self.assertRecordsEqual(result, expected)

                # check other chunks
                chunk_records = mut._do_flush()
                chunk_records_map = dict((k, v) for k, _, v in chunk_records)

                result = chunk_records_map[mut.cix[(0, 1)].key]
                expected = np.array([[0, 999.], [1, 999.], [2, 999.],
                                     [3, 999.], [4, 999.], [5, 999.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(1, 0)].key]
                expected = np.array([[0, 999.], [1, 999.], [2, 999.]])
                self.assertRecordsEqual(result, expected)

                result = chunk_records_map[mut.cix[(1, 1)].key]
                expected = np.array([[0, 999.], [1, 999.]])
                self.assertRecordsEqual(result, expected)
Ejemplo n.º 13
0
 def testRayClusterMode(self):
     with new_session(backend='ray', _load_code_from_local=True).as_default():
         t = mt.random.rand(100, 4, chunk_size=30)
         df = md.DataFrame(t, columns=list('abcd'))
         r = df.describe().execute()
         self.assertEqual(r.shape, (8, 4))
Ejemplo n.º 14
0
 def setUp(self):
     new_session().as_default()
Ejemplo n.º 15
0
    def testRemoteFunctionInLocalCluster(self):
        with new_cluster(scheduler_n_process=2, worker_n_process=3,
                         shared_memory='20M', modules=[__name__], web=True) as cluster:
            session = cluster.session

            def f(x):
                return x + 1

            def g(x, y):
                return x * y

            a = mr.spawn(f, 3)
            b = mr.spawn(f, 4)
            c = mr.spawn(g, (a, b))

            r = session.run(c, timeout=_exec_timeout)
            self.assertEqual(r, 20)

            e = mr.spawn(f, mr.spawn(f, 2))

            r = session.run(e, timeout=_exec_timeout)
            self.assertEqual(r, 4)

            session2 = new_session(cluster.endpoint)
            expect_session_id = session2.session_id

            def f2():
                session = Session.default
                assert isinstance(session._sess, ClusterSession)
                assert session._sess.session_id == expect_session_id

                t = mt.ones((3, 2))
                return t.sum().to_numpy()

            self.assertEqual(cloudpickle.loads(cloudpickle.dumps(Session.default)).session_id,
                             session.session_id)
            self.assertIsInstance(serialize_function(f2), bytes)

            d = mr.spawn(f2, retry_when_fail=False)

            r = session2.run(d, timeout=_exec_timeout)
            self.assertEqual(r, 6)

            # test input tileable
            def f(t, x):
                return (t * x).sum().to_numpy()

            rs = np.random.RandomState(0)
            raw = rs.rand(5, 4)

            t1 = mt.tensor(raw, chunk_size=3)
            t2 = t1.sum(axis=0)
            s = mr.spawn(f, args=(t2, 3), retry_when_fail=False)

            r = session.run(s, timeout=_exec_timeout)
            expected = (raw.sum(axis=0) * 3).sum()
            self.assertAlmostEqual(r, expected)

            # test named tileable
            session3 = new_session(cluster.endpoint)
            t = mt.ones((10, 10), chunk_size=3)
            session3.run(t, name='t_name')

            def f3():
                import mars.tensor as mt

                s = mt.named_tensor(name='t_name')
                return (s + 1).to_numpy()

            d = mr.spawn(f3, retry_when_fail=False)
            r = session3.run(d, timeout=_exec_timeout)
            np.testing.assert_array_equal(r, np.ones((10, 10)) + 1)

            # test tileable that executed
            session4 = new_session(cluster.endpoint)
            df1 = md.DataFrame(raw, chunk_size=3)
            df1 = df1[df1.iloc[:, 0] < 1.5]

            def f4(input_df):
                bonus = input_df.iloc[:, 0].fetch().sum()
                return input_df.sum().to_pandas() + bonus

            d = mr.spawn(f4, args=(df1,), retry_when_fail=False)
            r = session4.run(d, timeout=_exec_timeout)
            expected = pd.DataFrame(raw).sum() + raw[:, 0].sum()
            pd.testing.assert_series_equal(r, expected)

            # test tileable has unknown shape
            session5 = new_session(cluster.endpoint)

            def f5(t, x):
                assert all(not np.isnan(s) for s in t.shape)
                return (t * x).sum().to_numpy()

            rs = np.random.RandomState(0)
            raw = rs.rand(5, 4)

            t1 = mt.tensor(raw, chunk_size=3)
            t2 = t1[t1 < 0.5]
            s = mr.spawn(f5, args=(t2, 3))
            result = session5.run(s, timeout=_exec_timeout)
            expected = (raw[raw < 0.5] * 3).sum()
            self.assertAlmostEqual(result, expected)
Ejemplo n.º 16
0
    def testMainDataFrameWithoutEtcd(self):
        self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true'])
        sess = new_session(self.session_manager_ref.address)

        # test binary arithmetics with different indices
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=(10, 5))
        raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=(10, 6))
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = md.DataFrame(raw1, chunk_size=5)
        raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = md.DataFrame(raw2, chunk_size=6)
        r = df1 + df2
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1 + raw2)

        # test sort_values
        raw1 = pd.DataFrame(np.random.rand(10, 10))
        raw1[0] = raw1[0].apply(str)
        raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')])
        df1 = md.DataFrame(raw1, chunk_size=5)
        r = df1.sort_values([('A', 'C')])
        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')]))

        rs = np.random.RandomState(0)
        raw2 = pd.DataFrame({'a': rs.rand(10),
                            'b': [f's{rs.randint(1000)}' for _ in range(10)]
                            })
        raw2['b'] = raw2['b'].astype(md.ArrowStringDtype())
        mdf = md.DataFrame(raw2, chunk_size=4)
        filtered = mdf[mdf['a'] > 0.5]
        df2 = filtered.sort_values(by='b')
        result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = raw2[raw2['a'] > 0.5].sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = md.Series(s1, chunk_size=6)
        result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_series_equal(result, s1)

        # test reindex
        data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df3 = md.DataFrame(data, chunk_size=4)
        r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3))

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = data.reindex(index=np.arange(10, 1, -1))
        pd.testing.assert_frame_equal(result, expected)

        # test rebalance
        df4 = md.DataFrame(data)
        r = df4.rebalance()

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        pd.testing.assert_frame_equal(result, data)
        chunk_metas = sess.get_tileable_chunk_metas(r.key)
        workers = list(set(itertools.chain(*(m.workers for m in chunk_metas.values()))))
        self.assertEqual(len(workers), 2)

        # test nunique
        data = pd.DataFrame(np.random.randint(0, 10, (100, 5)),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df5 = md.DataFrame(data, chunk_size=4)
        r = df5.nunique()

        result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess)
        expected = data.nunique()
        pd.testing.assert_series_equal(result, expected)

        # test re-execute df.groupby().agg().sort_values()
        rs = np.random.RandomState(0)
        data = pd.DataFrame({'col1': rs.rand(100), 'col2': rs.randint(10, size=100)})
        df6 = md.DataFrame(data, chunk_size=40)
        grouped = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \
            .execute(session=sess, timeout=self.timeout)
        r = grouped.sort_values(by='cnt').head().execute(session=sess, timeout=self.timeout)
        result = r.fetch(session=sess)
        expected = data.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \
            .sort_values(by='cnt').head()
        pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True))
        r2 = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}).sort_values(by='cnt').head() \
            .execute(session=sess, timeout=self.timeout)
        result = r2.fetch(session=sess)
        pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True))

        # test groupby with sample
        src_data_list = []
        sample_count = 10
        for b in range(5):
            data_count = int(np.random.randint(40, 100))
            src_data_list.append(pd.DataFrame({
                'a': np.random.randint(0, 100, size=data_count),
                'b': np.array([b] * data_count),
                'c': np.random.randint(0, 100, size=data_count),
                'd': np.random.randint(0, 100, size=data_count),
            }))
        data = pd.concat(src_data_list)
        shuffle_idx = np.arange(len(data))
        np.random.shuffle(shuffle_idx)
        data = data.iloc[shuffle_idx].reset_index(drop=True)

        df7 = md.DataFrame(data, chunk_size=40)
        sampled = df7.groupby('b').sample(10)
        r = sampled.execute(session=sess, timeout=self.timeout)
        result = r.fetch(session=sess)
        self.assertFalse((result.groupby('b').count() - sample_count).any()[0])
Ejemplo n.º 17
0
    def testClusterSession(self):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:
            sess1 = cluster.session
            sess2 = new_session(cluster.endpoint, session_id=sess1.session_id)

            self.assertNotEqual(sess1, sess2)
            self.assertEqual(sess1.session_id, sess2.session_id)

            session_id = str(uuid.uuid4())
            with self.assertRaises(ValueError) as cm:
                new_session(cluster.endpoint, session_id=session_id)

            expected_msg = "The session with id = %s doesn't exist" % session_id
            self.assertEqual(cm.exception.args[0], expected_msg)

            sess1.close()
            with self.assertRaises(ValueError) as cm:
                new_session(cluster.endpoint, session_id=sess1.session_id)

            expected_msg = "The session with id = %s doesn't exist" % sess1.session_id
            self.assertEqual(cm.exception.args[0], expected_msg)

            web_sess1 = new_session('http://' + cluster._web_endpoint)
            web_sess2 = new_session('http://' + cluster._web_endpoint,
                                    session_id=web_sess1.session_id)

            self.assertNotEqual(web_sess1, web_sess2)
            self.assertEqual(web_sess1.session_id, web_sess2.session_id)

            session_id = str(uuid.uuid4())
            with self.assertRaises(ValueError) as cm:
                new_session('http://' + cluster._web_endpoint,
                            session_id=session_id)

            expected_msg = "The session with id = %s doesn't exist" % session_id
            self.assertEqual(cm.exception.args[0], expected_msg)

            web_sess1.close()
            with self.assertRaises(ValueError) as cm:
                new_session('http://' + cluster._web_endpoint,
                            session_id=web_sess1.session_id)

            expected_msg = "The session with id = %s doesn't exist" % web_sess1.session_id
            self.assertEqual(cm.exception.args[0], expected_msg)
Ejemplo n.º 18
0
    def testFetchLogWithoutEtcd(self):
        # test fetch log
        with tempfile.TemporaryDirectory() as temp_dir:
            self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'],
                                 scheduler_args=[f'-Dcustom_log_dir={temp_dir}'])
            sess = new_session(self.session_manager_ref.address)

            def f():
                print('test')

            r = spawn(f)
            r.execute(session=sess)

            custom_log_actor = sess._api.actor_client.actor_ref(
                CustomLogMetaActor.default_uid(),
                address=self.cluster_info.get_scheduler(CustomLogMetaActor.default_uid())
            )

            chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths(
                sess.session_id, r.op.key)
            paths = list(chunk_key_to_log_path.values())
            self.assertEqual(len(paths), 1)
            log_path = paths[0][1]
            with open(log_path) as f:
                self.assertEqual(f.read().strip(), 'test')

            context = DistributedContext(scheduler_address=self.session_manager_ref.address,
                                         session_id=sess.session_id)
            log_result = context.fetch_tileable_op_logs(r.op.key)
            log = next(iter(log_result.values()))['log']
            self.assertEqual(log.strip(), 'test')

            log = r.fetch_log()
            self.assertEqual(str(log).strip(), 'test')

            # test multiple functions
            def f1(size):
                print('f1' * size)
                sys.stdout.flush()

            fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)])
            fs.execute(session=sess)
            log = fs.fetch_log(offsets=20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30])
            self.assertGreater(len(log[0].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets
            log = fs.fetch_log(offsets=-20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets which represented in string
            log = fs.fetch_log(offsets='-0.02K', sizes='0.01K')
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            def test_nested():
                print('level0')
                fr = spawn(f1, 1)
                fr.execute()
                print(fr.fetch_log())

            r = spawn(test_nested)
            with self.assertRaises(ValueError):
                r.fetch_log()
            r.execute(session=sess)
            log = str(r.fetch_log())
            self.assertIn('level0', log)
            self.assertIn('f1', log)

            df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5)

            def df_func(c):
                print('df func')
                return c

            df2 = df.map_chunk(df_func)
            df2.execute(session=sess)
            log = df2.fetch_log()
            self.assertIn('Chunk op key:', str(log))
            self.assertIn('df func', repr(log))
            self.assertEqual(len(str(df.fetch_log(session=sess))), 0)

            def test_host(rndf):
                rm = spawn(nested, rndf)
                rm.execute()
                print(rm.fetch_log())

            def nested(_rndf):
                print('log_content')

            ds = [spawn(test_host, n, retry_when_fail=False)
                  for n in np.random.rand(4)]
            xtp = ExecutableTuple(ds)
            xtp.execute(session=sess)
            for log in xtp.fetch_log(session=sess):
                self.assertEqual(str(log).strip(), 'log_content')

            def test_threaded():
                import threading

                exc_info = None

                def print_fun():
                    nonlocal exc_info
                    try:
                        print('inner')
                    except:  # noqa: E722  # nosec  # pylint: disable=bare-except
                        exc_info = sys.exc_info()

                print_thread = threading.Thread(target=print_fun)
                print_thread.start()
                print_thread.join()

                if exc_info is not None:
                    raise exc_info[1].with_traceback(exc_info[-1])

                print('after')

            rm = spawn(test_threaded)
            rm.execute(session=sess)
            logs = str(rm.fetch_log(session=sess)).strip()
            self.assertEqual(logs, 'inner\nafter')
Ejemplo n.º 19
0
    def testAppendExecution(self):
        executor = ExecutorForTest(storage=new_session().context)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=2)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD'))
        mdf3 = from_pandas(df3, chunk_size=3)
        expected = df1.append([df2, df3])
        adf = mdf1.append([mdf2, mdf3])
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test for series
        series1 = pd.Series(np.random.rand(10, ))
        series2 = pd.Series(np.random.rand(10, ))

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=2)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        series3 = pd.Series(np.random.rand(4, ))
        mseries3 = series_from_pandas(series3, chunk_size=2)
        expected = series1.append([series2, series3])
        aseries = mseries1.append([mseries2, mseries3])
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)
Ejemplo n.º 20
0
    def testWebApi(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            self.assertEqual(sess.count_workers(), 1)
            a = mt.ones((100, 100), chunk_size=30)
            b = mt.ones((100, 100), chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            assert_array_equal(value, np.ones((100, 100)) * 100)

            # check resubmission
            value2 = sess.run(c, timeout=timeout)
            assert_array_equal(value, value2)

            # check when local compression libs are missing
            from mars.serialize import dataserializer
            try:
                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                assert_array_equal(value, np.ones((10, 10)) * 10)

                dataserializer.decompressors[
                    dataserializer.CompressType.LZ4] = None
                dataserializer.decompressobjs[
                    dataserializer.CompressType.LZ4] = None
                dataserializer.compress_openers[
                    dataserializer.CompressType.LZ4] = None

                assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10)
            finally:
                dataserializer.decompressors[
                    dataserializer.CompressType.
                    LZ4] = dataserializer.lz4_decompress
                dataserializer.decompressobjs[
                    dataserializer.CompressType.
                    LZ4] = dataserializer.lz4_decompressobj
                dataserializer.compress_openers[
                    dataserializer.CompressType.LZ4] = dataserializer.lz4_open

            va = np.random.randint(0, 10000, (100, 100))
            vb = np.random.randint(0, 10000, (100, 100))
            a = mt.array(va, chunk_size=30)
            b = mt.array(vb, chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            assert_array_equal(value, va.dot(vb))

            graphs = sess.get_graph_states()

            # check web UI requests
            res = requests.get(service_ep)
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/task' % (service_ep, ))
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/scheduler' % (service_ep, ))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/scheduler?endpoint=127.0.0.1:%s' %
                               (service_ep, self.scheduler_port))
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/worker' % (service_ep, ))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/worker?endpoint=127.0.0.1:%s' %
                               (service_ep, self.worker_port))
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/task' % (service_ep, ))
            self.assertEqual(res.status_code, 200)
            task_id = next(iter(graphs.keys()))
            res = requests.get('%s/task?session_id=%s&task_id=%s' %
                               (service_ep, sess._session_id, task_id))
            self.assertEqual(res.status_code, 200)
Ejemplo n.º 21
0
 def restart_session(self):
     self._mars_session.close()
     self._mars_session = new_session(
         self._endpoint, req_session=self._req_session).as_default()
Ejemplo n.º 22
0
    def testRemoteWithoutEtcd(self):
        from mars.scheduler.resource import ResourceActor
        from mars.worker.dispatcher import DispatchActor

        self.start_processes(
            etcd=False,
            modules=['mars.scheduler.tests.integrated.no_prepare_op'])
        sess = new_session(self.session_manager_ref.address)
        resource_ref = sess._api.actor_client.actor_ref(
            ResourceActor.default_uid(),
            address=self.cluster_info.get_scheduler(
                ResourceActor.default_uid()))
        worker_ips = resource_ref.get_worker_endpoints()

        rs = np.random.RandomState(0)
        raw1 = rs.rand(10, 10)
        raw2 = rs.rand(10, 10)

        def f_none(_x):
            return None

        r_none = spawn(f_none, raw1)
        result = r_none.execute(session=sess,
                                timeout=self.timeout).fetch(session=sess)
        self.assertIsNone(result)

        def f1(x):
            return x + 1

        def f2(x, y, z=None):
            return x * y * (z[0] + z[1])

        r1 = spawn(f1, raw1)
        r2 = spawn(f1, raw2)
        r3 = spawn(f2, (r1, r2), {'z': [r1, r2]})
        result = r3.execute(session=sess,
                            timeout=self.timeout).fetch(session=sess)
        expected = (raw1 + 1) * (raw2 + 1) * (raw1 + 1 + raw2 + 1)
        np.testing.assert_allclose(result, expected)

        def f(t, x):
            mul = (t * x).execute()
            return mul.sum().to_numpy()

        rs = np.random.RandomState(0)
        raw = rs.rand(5, 4)

        t1 = mt.tensor(raw, chunk_size=3)
        t2 = t1.sum(axis=0)
        s = spawn(f, args=(t2, 3))

        result = s.execute(session=sess,
                           timeout=self.timeout).fetch(session=sess)
        expected = (raw.sum(axis=0) * 3).sum()
        self.assertAlmostEqual(result, expected)

        time.sleep(1)
        for worker_ip in worker_ips:
            ref = sess._api.actor_client.actor_ref(DispatchActor.default_uid(),
                                                   address=worker_ip)
            self.assertEqual(len(ref.get_slots('cpu')), 1)
Ejemplo n.º 23
0
    def testFetch(self):
        sess = new_session()

        arr1 = mt.ones((10, 5), chunk_size=3)

        r1 = sess.run(arr1)
        r2 = sess.run(arr1)
        np.testing.assert_array_equal(r1, r2)

        executor = sess._sess._executor
        executor.chunk_result[get_tiled(arr1).chunks[0].key] = np.ones(
            (3, 3)) * 2
        r3 = sess.run(arr1 + 1)
        np.testing.assert_array_equal(r3[:3, :3], np.ones((3, 3)) * 3)

        # rerun to ensure arr1's chunk results still exist
        r4 = sess.run(arr1 + 1)
        np.testing.assert_array_equal(r4[:3, :3], np.ones((3, 3)) * 3)

        arr2 = mt.ones((10, 5), chunk_size=3)
        r5 = sess.run(arr2)
        np.testing.assert_array_equal(r5[:3, :3], np.ones((3, 3)) * 2)

        r6 = sess.run(arr2 + 1)
        np.testing.assert_array_equal(r6[:3, :3], np.ones((3, 3)) * 3)

        df = md.DataFrame(np.random.rand(10, 2), columns=list('ab'))
        s = df['a'].map(lambda x: np.ones((3, 3)), dtype='object').sum()

        np.testing.assert_array_equal(s.execute().fetch(),
                                      np.ones((3, 3)) * 10)

        # test fetch multiple tensors
        raw = np.random.rand(5, 10)
        arr1 = mt.ones((5, 10), chunk_size=5)
        arr2 = mt.tensor(raw, chunk_size=3)
        arr3 = mt.sum(arr2)

        sess.run(arr1, arr2, arr3)

        fetch1, fetch2, fetch3 = sess.fetch(arr1, arr2, arr3)
        np.testing.assert_array_equal(fetch1, np.ones((5, 10)))
        np.testing.assert_array_equal(fetch2, raw)
        np.testing.assert_almost_equal(fetch3, raw.sum())

        fetch1, fetch2, fetch3 = sess.fetch([arr1, arr2, arr3])
        np.testing.assert_array_equal(fetch1, np.ones((5, 10)))
        np.testing.assert_array_equal(fetch2, raw)
        np.testing.assert_almost_equal(fetch3, raw.sum())

        raw = np.random.rand(5, 10)
        arr = mt.tensor(raw, chunk_size=5)
        s = arr.sum()

        self.assertAlmostEqual(s.execute().fetch(), raw.sum())

        def _execute_ds(*_):  # pragma: no cover
            raise ValueError('cannot run random again')

        try:
            register(ArrayDataSource, _execute_ds)

            self.assertAlmostEqual(s.fetch(), raw.sum())
        finally:
            del Executor._op_runners[ArrayDataSource]
Ejemplo n.º 24
0
    def testSortValuesExecution(self):
        distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [
            '0', '1'
        ]
        for add_distinct in distinct_opts:
            os.environ['PSRS_DISTINCT_COL'] = add_distinct
            df = pd.DataFrame(np.random.rand(100, 10),
                              columns=['a' + str(i) for i in range(10)])

            # test one chunk
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a6', 'a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a6', 'a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test psrs
            mdf = DataFrame(df, chunk_size=10)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test ascending=False
            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a0', 'a1'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test multiindex
            df2 = df.copy(deep=True)
            df2.columns = pd.MultiIndex.from_product(
                [list('AB'), list('CDEFG')])
            mdf = DataFrame(df2, chunk_size=10)

            result = self.executor.execute_dataframe(mdf.sort_values([('A',
                                                                       'C')]),
                                                     concat=True)[0]
            expected = df2.sort_values([('A', 'C')])

            pd.testing.assert_frame_equal(result, expected)

            # test rechunk
            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test other types
            raw = pd.DataFrame(
                {
                    'a': np.random.rand(10),
                    'b': np.random.randint(1000, size=10),
                    'c': np.random.rand(10),
                    'd': [np.random.bytes(10) for _ in range(10)],
                    'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                    'f': [pd.Timedelta(f'{i} days') for i in range(10)]
                }, )
            mdf = DataFrame(raw, chunk_size=3)

            for label in raw.columns:
                result = self.executor.execute_dataframe(
                    mdf.sort_values(label), concat=True)[0]
                expected = raw.sort_values(label)
                pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a', 'b', 'e'], ascending=False),
                                                     concat=True)[0]
            expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test nan
            df = pd.DataFrame({
                'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
                'col2': [2, 1, 9, np.nan, 7, 4],
                'col3': [0, 1, 9, 4, 2, 3],
            })
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            # test None (issue #1885)
            df = pd.DataFrame(np.random.rand(1000, 10))

            df[0][df[0] < 0.5] = 'A'
            df[0][df[0] != 'A'] = None

            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values([0, 1]),
                                                     concat=True)[0]
            expected = df.sort_values([0, 1])

            pd.testing.assert_frame_equal(result, expected)

            mdf = DataFrame(df, chunk_size=100)
            result = self.executor.execute_dataframe(mdf.sort_values([0, 1]),
                                                     concat=True)[0]
            expected = df.sort_values([0, 1])

            pd.testing.assert_frame_equal(result, expected)

            # test ignore_index
            executor = ExecutorForTest(storage=new_session().context)

            df = pd.DataFrame(np.random.rand(10, 3),
                              columns=['a' + str(i) for i in range(3)])

            mdf = DataFrame(df, chunk_size=3)
            result = executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ignore_index=True),
                                                concat=True)[0]
            try:  # for python3.5
                expected = df.sort_values(['a0', 'a1'], ignore_index=True)
            except TypeError:
                expected = df.sort_values(['a0', 'a1'])
                expected.index = pd.RangeIndex(len(expected))

            pd.testing.assert_frame_equal(result, expected)

            # test inplace
            mdf = DataFrame(df)
            mdf.sort_values('a0', inplace=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            df.sort_values('a0', inplace=True)

            pd.testing.assert_frame_equal(result, df)

            # test unknown shape
            df = pd.DataFrame({
                'a': list(range(10)),
                'b': np.random.random(10)
            })
            mdf = DataFrame(df, chunk_size=4)
            filtered = mdf[mdf['a'] > 2]
            result = self.executor.execute_dataframe(
                filtered.sort_values(by='b'), concat=True)[0]

            pd.testing.assert_frame_equal(result,
                                          df[df['a'] > 2].sort_values(by='b'))

            # test empty dataframe
            df = pd.DataFrame({
                'a': list(range(10)),
                'b': np.random.random(10)
            })
            mdf = DataFrame(df, chunk_size=4)
            filtered = mdf[mdf['b'] > 100]
            result = self.executor.execute_dataframe(
                filtered.sort_values(by='b'), concat=True)[0]

            pd.testing.assert_frame_equal(
                result, df[df['b'] > 100].sort_values(by='b'))

            # test chunks with zero length
            df = pd.DataFrame({
                'a': list(range(10)),
                'b': np.random.random(10)
            })
            df.iloc[4:8, 1] = 0

            mdf = DataFrame(df, chunk_size=4)
            filtered = mdf[mdf['b'] != 0]
            result = self.executor.execute_dataframe(
                filtered.sort_values(by='b'), concat=True)[0]

            pd.testing.assert_frame_equal(result,
                                          df[df['b'] != 0].sort_values(by='b'))

            # test Series.sort_values
            raw = pd.Series(np.random.rand(10))
            series = Series(raw)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=3)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=2)
            result = self.executor.execute_dataframe(
                series.sort_values(ascending=False), concat=True)[0]
            expected = raw.sort_values(ascending=False)

            pd.testing.assert_series_equal(result, expected)
Ejemplo n.º 25
0
    def testFetchLogWithoutEtcd(self):
        # test fetch log
        with tempfile.TemporaryDirectory() as temp_dir:
            self.start_processes(etcd=False, modules=['mars.scheduler.tests.integrated.no_prepare_op'],
                                 scheduler_args=[f'-Dcustom_log_dir={temp_dir}'])
            sess = new_session(self.session_manager_ref.address)

            def f():
                print('test')

            r = spawn(f)
            r.execute(session=sess)

            custom_log_actor = sess._api.actor_client.actor_ref(
                CustomLogMetaActor.default_uid(),
                address=self.cluster_info.get_scheduler(CustomLogMetaActor.default_uid())
            )

            chunk_key_to_log_path = custom_log_actor.get_tileable_op_log_paths(
                sess.session_id, r.op.key)
            paths = list(chunk_key_to_log_path.values())
            self.assertEqual(len(paths), 1)
            log_path = paths[0][1]
            with open(log_path) as f:
                self.assertEqual(f.read().strip(), 'test')

            context = DistributedContext(scheduler_address=self.session_manager_ref.address,
                                         session_id=sess.session_id)
            log_result = context.fetch_tileable_op_logs(r.op.key)
            log = next(iter(log_result.values()))['log']
            self.assertEqual(log.strip(), 'test')

            log = r.fetch_log()
            self.assertEqual(str(log).strip(), 'test')

            # test multiple functions
            def f1(size):
                print('f1' * size)

            fs = ExecutableTuple([spawn(f1, 30), spawn(f1, 40)])
            fs.execute(session=sess)
            log = fs.fetch_log(offsets=20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30)[20:30])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40)[20:30])
            self.assertGreater(len(log[0].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets
            log = fs.fetch_log(offsets=-20, sizes=10)
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            # test negative offsets which represented in string
            log = fs.fetch_log(offsets='-0.02K', sizes='0.01K')
            self.assertEqual(str(log[0]).strip(), ('f1' * 30 + '\n')[-20:-10])
            self.assertEqual(str(log[1]).strip(), ('f1' * 40 + '\n')[-20:-10])
            self.assertTrue(all(s > 0 for s in log[0].offsets))
            self.assertGreater(len(log[1].offsets), 0)
            self.assertTrue(all(s > 0 for s in log[1].offsets))
            self.assertGreater(len(log[0].chunk_op_keys), 0)

            def test_nested():
                print('level0')
                fr = spawn(f1, 1)
                fr.execute()
                print(fr.fetch_log())

            r = spawn(test_nested)
            with self.assertRaises(ValueError):
                r.fetch_log()
            r.execute(session=sess)
            log = str(r.fetch_log())
            self.assertIn('level0', log)
            self.assertIn('f1', log)

            df = md.DataFrame(mt.random.rand(10, 3), chunk_size=5)

            def df_func(c):
                print('df func')
                return c

            df2 = df.map_chunk(df_func)
            df2.execute(session=sess)
            log = df2.fetch_log()
            self.assertIn('Chunk op key:', str(log))
            self.assertIn('df func', repr(log))
            self.assertEqual(len(str(df.fetch_log(session=sess))), 0)
Ejemplo n.º 26
0
    def testSortIndexExecution(self):
        raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw)
        mdf.sort_index(inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=30)
        result = self.executor.execute_dataframe(mdf.sort_index(), concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=20)
        result = self.executor.execute_dataframe(mdf.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        executor = ExecutorForTest(storage=new_session().context)

        mdf = DataFrame(raw, chunk_size=10)
        result = executor.execute_dataframe(mdf.sort_index(ignore_index=True), concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(ignore_index=True)
        except TypeError:
            expected = raw.sort_index()
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test axis=1
        raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1), concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1, ascending=False), concat=True)[0]
        expected = raw.sort_index(axis=1, ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        executor = ExecutorForTest(storage=new_session().context)

        result = executor.execute_dataframe(mdf.sort_index(axis=1, ignore_index=True), concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(axis=1, ignore_index=True)
        except TypeError:
            expected = raw.sort_index(axis=1)
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test series
        raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))

        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(series.sort_index(), concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(series.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_series_equal(result, expected)
Ejemplo n.º 27
0
 def testLocalTrainTensor(self):
     new_session().as_default()
     dtrain = MarsDMatrix(self.X, self.y)
     booster = train({}, dtrain, num_boost_round=2)
     self.assertIsInstance(booster, Booster)
Ejemplo n.º 28
0
    def testSortValuesExecution(self):
        df = pd.DataFrame(np.random.rand(100, 10), columns=['a' + str(i) for i in range(10)])

        # test one chunk
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a6', 'a7'], ascending=False), concat=True)[0]
        expected = df.sort_values(['a6', 'a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test psrs
        mdf = DataFrame(df, chunk_size=10)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test ascending=False
        result = self.executor.execute_dataframe(mdf.sort_values(['a0', 'a1'], ascending=False), concat=True)[0]
        expected = df.sort_values(['a0', 'a1'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a7'], ascending=False), concat=True)[0]
        expected = df.sort_values(['a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test rechunk
        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'), concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']), concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test other types
        raw = pd.DataFrame({'a': np.random.rand(10),
                            'b': np.random.randint(1000, size=10),
                            'c': np.random.rand(10),
                            'd': [np.random.bytes(10) for _ in range(10)],
                            'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                            'f': [pd.Timedelta(f'{i} days') for i in range(10)]
                            },)
        mdf = DataFrame(raw, chunk_size=3)

        for label in raw.columns:
            result = self.executor.execute_dataframe(mdf.sort_values(label), concat=True)[0]
            expected = raw.sort_values(label)
            pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a', 'b', 'e'], ascending=False), concat=True)[0]
        expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test nan
        df = pd.DataFrame({
            'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
            'col2': [2, 1, 9, np.nan, 7, 4],
            'col3': [0, 1, 9, 4, 2, 3],
        })
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']), concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        # test ignore_index
        executor = ExecutorForTest(storage=new_session().context)

        df = pd.DataFrame(np.random.rand(10, 3), columns=['a' + str(i) for i in range(3)])

        mdf = DataFrame(df, chunk_size=3)
        result = executor.execute_dataframe(
            mdf.sort_values(['a0', 'a1'], ignore_index=True), concat=True)[0]
        try:  # for python3.5
            expected = df.sort_values(['a0', 'a1'], ignore_index=True)
        except TypeError:
            expected = df.sort_values(['a0', 'a1'])
            expected.index = pd.RangeIndex(len(expected))

        pd.testing.assert_frame_equal(result, expected)

        # test inplace
        mdf = DataFrame(df)
        mdf.sort_values('a0', inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        df.sort_values('a0', inplace=True)

        pd.testing.assert_frame_equal(result, df)

        # test unknown shape
        df = pd.DataFrame({'a': list(range(10)),
                           'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['a'] > 2]
        result = self.executor.execute_dataframe(filtered.sort_values(by='b'), concat=True)[0]

        pd.testing.assert_frame_equal(result, df[df['a'] > 2].sort_values(by='b'))

        # test Sereis.sort_values
        raw = pd.Series(np.random.rand(10))
        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(series.sort_values(), concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(series.sort_values(ascending=False), concat=True)[0]
        expected = raw.sort_values(ascending=False)

        pd.testing.assert_series_equal(result, expected)
Ejemplo n.º 29
0
    def testEagerMode(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:

            self.assertIsInstance(Session.default_or_local()._sess,
                                  LocalClusterSession)

            with option_context({'eager_mode': True}):
                a_data = np.random.rand(10, 10)

                a = mt.tensor(a_data, chunk_size=3)
                np.testing.assert_array_equal(a, a_data)

                r1 = a + 1
                expected1 = a_data + 1
                np.testing.assert_array_equal(r1, expected1)

                r2 = r1.dot(r1)
                expected2 = expected1.dot(expected1)
                np.testing.assert_array_almost_equal(r2, expected2)

            a = mt.ones((10, 10), chunk_size=3)
            with self.assertRaises(ValueError):
                a.fetch()

            r = a.dot(a)
            np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    a_data = np.random.rand(10, 10)

                    a = mt.tensor(a_data, chunk_size=3)
                    np.testing.assert_array_equal(a, a_data)

                    r1 = a + 1
                    expected1 = a_data + 1
                    np.testing.assert_array_equal(r1, expected1)

                    r2 = r1.dot(r1)
                    expected2 = expected1.dot(expected1)
                    np.testing.assert_array_almost_equal(r2, expected2)

                    web_session = Session.default_or_local()._sess
                    self.assertEqual(web_session.get_task_count(), 3)

                a = mt.ones((10, 10), chunk_size=3)
                with self.assertRaises(ValueError):
                    a.fetch()

                r = a.dot(a)
                np.testing.assert_array_equal(r.execute(),
                                              np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
                from mars.dataframe.datasource.series import from_pandas as from_pandas_series
                from mars.dataframe.arithmetic import add

                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    data1 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
                    df1 = from_pandas_df(data1, chunk_size=5)
                    pd.testing.assert_frame_equal(df1.fetch(), data1)

                    data2 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
                    df2 = from_pandas_df(data2, chunk_size=6)
                    pd.testing.assert_frame_equal(df2.fetch(), data2)

                    df3 = add(df1, df2)
                    pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)

                    s1 = pd.Series(np.random.rand(10),
                                   index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
                    series1 = from_pandas_series(s1)
                    pd.testing.assert_series_equal(series1.fetch(), s1)

                web_session = Session.default_or_local()._sess
                self.assertEqual(web_session.get_task_count(), 4)
Ejemplo n.º 30
0
    def testLocalClassifier(self):
        new_session().as_default()

        X, y = self.X, self.y
        y = (y * 10).astype(mt.int32)
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X, y, eval_set=[(X, y)])
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        history = classifier.evals_result()

        self.assertIsInstance(prediction, mt.Tensor)
        self.assertIsInstance(history, dict)

        self.assertEqual(list(history)[0], 'validation_0')
        self.assertEqual(list(history['validation_0'])[0], 'merror')
        self.assertEqual(len(history['validation_0']), 1)
        self.assertEqual(len(history['validation_0']['merror']), 2)

        prob = classifier.predict_proba(X)
        self.assertEqual(prob.shape, X.shape)

        # test dataframe
        X_df = self.X_df
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X_df, y)
        prediction = classifier.predict(X_df)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        # test weight
        weights = [
            mt.random.rand(X.shape[0]),
            md.Series(mt.random.rand(X.shape[0])),
            md.DataFrame(mt.random.rand(X.shape[0]))
        ]
        y_df = md.DataFrame(self.y)
        for weight in weights:
            classifier = XGBClassifier(verbosity=1, n_estimators=2)
            classifier.fit(X, y_df, sample_weights=weight)
            prediction = classifier.predict(X)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

        # should raise error if weight.ndim > 1
        with self.assertRaises(ValueError):
            XGBClassifier(verbosity=1,
                          n_estimators=2).fit(X,
                                              y_df,
                                              sample_weights=mt.random.rand(
                                                  1, 1))

        # test binary classifier
        new_y = (self.y > 0.5).astype(mt.int32)
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X, new_y)
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        with self.assertRaises(TypeError):
            classifier.fit(X, y, wrong_param=1)
        classifier.fit(X, y)
        with self.assertRaises(TypeError):
            classifier.predict(X, wrong_param=1)