Example #1
0
    def testChunkBroadcast(self, *_):
        proc_count = 2
        endpoints = ['127.0.0.1:%d' % get_next_port() for _ in range(proc_count)]
        keys = []

        def _mock_get_scheduler(key):
            return endpoints[keys.index(key[1]) % len(endpoints)]

        ChunkMetaActor.get_scheduler.side_effect = _mock_get_scheduler

        session_id = str(uuid.uuid4())
        with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1:
            pool1.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name())
            pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())

            with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2:
                pool2.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name())
                pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())

                client = new_client()
                ref1 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0])
                ref2 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0])
                local_ref1 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[0])
                local_ref2 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[1])

                key1 = str(uuid.uuid4())
                key2 = str(uuid.uuid4())
                keys = [key1, key2]

                ref1.set_chunk_broadcasts(session_id, key1, [endpoints[1]])
                ref1.set_chunk_size(session_id, key1, 512)
                ref1.set_chunk_shape(session_id, key1, (10,) * 2)
                ref1.add_worker(session_id, key1, 'abc')
                ref2.set_chunk_broadcasts(session_id, key2, [endpoints[0]])
                ref2.set_chunk_size(session_id, key2, 512)
                ref1.set_chunk_shape(session_id, key2, (10,) * 2)
                ref2.add_worker(session_id, key2, 'def')
                pool2.sleep(0.1)

                self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_size, 512)
                self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2)
                self.assertEqual(local_ref1.get_chunk_broadcasts(session_id, key1), [endpoints[1]])
                self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_size, 512)
                self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2)
                self.assertEqual(local_ref2.get_chunk_broadcasts(session_id, key2), [endpoints[0]])

                ref1.delete_meta(session_id, key1)
                pool2.sleep(0.1)

                self.assertIsNone(local_ref1.get_chunk_meta(session_id, key1))
                self.assertIsNone(local_ref2.get_chunk_meta(session_id, key1))
                self.assertIsNone(local_ref1.get_chunk_broadcasts(session_id, key1))

                local_ref1.remove_workers_in_session(session_id, ['def'])
                local_ref2.remove_workers_in_session(session_id, ['def'])
                pool2.sleep(0.1)

                self.assertIsNone(local_ref1.get_chunk_meta(session_id, key2))
                self.assertIsNone(local_ref2.get_chunk_meta(session_id, key2))
                self.assertIsNone(local_ref2.get_chunk_broadcasts(session_id, key2))
Example #2
0
    def wait_scheduler_worker_start(self):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        actor_client = new_client()
        time.sleep(1)
        check_time = time.time()
        while True:
            try:
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(),
                    address='127.0.0.1:' + self.scheduler_port)
                if actor_client.has_actor(resource_ref):
                    break
                else:
                    raise SystemError('Check meta_timestamp timeout')
            except:  # noqa: E722
                if time.time() - check_time > 10:
                    raise
                time.sleep(0.1)

        check_time = time.time()
        while not resource_ref.get_worker_count():
            if self.proc_scheduler.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' %
                                  self.proc_scheduler.poll())
            if self.proc_worker.poll() is not None:
                raise SystemError('Worker not started. exit code %s' %
                                  self.proc_worker.poll())
            if time.time() - check_time > 20:
                raise SystemError('Check meta_timestamp timeout')

            time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors
Example #3
0
    def start(self):
        from mars.actors import new_client

        super(CupidSchedulerServiceMain, self).start()

        # create process helper on every process
        proc_helper_refs = []
        for proc_id in range(self.pool.cluster_info.n_process):
            uid = 's:%d:mars-process-helper' % proc_id
            actor_ref = self.pool.create_actor(
                CupidSchedulerProcessHelperActor, uid=uid)
            proc_helper_refs.append(actor_ref)

        cupid_scheduler_key, scheduler_keys = self.args.cupid_scheduler_key.split(
            ';')

        if self.args.cupid_scheduler_key:
            self.write_cupid_service_info(cupid_scheduler_key)

        self.wait_for_all_ready(scheduler_keys)
        self.create_scheduler_discoverer()

        actor_client = new_client()
        for proc_helper_actor in proc_helper_refs:
            envs = self.cupid_context.prepare_channel()
            proc_helper_ref = actor_client.actor_ref(proc_helper_actor)
            new_envs = dict((env.name, env.value) for env in envs)
            proc_helper_ref.start_channel(new_envs)
Example #4
0
    def start(self):
        from mars.actors import new_client
        from cupid import context

        self.cupid_context = context()
        self.read_cupid_service_info(self.args.cupid_scheduler_key)
        self.create_scheduler_discoverer()

        super(CupidWorkerServiceMain, self).start()

        actor_client = new_client()
        proc_helpers = self._service._process_helper_actors
        for proc_helper_actor in proc_helpers:
            envs = self.cupid_context.prepare_channel()
            proc_helper_ref = actor_client.actor_ref(proc_helper_actor)
            new_envs = dict((env.name, env.value) for env in envs)
            proc_helper_ref.start_channel(new_envs)
    def start(self):
        from mars.actors import new_client
        from cupid import context

        self.cupid_context = context()
        self.read_cupid_service_info(self.args.cupid_scheduler_key)
        self.create_scheduler_discoverer()

        super(CupidWorkerServiceMain, self).start()

        actor_client = new_client()
        proc_helpers = self._service._process_helper_actors
        for proc_helper_actor in proc_helpers:
            logger.info('Start channel for subprocess %s.',
                        proc_helper_actor.uid)
            envs = self.cupid_context.prepare_channel()
            proc_helper_ref = actor_client.actor_ref(proc_helper_actor)
            new_envs = dict((env.name, env.value) for env in envs)
            proc_helper_ref.start_channel(new_envs)
        logger.info('All channel ready, upload worker status now.')
        self._service._status_ref.enable_status_upload(channel_ready=True,
                                                       _tell=True)
Example #6
0
    def _start_service(self):
        worker_port = self.worker_port = str(get_next_port())
        scheduler_port = self.scheduler_port = str(get_next_port())
        proc_worker = subprocess.Popen([sys.executable, '-m', 'mars.worker',
                                        '-a', '127.0.0.1',
                                        '-p', worker_port,
                                        '--cpu-procs', '2',
                                        '--cache-mem', '10m',
                                        '--schedulers', '127.0.0.1:' + scheduler_port,
                                        '--log-level', 'debug',
                                        '--log-format', 'WOR %(asctime)-15s %(message)s',
                                        '--ignore-avail-mem'])
        proc_scheduler = subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                                           '--nproc', '1',
                                           '-H', '127.0.0.1',
                                           '-p', scheduler_port,
                                           '-Dscheduler.default_cpu_usage=0',
                                           '--log-level', 'debug',
                                           '--log-format', 'SCH %(asctime)-15s %(message)s'])

        self.proc_worker = proc_worker
        self.proc_scheduler = proc_scheduler

        actor_client = new_client()
        time.sleep(1)
        check_time = time.time()
        while True:
            try:
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(), address='127.0.0.1:' + self.scheduler_port)
                if actor_client.has_actor(resource_ref):
                    break
                else:
                    raise SystemError('Check meta_timestamp timeout')
            except:  # noqa: E722
                if time.time() - check_time > 10:
                    raise
                time.sleep(0.1)

        check_time = time.time()
        while not resource_ref.get_worker_count():
            if self.proc_scheduler.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' % self.proc_scheduler.poll())
            if self.proc_worker.poll() is not None:
                raise SystemError('Worker not started. exit code %s' % self.proc_worker.poll())
            if time.time() - check_time > 30:
                raise SystemError('Check meta_timestamp timeout')

            time.sleep(0.1)

        web_port = self.web_port = str(get_next_port())
        proc_web = subprocess.Popen([sys.executable, '-m', 'mars.web',
                                     '-H', '127.0.0.1',
                                     '--log-level', 'debug',
                                     '--log-format', 'WEB %(asctime)-15s %(message)s',
                                     '-p', web_port,
                                     '-s', '127.0.0.1:' + self.scheduler_port])
        self.proc_web = proc_web

        service_ep = 'http://127.0.0.1:' + self.web_port
        check_time = time.time()
        while True:
            if time.time() - check_time > 30:
                raise SystemError('Wait for service start timeout')
            try:
                resp = requests.get(service_ep + '/api', timeout=1)
            except (requests.ConnectionError, requests.Timeout):
                time.sleep(0.1)
                continue
            if resp.status_code >= 400:
                time.sleep(0.1)
                continue
            break
Example #7
0
    def testWebApi(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            self.assertEqual(sess.count_workers(), 1)

            a = mt.ones((100, 100), chunk_size=30)
            b = mt.ones((100, 100), chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            assert_array_equal(value, np.ones((100, 100)) * 100)

            # check resubmission
            value2 = sess.run(c, timeout=timeout)
            assert_array_equal(value, value2)

            # check when local compression libs are missing
            from mars.serialize import dataserializer
            try:
                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                assert_array_equal(value, np.ones((10, 10)) * 10)

                dataserializer.decompressors[dataserializer.CompressType.LZ4] = None
                dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = None
                dataserializer.compress_openers[dataserializer.CompressType.LZ4] = None

                assert_array_equal(sess.fetch(c), np.ones((10, 10)) * 10)
            finally:
                dataserializer.decompressors[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompress
                dataserializer.decompressobjs[dataserializer.CompressType.LZ4] = dataserializer.lz4_decompressobj
                dataserializer.compress_openers[dataserializer.CompressType.LZ4] = dataserializer.lz4_open

            # check serialization by pickle
            try:
                sess._sess._serial_type = SerialType.PICKLE

                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                assert_array_equal(value, np.ones((10, 10)) * 10)
            finally:
                sess._sess._serial_type = SerialType.ARROW

            va = np.random.randint(0, 10000, (100, 100))
            vb = np.random.randint(0, 10000, (100, 100))
            a = mt.array(va, chunk_size=30)
            b = mt.array(vb, chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            assert_array_equal(value, va.dot(vb))

            graphs = sess.get_graph_states()

            # make sure status got uploaded
            time.sleep(1.5)

            # check web UI requests
            res = requests.get(service_ep)
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/scheduler' % (service_ep,))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/scheduler/127.0.0.1:%s' % (service_ep, self.scheduler_port))
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/worker' % (service_ep,))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/worker/127.0.0.1:%s' % (service_ep, self.worker_port))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/worker/127.0.0.1:%s/timeline' % (service_ep, self.worker_port))
            self.assertEqual(res.status_code, 200)

            res = requests.get('%s/session' % (service_ep,))
            self.assertEqual(res.status_code, 200)
            task_id = next(iter(graphs.keys()))
            res = requests.get('%s/session/%s/graph/%s' % (service_ep, sess._session_id, task_id))
            self.assertEqual(res.status_code, 200)
            res = requests.get('%s/session/%s/graph/%s/running_nodes' % (service_ep, sess._session_id, task_id))
            self.assertEqual(res.status_code, 200)

            from mars.web.task_pages import PROGRESS_APP_NAME
            res = requests.get('%s/%s?session_id=%s&task_id=%s'
                               % (service_ep, PROGRESS_APP_NAME, sess._session_id, task_id))
            self.assertEqual(res.status_code, 200)

            from mars.web.worker_pages import TIMELINE_APP_NAME
            res = requests.get('%s/%s?endpoint=127.0.0.1:%s'
                               % (service_ep, TIMELINE_APP_NAME, self.worker_port))
            self.assertEqual(res.status_code, 200)

        # make sure all chunks freed when session quits
        from mars.worker.storage import StorageManagerActor
        actor_client = new_client()
        storage_manager_ref = actor_client.actor_ref(StorageManagerActor.default_uid(),
                                                     address='127.0.0.1:' + str(self.worker_port))
        self.assertSetEqual(set(storage_manager_ref.dump_keys()), set())
Example #8
0
    def testWebApi(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            session_id = sess._session_id
            self.assertEqual(sess.count_workers(), 1)

            a = mt.ones((100, 100), chunk_size=30)
            b = mt.ones((100, 100), chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            np.testing.assert_array_equal(value, np.ones((100, 100)) * 100)

            # check resubmission
            value2 = sess.run(c, timeout=timeout)
            np.testing.assert_array_equal(value, value2)

            # check when local compression libs are missing
            from mars.serialize import dataserializer
            try:
                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                np.testing.assert_array_equal(value, np.ones((10, 10)) * 10)

                dataserializer.decompressors[
                    dataserializer.CompressType.LZ4] = None
                dataserializer.decompressobjs[
                    dataserializer.CompressType.LZ4] = None
                dataserializer.compress_openers[
                    dataserializer.CompressType.LZ4] = None

                np.testing.assert_array_equal(sess.fetch(c),
                                              np.ones((10, 10)) * 10)
            finally:
                dataserializer.decompressors[
                    dataserializer.CompressType.
                    LZ4] = dataserializer.lz4_decompress
                dataserializer.decompressobjs[
                    dataserializer.CompressType.
                    LZ4] = dataserializer.lz4_decompressobj
                dataserializer.compress_openers[
                    dataserializer.CompressType.LZ4] = dataserializer.lz4_open

            # check serialization by pickle
            try:
                sess._sess._serial_type = SerialType.PICKLE

                a = mt.ones((10, 10), chunk_size=30)
                b = mt.ones((10, 10), chunk_size=30)
                c = a.dot(b)
                value = sess.run(c, timeout=timeout)
                np.testing.assert_array_equal(value, np.ones((10, 10)) * 10)

                raw = pd.DataFrame(np.random.rand(10, 5),
                                   columns=list('ABCDE'),
                                   index=pd.RangeIndex(10, 0, -1))
                data = md.DataFrame(raw).astype({'E': 'arrow_string'})
                ret_data = data.execute(session=sess).fetch(session=sess)
                self.assertEqual(ret_data.dtypes['E'], np.dtype('O'))
                pd.testing.assert_frame_equal(ret_data.astype({'E': 'float'}),
                                              raw,
                                              check_less_precise=True)

                raw = pd.Series(np.random.rand(10),
                                index=pd.RangeIndex(10, 0, -1),
                                name='r')
                data = md.Series(raw).astype('Arrow[string]')
                ret_data = data.execute(session=sess).fetch(session=sess)
                self.assertEqual(ret_data.dtype, np.dtype('O'))
                pd.testing.assert_series_equal(ret_data.astype('float'), raw)
            finally:
                sess._sess._serial_type = SerialType.ARROW

            va = np.random.randint(0, 10000, (100, 100))
            vb = np.random.randint(0, 10000, (100, 100))
            a = mt.array(va, chunk_size=30)
            b = mt.array(vb, chunk_size=30)
            c = a.dot(b)
            value = sess.run(c, timeout=timeout)
            np.testing.assert_array_equal(value, va.dot(vb))

            # test fetch log
            def f():
                print('test')

            r = mr.spawn(f).execute(session=sess, timeout=timeout)
            self.assertEqual(str(r.fetch_log()).strip(), 'test')
            self.assertEqual(str(r.fetch_log(offsets=0)).strip(), 'test')
            self.assertEqual(str(r.fetch_log()).strip(), '')
            self.assertEqual(
                str(r.fetch_log(offsets='-0.003k', sizes=2)).strip(), 'st')

            graphs = sess.get_graph_states()

            # make sure status got uploaded
            time.sleep(1.5)

            # check web UI requests
            res = requests.get(service_ep)
            self.assertEqual(res.status_code, 200)

            res = requests.get(f'{service_ep}/scheduler')
            self.assertEqual(res.status_code, 200)
            res = requests.get(
                f'{service_ep}/scheduler/127.0.0.1:{self.scheduler_port}')
            self.assertEqual(res.status_code, 200)

            res = requests.get(f'{service_ep}/worker')
            self.assertEqual(res.status_code, 200)
            res = requests.get(
                f'{service_ep}/worker/127.0.0.1:{self.worker_port}')
            self.assertEqual(res.status_code, 200)
            res = requests.get(
                f'{service_ep}/worker/127.0.0.1:{self.worker_port}/timeline')
            self.assertEqual(res.status_code, 200)

            res = requests.get(f'{service_ep}/session')
            self.assertEqual(res.status_code, 200)
            task_id = next(iter(graphs.keys()))
            res = requests.get(
                f'{service_ep}/session/{session_id}/graph/{task_id}')
            self.assertEqual(res.status_code, 200)
            res = requests.get(
                f'{service_ep}/session/{session_id}/graph/{task_id}/running_nodes'
            )
            self.assertEqual(res.status_code, 200)

            from mars.web.task_pages import PROGRESS_APP_NAME
            res = requests.get(
                f'{service_ep}/{PROGRESS_APP_NAME}?session_id={session_id}&task_id={task_id}'
            )
            self.assertEqual(res.status_code, 200)

            from mars.web.worker_pages import TIMELINE_APP_NAME
            res = requests.get(
                f'{service_ep}/{TIMELINE_APP_NAME}?endpoint=127.0.0.1:{self.worker_port}'
            )
            self.assertEqual(res.status_code, 200)

        # make sure all chunks freed when session quits
        from mars.worker.storage import StorageManagerActor
        actor_client = new_client()
        storage_manager_ref = actor_client.actor_ref(
            StorageManagerActor.default_uid(),
            address='127.0.0.1:' + str(self.worker_port))
        self.assertSetEqual(set(storage_manager_ref.dump_keys()), set())
Example #9
0
    def testChunkMetaActors(self, *_):
        proc_count = 2
        endpoints = [
            '127.0.0.1:%d' % get_next_port() for _ in range(proc_count)
        ]
        keys = []

        def _mock_get_scheduler(key):
            return endpoints[keys.index(key[1]) % len(endpoints)]

        ChunkMetaClient.get_scheduler.side_effect = _mock_get_scheduler

        session1 = str(uuid.uuid4())
        session2 = str(uuid.uuid4())
        with create_actor_pool(n_process=1,
                               backend='gevent',
                               address=endpoints[0]) as pool1:
            cluster_info1 = pool1.create_actor(
                SchedulerClusterInfoActor,
                endpoints,
                uid=SchedulerClusterInfoActor.default_name())
            pool1.create_actor(ChunkMetaActor,
                               uid=ChunkMetaActor.default_name())

            with create_actor_pool(n_process=1,
                                   backend='gevent',
                                   address=endpoints[1]) as pool2:
                cluster_info2 = pool2.create_actor(
                    SchedulerClusterInfoActor,
                    endpoints,
                    uid=SchedulerClusterInfoActor.default_name())
                pool2.create_actor(ChunkMetaActor,
                                   uid=ChunkMetaActor.default_name())

                actor_client = new_client()
                client1 = ChunkMetaClient(
                    actor_client, actor_client.actor_ref(cluster_info1))
                client2 = ChunkMetaClient(
                    actor_client, actor_client.actor_ref(cluster_info2))

                loc_ref1 = actor_client.actor_ref(
                    ChunkMetaActor.default_name(), address=endpoints[0])
                loc_ref2 = actor_client.actor_ref(
                    ChunkMetaActor.default_name(), address=endpoints[1])

                key1 = (str(uuid.uuid4()), str(uuid.uuid4()))
                key2 = str(uuid.uuid4())
                key3 = str(uuid.uuid4())
                key4 = (str(uuid.uuid4()), str(uuid.uuid4()))
                key5 = str(uuid.uuid4())
                key6 = str(uuid.uuid4())
                keys = [key1, key2, key3, key4, key5, key6]
                client1.set_chunk_size(session1, key1, 512)
                client2.set_chunk_size(session1, key2, 1024)
                client2.set_chunk_size(session2, key3, 1024)

                self.assertEqual(client1.get_chunk_size(session1, key1), 512)
                self.assertEqual(client2.get_chunk_size(session1, key2), 1024)
                self.assertEqual(client1.get_chunk_size(session1, key2), 1024)
                self.assertEqual(client2.get_chunk_size(session1, key1), 512)

                self.assertListEqual(
                    client1.batch_get_chunk_size(session1, [key1, key2]),
                    [512, 1024])
                self.assertListEqual(
                    client2.batch_get_chunk_size(session1, [key1, key2]),
                    [512, 1024])

                client1.set_chunk_shape(session1, key1, (10, ))
                client2.set_chunk_shape(session1, key2, (10, ) * 2)
                client2.set_chunk_shape(session2, key3, (10, ) * 2)

                self.assertEqual(client1.get_chunk_shape(session1, key1),
                                 (10, ))
                self.assertEqual(client2.get_chunk_shape(session1, key2),
                                 (10, ) * 2)
                self.assertEqual(client1.get_chunk_shape(session1, key2),
                                 (10, ) * 2)
                self.assertEqual(client2.get_chunk_shape(session1, key1),
                                 (10, ))

                self.assertListEqual(
                    client1.batch_get_chunk_shape(session1, [key1, key2]),
                    [(10, ), (10, ) * 2])
                self.assertListEqual(
                    client2.batch_get_chunk_shape(session1, [key1, key2]),
                    [(10, ), (10, ) * 2])

                mock_endpoint = '127.0.0.1:%d' % get_next_port()
                with create_actor_pool(n_process=1,
                                       backend='gevent',
                                       address=mock_endpoint) as pool3:
                    cluster_info3 = pool3.create_actor(
                        SchedulerClusterInfoActor,
                        endpoints,
                        uid=SchedulerClusterInfoActor.default_name())
                    client3 = ChunkMetaClient(
                        actor_client, actor_client.actor_ref(cluster_info3))
                    self.assertListEqual(
                        client3.batch_get_chunk_shape(session1, [key1, key2]),
                        [(10, ), (10, ) * 2])

                client1.add_worker(session1, key1, 'abc')
                client1.add_worker(session1, key1, 'def')
                client2.add_worker(session1, key2, 'ghi')

                client1.add_worker(session2, key3, 'ghi')

                self.assertEqual(sorted(client1.get_workers(session1, key1)),
                                 sorted(('abc', 'def')))
                self.assertEqual(sorted(client2.get_workers(session1, key2)),
                                 sorted(('ghi', )))

                batch_result = client1.batch_get_workers(
                    session1, [key1, key2])
                self.assertEqual(sorted(batch_result[0]), sorted(
                    ('abc', 'def')))
                self.assertEqual(sorted(batch_result[1]), sorted(('ghi', )))

                affected = []
                for loc_ref in (loc_ref1, loc_ref2):
                    affected.extend(
                        loc_ref.remove_workers_in_session(session2, ['ghi']))
                self.assertEqual(affected, [key3])
                self.assertEqual(sorted(client1.get_workers(session1, key2)),
                                 sorted(('ghi', )))
                self.assertIsNone(client1.get_workers(session2, key3))

                client1.delete_meta(session1, key1)
                self.assertIsNone(client1.get_workers(session1, key1))
                self.assertIsNone(
                    client1.batch_get_chunk_size(session1, [key1, key2])[0])
                self.assertIsNone(
                    client1.batch_get_workers(session1, [key1, key2])[0])

                client2.batch_delete_meta(session1, [key1, key2])
                self.assertIsNone(client1.get_workers(session1, key2))
                self.assertIsNone(
                    client1.batch_get_chunk_size(session1, [key1, key2])[1])
                self.assertIsNone(
                    client1.batch_get_workers(session1, [key1, key2])[1])

                meta4 = WorkerMeta(chunk_size=512,
                                   chunk_shape=(10, ) * 2,
                                   workers=(endpoints[0], ))
                loc_ref2.batch_set_chunk_meta(session1, [key4], [meta4])
                self.assertEqual(
                    loc_ref2.get_chunk_meta(session1, key4).chunk_size, 512)
                self.assertEqual(
                    loc_ref2.get_chunk_meta(session1, key4).chunk_shape,
                    (10, ) * 2)

                meta5 = WorkerMeta(chunk_size=512,
                                   chunk_shape=(10, ) * 2,
                                   workers=(endpoints[0], ))
                meta6 = WorkerMeta(chunk_size=512,
                                   chunk_shape=(10, ) * 2,
                                   workers=(endpoints[0], ))
                client1.batch_set_chunk_meta(session1, [key5, key6],
                                             [meta5, meta6])
                self.assertEqual(
                    loc_ref1.get_chunk_meta(session1, key5).chunk_size, 512)
                self.assertEqual(
                    loc_ref2.get_chunk_meta(session1, key6).chunk_size, 512)
Example #10
0
    def testChunkBroadcast(self, *_):
        proc_count = 2
        endpoints = ['127.0.0.1:%d' % get_next_port() for _ in range(proc_count)]
        keys = []

        def _mock_get_scheduler(key):
            return endpoints[keys.index(key[1]) % len(endpoints)]

        ChunkMetaClient.get_scheduler.side_effect = _mock_get_scheduler

        session_id = str(uuid.uuid4())
        with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1:
            cluster_info1 = pool1.create_actor(SchedulerClusterInfoActor, endpoints,
                                               uid=SchedulerClusterInfoActor.default_uid())
            pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid())

            with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2:
                cluster_info2 = pool2.create_actor(SchedulerClusterInfoActor, endpoints,
                                                   uid=SchedulerClusterInfoActor.default_uid())
                pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid())

                actor_client = new_client()
                client1 = ChunkMetaClient(actor_client, actor_client.actor_ref(cluster_info1))
                client2 = ChunkMetaClient(actor_client, actor_client.actor_ref(cluster_info2))
                local_ref1 = actor_client.actor_ref(ChunkMetaActor.default_uid(), address=endpoints[0])
                local_ref2 = actor_client.actor_ref(ChunkMetaActor.default_uid(), address=endpoints[1])

                key1 = str(uuid.uuid4())
                key2 = str(uuid.uuid4())
                key3 = str(uuid.uuid4())
                keys = [key1, key2, key3]

                client1.set_chunk_broadcasts(session_id, key1, [endpoints[1]])
                client1.set_chunk_size(session_id, key1, 512)
                client1.set_chunk_shape(session_id, key1, (10,) * 2)
                client1.add_worker(session_id, key1, 'abc')
                client2.set_chunk_broadcasts(session_id, key2, [endpoints[0]])
                client2.set_chunk_size(session_id, key2, 512)
                client1.set_chunk_shape(session_id, key2, (10,) * 2)
                client2.add_worker(session_id, key2, 'def')
                pool2.sleep(0.1)

                self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_size, 512)
                self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2)
                self.assertEqual(local_ref1.get_chunk_broadcasts(session_id, key1), [endpoints[1]])
                self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_size, 512)
                self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2)
                self.assertEqual(local_ref2.get_chunk_broadcasts(session_id, key2), [endpoints[0]])

                client1.batch_set_chunk_broadcasts(session_id, [key3], [[endpoints[1]]])
                meta3 = WorkerMeta(chunk_size=512, chunk_shape=(10,) * 2, workers=(endpoints[0],))
                local_ref1.batch_set_chunk_meta(session_id, [key3], [meta3])
                self.assertEqual(local_ref2.get_chunk_meta(session_id, key3).chunk_size, 512)
                self.assertEqual(local_ref2.get_chunk_meta(session_id, key3).chunk_shape, (10,) * 2)

                client1.delete_meta(session_id, key1)
                pool2.sleep(0.1)

                self.assertIsNone(local_ref1.get_chunk_meta(session_id, key1))
                self.assertIsNone(local_ref2.get_chunk_meta(session_id, key1))
                self.assertIsNone(local_ref1.get_chunk_broadcasts(session_id, key1))

                local_ref1.remove_workers_in_session(session_id, ['def'])
                local_ref2.remove_workers_in_session(session_id, ['def'])
                pool2.sleep(0.1)

                self.assertIsNone(local_ref1.get_chunk_meta(session_id, key2))
                self.assertIsNone(local_ref2.get_chunk_meta(session_id, key2))
                self.assertIsNone(local_ref2.get_chunk_broadcasts(session_id, key2))
Example #11
0
    def setUp(self):
        scheduler_port = str(get_next_port())
        proc_worker = subprocess.Popen([
            sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--level',
            'debug', '--cpu-procs', '2', '--cache-mem', '10m', '--schedulers',
            '127.0.0.1:' + scheduler_port, '--ignore-avail-mem'
        ])
        proc_scheduler = subprocess.Popen([
            sys.executable, '-m', 'mars.scheduler', '--nproc', '1', '--level',
            'debug', '-H', '127.0.0.1', '-p', scheduler_port, '--format',
            '%(asctime)-15s %(message)s'
        ])

        self.scheduler_port = scheduler_port
        self.proc_worker = proc_worker
        self.proc_scheduler = proc_scheduler

        actor_client = new_client()
        time.sleep(2)
        check_time = time.time()
        while True:
            try:
                kv_ref = actor_client.actor_ref(KVStoreActor.default_name(),
                                                address='127.0.0.1:' +
                                                scheduler_port)
                if actor_client.has_actor(kv_ref):
                    break
                else:
                    raise SystemError('Check meta_timestamp timeout')
            except:
                if time.time() - check_time > 10:
                    raise
                time.sleep(1)

        check_time = time.time()
        while True:
            content = kv_ref.read('/workers/meta_timestamp', silent=True)
            if self.proc_scheduler.poll() is not None:
                raise SystemError('Scheduler not started. exit code %s' %
                                  self.proc_scheduler.poll())
            if self.proc_worker.poll() is not None:
                raise SystemError('Worker not started. exit code %s' %
                                  self.proc_worker.poll())
            if time.time() - check_time > 20:
                raise SystemError('Check meta_timestamp timeout')

            if not content:
                time.sleep(0.5)
            else:
                break

        web_port = str(get_next_port())
        self.web_port = web_port
        proc_web = subprocess.Popen([
            sys.executable, '-m', 'mars.web', '-H', '127.0.0.1', '--level',
            'debug', '--ui-port', web_port, '-s',
            '127.0.0.1:' + self.scheduler_port
        ])
        self.proc_web = proc_web

        service_ep = 'http://127.0.0.1:' + self.web_port
        check_time = time.time()
        while True:
            if time.time() - check_time > 30:
                raise SystemError('Wait for service start timeout')
            try:
                resp = requests.get(service_ep + '/api', timeout=1)
            except (requests.ConnectionError, requests.Timeout):
                time.sleep(1)
                continue
            if resp.status_code >= 400:
                time.sleep(1)
                continue
            break

        self.exceptions = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )
Example #12
0
    def testChunkMetaActors(self, *_):
        proc_count = 2
        endpoints = ['127.0.0.1:%d' % get_next_port() for _ in range(proc_count)]
        keys = []

        def _mock_get_scheduler(key):
            return endpoints[keys.index(key[1]) % len(endpoints)]

        ChunkMetaActor.get_scheduler.side_effect = _mock_get_scheduler

        session1 = str(uuid.uuid4())
        session2 = str(uuid.uuid4())
        with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1:
            pool1.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name())
            pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())

            with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2:
                pool2.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name())
                pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name())

                client = new_client()
                ref1 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0])
                ref2 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[1])

                loc_ref1 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[0])
                loc_ref2 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[1])

                key1 = str(uuid.uuid4())
                key2 = str(uuid.uuid4())
                key3 = str(uuid.uuid4())
                keys = [key1, key2, key3]
                ref1.set_chunk_size(session1, key1, 512)
                ref2.set_chunk_size(session1, key2, 1024)
                ref2.set_chunk_size(session2, key3, 1024)

                self.assertEqual(ref1.get_chunk_size(session1, key1), 512)
                self.assertEqual(ref2.get_chunk_size(session1, key2), 1024)
                self.assertEqual(ref1.get_chunk_size(session1, key2), 1024)
                self.assertEqual(ref2.get_chunk_size(session1, key1), 512)

                self.assertListEqual(ref1.batch_get_chunk_size(session1, [key1, key2]), [512, 1024])
                self.assertListEqual(ref2.batch_get_chunk_size(session1, [key1, key2]), [512, 1024])

                ref1.add_worker(session1, key1, 'abc')
                ref1.add_worker(session1, key1, 'def')
                ref2.add_worker(session1, key2, 'ghi')

                ref1.add_worker(session2, key3, 'ghi')

                self.assertEqual(sorted(ref1.get_workers(session1, key1)), sorted(('abc', 'def')))
                self.assertEqual(sorted(ref2.get_workers(session1, key2)), sorted(('ghi',)))

                batch_result = ref1.batch_get_workers(session1, [key1, key2])
                self.assertEqual(sorted(batch_result[0]), sorted(('abc', 'def')))
                self.assertEqual(sorted(batch_result[1]), sorted(('ghi',)))

                affected = []
                for loc_ref in (loc_ref1, loc_ref2):
                    affected.extend(loc_ref.remove_workers_in_session(session2, ['ghi']))
                self.assertEqual(affected, [key3])
                self.assertEqual(sorted(ref1.get_workers(session1, key2)), sorted(('ghi',)))
                self.assertIsNone(ref1.get_workers(session2, key3))

                ref1.delete_meta(session1, key1)
                self.assertIsNone(ref1.get_workers(session1, key1))
                self.assertIsNone(ref1.batch_get_chunk_size(session1, [key1, key2])[0])
                self.assertIsNone(ref1.batch_get_workers(session1, [key1, key2])[0])

                ref2.batch_delete_meta(session1, [key1, key2])
                self.assertIsNone(ref1.get_workers(session1, key2))
                self.assertIsNone(ref1.batch_get_chunk_size(session1, [key1, key2])[1])
                self.assertIsNone(ref1.batch_get_workers(session1, [key1, key2])[1])