Exemple #1
0
    def testLocalCluster(self, *_):
        endpoint = gen_endpoint('0.0.0.0')
        with LocalDistributedCluster(endpoint,
                                     scheduler_n_process=2,
                                     worker_n_process=3,
                                     shared_memory='20M') as cluster:
            pool = cluster.pool

            self.assertTrue(
                pool.has_actor(
                    pool.actor_ref(SchedulerClusterInfoActor.default_uid())))
            self.assertTrue(
                pool.has_actor(
                    pool.actor_ref(SessionManagerActor.default_uid())))
            self.assertTrue(
                pool.has_actor(pool.actor_ref(DispatchActor.default_uid())))

            with new_session(endpoint) as session:
                api = session._api

                t = mt.ones((3, 3), chunk_size=2)
                result = session.run(t, timeout=_exec_timeout)

                np.testing.assert_array_equal(result, np.ones((3, 3)))

            self.assertNotIn(session._session_id,
                             api.session_manager.get_sessions())
Exemple #2
0
    def testFailoverMessage(self):
        mock_session_id = str(uuid.uuid4())
        mock_graph_key = str(uuid.uuid4())
        mock_chunk_key = str(uuid.uuid4())
        addr = '127.0.0.1:%d' % get_next_port()
        mock_worker_addr = '127.0.0.1:54132'

        options.scheduler.worker_blacklist_time = 0.5

        with create_actor_pool(n_process=1, backend='gevent',
                               address=addr) as pool:
            cluster_info_ref = pool.create_actor(
                SchedulerClusterInfoActor, [pool.cluster_info.address],
                uid=SchedulerClusterInfoActor.default_uid())
            session_manager_ref = pool.create_actor(
                SessionManagerActor, uid=SessionManagerActor.default_uid())
            resource_ref = pool.create_actor(ResourceActor,
                                             uid=ResourceActor.default_uid())
            pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_uid())

            session_ref = pool.actor_ref(
                session_manager_ref.create_session(mock_session_id))
            chunk_meta_client = ChunkMetaClient(pool, cluster_info_ref)
            chunk_meta_client.set_chunk_meta(mock_session_id,
                                             mock_chunk_key,
                                             size=80,
                                             shape=(10, ),
                                             workers=(mock_worker_addr, ))

            with mock.patch(GraphActor.__module__ + '.' + GraphActor.__name__,
                            new=MockGraphActor):
                session_ref.submit_tileable_graph(None, mock_graph_key)
                graph_ref = pool.actor_ref(
                    GraphActor.gen_uid(mock_session_id, mock_graph_key))

                expire_time = time.time(
                ) - options.scheduler.status_timeout - 1
                resource_ref.set_worker_meta(mock_worker_addr,
                                             dict(update_time=expire_time))

                resource_ref.detect_dead_workers(_tell=True)
                pool.sleep(0.2)

                _, removes, lost_chunks = graph_ref.get_worker_change_args()
                self.assertListEqual(removes, [mock_worker_addr])
                self.assertListEqual(lost_chunks, [mock_chunk_key])

                self.assertNotIn(mock_worker_addr,
                                 resource_ref.get_workers_meta())
                resource_ref.set_worker_meta(mock_worker_addr,
                                             dict(update_time=time.time()))
                self.assertNotIn(mock_worker_addr,
                                 resource_ref.get_workers_meta())

                pool.sleep(0.4)
                resource_ref.set_worker_meta(mock_worker_addr,
                                             dict(update_time=time.time()))
                self.assertIn(mock_worker_addr,
                              resource_ref.get_workers_meta())
Exemple #3
0
    def setUp(self):
        endpoint = '127.0.0.1:%d' % get_next_port()
        self.endpoint = endpoint
        self.pool = create_actor_pool(n_process=1, backend='gevent', address=endpoint)
        self.pool.create_actor(SchedulerClusterInfoActor, [endpoint],
                               uid=SchedulerClusterInfoActor.default_uid())
        self.pool.create_actor(SessionManagerActor, uid=SessionManagerActor.default_uid())
        self.pool.create_actor(ResourceActor, uid=ResourceActor.default_uid())

        self.api = MarsAPI(endpoint)
Exemple #4
0
    def _start_worker_process(self,
                              cuda=False,
                              cuda_device=None,
                              extra_env=None,
                              modules=None,
                              check_timeout=None):
        mock_scheduler_addr = f'127.0.0.1:{get_next_port()}'
        try:
            with create_actor_pool(n_process=1,
                                   backend='gevent',
                                   address=mock_scheduler_addr) as pool:
                pool.create_actor(SchedulerClusterInfoActor,
                                  [mock_scheduler_addr],
                                  uid=SchedulerClusterInfoActor.default_uid())
                pool.create_actor(SessionManagerActor,
                                  uid=SessionManagerActor.default_uid())

                pool.create_actor(ChunkMetaActor,
                                  uid=ChunkMetaActor.default_uid())
                resource_ref = pool.create_actor(
                    ResourceActor, uid=ResourceActor.default_uid())

                args = [
                    sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                    '--schedulers', mock_scheduler_addr, '--cpu-procs', '1',
                    '--cache-mem', '10m', '--spill-dir', self._spill_dir,
                    '--log-level', 'debug', '--log-format',
                    '%(asctime)-15s %(message)s', '--ignore-avail-mem'
                ]
                if modules:
                    args.extend(['--load-modules', ','.join(modules)])
                env = os.environ.copy()
                env.update(extra_env or dict())
                if cuda:
                    env['CUDA_VISIBLE_DEVICES'] = cuda_device
                proc = subprocess.Popen(args, env=env)
                worker_endpoint = self._wait_worker_ready(
                    proc, resource_ref, timeout=check_timeout)

                yield pool, worker_endpoint
        finally:
            if proc.poll() is None:
                proc.send_signal(signal.SIGINT)
                check_time = time.time()
                while True:
                    time.sleep(0.1)
                    if proc.poll(
                    ) is not None or time.time() - check_time >= 5:
                        break
                if proc.poll() is None:
                    proc.kill()
            if os.path.exists(options.worker.plasma_socket):
                os.unlink(options.worker.plasma_socket)
Exemple #5
0
    def start_processes(self,
                        n_schedulers=2,
                        n_workers=2,
                        etcd=False,
                        cuda=False,
                        modules=None,
                        log_scheduler=True,
                        log_worker=True,
                        env=None):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception, )

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = []
        append_args_worker = []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(
                ['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']
        if not cuda:
            append_args_worker += ['--no-cuda']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1',
                '-p', p, '--log-level',
                'debug' if log_scheduler else 'warning', '--log-format',
                'SCH%d %%(asctime)-15s %%(message)s' % idx,
                '-Dscheduler.retry_delay=5', '-Dscheduler.default_cpu_usage=0',
                '-Dscheduler.status_timeout=10'
            ] + append_args + append_args_scheduler,
                             env=proc_env)
            for idx, p in enumerate(scheduler_ports)
        ]
        cuda_count = resource.cuda_count()
        self.proc_workers = [
            subprocess.Popen([
                sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                '--cpu-procs', '1', '--log-level',
                'debug' if log_worker else 'warning', '--log-format',
                'WOR%d %%(asctime)-15s %%(message)s' % idx, '--cache-mem',
                '16m', '--ignore-avail-mem', '--cuda-device',
                str(idx % cuda_count) if cuda_count else '0',
                '-Dworker.prepare_data_timeout=30'
            ] + append_args + append_args_worker,
                             env=proc_env) for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(),
            address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                started_schedulers = self.cluster_info.get_schedulers()
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        'Schedulers does not met requirement: %d < %d.' %
                        (len(started_schedulers), n_schedulers))
                actor_address = self.cluster_info.get_scheduler(
                    SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(
                    ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(
                    ResourceActor.default_uid(), address=actor_address)

                if resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        'Workers does not met requirement: %d < %d.' %
                        (resource_ref.get_worker_count(), n_workers))
                break
            except:
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors
Exemple #6
0
    def _start_processes(self, n_schedulers=2, n_workers=2, etcd=False, cuda=False, modules=None,
                         log_scheduler=True, log_worker=True, env=None, scheduler_args=None,
                         worker_args=None, worker_cpu=1):
        old_not_errors = gevent.hub.Hub.NOT_ERROR
        gevent.hub.Hub.NOT_ERROR = (Exception,)

        scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)]
        self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports]

        append_args = []
        append_args_scheduler = scheduler_args or []
        append_args_worker = worker_args or []
        if modules:
            append_args.extend(['--load-modules', ','.join(modules)])

        if etcd:
            etcd_port = get_next_port()
            self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port)
            self.etcd_helper.run()
            options.kv_store = f'etcd://127.0.0.1:{etcd_port}'
            append_args.extend(['--kv-store', options.kv_store])
        else:
            append_args.extend(['--schedulers', ','.join(self.scheduler_endpoints)])

        if 'DUMP_GRAPH_DATA' in os.environ:
            append_args_scheduler += ['-Dscheduler.dump_graph_data=true']

        proc_env = os.environ.copy()
        if env:
            proc_env.update(env)

        self.proc_schedulers = [
            subprocess.Popen([sys.executable, '-m', 'mars.scheduler',
                              '-H', '127.0.0.1',
                              '-p', p,
                              '--log-level', 'debug' if log_scheduler else 'warning',
                              '--log-format', f'SCH{idx} %(asctime)-15s %(message)s'
                              '-Dscheduler.retry_delay=5',
                              '-Dscheduler.default_cpu_usage=0',
                              '-Dscheduler.status_timeout=10']
                             + append_args + append_args_scheduler, env=proc_env)
            for idx, p in enumerate(scheduler_ports)]
        cuda_count = resource.cuda_count()
        cuda_devices = [int(d) for d in os.environ['CUDA_VISIBLE_DEVICES'].split(',')] \
            if os.environ.get('CUDA_VISIBLE_DEVICES') else list(range(cuda_count))
        self.proc_workers = [
            subprocess.Popen([sys.executable, '-m', 'mars.worker',
                              '-a', '127.0.0.1',
                              '--cpu-procs', str(worker_cpu),
                              '--log-level', 'debug' if log_worker else 'warning',
                              '--log-format', f'WOR{idx} %(asctime)-15s %(message)s',
                              '--cache-mem', '16m',
                              '--ignore-avail-mem',
                              '--cuda-device', str(cuda_devices[idx % cuda_count]) if cuda_count else '',
                              '-Dworker.prepare_data_timeout=30']
                             + append_args + append_args_worker, env=proc_env)
            for idx in range(n_workers)
        ]

        actor_client = new_client()
        self.cluster_info = actor_client.actor_ref(
            SchedulerClusterInfoActor.default_uid(), address=self.scheduler_endpoints[0])

        check_time = time.time()
        while True:
            try:
                try:
                    started_schedulers = self.cluster_info.get_schedulers()
                except Exception as e:
                    raise ProcessRequirementUnmetError(f'Failed to get scheduler numbers, {e}')
                if len(started_schedulers) < n_schedulers:
                    raise ProcessRequirementUnmetError(
                        f'Schedulers does not met requirement: {len(started_schedulers)} < {n_schedulers}.')
                actor_address = self.cluster_info.get_scheduler(SessionManagerActor.default_uid())
                self.session_manager_ref = actor_client.actor_ref(
                    SessionManagerActor.default_uid(), address=actor_address)

                actor_address = self.cluster_info.get_scheduler(ResourceActor.default_uid())
                resource_ref = actor_client.actor_ref(ResourceActor.default_uid(), address=actor_address)

                if not actor_client.has_actor(self.session_manager_ref) \
                        or resource_ref.get_worker_count() < n_workers:
                    raise ProcessRequirementUnmetError(
                        f'Workers does not met requirement: {resource_ref.get_worker_count()} < {n_workers}')
                break
            except:  # noqa: E722
                if time.time() - check_time > 20:
                    raise
                time.sleep(0.1)

        gevent.hub.Hub.NOT_ERROR = old_not_errors