Beispiel #1
0
    def testDaemon(self):
        mock_scheduler_addr = '127.0.0.1:%d' % get_next_port()
        with create_actor_pool(n_process=2,
                               backend='gevent',
                               distributor=MarsDistributor(2, 'w:0:'),
                               address=mock_scheduler_addr) as pool:
            daemon_ref = pool.create_actor(
                WorkerDaemonActor, uid=WorkerDaemonActor.default_name())
            pool.create_actor(DispatchActor, uid=DispatchActor.default_name())
            sleeper_ref = daemon_ref.create_actor(DaemonSleeperActor,
                                                  uid='w:1:DaemonSleeperActor')
            daemon_ref.create_actor(ProcessHelperActor, uid='w:1:ProcHelper')
            test_actor = pool.create_actor(DaemonTestActor)
            daemon_ref.register_callback(test_actor, 'handle_process_down')

            test_actor.run_test_sleep(sleeper_ref, 10, _tell=True)
            self.assertTrue(daemon_ref.is_actor_process_alive(sleeper_ref))

            pool.sleep(0.5)

            daemon_ref.kill_actor_process(sleeper_ref)
            # repeated kill shall not produce errors
            daemon_ref.kill_actor_process(sleeper_ref)
            self.assertFalse(daemon_ref.is_actor_process_alive(sleeper_ref))

            pool.restart_process(1)
            daemon_ref.handle_process_down([1])
            pool.sleep(1)
            self.assertTrue(pool.has_actor(sleeper_ref))
            with self.assertRaises(WorkerProcessStopped):
                test_actor.get_result()

            test_actor.run_test_sleep(sleeper_ref, 1)
            pool.sleep(1.5)
            test_actor.get_result()
Beispiel #2
0
    def testWorkerProcessRestart(self):
        mock_scheduler_addr = '127.0.0.1:%d' % get_next_port()
        try:
            with create_actor_pool(n_process=1,
                                   backend='gevent',
                                   address=mock_scheduler_addr) as pool:
                pool.create_actor(SchedulerClusterInfoActor,
                                  schedulers=[mock_scheduler_addr],
                                  uid=SchedulerClusterInfoActor.default_name())

                pool.create_actor(ChunkMetaActor,
                                  uid=ChunkMetaActor.default_name())
                resource_ref = pool.create_actor(
                    ResourceActor, uid=ResourceActor.default_name())

                proc = subprocess.Popen([
                    sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                    '--schedulers', mock_scheduler_addr, '--cpu-procs', '1',
                    '--cache-mem', '10m', '--spill-dir', self._spill_dir,
                    '--ignore-avail-mem'
                ])
                worker_endpoint = self._wait_worker_ready(proc, resource_ref)

                daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name(),
                                            address=worker_endpoint)
                dispatch_ref = pool.actor_ref(DispatchActor.default_name(),
                                              address=worker_endpoint)
                cpu_slots = dispatch_ref.get_slots('cpu')
                calc_ref = pool.actor_ref(cpu_slots[0],
                                          address=worker_endpoint)
                daemon_ref.kill_actor_process(calc_ref)

                check_start = time.time()
                while not daemon_ref.is_actor_process_alive(calc_ref):
                    gevent.sleep(0.1)
                    if time.time() - check_start > 10:
                        raise TimeoutError('Check process restart timeout')
        finally:
            if proc.poll() is None:
                proc.send_signal(signal.SIGINT)
                check_time = time.time()
                while True:
                    time.sleep(0.1)
                    if proc.poll(
                    ) is not None or time.time() - check_time >= 5:
                        break
                if proc.poll() is None:
                    proc.kill()
            if os.path.exists(options.worker.plasma_socket):
                os.unlink(options.worker.plasma_socket)
Beispiel #3
0
 def post_create(self):
     super(DaemonSleeperActor, self).__init__()
     self._daemon_ref = self.promise_ref(WorkerDaemonActor.default_name())
     self._daemon_ref.register_process(self.ref(), os.getpid(), _tell=True)