Beispiel #1
0
    def testOperandPrepush(self):
        session_id = str(uuid.uuid4())
        graph_key = str(uuid.uuid4())
        mock_workers = ['localhost:12345']

        with self._prepare_test_graph(session_id, graph_key,
                                      mock_workers) as (pool, graph_ref):
            input_op_keys, mid_op_key, output_op_keys = self._filter_graph_level_op_keys(
                graph_ref)
            fake_exec_ref = pool.create_actor(FakeExecutionActor, 0.5)

            input_refs = [
                pool.actor_ref(OperandActor.gen_uid(session_id, k))
                for k in input_op_keys
            ]
            mid_ref = pool.actor_ref(
                OperandActor.gen_uid(session_id, mid_op_key))

            def _fake_raw_execution_ref(*_, **__):
                return fake_exec_ref

            with patch_method(OperandActor._get_raw_execution_ref, new=_fake_raw_execution_ref),\
                    patch_method(AssignerActor.get_worker_assignments, new=lambda *_: mock_workers):
                input_refs[0].start_operand(OperandState.READY)
                input_refs[1].start_operand(OperandState.READY)

                start_time = time.time()
                # submission without pre-push will fail
                while mid_ref.get_state() != OperandState.FINISHED:
                    pool.sleep(0.1)
                    if time.time() - start_time > 30:
                        raise TimeoutError(
                            'Check middle chunk state timed out.')
Beispiel #2
0
    def testExecuteWorker(self):
        with self._start_worker_process() as (pool, worker_endpoint):
            test_ref = pool.create_actor(WorkerProcessTestActor)
            test_ref.run_test(worker_endpoint, _tell=True)

            check_time = time.time()
            while not test_ref.get_reply():
                gevent.sleep(0.1)
                if time.time() - check_time > 20:
                    raise TimeoutError('Check reply timeout')
Beispiel #3
0
    def testExecuteCudaWorker(self):
        dev_id = os.environ.get('CUDA_VISIBLE_DEVICES', '0').split(',', 1)[0]
        with self._start_worker_process(
                no_cuda=False, cuda_device=dev_id) as (pool, worker_endpoint):
            test_ref = pool.create_actor(WorkerProcessTestActor)
            test_ref.run_test(worker_endpoint, calc_device='cuda', _tell=True)

            check_time = time.time()
            while not test_ref.get_reply():
                gevent.sleep(0.1)
                if time.time() - check_time > 2000:
                    raise TimeoutError('Check reply timeout')
Beispiel #4
0
    def testWorkerProcessRestart(self):
        with self._start_worker_process() as (pool, worker_endpoint):
            daemon_ref = pool.actor_ref(WorkerDaemonActor.default_uid(), address=worker_endpoint)
            dispatch_ref = pool.actor_ref(DispatchActor.default_uid(), address=worker_endpoint)
            cpu_slots = dispatch_ref.get_slots('cpu')
            calc_ref = pool.actor_ref(cpu_slots[0], address=worker_endpoint)
            daemon_ref.kill_actor_process(calc_ref)

            check_start = time.time()
            while not daemon_ref.is_actor_process_alive(calc_ref):
                gevent.sleep(0.1)
                if time.time() - check_start > 10:
                    raise TimeoutError('Check process restart timeout')
Beispiel #5
0
    def testWorkerProcessRestart(self):
        mock_scheduler_addr = '127.0.0.1:%d' % get_next_port()
        try:
            with create_actor_pool(n_process=1,
                                   backend='gevent',
                                   address=mock_scheduler_addr) as pool:
                pool.create_actor(SchedulerClusterInfoActor,
                                  schedulers=[mock_scheduler_addr],
                                  uid=SchedulerClusterInfoActor.default_name())

                pool.create_actor(ChunkMetaActor,
                                  uid=ChunkMetaActor.default_name())
                resource_ref = pool.create_actor(
                    ResourceActor, uid=ResourceActor.default_name())

                proc = subprocess.Popen([
                    sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                    '--schedulers', mock_scheduler_addr, '--cpu-procs', '1',
                    '--cache-mem', '10m', '--spill-dir', self._spill_dir,
                    '--ignore-avail-mem'
                ])
                worker_endpoint = self._wait_worker_ready(proc, resource_ref)

                daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name(),
                                            address=worker_endpoint)
                dispatch_ref = pool.actor_ref(DispatchActor.default_name(),
                                              address=worker_endpoint)
                cpu_slots = dispatch_ref.get_slots('cpu')
                calc_ref = pool.actor_ref(cpu_slots[0],
                                          address=worker_endpoint)
                daemon_ref.kill_actor_process(calc_ref)

                check_start = time.time()
                while not daemon_ref.is_actor_process_alive(calc_ref):
                    gevent.sleep(0.1)
                    if time.time() - check_start > 10:
                        raise TimeoutError('Check process restart timeout')
        finally:
            if proc.poll() is None:
                proc.send_signal(signal.SIGINT)
                check_time = time.time()
                while True:
                    time.sleep(0.1)
                    if proc.poll(
                    ) is not None or time.time() - check_time >= 5:
                        break
                if proc.poll() is None:
                    proc.kill()
            if os.path.exists(options.worker.plasma_socket):
                os.unlink(options.worker.plasma_socket)
Beispiel #6
0
 def waiter():
     check_time = time.time()
     while True:
         if not resource_ref.get_workers_meta():
             gevent.sleep(0.1)
             if proc.poll() is not None:
                 raise SystemError('Worker dead. exit code %s' % proc.poll())
             if time.time() - check_time > 20:
                 raise TimeoutError('Check meta_timestamp timeout')
             continue
         else:
             break
     val = resource_ref.get_workers_meta()
     worker_ips.extend(val.keys())
Beispiel #7
0
    def testExecuteWorker(self):
        mock_scheduler_addr = '127.0.0.1:%d' % get_next_port()
        try:
            with create_actor_pool(n_process=1,
                                   backend='gevent',
                                   address=mock_scheduler_addr) as pool:
                pool.create_actor(SchedulerClusterInfoActor,
                                  schedulers=[mock_scheduler_addr],
                                  uid=SchedulerClusterInfoActor.default_name())

                pool.create_actor(ChunkMetaActor,
                                  uid=ChunkMetaActor.default_name())
                resource_ref = pool.create_actor(
                    ResourceActor, uid=ResourceActor.default_name())

                proc = subprocess.Popen([
                    sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1',
                    '--schedulers', mock_scheduler_addr, '--cpu-procs', '1',
                    '--cache-mem', '10m', '--spill-dir', self._spill_dir,
                    '--ignore-avail-mem'
                ])
                worker_endpoint = self._wait_worker_ready(proc, resource_ref)

                test_ref = pool.create_actor(WorkerProcessTestActor)
                test_ref.run_test(worker_endpoint, _tell=True)

                check_time = time.time()
                while not test_ref.get_reply():
                    gevent.sleep(0.1)
                    if time.time() - check_time > 20:
                        raise TimeoutError('Check reply timeout')
        finally:
            if proc.poll() is None:
                proc.send_signal(signal.SIGINT)
                check_time = time.time()
                while True:
                    time.sleep(0.1)
                    if proc.poll(
                    ) is not None or time.time() - check_time >= 5:
                        break
                if proc.poll() is None:
                    proc.kill()
            if os.path.exists(options.worker.plasma_socket):
                os.unlink(options.worker.plasma_socket)