Esempio n. 1
0
 def handle_register_function_result(self,
                                     request: network_pb2.FunctionResult):
     with self.func_result.lock:
         assert self.func_result.function_id == request.func_id
         result = cloudpickle.loads(request.result)
         self.func_result.results[request.world_rank] = result
         self.func_result.remaining -= 1
         if self.func_result.remaining == 0:
             self.func_result.done.set()
     return network_pb2.Empty()
Esempio n. 2
0
    def _reset(self):
        # send stop to all mpi workers
        if self.workers:
            empty_msg = network_pb2.Empty()

            def send_stop(stub):
                try:
                    stub.Stop(empty_msg)
                except Exception:
                    pass

            for meta in self.workers:
                if not hasattr(meta, "stub") or meta.stub is None:
                    continue
                send_stop(meta.stub)

        # stop mpirun
        if self.mpirun_forward_thread is not None:
            self.mpirun_forward_thread.stop()
            self.mpirun_forward_thread = None
        if (self.mpirun_check_thread is not None
                and threading.current_thread().ident !=
                self.mpirun_check_thread.ident):
            self.mpirun_check_thread.join(1)
            if self.mpirun_check_thread.is_alive():
                try:
                    # kill the mpirun process
                    os.killpg(os.getpgid(self.mpirun_proc.pid), signal.SIGTERM)
                except ProcessLookupError:
                    # the process has been exited.
                    pass
                self.mpirun_check_thread.stop()
                self.mpirun_check_thread = None
        self.mpirun_proc = None

        self.workers = [None] * self.world_size
        if self.server:
            self.server.stop(None)
            self.server.wait_for_termination(self.timeout)
            del self.server
            self.server = None
        self.func_id = 0
        with self.lock:
            if self.func_result:
                self.func_result.done.set()
                self.func_result = None
        self.started = False

        if self.peers:
            self.peers = None

        if self.pg:
            ray.util.remove_placement_group(self.pg)
            self.pg = None
Esempio n. 3
0
 def handle_stop(self, request: network_pb2.Empty):
     self.should_stop.set()
     return network_pb2.Empty()
Esempio n. 4
0
 def handle_run_command(self, func: network_pb2.Function):
     self.task_queue.put((self.expected_func_id, func))
     self.expected_func_id += 1
     return network_pb2.Empty()