def test_inference3d_in_proc(tiny_model_3d, log_queue): config = tiny_model_3d["config"] in_channels = config["input_channels"] model = TinyConvNet3d(in_channels=in_channels) handler_conn, inference_conn = mp.Pipe() p = mp.Process(target=run, kwargs={ "conn": inference_conn, "model": model, "config": config, "log_queue": log_queue }) p.start() client = create_client(IInference, handler_conn) try: client.set_devices([torch.device("cpu")]) f = [] n = 10 for i in range(n): data = TikTensor(torch.rand(in_channels, 15, 15, 15)) f.append(client.forward(data)) for i in range(n): f[i].result(timeout=10) print("received ", i) finally: client.shutdown()
def test_training_in_proc(tiny_model_2d, log_queue): config = tiny_model_2d["config"] config["num_iterations_per_update"] = 10 in_channels = config["input_channels"] model = TinyConvNet2d(in_channels=in_channels) handler_conn, training_conn = mp.Pipe() p = mp.Process(target=run, kwargs={ "conn": training_conn, "model": model, "config": config, "log_queue": log_queue }) p.start() client = create_client(ITraining, handler_conn) try: client.set_devices([torch.device("cpu")]) data = TikTensorBatch([ TikTensor(torch.zeros(in_channels, 15, 15), ((1, ), (1, ))), TikTensor(torch.ones(in_channels, 9, 9), ((2, ), (2, ))), ]) labels = TikTensorBatch([ TikTensor(torch.ones(in_channels, 15, 15, dtype=torch.uint8), ((1, ), (1, ))), TikTensor(torch.full((in_channels, 9, 9), 2, dtype=torch.uint8), ((2, ), (2, ))), ]) client.update_dataset("training", data, labels) client.resume_training() finally: client.shutdown()
def load_model(self, model: Model, state: ModelState, devices: list) -> RPCFuture[SetDeviceReturnType]: log_dir = model.config.get(LOGGING, {}).get(DIRECTORY, "") if log_dir: os.makedirs(log_dir, exist_ok=True) self.logger.info("log dir: %s", os.path.abspath(log_dir)) self._start_logging_handler() incomplete_msg = get_error_msg_for_incomplete_config(model.config) if incomplete_msg: raise ValueError(incomplete_msg) # todo: move test_transforms elsewhere self.test_transforms = model.config.get(TESTING, {}).get( TRANSFORMS, {"Normalize": {}}) if not devices: devices = ["cpu"] cuda_visible_devices, handler_devices = self.get_cuda_and_handler_device_names( devices) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_visible_devices) self.logger.info("Set CUDA_VISIBLE_DEVICES to '%s'", os.environ["CUDA_VISIBLE_DEVICES"]) server_conn, handler_conn = mp.Pipe() p = mp.Process( target=run_handler, name="Handler", kwargs={ "conn": handler_conn, "config": model.config, "model_file": model.code, "model_state": state.model_state, "optimizer_state": state.optimizer_state, "log_queue": self.log_queue, }, ) try: p.start() except Exception as e: self.logger.error(e) err_fut = RPCFuture() err_fut.set_exception(e) return err_fut else: self.handler = create_client(IHandler, server_conn) try: tik_fut = self.handler.set_devices(handler_devices) except Exception as e: self.logger.exception("set_devices failed") err_fut = RPCFuture() err_fut.set_exception(e) return err_fut else: self.logger.info("got tik_fut") fut = tik_fut.map(convert_to_SetDeviceReturnType) self.logger.info("converted tik_fut") return fut
def _spawn(iface_cls, srv_cls): child, parent = mp.Pipe() p = mp.Process(target=_run_srv, args=(srv_cls, parent, log_queue)) p.start() data["client"] = client = create_client(iface_cls, child) data["process"] = p return client
def start_model_session_process( model_zip: bytes, devices: List[str], log_queue: Optional[_mp.Queue] = None ) -> Tuple[_mp.Process, IRPCModelSession]: client_conn, server_conn = _mp.Pipe() proc = _mp.Process( target=_run_model_session_process, name="ModelSessionProcess", kwargs={"conn": server_conn, "devices": devices, "log_queue": log_queue, "model_zip": model_zip}, ) proc.start() return proc, _mp_rpc.create_client(IRPCModelSession, client_conn)
def client(log_queue): child, parent = mp.Pipe() p = mp.Process(target=_srv, args=(parent, log_queue)) p.start() client = create_client(ITestApi, child, timeout=10) yield client client.shutdown() p.join()
def client3d(tiny_model_3d, log_queue): client_conn, handler_conn = mp.Pipe() p = mp.Process(target=run_handler, name="Handler", kwargs={ "conn": handler_conn, **tiny_model_3d, "log_queue": log_queue }) p.start() cl = create_client(IHandler, client_conn) cl.set_devices(["cpu"]) yield cl cl.shutdown.async_().result(timeout=30) p.join(timeout=20)
def test_inference2d_in_proc(tiny_model_2d, log_queue): config = tiny_model_2d["config"] in_channels = config["input_channels"] model = TinyConvNet2d(in_channels=in_channels) handler_conn, inference_conn = mp.Pipe() p = mp.Process(target=run, kwargs={ "conn": inference_conn, "model": model, "config": config, "log_queue": log_queue }) p.start() client = create_client(IInference, handler_conn) try: client.set_devices([torch.device("cpu")]) data = TikTensor(torch.zeros(in_channels, 15, 15), (0, )) f = client.forward(data) f.result(timeout=10) finally: client.shutdown()
def test_future_timeout(client: ITestApi, log_queue): child, parent = mp.Pipe() p = mp.Process(target=_srv, args=(parent, log_queue)) p.start() client = create_client(ITestApi, child, timeout=0.001) with pytest.raises(TimeoutError): client.compute(1, 2) with pytest.raises(TimeoutError): client.compute.async_(1, 2).result() with pytest.raises(TimeoutError): client.compute_fut(1, 2).result() client.compute.async_(1, 2).result(timeout=3) client.shutdown() p.join()
def test_race_condition(log_queue): class SlowConn: def __init__(self, conn): self._conn = conn def send(self, *args): self._conn.send(*args) # Block so future will be resolved earlier than we return value time.sleep(0.5) def __getattr__(self, name): return getattr(self._conn, name) child, parent = mp.Pipe() p = mp.Process(target=_srv, args=(parent, log_queue)) p.start() client = create_client(ITestApi, SlowConn(child)) client.fast_compute(2, 2) client.shutdown().result()
def __init__( self, config: dict, model_file: bytes, model_state: bytes, optimizer_state: bytes, log_queue: Optional[mp.Queue] = None, ) -> None: """ :param config: configuration dict :param model_file: bytes of file describing the neural network model :param model_state: binarized model state dict :param optimizer_state: binarized optimizer state dict """ assert model_file for required in [MODEL_CLASS_NAME]: if required not in config: raise ValueError(f"{required} missing in config") self.config = config self.shutdown_event = threading.Event() self.logger = logging.getLogger(__name__) self.logger.info("started") self.valid_shapes: Optional[List[Point]] = None self.shrinkage: Optional[Point] = None self.idle_devices: List[torch.device] = [] self.training_devices: List[torch.device] = [] self.inference_devices: List[torch.device] = [] self.tempdir = tempfile.mkdtemp() user_module_name = "usermodel" with open(os.path.join(self.tempdir, user_module_name + ".py"), "wb") as f: f.write(model_file) sys.path.insert(0, self.tempdir) user_module = importlib.import_module(user_module_name) self.model: torch.nn.Module = getattr( user_module, self.config[MODEL_CLASS_NAME])( **self.config.get(MODEL_INIT_KWARGS, {})) self.logger.debug("created user model") if model_state: self.logger.debug("load model state") try: self.model.load_state_dict( torch.load(io.BytesIO(model_state), map_location="cpu")) except Exception as e: self.logger.exception(e) else: self.logger.info("restored model state") try: self.logger.debug("start dryrun process") handler2dryrun_conn, dryrun2handler_conn = mp.Pipe() self._dry_run_proc = mp.Process( name="DryRun", target=run_dryrun, kwargs={ "conn": dryrun2handler_conn, "config": config, "model": self.model, "log_queue": log_queue }, ) self._dry_run_proc.start() self._dry_run: IDryRun = create_client(IDryRun, handler2dryrun_conn) self.logger.debug("start training process") handler2training_conn, training2handler_conn = mp.Pipe() self._training_proc = mp.Process( target=run_training, name="Training", kwargs={ "conn": training2handler_conn, "config": config, "model": self.model, "optimizer_state": optimizer_state, "log_queue": log_queue, }, ) self._training_proc.start() self._training: ITraining = create_client(ITraining, handler2training_conn) self.logger.debug("start inference process") handler2inference_conn, inference2handler_conn = mp.Pipe() self._inference_proc = mp.Process( target=run_inference, name="Inference", kwargs={ "conn": inference2handler_conn, "config": config, "model": self.model, "log_queue": log_queue }, ) self._inference_proc.start() self._inference: IInference = create_client( IInference, handler2inference_conn) # start device setter thread that will wait for dry run processes to finish self.new_device_names: queue.Queue = queue.Queue() self.device_setter_thread = threading.Thread(target=add_logger( self.logger)(self._device_setter_worker), name="DeviceSetter") self.device_setter_thread.start() except Exception as e: self.logger.exception(e) self.shutdown()