def __init__(self, args): logger.debug('Create backend {}'.format(args['name'])) self.name = args['name'] self.input_tensor_queue = queue.Queue(maxsize=MAX_SESSION_SLOT_NUM) self.batched_tensor_queue = queue.Queue(maxsize=MAX_BATCHED_TENSOR_NUM) self.output_tensor_queue = queue.Queue(maxsize=MAX_SESSION_SLOT_NUM) self.dynamic_batch = args.get('dynamic_batch') self.duplicate_num = args.get('duplicate_num') self.model_path = args['path'] self.model_type = args.get('model_type') self.max_batch_size = args.get('max_batch_size') self.timeout = args.get('timeout') self.use_mps = args.get('use_mps') self.metric_q = args.get('metric_queue') self.tags = {'model': self.name} self.threads = {} # all threads self.io_queues = [] # io queue of request handler self.io_queue_lock = threading.Lock( ) # lock for create request handler # input shared memory self.input_shm_name_set = [ ] # shared memory name for concat and inference self.input_shm_set = [] # shared memory for concat and inference self.input_shm_queue = mp.Queue(maxsize=3 * self.duplicate_num) self.input_info = args.get('input_info') # create a set of input shared memory for idx in range(3 * self.duplicate_num): input_shm_name = [] input_shm = [] for info in self.input_info: shm_name = gen_name(info['name'], suffix=idx) sh = ShmHandler(shm_name, info['max_shape'], info['dtype']) sh.create_shm() input_shm_name.append(shm_name) input_shm.append(sh) self.input_shm_name_set.append(input_shm_name) self.input_shm_set.append(input_shm) self.input_shm_queue.put(idx) # output shared memory info self.output_info = args.get('output_info') self.use_mps = False if self.use_mps is None else self.use_mps self.dynamic_batch = True if self.dynamic_batch is None else self.dynamic_batch self.max_batch_size = 32 if self.max_batch_size is None else self.max_batch_size self.duplicate_num = 1 if self.duplicate_num is None else self.duplicate_num self.adapt = False if self.timeout is None: # print('TIMEOUT IS NONE') self.timeout = 0.01 self.adapt = True
def get_tensor_info_from_session(create): tensor_info = conn.recv() shm_list = [] for info in tensor_info: sh = ShmHandler(info['shm_name'], info['max_shape'], info['dtype']) if create: sh.create_shm() else: sh.load_shm() shm_list.append(sh) conn.send(True) return shm_list
def __init__( self, name: str, path: str, input_info: list, output_info: list, dynamic_batch: bool = True, duplicate_num: int = 1, model_type: str = 'torch', max_batch_size: int = 32, # timeout : float = 0.003, metric: bool = False, timeout=None, ): global inference_context_queue assert inference_context_queue logger.debug('Session started.') # 1. backend params self.name = name backend_args = { 'name': name, 'path': path, 'dynamic_batch': dynamic_batch, 'duplicate_num': duplicate_num, 'model_type': model_type, 'max_batch_size': max_batch_size, 'timeout': timeout, 'input_info': input_info, 'output_info': output_info, } # 2. build connect with backend self.conn_s, self.conn_c = mp.Pipe() inference_context_queue.put((self.conn_c, backend_args)) stat = self.conn_s.recv() assert stat is True # 3. share memory with backend self.input_shm = [] self.output_shm = [] def _shm_info(tensor_infos): for info in tensor_infos: assert info.get('name') assert info.get('max_shape') assert info.get('dtype') info['shm_name'] = gen_name(info['name']) info['shm_size'] = \ functools.reduce(operator.mul, info.get('max_shape')) * \ np.dtype(info.get('dtype')).itemsize _shm_info(input_info) for info in input_info: sh = ShmHandler(info['shm_name'], info['max_shape'], info['dtype']) sh.create_shm() self.input_shm.append(sh) self.conn_s.send(input_info) assert self.conn_s.recv() # load output shm _shm_info(output_info) self.conn_s.send(output_info) assert self.conn_s.recv() for info in output_info: sh = ShmHandler(info['shm_name'], info['max_shape'], info['dtype']) sh.load_shm() self.output_shm.append(sh)
def mps_model_handler(self, idx): try: # 1. create backend model process conn_backend, conn_model = mp.Pipe() proc = mp.Process(target=model_process, args=(self.name, self.model_type, self.model_path, self.input_shm_queue, conn_model, self.input_info, self.output_info, idx, self.metric_q)) proc.start() # 2. create shared memory conn_backend.send(self.input_shm_name_set) output_shm_name = conn_backend.recv() output_shm = [] for shn_name, info in zip(output_shm_name, self.output_info): sh = ShmHandler(shn_name, info['max_shape'], info['dtype']) sh.load_shm() output_shm.append(sh) except: logger.error('mps_model_handler initialize error') logger.error(traceback.format_exc()) return def health_check(): while True: sleep(5) tag = {'model_handler_name': '{}_{}'.format(self.name, idx)} if proc.is_alive(): self.emit_metric({'model_handler_health_value': 1}, tag=tag) else: self.emit_metric({'model_handler_health_value': 0}, tag=tag) health_thread = threading.Thread(target=health_check, daemon=True) health_thread.start() # 3. inference while self.alive: start_ts = time() try: shm_idx, shapes, batch_index, batch_q_ts = \ self.batched_tensor_queue.get(timeout=1) except queue.Empty: continue except: logger.error('mps_model_handler error') logger.error(traceback.format_exc()) batch_output = [] try: model_start_ts = time() conn_backend.send((shm_idx, shapes)) shapes = conn_backend.recv() self.emit_metric( {'backend_forward_model_cost': time() - model_start_ts}) for shape, sh in zip(shapes, output_shm): shm_arr = sh.ndarray(shape) output_arr = np.empty(shape, shm_arr.dtype) output_arr[:] = shm_arr[:] batch_output.append(output_arr) fwd_cost = time() - start_ts self.emit_metric({'backend_forward_cost': fwd_cost}) if self.adapt: delta = fwd_cost / (0.5 + self.duplicate_num) - self.timeout if abs(delta) / self.timeout > 0.2: self.io_queue_lock.acquire() self.timeout = self.timeout * 0.8 + (self.timeout + delta) * 0.2 self.io_queue_lock.release() # print('forward cost : {}, timeout : {}'.format( # fwd_cost, self.timeout # )) except: logger.error('mps_model_handler error') logger.error(traceback.format_exc()) self.emit_metric({'mps_model_handler_error_counter': 1}) finally: self.output_tensor_queue.put((batch_output, batch_index)) # 4. clean conn_backend.send(EXIT_SIG) stat = conn_backend.recv() for sh in output_shm: sh.close() conn_backend.send(True) proc.join()
def model_process(model_name, model_type, model_path, shm_queue, conn, input_info, output_info, pid, metric_q): try: # 1. init model if model_type == 'mock': from SimpleDBI.mock_model import MockModel model = MockModel(model_name, model_path) elif model_type == 'torch': from SimpleDBI.torch_model import TorchModel model = TorchModel(model_name, model_path) elif model_type == 'tf': from SimpleDBI.tf_model import TFModel model = TFModel(model_name, model_path, input_info, output_info) elif model_type == 'tensorrt': from SimpleDBI.tensorrt_model import TensorRTModel model = TensorRTModel(model_name, model_path) elif model_type == 'onnx2trt': from SimpleDBI.onnx2trt_model import TensorRTModel model = TensorRTModel(model_name, model_path) else: logger.error('ERROR MODEL TYPE : {}'.format(model_type)) raise RuntimeError('ERROR MODEL TYPE : {}'.format(model_type)) # 2. create shared memoty # 2.1 create output shared memory output_shm_name = [] output_shm = [] for info in output_info: shm_name = gen_name(info['name']) sh = ShmHandler(shm_name, info['max_shape'], info['dtype']) sh.create_shm() output_shm_name.append(shm_name) output_shm.append(sh) # 2.2 load input shared memory input_shm_name_list = conn.recv() input_shm_list = [] for input_shm_name in input_shm_name_list: input_shm = [] for shm_name, info in zip(input_shm_name, input_info): sh = ShmHandler(shm_name, info['max_shape'], info['dtype']) sh.load_shm() input_shm.append(sh) input_shm_list.append(input_shm) conn.send(output_shm_name) except: logger.error('model_process initialize error') logger.error(traceback.format_exc()) return logger.info('model_process <{}> initialize done'.format(model_name)) tags = {'model': '{}_{}'.format(model_name, pid)} # 3. inference while True: value = conn.recv() if value == EXIT_SIG: break shm_idx, input_shapes = value inputs = [] output_shapes = [] try: ts = time() # 3.1 load input input_shm = input_shm_list[shm_idx] for shape, sh in zip(input_shapes, input_shm): shm_arr = sh.ndarray(shape) inputs.append(shm_arr) # 3.2 forward outputs = model.forward(*inputs) # 3.3 write output for output, sh in zip(outputs, output_shm): shape = output.shape shm_arr = sh.ndarray(shape) shm_arr[:] = output[:] output_shapes.append(shape) if metric_q is not None: metric_q.put({ "tags": tags, "fields": { 'model_proc_cost': time() - ts }, }) except: logger.error('model_process runtime error') logger.error(traceback.format_exc()) finally: conn.send(output_shapes) shm_queue.put(shm_idx) # send shared memory to avalible queue # 4. clean try: for input_shm in input_shm_list: for sh in input_shm: sh.close() conn.send(True) stat = conn.recv() assert stat for sh in output_shm: sh.close() conn.close() except: logger.error('model_process destructor error') logger.error(traceback.format_exc()) logger.error('Model process exit.')