def _send_bls_sequence_requests(self, correlation_id): # Start request try: input = pb_utils.Tensor('INPUT', np.array([1000], dtype=np.int32)) infer_request = pb_utils.InferenceRequest( model_name='onnx_nobatch_sequence_int32', inputs=[input], requested_output_names=['OUTPUT'], flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START, correlation_id=correlation_id) self.assertTrue(infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT') self.assertEqual(output.as_numpy()[0], input.as_numpy()[0]) for i in range(10): input = pb_utils.Tensor('INPUT', np.array([i], dtype=np.int32)) infer_request = pb_utils.InferenceRequest( model_name='onnx_nobatch_sequence_int32', inputs=[input], requested_output_names=['OUTPUT'], correlation_id=correlation_id) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) # The new output is the previous output + the current input expected_output = output.as_numpy()[0] + i output = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT') self.assertEqual(output.as_numpy()[0], expected_output) # Final request input = pb_utils.Tensor('INPUT', np.array([2000], dtype=np.int32)) infer_request = pb_utils.InferenceRequest( model_name='onnx_nobatch_sequence_int32', inputs=[input], requested_output_names=['OUTPUT'], correlation_id=correlation_id) infer_request.set_flags( pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END) self.assertTrue(infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) expected_output = output.as_numpy()[0] + input.as_numpy()[0] output = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT') self.assertEqual(output.as_numpy()[0], expected_output) except Exception as e: self.add_deferred_exception(e)
def test_bls_incorrect_args(self): with self.assertRaises(TypeError): pb_utils.InferenceRequest( inputs=[], requested_output_names=['OUTPUT0', 'OUTPUT1']) with self.assertRaises(TypeError): pb_utils.InferenceRequest( model_name='add_sub', requested_output_names=['OUTPUT0', 'OUTPUT1']) with self.assertRaises(TypeError): pb_utils.InferenceRequest(model_name='add_sub', inputs=[])
def bls_add_sub(_=None): input0_np = np.random.randn(*[16]) input0_np = input0_np.astype(np.float32) input1_np = np.random.randn(*[16]) input1_np = input1_np.astype(np.float32) input0 = pb_utils.Tensor('INPUT0', input0_np) input1 = pb_utils.Tensor('INPUT1', input1_np) infer_request = pb_utils.InferenceRequest( model_name='add_sub', inputs=[input0, input1], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() if infer_response.has_error(): return False output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') if output0 is None or output1 is None: return False expected_output_0 = input0.as_numpy() + input1.as_numpy() expected_output_1 = input0.as_numpy() - input1.as_numpy() if not np.all(expected_output_0 == output0.as_numpy()): return False if not np.all(expected_output_1 == output1.as_numpy()): return False return True
def test_bls_execute_error(self): # Test BLS with a model that has an error during execution. infer_request = pb_utils.InferenceRequest(model_name='execute_error', inputs=[], requested_output_names=[]) infer_response = infer_request.exec() self.assertTrue(infer_response.has_error())
def _get_gpu_bls_outputs(self, input0_pb, input1_pb): """ This function is created to test that the DLPack container works properly when the inference response and outputs go out of scope. Returns True on success and False on failure. """ infer_request = pb_utils.InferenceRequest( model_name='dlpack_add_sub', inputs=[input0_pb, input1_pb], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() if infer_response.has_error(): return False output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') if output0 is None or output1 is None: return False # When one of the inputs is in GPU the output returned by the model must # be in GPU, otherwise the outputs will be in CPU. if not input0_pb.is_cpu() or not input1_pb.is_cpu(): if output0.is_cpu() or output1.is_cpu(): return False else: if (not output0.is_cpu()) or (not output1.is_cpu()): return False # Make sure that the reference count is increased by one when DLPack # representation is created. rc_before_dlpack_output0 = sys.getrefcount(output0) rc_before_dlpack_output1 = sys.getrefcount(output1) output0_dlpack = output0.to_dlpack() output1_dlpack = output1.to_dlpack() rc_after_dlpack_output0 = sys.getrefcount(output0) rc_after_dlpack_output1 = sys.getrefcount(output1) if rc_after_dlpack_output0 - rc_before_dlpack_output0 != 1: return False if rc_after_dlpack_output1 - rc_before_dlpack_output1 != 1: return False # Make sure that reference count decreases after destroying the DLPack output0_dlpack = None output1_dlpack = None rc_after_del_dlpack_output0 = sys.getrefcount(output0) rc_after_del_dlpack_output1 = sys.getrefcount(output1) if rc_after_del_dlpack_output0 - rc_after_dlpack_output0 != -1: return False if rc_after_del_dlpack_output1 - rc_after_dlpack_output1 != -1: return False return output0.to_dlpack(), output1.to_dlpack()
def test_bls_wrong_inputs(self): input0 = pb_utils.Tensor('INPUT0', np.random.randn(*[1, 16])) infer_request = pb_utils.InferenceRequest( model_name='add_sub', inputs=[input0], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() self.assertTrue(infer_response.has_error())
def _send_identity_tensor(self, size): tensor_size = [1, size] input0_np = np.random.randn(*tensor_size) input0 = pb_utils.Tensor('INPUT0', input0_np.astype(np.float32)) infer_request = pb_utils.InferenceRequest( model_name='identity_fp32', inputs=[input0], requested_output_names=['OUTPUT0']) return input0_np, infer_request.exec()
def test_bls_execute_error(self): # Test BLS with a model that has an error during execution. infer_request = pb_utils.InferenceRequest(model_name='execute_error', inputs=[], requested_output_names=[]) infer_response = infer_request.exec() self.assertTrue(infer_response.has_error()) self.assertEqual( infer_response.error().message(), "expected 1 inputs but got 0 inputs for model 'execute_error'") self.assertTrue(len(infer_response.output_tensors()) == 0)
def test_bls_wrong_inputs(self): input0 = pb_utils.Tensor('INPUT0', np.random.randn(*[1, 16])) infer_request = pb_utils.InferenceRequest( model_name='add_sub', inputs=[input0], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() self.assertTrue(infer_response.has_error()) self.assertEqual( infer_response.error().message(), "expected 2 inputs but got 1 inputs for model 'add_sub'") self.assertTrue(len(infer_response.output_tensors()) == 0)
def test_zero_length_io(self): model_name = 'identity_fp32' input0 = np.zeros([1, 0], dtype=np.float32) input0_pb = pb_utils.Tensor('INPUT0', input0) infer_request = pb_utils.InferenceRequest( model_name=model_name, inputs=[input0_pb], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') self.assertTrue(np.all(output0 == input0))
def test_bls_sync(self): infer_request = pb_utils.InferenceRequest( model_name='non_existent_model', inputs=[], requested_output_names=[]) infer_response = infer_request.exec() # Because the model doesn't exist, the inference response must have an # error self.assertTrue(infer_response.has_error()) # Make sure that the inference requests can be performed properly after # an error. self.assertTrue(bls_add_sub())
def _get_gpu_bls_outputs(self, input0_pb, input1_pb): """ This function is created to test that the DLPack container works properly when the inference response and outputs go out of scope. """ infer_request = pb_utils.InferenceRequest( model_name='dlpack_add_sub', inputs=[input0_pb, input1_pb], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') self.assertIsNotNone(output0) self.assertIsNotNone(output1) # When one of the inputs is in GPU the output returned by the model must # be in GPU, otherwise the outputs will be in CPU. if not input0_pb.is_cpu() or not input1_pb.is_cpu(): self.assertTrue((not output0.is_cpu()) and (not output1.is_cpu())) else: self.assertTrue((output0.is_cpu()) and (output1.is_cpu())) # Make sure that the reference count is increased by one when DLPack # representation is created. rc_before_dlpack_output0 = sys.getrefcount(output0) rc_before_dlpack_output1 = sys.getrefcount(output1) output0_dlpack = output0.to_dlpack() output1_dlpack = output1.to_dlpack() rc_after_dlpack_output0 = sys.getrefcount(output0) rc_after_dlpack_output1 = sys.getrefcount(output1) self.assertEqual(rc_after_dlpack_output0 - rc_before_dlpack_output0, 1) self.assertEqual(rc_after_dlpack_output1 - rc_before_dlpack_output1, 1) # Make sure that reference count decreases after destroying the DLPack output0_dlpack = None output1_dlpack = None rc_after_del_dlpack_output0 = sys.getrefcount(output0) rc_after_del_dlpack_output1 = sys.getrefcount(output1) self.assertEqual(rc_after_del_dlpack_output0 - rc_after_dlpack_output0, -1) self.assertEqual(rc_after_del_dlpack_output1 - rc_after_dlpack_output1, -1) return output0.to_dlpack(), output1.to_dlpack()
def response_thread(self, response_sender, in_input): # The response_sender is used to send response(s) associated with the # corresponding request. # Sleep 5 seconds to make sure the main thread has exited. time.sleep(5) status = self.execute_gpu_bls() if not status: infer_response = pb_utils.InferenceResponse( error="GPU BLS test failed.") response_sender.send(infer_response) else: in_value = in_input infer_request = pb_utils.InferenceRequest( model_name='identity_fp32', requested_output_names=["OUTPUT0"], inputs=[pb_utils.Tensor('INPUT0', in_input)]) infer_response = infer_request.exec() output0 = pb_utils.get_output_tensor_by_name( infer_response, "OUTPUT0") if infer_response.has_error(): response = pb_utils.InferenceResponse( error=infer_response.error().message()) response_sender.send( response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) elif np.any(in_input != output0.as_numpy()): error_message = ( "BLS Request input and BLS response output do not match." f" {in_value} != {output0.as_numpy()}") response = pb_utils.InferenceResponse(error=error_message) response_sender.send( response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) else: output_tensors = [pb_utils.Tensor('OUT', in_value)] response = pb_utils.InferenceResponse( output_tensors=output_tensors) response_sender.send( response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1
def execute(self, requests): responses = [] for request in requests: # Get INPUT0 input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0') infer_request = pb_utils.InferenceRequest( model_name='identity', requested_output_names=["OUTPUT0"], inputs=[input0]) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) inference_response = pb_utils.InferenceResponse(output_tensors=[ pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') ]) responses.append(inference_response) return responses
def create_addsub_inference_request(gpu=False): if not gpu: input0_np = np.random.randn(16) input1_np = np.random.randn(16) input0_np = input0_np.astype(np.float32) input1_np = input1_np.astype(np.float32) input0 = pb_utils.Tensor('INPUT0', input0_np) input1 = pb_utils.Tensor('INPUT1', input1_np) else: input0_pytorch = torch.rand(16).to('cuda') input1_pytorch = torch.rand(16).to('cuda') input0 = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0_pytorch)) input1 = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1_pytorch)) infer_request = pb_utils.InferenceRequest( model_name='dlpack_add_sub', inputs=[input0, input1], requested_output_names=['OUTPUT0', 'OUTPUT1']) return input0, input1, infer_request
def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu): input0 = torch.rand(16) input1 = torch.rand(16) if is_input0_gpu: input0 = input0.to('cuda') if is_input1_gpu: input1 = input1.to('cuda') input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0)) input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1)) infer_request = pb_utils.InferenceRequest( model_name='dlpack_add_sub', inputs=[input0_pb, input1_pb], requested_output_names=['OUTPUT0', 'OUTPUT1']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') self.assertIsNotNone(output0) self.assertIsNotNone(output1) expected_output_0 = from_dlpack( input0_pb.to_dlpack()).to('cpu') + from_dlpack( input1_pb.to_dlpack()).to('cpu') expected_output_1 = from_dlpack( input0_pb.to_dlpack()).to('cpu') - from_dlpack( input1_pb.to_dlpack()).to('cpu') self.assertTrue( torch.all(expected_output_0 == from_dlpack(output0.to_dlpack()).to( 'cpu'))) self.assertTrue( torch.all(expected_output_1 == from_dlpack(output1.to_dlpack()).to( 'cpu')))
def execute(self, requests): """ This function is called on inference request. """ # Only generate the error for the first request for i, request in enumerate(requests): request_input = pb_utils.get_input_tensor_by_name(request, 'IN') # Sync BLS request infer_request = pb_utils.InferenceRequest( model_name='identity_fp32', requested_output_names=["OUTPUT0"], inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())]) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( f"BLS Response has an error: {infer_response.error().message()}" ) output0 = pb_utils.get_output_tensor_by_name( infer_response, "OUTPUT0") if np.any(output0.as_numpy() != request_input.as_numpy()): raise pb_utils.TritonModelException( f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}" ) thread1 = threading.Thread(target=self.response_thread, args=(request.get_response_sender(), pb_utils.get_input_tensor_by_name( request, 'IN').as_numpy())) thread1.daemon = True with self.inflight_thread_count_lck: self.inflight_thread_count += 1 thread1.start() return None
def batch_rescoring(self, score_hyps, hist_enc, hist_mask_len, max_len): """ score_hyps: [((ctc_score, (id1, id2, id3, ....)), (), ...), ....] hist_enc: [len1xF, len2xF, .....] hist_mask: [1x1xlen1, 1x1xlen2] return bzx1 best_index """ bz = len(hist_enc) f = hist_enc[0].shape[-1] beam_size = self.beam_size encoder_lens = np.zeros((bz, 1), dtype=np.int32) encoder_out = torch.zeros((bz, max_len, f), dtype=self.dtype) hyps = [] ctc_score = torch.zeros((bz, beam_size), dtype=self.dtype) max_seq_len = 0 for i in range(bz): cur_len = hist_enc[i].shape[0] encoder_out[i, 0:cur_len] = hist_enc[i] encoder_lens[i, 0] = hist_mask_len[i] # process candidate if len(score_hyps[i]) < beam_size: to_append = (beam_size - len(score_hyps[i])) * [(-10000, ())] score_hyps[i] = list(score_hyps[i]) + to_append for idx, c in enumerate(score_hyps[i]): score, idlist = c if score < -10000: score = -10000 ctc_score[i][idx] = score hyps.append(list(idlist)) if len(hyps[-1]) > max_seq_len: max_seq_len = len(hyps[-1]) max_seq_len += 2 hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64) hyps_pad_sos_eos = hyps_pad_sos_eos * self.eos # fill eos if self.bidecoder: r_hyps_pad_sos_eos = np.ones((bz, beam_size, max_seq_len), dtype=np.int64) r_hyps_pad_sos_eos = r_hyps_pad_sos_eos * self.eos hyps_lens_sos = np.ones((bz, beam_size), dtype=np.int32) bz_id = 0 for idx, cand in enumerate(hyps): bz_id = idx // beam_size length = len(cand) + 2 bz_offset = idx % beam_size pad_cand = [self.sos] + cand + [self.eos] hyps_pad_sos_eos[bz_id][bz_offset][0 : length] = pad_cand if self.bidecoder: r_pad_cand = [self.sos] + cand[::-1] + [self.eos] r_hyps_pad_sos_eos[bz_id][bz_offset][0:length] = r_pad_cand hyps_lens_sos[bz_id][idx % beam_size] = len(cand) + 1 in0 = pb_utils.Tensor.from_dlpack("encoder_out", to_dlpack(encoder_out)) in1 = pb_utils.Tensor("encoder_out_lens", encoder_lens) in2 = pb_utils.Tensor("hyps_pad_sos_eos", hyps_pad_sos_eos) in3 = pb_utils.Tensor("hyps_lens_sos", hyps_lens_sos) input_tensors = [in0, in1, in2, in3] if self.bidecoder: in4 = pb_utils.Tensor("r_hyps_pad_sos_eos", r_hyps_pad_sos_eos) input_tensors.append(in4) in5 = pb_utils.Tensor.from_dlpack("ctc_score", to_dlpack(ctc_score)) input_tensors.append(in5) request = pb_utils.InferenceRequest(model_name='decoder', requested_output_names=['best_index'], inputs=input_tensors) response = request.exec() best_index = pb_utils.get_output_tensor_by_name(response, 'best_index') best_index = from_dlpack(best_index.to_dlpack()).clone() best_index = best_index.numpy()[:, 0] return best_index
def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference is requested for this model. Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate through list of requests and create # an instance of pb_utils.InferenceResponse class for each of them. You # should avoid storing any of the input Tensors in the class attributes # as they will be overridden in subsequent inference requests. You can # make a copy of the underlying NumPy array and store it if it is # required. batch_encoder_out, batch_encoder_lens = [], [] batch_log_probs, batch_log_probs_idx = [], [] batch_count = [] batch_root = TrieVector() batch_start = [] root_dict = {} encoder_max_len = 0 hyps_max_len = 0 total = 0 for request in requests: # Perform inference on the request and append it to responses list... in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out") in_1 = pb_utils.get_input_tensor_by_name(request, "encoder_out_lens") in_2 = pb_utils.get_input_tensor_by_name(request, "batch_log_probs") in_3 = pb_utils.get_input_tensor_by_name(request, "batch_log_probs_idx") batch_encoder_out.append(in_0.as_numpy()) encoder_max_len = max(encoder_max_len, batch_encoder_out[-1].shape[1]) cur_b_lens = in_1.as_numpy() batch_encoder_lens.append(cur_b_lens) cur_batch = cur_b_lens.shape[0] batch_count.append(cur_batch) cur_b_log_probs = in_2.as_numpy() cur_b_log_probs_idx = in_3.as_numpy() for i in range(cur_batch): cur_len = cur_b_lens[i] cur_probs = cur_b_log_probs[i][ 0:cur_len, :].tolist() # T X Beam cur_idx = cur_b_log_probs_idx[i][ 0:cur_len, :].tolist() # T x Beam batch_log_probs.append(cur_probs) batch_log_probs_idx.append(cur_idx) root_dict[total] = PathTrie() batch_root.append(root_dict[total]) batch_start.append(True) total += 1 score_hyps = ctc_beam_search_decoder_batch( batch_log_probs, batch_log_probs_idx, batch_root, batch_start, self.beam_size, min(total, self.num_processes), blank_id=self.blank_id, space_id=-2, cutoff_prob=self.cutoff_prob, ext_scorer=self.lm) all_hyps = [] all_ctc_score = [] max_seq_len = 0 for seq_cand in score_hyps: # if candidates less than beam size if len(seq_cand) != self.beam_size: seq_cand = list(seq_cand) seq_cand += (self.beam_size - len(seq_cand)) * [(-float("INF"), (0, ))] for score, hyps in seq_cand: all_hyps.append(list(hyps)) all_ctc_score.append(score) max_seq_len = max(len(hyps), max_seq_len) beam_size = self.beam_size feature_size = self.feature_size hyps_max_len = max_seq_len + 2 in_ctc_score = np.zeros((total, beam_size), dtype=self.data_type) in_hyps_pad_sos_eos = np.ones( (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos if self.bidecoder: in_r_hyps_pad_sos_eos = np.ones( (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos in_hyps_lens_sos = np.ones((total, beam_size), dtype=np.int32) in_encoder_out = np.zeros((total, encoder_max_len, feature_size), dtype=self.data_type) in_encoder_out_lens = np.zeros(total, dtype=np.int32) st = 0 for b in batch_count: t = batch_encoder_out.pop(0) in_encoder_out[st:st + b, 0:t.shape[1]] = t in_encoder_out_lens[st:st + b] = batch_encoder_lens.pop(0) for i in range(b): for j in range(beam_size): cur_hyp = all_hyps.pop(0) cur_len = len(cur_hyp) + 2 in_hyp = [self.sos] + cur_hyp + [self.eos] in_hyps_pad_sos_eos[st + i][j][0:cur_len] = in_hyp in_hyps_lens_sos[st + i][j] = cur_len - 1 if self.bidecoder: r_in_hyp = [self.sos] + cur_hyp[::-1] + [self.eos] in_r_hyps_pad_sos_eos[st + i][j][0:cur_len] = r_in_hyp in_ctc_score[st + i][j] = all_ctc_score.pop(0) st += b in_encoder_out_lens = np.expand_dims(in_encoder_out_lens, axis=1) in_tensor_0 = pb_utils.Tensor("encoder_out", in_encoder_out) in_tensor_1 = pb_utils.Tensor("encoder_out_lens", in_encoder_out_lens) in_tensor_2 = pb_utils.Tensor("hyps_pad_sos_eos", in_hyps_pad_sos_eos) in_tensor_3 = pb_utils.Tensor("hyps_lens_sos", in_hyps_lens_sos) input_tensors = [in_tensor_0, in_tensor_1, in_tensor_2, in_tensor_3] if self.bidecoder: in_tensor_4 = pb_utils.Tensor("r_hyps_pad_sos_eos", in_r_hyps_pad_sos_eos) input_tensors.append(in_tensor_4) in_tensor_5 = pb_utils.Tensor("ctc_score", in_ctc_score) input_tensors.append(in_tensor_5) inference_request = pb_utils.InferenceRequest( model_name='decoder', requested_output_names=['best_index'], inputs=input_tensors) inference_response = inference_request.exec() if inference_response.has_error(): raise pb_utils.TritonModelException( inference_response.error().message()) else: # Extract the output tensors from the inference response. best_index = pb_utils.get_output_tensor_by_name( inference_response, 'best_index') best_index = best_index.as_numpy() hyps = [] idx = 0 for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos): best_idx = best_index[idx][0] best_cand_len = cand_lens[best_idx] - 1 # remove sos best_cand = cands[best_idx][1:1 + best_cand_len].tolist() hyps.append(best_cand) idx += 1 hyps = map_batch( hyps, self.vocabulary, min(multiprocessing.cpu_count(), len(in_ctc_score))) st = 0 for b in batch_count: sents = np.array(hyps[st:st + b]) out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype)) inference_response = pb_utils.InferenceResponse( output_tensors=[out0]) responses.append(inference_response) st += b return responses
def Execute(self, request, context): """Execute is called on TRITONBACKEND_ModelInstanceExecute. Inference happens in this function. This function mainly converts gRPC protobufs to the triton_python_backend_utils.InferenceRequest and triton_python_backend_utils.InferenceResponse. Parameters ---------- request : python_host_pb2.ExecuteRequest Contains a `requests` attribute which is a list of python_host_pb2.InferenceRequest """ requests = request.requests inference_requests = [] for request in requests: # This object contains a list of tpb_utils.Tensor input_tensors = [] for request_input in request.inputs: x = request_input numpy_type = tpb_utils.triton_to_numpy_type(x.dtype) # We need to deserialize TYPE_STRING if numpy_type == np.object or numpy_type == np.bytes_: numpy_data = deserialize_bytes_tensor(x.raw_data) tensor = tpb_utils.Tensor(x.name, numpy_data.reshape(x.dims)) input_tensors.append(tensor) else: tensor = tpb_utils.Tensor( x.name, np.frombuffer(x.raw_data, dtype=numpy_type).reshape(x.dims)) input_tensors.append(tensor) request_id = request.id correlation_id = request.correlation_id requested_output_names = request.requested_output_names inference_request = tpb_utils.InferenceRequest( input_tensors, request_id, correlation_id, requested_output_names) inference_requests.append(inference_request) # Execute inference on the Python backend responses contains a list of # triton_python_backend_utils.InferenceResponse. Each backend must # implement an execute method if not hasattr(self.backend, 'execute'): context.set_code(grpc.StatusCode.INTERNAL) context.set_details('Backend does not implement `execute` method') return ExecuteResponse() responses = self.backend.execute(inference_requests) # Make sure that number of InferenceResponse and InferenceRequest # objects match if len(inference_requests) != len(responses): context.set_code(grpc.StatusCode.INTERNAL) context.set_details( 'Number of inference responses and requests don\'t match ( requests=' + len(inference_requests) + ' != responses=' + len(responses) + ')') return ExecuteResponse() exec_responses = [] for response in responses: # If there is an error do not look into output_tensors if response.has_error(): error = Error(message=response.error().message()) inference_response = InferenceResponse(outputs=[], error=error, failed=True) exec_responses.append(inference_response) continue output_tensors = response.output_tensors() response_tensors = [] for output_tensor in output_tensors: output_np_array = output_tensor.as_numpy() output_shape = output_np_array.shape # We need to serialize TYPE_STRING if output_np_array.dtype.type is np.object or output_np_array.dtype.type is np.bytes_: output_np_array = serialize_byte_tensor(output_np_array) tensor = Tensor(name=output_tensor.name(), dtype=tpb_utils.numpy_to_triton_type( output_np_array.dtype.type), dims=output_shape, raw_data=output_np_array.tobytes()) response_tensors.append(tensor) exec_responses.append(InferenceResponse(outputs=response_tensors)) execute_response = ExecuteResponse(responses=exec_responses) return execute_response
def test_infer_request_args(self): # Dummy arguments used in the tests. inputs = [ pb_utils.Tensor('INPUT0', np.asarray([1, 2], dtype=np.int32)) ] model_name = 'my_model' requested_output_names = ['my_output'] # # inputs field validation # # Test list of None as inputs with self.assertRaises(pb_utils.TritonModelException) as e: pb_utils.InferenceRequest( inputs=[None], model_name=model_name, requested_output_names=requested_output_names) # Test None object as list of inputs with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( inputs=None, model_name=model_name, requested_output_names=requested_output_names) # model_name validation with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( model_name=None, inputs=inputs, requested_output_names=requested_output_names) # # Requested output name validations # # Test list of None objects as requested_output_names with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest(requested_output_names=[None], inputs=inputs, model_name=model_name) with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest(requested_output_names=None, inputs=inputs, model_name=model_name) # Other arguments validation # correlation_id set to None with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, correleation_id=None) # request_id set to None with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, request_id=None) # model_version set to None with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, model_version=None) # flags set to None with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, flags=None) # Empty lists should not raise an exception pb_utils.InferenceRequest(requested_output_names=[], inputs=[], model_name=model_name)
def Execute(self, request, context): """Execute is called on TRITONBACKEND_ModelInstanceExecute. Inference happens in this function. This function mainly converts gRPC protobufs to the triton_python_backend_utils.InferenceRequest and triton_python_backend_utils.InferenceResponse. Parameters ---------- request : python_host_pb2.ExecuteRequest Contains a `requests` attribute which is a list of python_host_pb2.InferenceRequest """ requests = request.requests inference_requests = [] for request in requests: # This object contains a list of tpb_utils.Tensor input_tensors = [] for request_input in request.inputs: x = request_input tensor = tpb_utils.Tensor( x.name, np.frombuffer(x.raw_data, dtype=protobuf_to_numpy_type( x.dtype)).reshape(x.dims)) input_tensors.append(tensor) request_id = request.id correlation_id = request.correlation_id requested_output_names = request.requested_output_names inference_request = tpb_utils.InferenceRequest( input_tensors, request_id, correlation_id, requested_output_names) inference_requests.append(inference_request) # Execute inference on the Python backend responses contains a list of # triton_python_backend_utils.InferenceResponse responses = self.backend(inference_requests) # Make sure that number of InferenceResponse and InferenceRequest # objects match if len(inference_requests) != len(responses): context.set_code(grpc.StatusCode.INTERNAL) context.set_details( 'Number of inference responses and requests don\'t match ( requests=' + len(inference_requests) + ' != responses=' + len(responses) + ')') return ExecuteResponse() exec_responses = [] for response in responses: output_tensors = response.output_tensors() response_tensors = [] for output_tensor in output_tensors: output_np_array = output_tensor.numpy_array() tensor = Tensor(name=output_tensor.name(), dtype=numpy_to_protobuf_type( output_np_array.dtype.type), dims=output_np_array.shape, raw_data=output_np_array.tobytes()) response_tensors.append(tensor) exec_responses.append(InferenceResponse(outputs=response_tensors)) execute_response = ExecuteResponse(responses=exec_responses) return execute_response
def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference request is made for this model. Depending on the batching configuration (e.g. Dynamic Batching) used, `requests` may contain multiple requests. Every Python model, must create one pb_utils.InferenceResponse for every pb_utils.InferenceRequest in `requests`. If there is an error, you can set the error argument when creating a pb_utils.InferenceResponse Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate over everyone of the requests # and create a pb_utils.InferenceResponse for each of them. for request in requests: # Get INPUT0 in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") # Get Model Name model_name = pb_utils.get_input_tensor_by_name( request, "MODEL_NAME") # Model Name string model_name_string = model_name.as_numpy()[0] # Create inference request object infer_request = pb_utils.InferenceRequest( model_name=model_name_string, requested_output_names=["OUTPUT0", "OUTPUT1"], inputs=[in_0, in_1]) # Perform synchronous blocking inference request infer_response = infer_request.exec() # Make sure that the inference response doesn't have an error. If # it has an error and you can't proceed with your model execution # you can raise an exception. if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference # response: # # pb_utils.InferenceResponse( # output_tensors=..., TritonError("An error occured")) # # Because the infer_response of the models contains the final # outputs with correct output names, we can just pass the list # of outputs to the InferenceResponse object. inference_response = pb_utils.InferenceResponse( output_tensors=infer_response.output_tensors()) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. return responses
def execute(self, requests): responses = [] for request in requests: input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") gpu_output = pb_utils.get_input_tensor_by_name( request, "GPU_OUTPUT").as_numpy() if input0.is_cpu(): if not gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) else: if gpu_output[0]: output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", input0.to_dlpack()) else: outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu() output0 = pb_utils.Tensor.from_dlpack( "OUTPUT0", to_dlpack(outptu0_pytorch)) next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:]) # Do not perform BLS inference if it is the first # model in the pipeline. if self._model_name != 'dlpack_io_identity_1': infer_request = pb_utils.InferenceRequest( model_name='dlpack_io_identity_1', inputs=[ input0, pb_utils.get_input_tensor_by_name( request, "GPU_OUTPUT") ], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) bls_output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') if not output0.is_cpu(): bls_output0 = from_dlpack( bls_output0.to_dlpack()).detach().cpu().numpy() else: bls_output0 = bls_output0.as_numpy() if not input0.is_cpu(): input0 = from_dlpack( input0.to_dlpack()).detach().cpu().numpy() else: input0 = input0.as_numpy() if not np.allclose(bls_output0, input0): raise pb_utils.TritonModelException( 'BLS input and output tensors are not equal') responses.append( pb_utils.InferenceResponse([output0, next_gpu_output])) return responses
async def execute(self, requests): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only argument. This function is called when an inference request is made for this model. Depending on the batching configuration (e.g. Dynamic Batching) used, `requests` may contain multiple requests. Every Python model, must create one pb_utils.InferenceResponse for every pb_utils.InferenceRequest in `requests`. If there is an error, you can set the error argument when creating a pb_utils.InferenceResponse Parameters ---------- requests : list A list of pb_utils.InferenceRequest Returns ------- list A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ responses = [] # Every Python backend must iterate over everyone of the requests # and create a pb_utils.InferenceResponse for each of them. for request in requests: # Get INPUT0 in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") # List of awaitables containing inflight inference responses. inference_response_awaits = [] for model_name in ['pytorch', 'add_sub']: # Create inference request object infer_request = pb_utils.InferenceRequest( model_name=model_name, requested_output_names=["OUTPUT0", "OUTPUT1"], inputs=[in_0, in_1]) # Store the awaitable inside the array. We don't need # the inference response immediately so we do not `await` # here. inference_response_awaits.append(infer_request.async_exec()) # Wait for all the inference requests to finish. The execution # of the Python script will be blocked until all the awaitables # are resolved. inference_responses = await asyncio.gather( *inference_response_awaits) for infer_response in inference_responses: # Make sure that the inference response doesn't have an error. # If it has an error and you can't proceed with your model # execution you can raise an exception. if infer_response.has_error(): raise pb_utils.TritonModelException( infer_response.error().message()) # Get the OUTPUT0 from the "pytorch" model inference resposne pytorch_output0_tensor = pb_utils.get_output_tensor_by_name( inference_responses[0], "OUTPUT0") # Get the OUTPUT1 from the "addsub" model inference resposne addsub_output1_tensor = pb_utils.get_output_tensor_by_name( inference_responses[1], "OUTPUT1") # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference # response: # # pb_utils.InferenceResponse( # output_tensors=..., TritonError("An error occured")) # # Because the infer_response of the models contains the final # outputs with correct output names, we can just pass the list # of outputs to the InferenceResponse object. inference_response = pb_utils.InferenceResponse( output_tensors=[pytorch_output0_tensor, addsub_output1_tensor]) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. return responses
def test_bls_tensor_lifecycle(self): model_name = 'dlpack_identity' # A 10 MB tensor. input_size = 10 * 1024 * 1024 # Sending the tensor 50 times to test whether the deallocation is # happening correctly. If the deallocation doesn't happen correctly, # there will be an out of shared memory error. for _ in range(50): input0 = np.ones([1, input_size], dtype=np.float32) input0_pb = pb_utils.Tensor('INPUT0', input0) infer_request = pb_utils.InferenceRequest( model_name=model_name, inputs=[input0_pb], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') np.testing.assert_equal(output0.as_numpy(), input0, "BLS CPU memory lifecycle failed.") # Checking the same with the GPU tensors. for index in range(50): input0 = None infer_request = None input0_pb = None torch.cuda.empty_cache() free_memory, _ = torch.cuda.mem_get_info() if index == 1: recorded_memory = free_memory if index > 1: self.assertEqual(free_memory, recorded_memory, "GPU memory lifecycle test failed.") input0 = torch.ones([1, input_size], dtype=torch.float32).to('cuda') input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0)) infer_request = pb_utils.InferenceRequest( model_name=model_name, inputs=[input0_pb], requested_output_names=['OUTPUT0']) infer_response = infer_request.exec() self.assertFalse(infer_response.has_error()) output0 = pb_utils.get_output_tensor_by_name( infer_response, 'OUTPUT0') output0_pytorch = from_dlpack(output0.to_dlpack()) # Set inference response and output0_pytorch to None, to make sure # that the DLPack is still valid. output0 = None infer_response = None self.assertTrue( torch.all(output0_pytorch == input0), f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model." )