def _basic_inference(self, shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle, error_msg, big_shm_name="", big_shm_size=64): input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) inputs = [] outputs = [] if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) inputs[0].set_shared_memory("input0_data", 64) if type(shm_ip1_handle) == np.array: inputs[1].set_data_from_numpy(input0_data, binary_data=True) elif big_shm_name != "": inputs[1].set_shared_memory(big_shm_name, big_shm_size) else: inputs[1].set_shared_memory("input1_data", 64) outputs[0].set_shared_memory("output0_data", 64) outputs[1].set_shared_memory("output1_data", 64) try: results = triton_client.infer("simple", inputs, model_version="", outputs=outputs) output = results.get_output('OUTPUT0') if _protocol == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = utils.triton_to_np_dtype(output_datatype) output_data = shm.get_contents_as_numpy(shm_op0_handle, output_dtype, output_shape) self.assertTrue( (output_data[0] == (input0_data + input1_data)).all(), "Model output does not match expected output") except Exception as ex: error_msg.append(str(ex))
def predict(self, input_images): # Put input data values into shared memory shm.set_shared_memory_region(self.input_images_handle, [input_images]) results = self.triton_client.infer(model_name=self.model_name, inputs=self.inputs, outputs=self.outputs) # Read results from the shared memory. output = results.get_output("output") output_data = shm.get_contents_as_numpy( self.output_handle, utils.triton_to_np_dtype(output.datatype), output.shape) return output_data
outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs[-1].set_shared_memory("output0_data", output0_byte_size) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) outputs[-1].set_shared_memory("output1_data", output1_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") print(utils.triton_to_np_dtype(output0.datatype)) if output0 is not None: output0_data = shm.get_contents_as_numpy( shm_op0_handle, utils.triton_to_np_dtype(output0.datatype), output0.shape) else: print("OUTPUT0 is missing in the response.") sys.exit(1) output1 = results.get_output("OUTPUT1") if output1 is not None: output1_data = shm.get_contents_as_numpy( shm_op1_handle, utils.triton_to_np_dtype(output1.datatype), output1.shape) else: print("OUTPUT1 is missing in the response.") sys.exit(1) for i in range(16):
def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) # configs [ url, protocol, async stream, binary data ] configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if output0_raw == output1_raw: # Float16 not supported for Input and Output via JSON if use_http_json_tensors and (input_dtype != np.float16) and \ (output0_dtype != np.float16) and (output1_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) input1_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: input0_array = input0_array.astype(input_dtype) input1_array = input1_array.astype(input_dtype) if not swap: output0_array = input0_array + input1_array output1_array = input0_array - input1_array else: output0_array = input0_array - input1_array output1_array = input0_array + input1_array if output0_dtype == np.object: output0_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output0_array.flatten()) ], dtype=object).reshape(output0_array.shape) else: output0_array = output0_array.astype(output0_dtype) if output1_dtype == np.object: output1_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output1_array.flatten()) ], dtype=object).reshape(output1_array.shape) else: output1_array = output1_array.astype(output1_dtype) if input_dtype == np.object: in0n = np.array( [str(x) for x in input0_array.reshape(input0_array.size)], dtype=object) input0_array = in0n.reshape(input0_array.shape) in1n = np.array( [str(x) for x in input1_array.reshape(input1_array.size)], dtype=object) input1_array = in1n.reshape(input1_array.shape) # prepend size of string to output string data if output0_dtype == np.object: if batch_size == 1: output0_array_tmp = serialize_byte_tensor_list([output0_array]) else: output0_array_tmp = serialize_byte_tensor_list(output0_array) else: output0_array_tmp = output0_array if output1_dtype == np.object: if batch_size == 1: output1_array_tmp = serialize_byte_tensor_list([output1_array]) else: output1_array_tmp = serialize_byte_tensor_list(output1_array) else: output1_array_tmp = output1_array # Get model platform model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if configs[0][1] == "http": metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata["platform"] else: metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata.platform if platform == "pytorch_libtorch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" else: OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp]) output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp]) if batch_size == 1: input0_list = [input0_array] input1_list = [input1_array] else: input0_list = [x for x in input0_array] input1_list = [x for x in input1_array] # Serialization of string tensors in the case of shared memory must be done manually if input_dtype == np.object: input0_list_tmp = serialize_byte_tensor_list(input0_list) input1_list_tmp = serialize_byte_tensor_list(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp]) input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp]) # Create system/cuda shared memory regions if needed shm_regions, shm_handles = su.create_set_shm_regions( input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] if config[1] == "http": inputs.append( httpclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( httpclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) else: inputs.append( grpcclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( grpcclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[0].set_data_from_numpy(input0_array, binary_data=config[3]) inputs[1].set_data_from_numpy(input1_array, binary_data=config[3]) else: inputs[0].set_data_from_numpy(input0_array) inputs[1].set_data_from_numpy(input1_array) else: # Register necessary shared memory regions/handles su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles, input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if batch_size == 1: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape((1, ) + tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape((1, ) + tensor_shape) ] else: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape(tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape(tensor_shape) ] # Force binary_data = False for shared memory and class output_req = [] i = 0 if "OUTPUT0" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT0, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT0)) output_req[-1].set_shared_memory(shm_regions[2] + '_data', output0_byte_size) else: if output0_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT0)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT0, class_count=num_classes)) i += 1 if "OUTPUT1" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT1, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT1)) output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data', output1_byte_size) else: if output1_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT1)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT1, class_count=num_classes)) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) last_response = results.get_response() if not skip_request_id_check: global _seen_request_ids if config[1] == "http": request_id = int(last_response["id"]) else: request_id = int(last_response.id) tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(str(response_model_version), model_version) tester.assertEqual(len(response_outputs), len(outputs)) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if use_system_shared_memory or use_cuda_shared_memory: if result_name == OUTPUT0: shm_handle = shm_handles[2] else: shm_handle = shm_handles[3] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) if result_name == OUTPUT0: tester.assertTrue( np.array_equal(output_data, output0_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, output0_array, output_data)) elif result_name == OUTPUT1: tester.assertTrue( np.array_equal(output_data, output1_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, output1_array, output_data)) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: for b in range(batch_size): # num_classes values must be returned and must # match expected top values if "nobatch" in pf: class_list = results.as_numpy(result_name) else: class_list = results.as_numpy(result_name)[b] tester.assertEqual(len(class_list), num_classes) if batch_size == 1: expected0_flatten = output0_array.flatten() expected1_flatten = output1_array.flatten() else: expected0_flatten = output0_array[b].flatten() expected1_flatten = output1_array[b].flatten() for idx, class_label in enumerate(class_list): # can't compare indices since could have different # indices with the same value/prob, so check that # the value of each index equals the expected value. # Only compare labels when the indices are equal. if type(class_label) == str: ctuple = class_label.split(':') else: ctuple = "".join(chr(x) for x in class_label).split(':') cval = float(ctuple[0]) cidx = int(ctuple[1]) if result_name == OUTPUT0: tester.assertEqual(cval, expected0_flatten[cidx]) tester.assertEqual( cval, expected0_flatten[expected0_sort_idx[b][idx]]) if cidx == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(cval, expected1_flatten[cidx]) tester.assertEqual( cval, expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if use_http_json_tensors and (tensor_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if shm_region_name_prefix is None: shm_region_name_prefix = ["input", "output"] input_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() # Get model platform model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if configs[0][1] == "http": metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata["platform"] else: metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata.platform for io_num in range(io_cnt): if platform == "pytorch_libtorch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_shape = input_shapes[io_num] output_shape = output_shapes[io_num] rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shape, dtype=rtensor_dtype) else: input_array = np.random.choice(a=[False, True], size=input_shape) if tensor_dtype != np.object: input_array = input_array.astype(tensor_dtype) expected_array = np.ndarray.copy(input_array) else: expected_array = np.array([ unicode(str(x), encoding='utf-8') for x in input_array.flatten() ], dtype=object) input_array = np.array([str(x) for x in input_array.flatten()], dtype=object).reshape(input_array.shape) expected_array = expected_array.reshape(output_shape) expected_dict[output_name] = expected_array output_byte_size = expected_array.nbytes if batch_size == 1: input_list = [input_array] else: input_list = [x for x in input_array] # Serialization of string tensors in the case of shared memory must be done manually if tensor_dtype == np.object: input_list_tmp = serialize_byte_tensor_list(input_list) else: input_list_tmp = input_list input_byte_size = sum([ip.nbytes for ip in input_list_tmp]) # create and register shared memory region for inputs and outputs shm_io_handles = su.create_set_either_shm_region( [ shm_region_name_prefix[0] + str(io_num), shm_region_name_prefix[1] + str(io_num) ], input_list_tmp, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory) if len(shm_io_handles) != 0: shm_ip_handles.append(shm_io_handles[0]) shm_op_handles.append(shm_io_handles[1]) input_dict[input_name] = input_array if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] output_req = [] for io_num, (input_name, output_name) in enumerate( zip(input_dict.keys(), expected_dict.keys())): input_data = input_dict[input_name] input_byte_size = input_data.nbytes output_byte_size = expected_dict[output_name].nbytes if config[1] == "http": inputs.append( httpclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append( httpclient.InferRequestedOutput(output_name, binary_data=config[3])) else: inputs.append( grpcclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append(grpcclient.InferRequestedOutput(output_name)) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[-1].set_data_from_numpy(input_data, binary_data=config[3]) else: inputs[-1].set_data_from_numpy(input_data) else: # Register necessary shared memory regions/handles su.register_add_either_shm_regions( inputs, output_req, shm_region_name_prefix, (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) last_response = results.get_response() if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(response_model_version, model_version) tester.assertEqual(len(response_outputs), io_cnt) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name tester.assertTrue(result_name in expected_dict) if use_system_shared_memory or use_cuda_shared_memory: if platform == "pytorch_libtorch": io_num = int(result_name.split("OUTPUT__")[1]) else: io_num = int(result_name.split("OUTPUT")[1]) shm_handle = shm_op_handles[io_num] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) expected = expected_dict[result_name] tester.assertEqual(output_data.shape, expected.shape) tester.assertTrue( np.array_equal(output_data, expected), "{}, {}, expected: {}, got {}".format(model_name, result_name, expected, output_data)) if len(shm_ip_handles) != 0: for io_num in range(io_cnt): if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
def infer_shape_tensor(tester, pf, tensor_dtype, input_shape_values, dummy_input_shapes, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, priority=0, timeout_us=0, batch_size=1): tester.assertTrue(use_http or use_grpc or use_streaming) tester.assertTrue(pf == "plan" or pf == "plan_nobatch") tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) configs = [] if use_http: configs.append(("localhost:8000", "http", False)) if use_grpc: configs.append(("localhost:8001", "grpc", False)) if use_streaming: configs.append(("localhost:8001", "grpc", True)) io_cnt = len(input_shape_values) # FIXME wrap up shm handle cleanup # item is (handle, byte_size) input_shm_handle_list = [] output_shm_handle_list = [] dummy_input_list = [] input_list = [] expected_dict = dict() # Prepare IO in advance for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()], dtype=object).reshape(dummy_in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected value for the output. Skip dummy output as we # only care about its shape (== value of OUTPUT*) expected_dict[output_name] = np.ndarray.copy(in0) # Only need to create region once input_byte_size = in0.size * np.dtype(np.int32).itemsize output_byte_size = input_byte_size * batch_size if use_system_shared_memory: input_shm_handle_list.append( (shm.create_shared_memory_region(input_name + shm_suffix, '/' + input_name + shm_suffix, input_byte_size), input_byte_size)) output_shm_handle_list.append((shm.create_shared_memory_region( output_name + shm_suffix, '/' + output_name + shm_suffix, output_byte_size), output_byte_size)) shm.set_shared_memory_region(input_shm_handle_list[-1][0], [ in0, ]) model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) # Run inference and check results for each config for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) inputs = [] outputs = [] # Set IOs for io_num in range(io_cnt): dummy_input_name = "DUMMY_INPUT{}".format(io_num) input_name = "INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) inputs.append( client_utils.InferInput(dummy_input_name, dummy_input_shapes[io_num], np_to_triton_dtype(tensor_dtype))) inputs.append( client_utils.InferInput(input_name, input_list[io_num].shape, "INT32")) outputs.append( client_utils.InferRequestedOutput(dummy_output_name)) outputs.append(client_utils.InferRequestedOutput(output_name)) # -2: dummy; -1: input inputs[-2].set_data_from_numpy(dummy_input_list[io_num]) if (not use_system_shared_memory): inputs[-1].set_data_from_numpy(input_list[io_num]) else: input_byte_size = input_shm_handle_list[io_num][1] output_byte_size = output_shm_handle_list[io_num][1] triton_client.register_system_shared_memory( input_name + shm_suffix, "/" + input_name + shm_suffix, input_byte_size) triton_client.register_system_shared_memory( output_name + shm_suffix, "/" + output_name + shm_suffix, output_byte_size) inputs[-1].set_shared_memory(input_name + shm_suffix, input_byte_size) outputs[-1].set_shared_memory(output_name + shm_suffix, output_byte_size) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, outputs=outputs, priority=priority, timeout=timeout_us) for io_num in range(io_cnt): output_name = "OUTPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) expected = expected_dict[output_name] # get outputs as numpy array dummy_out = results.as_numpy(dummy_output_name) if (not use_system_shared_memory): out = results.as_numpy(output_name) else: output = results.get_output(output_name) if config[1] == "grpc": output_shape = output.shape else: output_shape = output["shape"] out = shm.get_contents_as_numpy( output_shm_handle_list[io_num][0], np.int32, output_shape) # if out shape is 2D, it is batched if (len(out.shape) == 2): # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue( np.array_equal(dummy_out.shape[1:], out[0]), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out[0], dummy_out.shape[1:])) for b in range(1, out.shape[0]): tester.assertTrue( np.array_equal(out[b - 1], out[b]), "expect shape tensor has consistent value, " "expected: {}, got {}".format(out[b - 1], out[b])) out = out[0] else: tester.assertTrue( np.array_equal(dummy_out.shape, out), "{}, {} shape, expected: {}, got {}".format( model_name, dummy_output_name, out, dummy_out.shape)) tester.assertTrue( np.array_equal(out, expected), "{}, {}, expected: {}, got {}".format(model_name, output_name, expected, out)) # unregister shared memory region for next config if use_system_shared_memory: triton_client.unregister_system_shared_memory(input_name + shm_suffix) triton_client.unregister_system_shared_memory(output_name + shm_suffix) for handle in input_shm_handle_list: shm.destroy_shared_memory_region(handle[0]) for handle in output_shm_handle_list: shm.destroy_shared_memory_region(handle[0])
def check_sequence_shape_tensor_io(self, model_name, input_dtype, correlation_id, sequence_thresholds, values, expected_result, shm_region_handles, using_dynamic_batcher=False, sequence_name="<unknown>"): """Perform sequence of inferences using async run. The 'values' holds a list of tuples, one for each inference with format: (flag_str, shape_value, value, pre_delay_ms) """ tensor_shape = (1, 1) # shape tensor is 1-D tensor that doesn't contain batch size as first value shape_tensor_shape = (1,) self.assertFalse(_test_cuda_shared_memory, "Shape tensors does not support CUDA shared memory") client_utils = grpcclient triton_client = client_utils.InferenceServerClient(f"{_tritonserver_ipaddr}:8001", verbose=True) user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) # Execute the sequence of inference... try: seq_start_ms = int(round(time.time() * 1000)) sent_count = 0 shape_values = list() for flag_str, shape_value, value, pre_delay_ms in values: seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) # Construct request IOs inputs = [] outputs = [] # input order: input, shape(, dummy) inputs.append( client_utils.InferInput( "INPUT", tensor_shape, np_to_triton_dtype(np.int32 if using_dynamic_batcher else input_dtype))) inputs.append( client_utils.InferInput("SHAPE_INPUT", shape_tensor_shape, np_to_triton_dtype(np.int32))) if using_dynamic_batcher: inputs.append( client_utils.InferInput( "DUMMY_INPUT", tensor_shape, np_to_triton_dtype(input_dtype))) # output order: shape, output, resized outputs.append( client_utils.InferRequestedOutput("SHAPE_OUTPUT")) outputs.append(client_utils.InferRequestedOutput("OUTPUT")) outputs.append( client_utils.InferRequestedOutput("RESIZED_OUTPUT")) # Set IO values shape_values.append( np.full(shape_tensor_shape, shape_value, dtype=np.int32)) if not _test_system_shared_memory: if using_dynamic_batcher: if input_dtype == np.object_: dummy_in0 = np.full(tensor_shape, value, dtype=np.int32) dummy_in0n = np.array( [str(x) for x in in0.reshape(dummy_in0.size)], dtype=object) dummy_in0 = dummy_in0n.reshape(tensor_shape) else: dummy_in0 = np.full(tensor_shape, value, dtype=input_dtype) in0 = np.full(tensor_shape, value, dtype=np.int32) else: if input_dtype == np.object_: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array( [str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(tensor_shape) else: in0 = np.full(tensor_shape, value, dtype=input_dtype) inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(shape_values[-1]) if using_dynamic_batcher: inputs[2].set_data_from_numpy(dummy_in0) else: if using_dynamic_batcher: input_offset = 6 * sent_count output_offset = 6 * sent_count + 3 else: input_offset = 5 * sent_count output_offset = 5 * sent_count + 2 for i in range(len(inputs)): inputs[i].set_shared_memory( shm_region_handles[input_offset + i][0], shm_region_handles[input_offset + i][1]) for i in range(len(outputs)): outputs[i].set_shared_memory( shm_region_handles[output_offset + i][0], shm_region_handles[output_offset + i][1]) if pre_delay_ms is not None: time.sleep(pre_delay_ms / 1000.0) triton_client.async_stream_infer(model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) sent_count += 1 # Wait for the results in the order sent result = None processed_count = 0 while processed_count < sent_count: (results, error) = user_data._completed_requests.get() if error is not None: raise error # Get value of "OUTPUT", for shared memory, need to get it via # shared memory utils if (not _test_system_shared_memory): out = results.as_numpy("OUTPUT") else: output = results.get_output("OUTPUT") output_offset = 6 * processed_count + 4 if using_dynamic_batcher else 5 * processed_count + 3 output_shape = output.shape output_type = np.int32 if using_dynamic_batcher else np.float32 out = shm.get_contents_as_numpy( shm_region_handles[output_offset][2], output_type, output_shape) result = out[0][0] # Validate the (debatched) shape of the resized output matches # with the shape input values resized_shape = results.get_output("RESIZED_OUTPUT").shape[1:] self.assertTrue( np.array_equal(resized_shape, shape_values[processed_count]), "{}, {}, slot {}, expected: {}, got {}".format( model_name, "RESIZED_OUTPUT", processed_count, shape_values[processed_count], resized_shape)) print("{}: {}".format(sequence_name, result)) processed_count += 1 seq_end_ms = int(round(time.time() * 1000)) if input_dtype == np.object_: self.assertEqual(int(result), expected_result) else: self.assertEqual(result, expected_result) if sequence_thresholds is not None: lt_ms = sequence_thresholds[0] gt_ms = sequence_thresholds[1] if lt_ms is not None: if _test_jetson: lt_ms *= _jetson_slowdown_factor self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms, "sequence expected less than " + str(lt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") if gt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms, "sequence expected greater than " + str(gt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") except Exception as ex: self.add_deferred_exception(ex) triton_client.stop_stream()
outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) outputs[-1].set_shared_memory("output_data", output_byte_size, offset=output_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") if output0 is not None: output0_data = shm.get_contents_as_numpy( shm_op_handle, utils.triton_to_np_dtype(output0['datatype']), output0['shape']) else: print("OUTPUT0 is missing in the response.") sys.exit(1) output1 = results.get_output("OUTPUT1") if output1 is not None: output1_data = shm.get_contents_as_numpy(shm_op_handle, utils.triton_to_np_dtype( output1['datatype']), output1['shape'], offset=output_byte_size) else: print("OUTPUT1 is missing in the response.") sys.exit(1)
def check_sequence_async(self, trial, model_name, input_dtype, correlation_id, sequence_thresholds, values, expected_result, shm_region_handles, batch_size=1, sequence_name="<unknown>", tensor_shape=(1,)): """Perform sequence of inferences using stream async run. The 'values' holds a list of tuples, one for each inference with format: (flag_str, value, pre_delay_ms) """ if (("savedmodel" not in trial) and ("graphdef" not in trial) and ("custom" not in trial) and ("onnx" not in trial) and ("libtorch" not in trial) and ("plan" not in trial)): self.assertFalse(True, "unknown trial type: " + trial) self.assertFalse( _test_system_shared_memory and _test_cuda_shared_memory, "Cannot set both System and CUDA shared memory flags to 1") full_shape = tensor_shape if "nobatch" in trial else ( batch_size,) + tensor_shape client_utils = grpcclient triton_client = client_utils.InferenceServerClient(f"{_tritonserver_ipaddr}:8001", verbose=True) user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) # Execute the sequence of inference... try: seq_start_ms = int(round(time.time() * 1000)) INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT" OUTPUT = "OUTPUT__0" if trial.startswith("libtorch") else "OUTPUT" sent_count = 0 for flag_str, value, pre_delay_ms in values: seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) # Construct request IOs inputs = [] outputs = [] inputs.append( client_utils.InferInput(INPUT, full_shape, np_to_triton_dtype(input_dtype))) outputs.append(client_utils.InferRequestedOutput(OUTPUT)) if not (_test_system_shared_memory or _test_cuda_shared_memory): if input_dtype == np.object_: in0 = np.full(full_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(full_shape) else: in0 = np.full(full_shape, value, dtype=input_dtype) inputs[0].set_data_from_numpy(in0) else: offset = 2 * sent_count inputs[0].set_shared_memory(shm_region_handles[offset][0], shm_region_handles[offset][1]) outputs[0].set_shared_memory( shm_region_handles[offset + 1][0], shm_region_handles[offset + 1][1]) if pre_delay_ms is not None: time.sleep(pre_delay_ms / 1000.0) triton_client.async_stream_infer(model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) sent_count += 1 # Wait for the results in the order sent result = None processed_count = 0 while processed_count < sent_count: (results, error) = user_data._completed_requests.get() if error is not None: raise error # Get value of "OUTPUT", for shared memory, need to get it via # shared memory utils if (not _test_system_shared_memory) and ( not _test_cuda_shared_memory): out = results.as_numpy(OUTPUT) else: output = results.get_output(OUTPUT) offset = 2 * processed_count + 1 output_shape = output.shape output_type = input_dtype if _test_system_shared_memory: out = shm.get_contents_as_numpy( shm_region_handles[offset][2], output_type, output_shape) else: out = cudashm.get_contents_as_numpy( shm_region_handles[offset][2], output_type, output_shape) result = out[0] if "nobatch" in trial else out[0][0] print("{}: {}".format(sequence_name, result)) processed_count += 1 seq_end_ms = int(round(time.time() * 1000)) if input_dtype == np.object_: self.assertEqual(int(result), expected_result) else: self.assertEqual(result, expected_result) if sequence_thresholds is not None: lt_ms = sequence_thresholds[0] gt_ms = sequence_thresholds[1] if lt_ms is not None: if _test_jetson: lt_ms *= _jetson_slowdown_factor self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms, "sequence expected less than " + str(lt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") if gt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms, "sequence expected greater than " + str(gt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") except Exception as ex: self.add_deferred_exception(ex) triton_client.stop_stream()
def check_sequence(self, trial, model_name, input_dtype, correlation_id, sequence_thresholds, values, expected_result, protocol, batch_size=1, sequence_name="<unknown>", tensor_shape=(1,)): """Perform sequence of inferences. The 'values' holds a list of tuples, one for each inference with format: (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms) """ if (("savedmodel" not in trial) and ("graphdef" not in trial) and ("custom" not in trial) and ("onnx" not in trial) and ("libtorch" not in trial) and ("plan" not in trial)): self.assertFalse(True, "unknown trial type: " + trial) # Can only send the request exactly once since it is a # sequence model with state, so can have only a single config. configs = [] if protocol == "http": configs.append((f"{_tritonserver_ipaddr}:8000", "http", False)) if protocol == "grpc": configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", False)) if protocol == "streaming": configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", True)) self.assertFalse( _test_system_shared_memory and _test_cuda_shared_memory, "Cannot set both System and CUDA shared memory flags to 1") self.assertEqual(len(configs), 1) full_shape = tensor_shape if "nobatch" in trial else ( batch_size,) + tensor_shape # create and register shared memory output region in advance, # knowing that this function will not be called concurrently. if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() output_byte_size = 512 if _test_system_shared_memory: shm_op_handle = shm.create_shared_memory_region( "output_data", "/output", output_byte_size) self.triton_client_.register_system_shared_memory( "output_data", "/output", output_byte_size) elif _test_cuda_shared_memory: shm_op_handle = cudashm.create_shared_memory_region( "output_data", output_byte_size, 0) self.triton_client_.register_cuda_shared_memory( "output_data", cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size) shm_ip_handles = [] for config in configs: client_utils = grpcclient if config[1] == "grpc" else httpclient triton_client = client_utils.InferenceServerClient(config[0], verbose=True) if config[2]: user_data = UserData() triton_client.start_stream( partial(completion_callback, user_data)) # Execute the sequence of inference... try: seq_start_ms = int(round(time.time() * 1000)) INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT" OUTPUT = "OUTPUT__0" if trial.startswith( "libtorch") else "OUTPUT" for flag_str, value, thresholds, delay_ms in values: if _test_valgrind or _test_jetson: if delay_ms is not None: delay_ms[0] = max(_valgrind_delay_ms, delay_ms[0]) delay_ms[1] = max(_valgrind_delay_ms, delay_ms[1]) else: delay_ms = (_valgrind_delay_ms, _valgrind_delay_ms) if delay_ms is not None: time.sleep(delay_ms[0] / 1000.0) seq_start = False seq_end = False if flag_str is not None: seq_start = ("start" in flag_str) seq_end = ("end" in flag_str) # Construct request IOs inputs = [] outputs = [] inputs.append( client_utils.InferInput( INPUT, full_shape, np_to_triton_dtype(input_dtype))) outputs.append(client_utils.InferRequestedOutput(OUTPUT)) if input_dtype == np.object_: in0 = np.full(full_shape, value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(full_shape) else: in0 = np.full(full_shape, value, dtype=input_dtype) # create input shared memory and copy input data values into it if _test_system_shared_memory or _test_cuda_shared_memory: if input_dtype == np.object_: input_list_tmp = iu.serialize_byte_tensor_list( [in0]) input_byte_size = sum([ serialized_byte_size(i0) for i0 in input_list_tmp ]) else: input_list_tmp = [in0] input_byte_size = sum( [i0.nbytes for i0 in input_list_tmp]) ip_name = "ip{}".format(len(shm_ip_handles)) if _test_system_shared_memory: shm_ip_handles.append( shm.create_shared_memory_region( ip_name, "/" + ip_name, input_byte_size)) shm.set_shared_memory_region( shm_ip_handles[-1], input_list_tmp) triton_client.register_system_shared_memory( ip_name, "/" + ip_name, input_byte_size) elif _test_cuda_shared_memory: shm_ip_handles.append( cudashm.create_shared_memory_region( ip_name, input_byte_size, 0)) cudashm.set_shared_memory_region( shm_ip_handles[-1], input_list_tmp) triton_client.register_cuda_shared_memory( ip_name, cudashm.get_raw_handle(shm_ip_handles[-1]), 0, input_byte_size) inputs[0].set_shared_memory(ip_name, input_byte_size) outputs[0].set_shared_memory("output_data", output_byte_size) else: inputs[0].set_data_from_numpy(in0) start_ms = int(round(time.time() * 1000)) if config[2]: triton_client.async_stream_infer( model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer( model_name, inputs, outputs=outputs, sequence_id=correlation_id, sequence_start=seq_start, sequence_end=seq_end) end_ms = int(round(time.time() * 1000)) # Get value of "OUTPUT", for shared memory, need to get it via # shared memory utils if (not _test_system_shared_memory) and ( not _test_cuda_shared_memory): out = results.as_numpy(OUTPUT) else: output = results.get_output(OUTPUT) if config[1] == "http": output_shape = output["shape"] else: output_shape = output.shape output_type = input_dtype if _test_system_shared_memory: out = shm.get_contents_as_numpy( shm_op_handle, output_type, output_shape) else: out = cudashm.get_contents_as_numpy( shm_op_handle, output_type, output_shape) result = out[0] if "nobatch" in trial else out[0][0] print("{}: {}".format(sequence_name, result)) if thresholds is not None: lt_ms = thresholds[0] gt_ms = thresholds[1] if lt_ms is not None: self.assertTrue((end_ms - start_ms) < lt_ms, "expected less than " + str(lt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, "expected greater than " + str(gt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if delay_ms is not None: time.sleep(delay_ms[1] / 1000.0) seq_end_ms = int(round(time.time() * 1000)) if input_dtype == np.object_: self.assertEqual(int(result), expected_result) else: self.assertEqual(result, expected_result) if sequence_thresholds is not None: lt_ms = sequence_thresholds[0] gt_ms = sequence_thresholds[1] if lt_ms is not None: if _test_jetson: lt_ms *= _jetson_slowdown_factor self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms, "sequence expected less than " + str(lt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") if gt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms, "sequence expected greater than " + str(gt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") except Exception as ex: self.add_deferred_exception(ex) if config[2]: triton_client.stop_stream() if _test_system_shared_memory or _test_cuda_shared_memory: self.triton_client_.unregister_system_shared_memory() self.triton_client_.unregister_cuda_shared_memory() destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region destroy_func(shm_op_handle) for shm_ip_handle in shm_ip_handles: destroy_func(shm_ip_handle)
outputs.append(httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs[-1].set_shared_memory("output0_data", output0_byte_size) outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) outputs[-1].set_shared_memory("output1_data", output1_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") print(utils.triton_to_np_dtype(output0['datatype'])) if output0 is not None: output0_data = shm.get_contents_as_numpy( shm_op0_handle, utils.triton_to_np_dtype(output0['datatype']), output0['shape']) else: print("OUTPUT0 is missing in the response.") sys.exit(1) output1 = results.get_output("OUTPUT1") if output1 is not None: output1_data = shm.get_contents_as_numpy( shm_op1_handle, utils.triton_to_np_dtype(output1['datatype']), output1['shape']) else: print("OUTPUT1 is missing in the response.") sys.exit(1) for i in range(16):
def infer_and_validata(use_shared_memory, orig_input0_data, orig_input1_data): if use_shared_memory: input0_data = orig_input0_data input1_data = orig_input1_data byte_size = input0_data.size * input0_data.itemsize inputs[0].set_shared_memory("input0_data", byte_size) inputs[1].set_shared_memory("input1_data", byte_size) outputs[0].set_shared_memory("output0_data", byte_size) outputs[1].set_shared_memory("output1_data", byte_size) else: input0_data = orig_input0_data input1_data = orig_input1_data * 2 inputs[0].set_data_from_numpy(np.expand_dims(input0_data, axis=0)) inputs[1].set_data_from_numpy(np.expand_dims(input1_data, axis=0)) outputs[0].unset_shared_memory() outputs[1].unset_shared_memory() results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") if output0 is not None: if use_shared_memory: if protocol == "grpc": output0_data = shm.get_contents_as_numpy( shm_op0_handle, utils.triton_to_np_dtype(output0.datatype), output0.shape) else: output0_data = shm.get_contents_as_numpy( shm_op0_handle, utils.triton_to_np_dtype(output0['datatype']), output0['shape']) else: output0_data = results.as_numpy('OUTPUT0') else: print("OUTPUT0 is missing in the response.") sys.exit(1) output1 = results.get_output("OUTPUT1") if output1 is not None: if use_shared_memory: if protocol == "grpc": output1_data = shm.get_contents_as_numpy( shm_op1_handle, utils.triton_to_np_dtype(output1.datatype), output1.shape) else: output1_data = shm.get_contents_as_numpy( shm_op1_handle, utils.triton_to_np_dtype(output1['datatype']), output1['shape']) else: output1_data = results.as_numpy('OUTPUT1') else: print("OUTPUT1 is missing in the response.") sys.exit(1) if use_shared_memory: print("\n\n======== SHARED_MEMORY ========\n") else: print("\n\n======== NO_SHARED_MEMORY ========\n") for i in range(16): print( str(input0_data[i]) + " + " + str(input1_data[i]) + " = " + str(output0_data[0][i])) print( str(input0_data[i]) + " - " + str(input1_data[i]) + " = " + str(output1_data[0][i])) if (input0_data[i] + input1_data[i]) != output0_data[0][i]: print("shm infer error: incorrect sum") sys.exit(1) if (input0_data[i] - input1_data[i]) != output1_data[0][i]: print("shm infer error: incorrect difference") sys.exit(1) print("\n======== END ========\n\n")
outputs[-1].set_shared_memory("output_data", output_byte_size) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) outputs[-1].set_shared_memory("output_data", output_byte_size, offset=output_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") if output0 is not None: output0_data = shm.get_contents_as_numpy( shm_op_handle, utils.triton_to_np_dtype(output0.datatype), output0.shape) else: print("OUTPUT0 is missing in the response.") sys.exit(1) output1 = results.get_output("OUTPUT1") if output1 is not None: output1_data = shm.get_contents_as_numpy(shm_op_handle, utils.triton_to_np_dtype( output1.datatype), output1.shape, offset=output_byte_size) else: print("OUTPUT1 is missing in the response.") sys.exit(1)