def setUp(self): global _deferred_exceptions _deferred_exceptions = [] # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient( "localhost:8001") self.model_name_ = 'identity_2_float32' # This will not be changed even when ensemble is under test, # as the dynamic batching is performed within the composing model self.check_status_model = 'identity_2_float32' self.tensor_shape_ = (1, 1) self.inputs_ = { "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"), "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32") } self.input_data_ = { "INPUT0": np.ones(shape=(1, 1), dtype=np.float32), "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32) } self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"]) self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"]) self.outputs_ = { "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'), "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1') }
def simple_string_inference(triton_client): model_name = 'simple_string' inputs = [] outputs = [] inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES")) inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "BYTES")) # Create the data for the two input tensors. Initialize the first # to unique integers and the second to all ones. in0 = np.arange(start=0, stop=16, dtype=np.int32) in0 = np.expand_dims(in0, axis=0) in1 = np.ones(shape=(1, 16), dtype=np.int32) expected_sum = np.add(in0, in1) expected_diff = np.subtract(in0, in1) # The 'simple_string' model expects 2 BYTES tensors where each # element in those tensors is the utf-8 string representation of # an integer. The BYTES tensors must be represented by a numpy # array with dtype=np.object_. in0n = np.array([str(x).encode('utf-8') for x in in0.reshape(in0.size)], dtype=np.object_) input0_data = in0n.reshape(in0.shape) in1n = np.array([str(x).encode('utf-8') for x in in1.reshape(in1.size)], dtype=np.object_) input1_data = in1n.reshape(in1.shape) # Initialize the data inputs[0].set_data_from_numpy(input0_data) inputs[1].set_data_from_numpy(input1_data) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Get the output arrays from the results output0_data = results.as_numpy('OUTPUT0') output1_data = results.as_numpy('OUTPUT1') for i in range(16): print( str(input0_data[0][i]) + " + " + str(input1_data[0][i]) + " = " + str(output0_data[0][i])) print( str(input0_data[0][i]) + " - " + str(input1_data[0][i]) + " = " + str(output1_data[0][i])) # Convert result from string to int to check result r0 = int(output0_data[0][i]) r1 = int(output1_data[0][i]) if expected_sum[0][i] != r0: print("error: incorrect sum") sys.exit(1) if expected_diff[0][i] != r1: print("error: incorrect difference") sys.exit(1)
def _basic_inference(self, shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle, error_msg, big_shm_name="", big_shm_size=64): input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) inputs = [] outputs = [] if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) inputs[0].set_shared_memory("input0_data", 64) if type(shm_ip1_handle) == np.array: inputs[1].set_data_from_numpy(input0_data, binary_data=True) elif big_shm_name != "": inputs[1].set_shared_memory(big_shm_name, big_shm_size) else: inputs[1].set_shared_memory("input1_data", 64) outputs[0].set_shared_memory("output0_data", 64) outputs[1].set_shared_memory("output1_data", 64) try: results = triton_client.infer("simple", inputs, model_version="", outputs=outputs) output = results.get_output('OUTPUT0') if _protocol == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = utils.triton_to_np_dtype(output_datatype) output_data = shm.get_contents_as_numpy(shm_op0_handle, output_dtype, output_shape) self.assertTrue( (output_data[0] == (input0_data + input1_data)).all(), "Model output does not match expected output") except Exception as ex: error_msg.append(str(ex))
def sync_send(triton_client, result_list, values, batch_size, sequence_id, model_name, model_version): count = 1 for value in values: # Create the tensor for INPUT value_data = np.full(shape=[batch_size, 1], fill_value=value, dtype=np.int32) inputs = [] inputs.append(grpcclient.InferInput('INPUT', value_data.shape, "INT32")) # Initialize the data inputs[0].set_data_from_numpy(value_data) outputs = [] outputs.append(grpcclient.InferRequestedOutput('OUTPUT')) # Issue the synchronous sequence inference. result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, sequence_id=sequence_id, sequence_start=(count == 1), sequence_end=(count == len(values))) result_list.append(result.as_numpy('OUTPUT')) count = count + 1
def get_result(url, model_name, x): try: triton_client = grpcclient.InferenceServerClient(url=url, verbose=False, ssl=False) print("Channel creation success") except Exception as e: print("channel creation failed: " + str(e)) inputs = [] outputs = [] inputs.append(grpcclient.InferInput('input0', x.shape, "FP32")) input0_data = x print("X Shape : ", x.shape) inputs[0].set_data_from_numpy(input0_data) outputs.append(grpcclient.InferRequestedOutput('output0')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) inputs[0].set_data_from_numpy(input0_data) output0_data = results.as_numpy('output0') output0_data = sigmoid(output0_data.squeeze()) print(output0_data) result = np.mean(output0_data) return output0_data
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS): # Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( grpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( httpclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( grpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True, class_count=FLAGS.classes)) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def identity_inference(triton_client, np_array): model_name = "simple_identity" inputs = [] outputs = [] inputs.append(grpcclient.InferInput('INPUT0', np_array.shape, "BYTES")) inputs[0].set_data_from_numpy(np_array) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) if (np_array.dtype == np.object_): print(results.as_numpy('OUTPUT0')) if not np.array_equal(np_array, results.as_numpy('OUTPUT0')): print(results.as_numpy('OUTPUT0')) print("error: incorrect output") sys.exit(1) else: encoded_results = np.char.encode( results.as_numpy('OUTPUT0').astype(str)) if not np.array_equal(np_array, encoded_results): print(encoded_results) print("error: incorrect output") sys.exit(1)
def _run_query( client, n_rows, model_name, workflow_path, data_path, actual_output_filename, output_name, input_cols_name=None, backend="tensorflow", ): workflow = nvt.Workflow.load(workflow_path) if input_cols_name is None: batch = cudf.read_csv( data_path, nrows=n_rows)[workflow.output_node.input_columns.names] else: batch = cudf.read_csv(data_path, nrows=n_rows)[input_cols_name] input_dtypes = workflow.input_dtypes columns = [(col, batch[col]) for col in batch.columns] inputs = [] for i, (name, col) in enumerate(columns): d = col.values_host.astype(input_dtypes[name]) d = d.reshape(len(d), 1) inputs.append( grpcclient.InferInput(name, d.shape, np_to_triton_dtype(input_dtypes[name]))) inputs[i].set_data_from_numpy(d) outputs = [grpcclient.InferRequestedOutput(output_name)] time_start = dt.datetime.now() response = client.infer(model_name, inputs, request_id="1", outputs=outputs) run_time = dt.datetime.now() - time_start output_key = "output" if backend == "hugectr" else "0" output_actual = cudf.read_csv(os.path.expanduser(actual_output_filename), nrows=n_rows) output_actual = cp.asnumpy(output_actual[output_key].values) output_predict = response.as_numpy(output_name) if backend == "tensorflow": output_predict = output_predict[:, 0] diff = abs(output_actual - output_predict) return diff, run_time
def setUp(self): global _deferred_exceptions _deferred_exceptions = [] # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001") self.model_name_ = 'identity_2_float32' self.tensor_shape_ = (1, 1) self.inputs_ = { "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"), "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32") } self.input_data_ = { "INPUT0": np.ones(shape=(1, 1), dtype=np.float32), "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32) } self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"]) self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"]) self.outputs_ = { "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'), "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1') }
def test_grpc_out_of_shared_memory(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up too small CUDA shared memory for outputs, expect query # returns default value triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", 1, 0) shm_op1_handle = cudashm.create_shared_memory_region( "output1_data", 1, 0) triton_client.register_cuda_shared_memory( "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1) triton_client.register_cuda_shared_memory( "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1) outputs = [] outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs[-1].set_shared_memory("output0_data", 1) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) outputs[-1].set_shared_memory("output1_data", 1) try: triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) self.assertTrue("OUTPUT1 CPU 0" in ex.message()) cudashm.destroy_shared_memory_region(shm_op0_handle) cudashm.destroy_shared_memory_region(shm_op1_handle) triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory()
def _prepare_request(self, protocol): if (protocol == "grpc"): self.inputs_ = [] self.inputs_.append( grpcclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0')) else: self.inputs_ = [] self.inputs_.append( httpclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0')) self.inputs_[0].set_data_from_numpy(self.input0_data_)
def run(self, client_metadata): triton_client = client_metadata[0] inputs = [ grpcclient.InferInput("input", self.image_data_.shape, "FP32") ] inputs[0].set_data_from_numpy(self.image_data_) outputs = [ grpcclient.InferRequestedOutput("resnet_v1_50/predictions/Softmax", class_count=1) ] res = triton_client.infer(self.model_name_, inputs, outputs=outputs) self.postprocess(res) return self.batch_size_
def requestGenerator(batched_image_data, input_name, output_name, dtype, model_name, model_version, classes=1): # Set the input data inputs = [] inputs.append( grpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) outputs = [] outputs.append( grpcclient.InferRequestedOutput(output_name, class_count=classes)) yield inputs, outputs, model_name, model_version
def test_infer(self, data, it): assert (len(data) == len(self.input_names)) if (len(data) > 1): for b in data: assert b.shape[0] == data[0].shape[0] inputs = [ self._get_input(batch, name) for batch, name in zip(data, self.input_names) ] outputs = [ t_client.InferRequestedOutput(name) for name in self.output_names ] res = self.client.infer(model_name=self.model_name, inputs=inputs, outputs=outputs) res_data = [res.as_numpy(name) for name in self.output_names] return it, data, res_data
def get_embedding(self, face_img): if not isinstance(face_img, list): face_img = [face_img] face_img = np.stack(face_img) input_size = tuple(face_img[0].shape[0:2][::-1]) blob = cv2.dnn.blobFromImages( face_img, 1.0 / self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True) blob = blob.astype(triton_to_np_dtype(self.dtype)) inputs = [] inputs.append( grpcclient.InferInput(self.input_name, [blob.shape[0], self.c, self.h, self.w], "FP32")) # inputs[0].set_data_from_numpy(face_img) cudashm.set_shared_memory_region(self.in_handle, [blob]) input_bytesize = 12 * blob.shape[0] * self.w * self.h inputs[-1].set_shared_memory(self.in_handle_name, input_bytesize) outputs = [] out_bytesize = 12 * 512 * self.max_batch_size outputs.append(grpcclient.InferRequestedOutput(self.output_name[0])) outputs[-1].set_shared_memory(self.out_handle_name, out_bytesize) out = self.triton_client.infer(self.model_name, inputs, model_version=self.model_version, outputs=outputs) out = [ cudashm.get_contents_as_numpy(self.out_handle, triton_to_np_dtype(self.dtype), [blob.shape[0], 512]) ] # out = [out.as_numpy(e) for e in self.output_name] return out[0]
def run(self, input): inputs = [] outputs = [ grpcclient.InferRequestedOutput(e) for e in self.output_order ] inputs.append( grpcclient.InferInput(self.input_name, [1, self.c, self.h, self.w], self.dtype)) # inputs[0].set_data_from_numpy(input) cudashm.set_shared_memory_region(self.in_handle, [input]) inputs[-1].set_shared_memory(self.in_handle_name, self.input_bytesize) out = self.triton_client.infer(self.model_name, inputs, model_version=self.model_version, outputs=outputs) out = [out.as_numpy(e) for e in self.output_order] return out
def requestGenerator(input_name, input_data, output_name, dtype, protocol): # Set the input data inputs = [] if protocol.lower() == "grpc": inputs.append(grpcclient.InferInput(input_name, input_data.shape, dtype)) inputs[0].set_data_from_numpy(input_data) else: inputs.append(httpclient.InferInput(input_name, input_data.shape, dtype)) inputs[0].set_data_from_numpy(input_data, binary_data=True) outputs = [] if protocol.lower() == "grpc": outputs.append(grpcclient.InferRequestedOutput(output_name)) else: outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True)) return inputs, outputs
def test_nvt_hugectr_inference(n_rows, err_tol): warnings.simplefilter("ignore") model_name = "test_model_ens" col_names = ["userId", "movieId", "new_cat1"] # read in a batch of data to get transforms for batch = cudf.read_csv(DATA_DIR + "test/data.csv", nrows=n_rows)[col_names] # convert the batch to a triton inputs columns = [(col, batch[col]) for col in col_names] inputs = [] col_dtypes = [np.int64, np.int64, np.int64] for i, (name, col) in enumerate(columns): d = col.values_host.astype(col_dtypes[i]) d = d.reshape(len(d), 1) inputs.append( httpclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i]))) inputs[i].set_data_from_numpy(d) # placeholder variables for the output outputs = [] outputs.append(httpclient.InferRequestedOutput("OUTPUT0")) # make the request with httpclient.InferenceServerClient("localhost:8001") as client: response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) output_actual = cudf.read_csv(DATA_DIR + "test/output.csv", nrows=n_rows) output_actual = cp.asnumpy(output_actual["output"].values) output_predict = response.as_numpy("OUTPUT0") diff = abs(output_actual - output_predict) assert (diff < err_tol).all()
def inputs_outputs_generator(self, raw_inputs): """ Generate inputs and outptus blob for triton client inference :param raw_inputs: list of raw numpy inputs :return: inputs outputs data """ inputs = [] for input_specs, raw_input in zip(self.inputs_specs, raw_inputs): # parse data type raw_input = raw_input.astype( triton_to_np_dtype(input_specs.datatype)) infer_input = grpcclient.InferInput(input_specs.name, raw_input.shape, input_specs.datatype) infer_input.set_data_from_numpy(raw_input) inputs.append(infer_input) outputs = [] for output_specs in self.outputs_specs: outputs.append( grpcclient.InferRequestedOutput(output_specs.name, class_count=0)) return inputs, outputs
def infer(self, input_img, obj, confidence): confidence = confidence out = "crop/" + input_img.split("/")[1] # IMAGE MODE # print("Running in 'image' mode") if not input_img: print("FAILED: no input image") sys.exit(1) inputs = [] outputs = [] inputs.append(grpcclient.InferInput('data', [1, 3, 128, 64], "FP32")) outputs.append(grpcclient.InferRequestedOutput('prob')) # print("Creating buffer from image file...") input_image = cv2.imread(input_img) h = int(input_image.shape[0]) w = int(input_image.shape[1]) input_image = input_image[int(h * obj[3]):int(h * obj[4]), int(w * obj[1]):int(w * obj[2])] print(obj) cv2.imwrite(out, input_image) if input_image is None: print(f"FAILED: could not load input image {str(input_img)}") sys.exit(1) input_image_buffer = preprocess_suit_img(input_image, 128, 64) input_image_buffer = np.expand_dims(input_image_buffer, axis=0) inputs[0].set_data_from_numpy(input_image_buffer) # print("Invoking inference...") results = self.triton_client.infer(model_name=self.model, inputs=inputs, outputs=outputs, client_timeout=self.client_timeout) result = results.as_numpy('prob') # print (result[0]) return np.argmax(result[0])
async def request(self, model_name, clientName, x): result = list() print(x.shape) inputs = [ grpcclient.InferInput('input0', x.shape, "FP32") ] outputs = [ grpcclient.InferRequestedOutput('output0') ] inputs[0].set_data_from_numpy(x) if self.DEBUG: stime = time.time() results = await self.loop.run_in_executor(None, lambda: self.tritonclient[clientName].infer(model_name=model_name, \ inputs=inputs, \ outputs=outputs)) if self.DEBUG: distime = time.time() - stime self.Elapsed[model_name] = distime print("[Request] Elapsed (After Request) {} {:.3f} second".format(model_name, distime)) output0_data = results.as_numpy('output0') output0_data = sigmoid(output0_data.squeeze()) return output0_data
def __init__(self, url="localhost:8001", model_name="yolov5", model_version="", verbose=False) -> None: self.triton_client = grpcclient.InferenceServerClient(url=url, verbose=verbose) # To make sure no shared memory regions are registered with the server. self.triton_client.unregister_system_shared_memory() self.triton_client.unregister_cuda_shared_memory() # Yolo model takes 1 input tensors dims [1, 3, 640, 640] # each and returns 4 output tensors # dims: [1,25200,6] # dims: [1,3,80,80,6] # dims: [1,3,40,40,6] # dims: [1,3,20,20,6] self.model_name = model_name self.model_version = model_version # Create the data for the input and 4 outputs tensors input_images = np.zeros((1, 3, 640, 640), dtype=np.float32) output = np.zeros((1, 25200, 6), dtype=np.float32) output_397 = np.zeros((1, 3, 80, 80, 6), dtype=np.float32) output_458 = np.zeros((1, 3, 40, 40, 6), dtype=np.float32) output_519 = np.zeros((1, 3, 20, 20, 6), dtype=np.float32) # Calc input/output tensors sizes input_images_byte_size = input_images.size * input_images.itemsize output_byte_size = output.size * output.itemsize output_397_byte_size = output_397.size * output_397.itemsize output_458_byte_size = output_458.size * output_458.itemsize output_519_byte_size = output_519.size * output_519.itemsize # Create outputs in Shared Memory and store shared memory handles self.output_handle = shm.create_shared_memory_region( "output", "/output", output_byte_size) self.output_397_handle = shm.create_shared_memory_region( "output_397", "/output_397", output_397_byte_size) self.output_458_handle = shm.create_shared_memory_region( "output_458", "/output_458", output_458_byte_size) self.output_519_handle = shm.create_shared_memory_region( "output_519", "/output_519", output_519_byte_size) # Register outputs shared memory with Triton Server self.triton_client.register_system_shared_memory( "output", "/output", output_byte_size) self.triton_client.register_system_shared_memory( "output_397", "/output_397", output_397_byte_size) self.triton_client.register_system_shared_memory( "output_458", "/output_458", output_458_byte_size) self.triton_client.register_system_shared_memory( "output_519", "/output_519", output_519_byte_size) # Create inputs in Shared Memory and store shared memory handles self.input_images_handle = shm.create_shared_memory_region( "images", "/images", input_images_byte_size) # Register inputs shared memory with Triton Server self.triton_client.register_system_shared_memory( "images", "/images", input_images_byte_size) # Set the parameters to use data from shared memory self.inputs = [] self.inputs.append( grpcclient.InferInput('images', [1, 3, 640, 640], "FP32")) self.inputs[-1].set_shared_memory("images", input_images_byte_size) self.outputs = [] self.outputs.append(grpcclient.InferRequestedOutput('output')) self.outputs[-1].set_shared_memory("output", output_byte_size) self.predict(input_images)
"/input0_simple", input0_byte_size) triton_client.register_system_shared_memory("input1_data", "/input1_simple", input1_byte_size) # Set the parameters to use data from shared memory inputs = [] inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input0_data", input0_byte_size) inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input1_data", input1_byte_size) outputs = [] outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs[-1].set_shared_memory("output0_data", output0_byte_size) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) outputs[-1].set_shared_memory("output1_data", output1_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") print(utils.triton_to_np_dtype(output0.datatype)) if output0 is not None: output0_data = shm.get_contents_as_numpy( shm_op0_handle, utils.triton_to_np_dtype(output0.datatype),
def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) # configs [ url, protocol, async stream, binary data ] configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if output0_raw == output1_raw: # Float16 not supported for Input and Output via JSON if use_http_json_tensors and (input_dtype != np.float16) and \ (output0_dtype != np.float16) and (output1_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max( np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min( np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) input1_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: input0_array = input0_array.astype(input_dtype) input1_array = input1_array.astype(input_dtype) if not swap: output0_array = input0_array + input1_array output1_array = input0_array - input1_array else: output0_array = input0_array - input1_array output1_array = input0_array + input1_array if output0_dtype == np.object: output0_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output0_array.flatten()) ], dtype=object).reshape(output0_array.shape) else: output0_array = output0_array.astype(output0_dtype) if output1_dtype == np.object: output1_array = np.array([ unicode(str(x), encoding='utf-8') for x in (output1_array.flatten()) ], dtype=object).reshape(output1_array.shape) else: output1_array = output1_array.astype(output1_dtype) if input_dtype == np.object: in0n = np.array( [str(x) for x in input0_array.reshape(input0_array.size)], dtype=object) input0_array = in0n.reshape(input0_array.shape) in1n = np.array( [str(x) for x in input1_array.reshape(input1_array.size)], dtype=object) input1_array = in1n.reshape(input1_array.shape) # prepend size of string to output string data if output0_dtype == np.object: if batch_size == 1: output0_array_tmp = serialize_byte_tensor_list([output0_array]) else: output0_array_tmp = serialize_byte_tensor_list(output0_array) else: output0_array_tmp = output0_array if output1_dtype == np.object: if batch_size == 1: output1_array_tmp = serialize_byte_tensor_list([output1_array]) else: output1_array_tmp = serialize_byte_tensor_list(output1_array) else: output1_array_tmp = output1_array # Get model platform model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if configs[0][1] == "http": metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata["platform"] else: metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata.platform if platform == "pytorch_libtorch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" else: OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp]) output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp]) if batch_size == 1: input0_list = [input0_array] input1_list = [input1_array] else: input0_list = [x for x in input0_array] input1_list = [x for x in input1_array] # Serialization of string tensors in the case of shared memory must be done manually if input_dtype == np.object: input0_list_tmp = serialize_byte_tensor_list(input0_list) input1_list_tmp = serialize_byte_tensor_list(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp]) input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp]) # Create system/cuda shared memory regions if needed shm_regions, shm_handles = su.create_set_shm_regions( input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] if config[1] == "http": inputs.append( httpclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( httpclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) else: inputs.append( grpcclient.InferInput(INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append( grpcclient.InferInput(INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[0].set_data_from_numpy(input0_array, binary_data=config[3]) inputs[1].set_data_from_numpy(input1_array, binary_data=config[3]) else: inputs[0].set_data_from_numpy(input0_array) inputs[1].set_data_from_numpy(input1_array) else: # Register necessary shared memory regions/handles su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles, input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if batch_size == 1: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape((1, ) + tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape((1, ) + tensor_shape) ] else: expected0_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape(tensor_shape) ] expected1_sort_idx = [ np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape(tensor_shape) ] # Force binary_data = False for shared memory and class output_req = [] i = 0 if "OUTPUT0" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT0, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT0)) output_req[-1].set_shared_memory(shm_regions[2] + '_data', output0_byte_size) else: if output0_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT0)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT0, class_count=num_classes)) i += 1 if "OUTPUT1" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput(OUTPUT1, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT1)) output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data', output1_byte_size) else: if output1_raw: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT1)) else: if config[1] == "http": output_req.append( httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3], class_count=num_classes)) else: output_req.append( grpcclient.InferRequestedOutput( OUTPUT1, class_count=num_classes)) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) last_response = results.get_response() if not skip_request_id_check: global _seen_request_ids if config[1] == "http": request_id = int(last_response["id"]) else: request_id = int(last_response.id) tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(str(response_model_version), model_version) tester.assertEqual(len(response_outputs), len(outputs)) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if use_system_shared_memory or use_cuda_shared_memory: if result_name == OUTPUT0: shm_handle = shm_handles[2] else: shm_handle = shm_handles[3] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) if result_name == OUTPUT0: tester.assertTrue( np.array_equal(output_data, output0_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, output0_array, output_data)) elif result_name == OUTPUT1: tester.assertTrue( np.array_equal(output_data, output1_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, output1_array, output_data)) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: for b in range(batch_size): # num_classes values must be returned and must # match expected top values if "nobatch" in pf: class_list = results.as_numpy(result_name) else: class_list = results.as_numpy(result_name)[b] tester.assertEqual(len(class_list), num_classes) if batch_size == 1: expected0_flatten = output0_array.flatten() expected1_flatten = output1_array.flatten() else: expected0_flatten = output0_array[b].flatten() expected1_flatten = output1_array[b].flatten() for idx, class_label in enumerate(class_list): # can't compare indices since could have different # indices with the same value/prob, so check that # the value of each index equals the expected value. # Only compare labels when the indices are equal. if type(class_label) == str: ctuple = class_label.split(':') else: ctuple = "".join(chr(x) for x in class_label).split(':') cval = float(ctuple[0]) cidx = int(ctuple[1]) if result_name == OUTPUT0: tester.assertEqual(cval, expected0_flatten[cidx]) tester.assertEqual( cval, expected0_flatten[expected0_sort_idx[b][idx]]) if cidx == expected0_sort_idx[b][idx]: tester.assertEqual( ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(cval, expected1_flatten[cidx]) tester.assertEqual( cval, expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format( result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if use_http_json_tensors and (tensor_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if shm_region_name_prefix is None: shm_region_name_prefix = ["input", "output"] input_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() # Get model platform model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if configs[0][1] == "http": metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata["platform"] else: metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True) metadata = metadata_client.get_model_metadata(model_name) platform = metadata.platform for io_num in range(io_cnt): if platform == "pytorch_libtorch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_shape = input_shapes[io_num] output_shape = output_shapes[io_num] rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shape, dtype=rtensor_dtype) else: input_array = np.random.choice(a=[False, True], size=input_shape) if tensor_dtype != np.object: input_array = input_array.astype(tensor_dtype) expected_array = np.ndarray.copy(input_array) else: expected_array = np.array([ unicode(str(x), encoding='utf-8') for x in input_array.flatten() ], dtype=object) input_array = np.array([str(x) for x in input_array.flatten()], dtype=object).reshape(input_array.shape) expected_array = expected_array.reshape(output_shape) expected_dict[output_name] = expected_array output_byte_size = expected_array.nbytes if batch_size == 1: input_list = [input_array] else: input_list = [x for x in input_array] # Serialization of string tensors in the case of shared memory must be done manually if tensor_dtype == np.object: input_list_tmp = serialize_byte_tensor_list(input_list) else: input_list_tmp = input_list input_byte_size = sum([ip.nbytes for ip in input_list_tmp]) # create and register shared memory region for inputs and outputs shm_io_handles = su.create_set_either_shm_region( [ shm_region_name_prefix[0] + str(io_num), shm_region_name_prefix[1] + str(io_num) ], input_list_tmp, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory) if len(shm_io_handles) != 0: shm_ip_handles.append(shm_io_handles[0]) shm_op_handles.append(shm_io_handles[1]) input_dict[input_name] = input_array if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient(config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient(config[0], verbose=True) inputs = [] output_req = [] for io_num, (input_name, output_name) in enumerate( zip(input_dict.keys(), expected_dict.keys())): input_data = input_dict[input_name] input_byte_size = input_data.nbytes output_byte_size = expected_dict[output_name].nbytes if config[1] == "http": inputs.append( httpclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append( httpclient.InferRequestedOutput(output_name, binary_data=config[3])) else: inputs.append( grpcclient.InferInput(input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append(grpcclient.InferRequestedOutput(output_name)) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[-1].set_data_from_numpy(input_data, binary_data=config[3]) else: inputs[-1].set_data_from_numpy(input_data) else: # Register necessary shared memory regions/handles su.register_add_either_shm_regions( inputs, output_req, shm_region_name_prefix, (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer( model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) last_response = results.get_response() if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(response_model_version, model_version) tester.assertEqual(len(response_outputs), io_cnt) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name tester.assertTrue(result_name in expected_dict) if use_system_shared_memory or use_cuda_shared_memory: if platform == "pytorch_libtorch": io_num = int(result_name.split("OUTPUT__")[1]) else: io_num = int(result_name.split("OUTPUT")[1]) shm_handle = shm_op_handles[io_num] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) expected = expected_dict[result_name] tester.assertEqual(output_data.shape, expected.shape) tester.assertTrue( np.array_equal(output_data, expected), "{}, {}, expected: {}, got {}".format(model_name, result_name, expected, output_data)) if len(shm_ip_handles) != 0: for io_num in range(io_cnt): if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0] + str(io_num) + '_data') cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') triton_client.unregister_system_shared_memory( shm_region_name_prefix[1] + str(io_num) + '_data') shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
print(config) except InferenceServerException as ex: print("FAILED : get_model_config") print("Got: {}".format(ex.message())) sys.exit(1) # DUMMY MODE if FLAGS.mode == 'dummy': print("Running in 'dummy' mode") print("Creating emtpy buffer filled with ones...") inputs = [] outputs = [] inputs.append(grpcclient.InferInput('data', [1, 3, 608, 608], "FP32")) inputs[0].set_data_from_numpy( np.ones(shape=(1, 3, 608, 608), dtype=np.float32)) outputs.append(grpcclient.InferRequestedOutput('prob')) print("Invoking inference...") results = triton_client.infer(model_name=FLAGS.model, inputs=inputs, outputs=outputs, client_timeout=FLAGS.client_timeout) if FLAGS.model_info: statistics = triton_client.get_inference_statistics( model_name=FLAGS.model) if len(statistics.model_stats) != 1: print("FAILED: get_inference_statistics") sys.exit(1) print(statistics) print("Done")
# pre-processing img = Image.open(DATA).convert('L') img = img.resize(INPUT_SHAPE) imgArr = np.asarray(img) / 255 imgArr = np.expand_dims(imgArr[:, :, np.newaxis], 0) imgArr= imgArr.astype(triton_to_np_dtype('FP32')) # Client-Server GRPC print("Using GRPC ... ") triton_client = grpcclient.InferenceServerClient(url=URL_GRPC, verbose=0) inputs = [] inputs.append(grpcclient.InferInput('flatten_1_input', imgArr.shape, 'FP32')) inputs[0].set_data_from_numpy(imgArr) outputs = [] outputs.append(grpcclient.InferRequestedOutput('dense_3', class_count=0)) responses = [] responses.append(triton_client.infer(MODEL,inputs, request_id=str(1), model_version=MODEL_VER, outputs=outputs)) # post-proocessing print (np.argmax(responses[0].as_numpy('dense_3')[0])) # TODO: Add in return of human-readable label # Client-Server HTTPS print("Using HTTPS ... ") triton_client = httpclient.InferenceServerClient(url=URL_HTTP, verbose=0) inputs = [] inputs.append(httpclient.InferInput('flatten_1_input', imgArr.shape, 'FP32'))
end_idx = (batch + 1) * (args.batch_size) # Convert the batch to a triton inputs current_batch = all_batches[start_idx:end_idx] columns = [(col, current_batch[col]) for col in col_names] inputs = [] for i, (name, col) in enumerate(columns): d = col.values_host.astype(col_dtypes[i]) d = d.reshape(len(d), 1) inputs.append( grpcclient.InferInput(name, d.shape, np_to_triton_dtype(col_dtypes[i]))) inputs[i].set_data_from_numpy(d) outputs = [] outputs.append(grpcclient.InferRequestedOutput("OUTPUT0")) response = client.infer(args.model_name, inputs, request_id=str(1), outputs=outputs) results.extend(response.as_numpy("OUTPUT0")) publish_batch(args.project_id, args.topic_id, current_batch, response.as_numpy("OUTPUT0")) logging.info( f"ROC AUC Score: {metrics.roc_auc_score(all_batches[LABEL_COLUMNS].values.tolist(), results)}" )
def infer(self,input_img,triton_client,confidence = 0.5): confidence = confidence out = "output/"+input_img.split("/")[1] # IMAGE MODE # print("Running in 'image' mode") if not input_img: print("FAILED: no input image") sys.exit(1) inputs = [] outputs = [] inputs.append(grpcclient.InferInput('data', [1, 3, 640, 640], "FP32")) outputs.append(grpcclient.InferRequestedOutput('prob')) # print("Creating buffer from image file...") input_image = cv2.imread(input_img) if input_image is None: print(f"FAILED: could not load input image {str(input_img)}") sys.exit(1) input_image_buffer,dw,dh,padding_w,padding_h= preprocess(input_image) input_image_buffer = np.expand_dims(input_image_buffer, axis=0) inputs[0].set_data_from_numpy(input_image_buffer) # print("Invoking inference...") results = self.triton_client.infer(model_name=self.model, inputs=inputs, outputs=outputs, client_timeout=self.client_timeout) if self.model_info: statistics = self.triton_client.get_inference_statistics(model_name=self.model) if len(statistics.model_stats) != 1: print("FAILED: get_inference_statistics") sys.exit(1) # print(statistics) # print("load model done") result = results.as_numpy('prob') # print(f"Received result buffer of size {result.shape}") # print(f"Naive buffer sum: {np.sum(result)}") detected_objects = postprocess(result, input_image.shape[1], input_image.shape[0],dw,dh,padding_w,padding_h,confidence, self.nms) print(f"Raw boxes: {int(result[0, 0, 0, 0])}") print(f"Detected objects: {len(detected_objects)}") return_info = [] for box in detected_objects: return_info.append([box.classID,box.u1,box.u2,box.v1,box.v2]) print(f"{COCOLabels(box.classID).name}: {box.confidence}") input_image = render_box(input_image, box.box(), color=tuple(RAND_COLORS[box.classID % 64].tolist())) size = get_text_size(input_image, f"{COCOLabels(box.classID).name}: {box.confidence:.2f}", normalised_scaling=0.6) input_image = render_filled_box(input_image, (box.x1 - 3, box.y1 - 3, box.x1 + size[0], box.y1 + size[1]), color=(220, 220, 220)) input_image = render_text(input_image, f"{COCOLabels(box.classID).name}: {box.confidence:.2f}", (box.x1, box.y1), color=(30, 30, 30), normalised_scaling=0.5) if out: cv2.imwrite(out, input_image) print(f"Saved result to {out}") else: cv2.imshow('image', input_image) cv2.waitKey(0) cv2.destroyAllWindows() return return_info
inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( httpclient.InferInput(input_name, batched_image_data.shape, "BYTES")) inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) output_names = [ output.name if FLAGS.protocol.lower() == "grpc" else output['name'] for output in output_metadata ] outputs = [] for output_name in output_names: if FLAGS.protocol.lower() == "grpc": outputs.append( grpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True, class_count=FLAGS.classes)) # Send request result = triton_client.infer(model_name, inputs, outputs=outputs) postprocess(result, output_names, input_filenames, batch_size) print("PASS")