def test_batch_request_for_batching_model(self): input_size = 16 # graphdef_nobatch_int32_int8_int8 is non batching version. # The server should return an error if the batch size dimension # is included in the shape tensor_shape = (1, input_size) for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) inputs = [] outputs = [] if protocol == "http": triton_client = tritonhttpclient.InferenceServerClient( url='localhost:8000', verbose=True) inputs.append( tritonhttpclient.InferInput('INPUT0', tensor_shape, "INT32")) inputs.append( tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0')) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1')) else: triton_client = tritongrpcclient.InferenceServerClient( url='localhost:8001', verbose=True) inputs.append( tritongrpcclient.InferInput('INPUT0', tensor_shape, "INT32")) inputs.append( tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append( tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs.append( tritongrpcclient.InferRequestedOutput('OUTPUT1')) # Initialize the data inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(in1) results = triton_client.infer(model_name, inputs, outputs=outputs)
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS): # Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( tritongrpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( tritonhttpclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( tritongrpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( tritonhttpclient.InferRequestedOutput(output_name, binary_data=True, class_count=FLAGS.classes)) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def test_ragged_input(self): model_name = "ragged_acc_shape" output_name = 'RAGGED_OUTPUT' outputs = [tritonhttpclient.InferRequestedOutput(output_name)] async_requests = [] try: for inputs in self.inputs: # Asynchronous inference call. async_requests.append( self.client.async_infer(model_name=model_name, inputs=inputs, outputs=outputs)) value_lists = [[v] * v for v in [2, 4, 1, 3]] expected_value = [] for value_list in value_lists: expected_value += value_list expected_value = np.asarray([expected_value], dtype=np.float32) for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. # Note the call will block till the server responds. result = async_requests[idx].get_result() # Validate the results by comparing with precomputed values. output_data = result.as_numpy(output_name) self.assertTrue( np.array_equal(output_data, expected_value), "Expect response {} to have value {}, got {}".format( idx, expected_value, output_data)) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_tf_unicode_bytes(self): # We use a simple model that takes an input tensor of 8 byte strings # and returns an output tensors of 8 strings. The output tensor # is the same as the input tensor. model_name = "graphdef_nobatch_zero_1_object" model_version = "" # Create the inference server client for the model. triton_client = tritonhttpclient.InferenceServerClient( "localhost:8000", verbose=True) # Create the data for the input tensor. Initialize the tensor to 8 # byte strings. (dtype of np.bytes_) # Sample string that should no longer cause failure in0 = np.array([ [ b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\xfa\x03\x94\x01\x0f\xd7\x02\xf1\x05\xdf\x01\x82\x03\xb5\x05\xc1\x07\xba\x06\xff\x06\xc7\x07L\xf5\x03\xe2\x07\xa9\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xdf\\\xcb\xbf' ], [ b'\n:\n\x1a\n\x01a\x12\x15\x1a\x13\n\x11*\xe3\x05\xc5\x06\xda\x07\xcb\x06~\xb1\x05\xb3\x01\xa9\x02\x15\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbb[\n\xbf' ], [ b'\nL\n-\n\x01a\x12(\x1a&\n$\x87\x07\xce\x01\xe7\x06\xee\x04\xe1\x03\xf1\x03\xd7\x07\xbe\x02\xb8\x05\xe0\x05\xe4\x01\x88\x06\xb6\x03\xb9\x05\x83\x06\xf8\x04\xe2\x04\xf4\x06\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbc\x99+@' ], [ b'\n2\n\x12\n\x01a\x12\r\x1a\x0b\n\t\x99\x02\xde\x04\x9f\x04\xc5\x053\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x12\x07\x83\xbe' ], [ b'\nJ\n\r\n\x01b\x12\x08\x1a\x06\n\x04\x9b\x94\xad\x04\n\r\n\x01c\x12\x08\x12\x06\n\x04\xc3\x8a\x08\xbf\n*\n\x01a\x12%\x1a#\n!\x9c\x02\xb2\x02\xcd\x02\x9d\x07\x8d\x01\xb6\x05a\xf1\x01\xf0\x05\xdb\x02\xac\x04\xbd\x05\xe0\x04\xd2\x06\xaf\x02\xa8\x01\x8b\x04' ], [ b'\n3\n\x13\n\x01a\x12\x0e\x1a\x0c\n\n<\xe2\x05\x8a\x01\xb3\x07?\xfd\x01\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x1b\x931\xbf' ], [ b'\n&\n\x07\n\x01a\x12\x02\x1a\x00\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04{\xbc\x0e>' ], [ b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\x97\x01\x93\x02\x9e\x01\xac\x06\xff\x01\xd8\x05\xe1\x07\xd8\x04g]\x9a\x05\xff\x06\xde\x07\x8f\x04\x97\x04\xda\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x9a\xb7I\n\r\n\x01c\x12\x08\x12\x06\n\x04\xfb\x87\x83\xbf' ] ], dtype='|S78').flatten() # Send inference request to the inference server. Get results for # both output tensors. inputs = [] outputs = [] inputs.append(tritonhttpclient.InferInput('INPUT0', in0.shape, "BYTES")) inputs[0].set_data_from_numpy(in0) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, model_version=model_version) # We expect there to be 1 results (with batch-size 1). Verify # that all 8 result elements are the same as the input. self.assertTrue(np.array_equal(in0, results.as_numpy('OUTPUT0')))
def test_max_element_count_as_shape(self): model_name = "ragged_acc_shape" output_name = 'BATCH_OUTPUT' outputs = [tritonhttpclient.InferRequestedOutput(output_name)] async_requests = [] try: for inputs in self.inputs: # Asynchronous inference call. async_requests.append( self.client.async_infer(model_name=model_name, inputs=inputs, outputs=outputs)) for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. # Note the call will block till the server responds. result = async_requests[idx].get_result() # Validate the results by comparing with precomputed values. output_data = result.as_numpy(output_name) self.assertEqual( output_data.shape, (1, 4), "Expect response {} to have shape to represent max element count {} among the batch , got {}" .format(idx, 4, output_data.shape)) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_accumulated_element_count_with_zero(self): model_name = "ragged_element_count_acc_zero" output_name = 'BATCH_OUTPUT' outputs = [tritonhttpclient.InferRequestedOutput(output_name)] async_requests = [] try: for inputs in self.inputs: # Asynchronous inference call. async_requests.append( self.client.async_infer(model_name=model_name, inputs=inputs, outputs=outputs)) expected_value = np.asarray([[0, 2, 6, 7, 10]], np.float32) for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. # Note the call will block till the server responds. result = async_requests[idx].get_result() # Validate the results by comparing with precomputed values. output_data = result.as_numpy(output_name) self.assertTrue( np.array_equal(output_data, expected_value), "Expect response {} to have value {}, got {}".format( idx, expected_value, output_data)) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def sync_send(triton_client, result_list, values, batch_size, sequence_id, model_name, model_version): count = 1 for value in values: # Create the tensor for INPUT value_data = np.full(shape=[batch_size, 1], fill_value=value, dtype=np.int32) inputs = [] inputs.append( tritonhttpclient.InferInput('INPUT', value_data.shape, "INT32")) # Initialize the data inputs[0].set_data_from_numpy(value_data) outputs = [] outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT')) # Issue the synchronous sequence inference. result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, sequence_id=sequence_id, sequence_start=(count == 1), sequence_end=(count == len(values))) result_list.append(result.as_numpy('OUTPUT')) count = count + 1
def _full_exact(self, model_name, plugin_name, shape): triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', list(shape), "FP32")) input0_data = np.ones(shape=shape).astype(np.float32) inputs[0].set_data_from_numpy(input0_data, binary_data=True) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) results = triton_client.infer(model_name + '_' + plugin_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT0') # Verify values of Normalize and GELU if plugin_name == 'CustomGeluPluginDynamic': # Add bias input0_data += 1 # Calculate Gelu activation test_output = (input0_data * 0.5) * (1 + np.tanh((0.797885 * input0_data) + (0.035677 * (input0_data**3)))) self.assertTrue(np.isclose(output0_data, test_output).all()) else: # L2 norm is sqrt(sum([1]*16))) test_output = input0_data / np.sqrt(sum([1] * 16)) self.assertTrue(np.isclose(output0_data, test_output).all())
def _full_exact(self, batch_size, model_name, plugin_name): triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [batch_size, 16], "FP32")) input0_data = np.random.randn(batch_size, 16).astype(np.float32) inputs[0].set_data_from_numpy(input0_data, binary_data=True) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) results = triton_client.infer(model_name + '_' + plugin_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT0') # Verify values of Leaky RELU (it uses 0.1 instead of the default 0.01) # and for CustomClipPlugin min_clip = 0.1, max_clip = 0.5 for b in range(batch_size): if plugin_name == 'LReLU_TRT': test_input = np.where(input0_data > 0, input0_data, input0_data * 0.1) self.assertTrue(np.isclose(output0_data, test_input).all()) else: # [TODO] Add test for CustomClip output test_input = np.clip(input0_data, 0.1, 0.5)
def run_infer(model_name, model_version, numerical_features, categorical_features, headers=None): inputs = [] outputs = [] num_type = "FP16" if numerical_features.dtype == np.float16 else "FP32" inputs.append( tritonhttpclient.InferInput('input__0', numerical_features.shape, num_type)) inputs.append( tritonhttpclient.InferInput('input__1', categorical_features.shape, "INT64")) # Initialize the data inputs[0].set_data_from_numpy(numerical_features, binary_data=True) inputs[1].set_data_from_numpy(categorical_features, binary_data=False) outputs.append( tritonhttpclient.InferRequestedOutput('output__0', binary_data=True)) results = triton_client.infer( model_name, inputs, model_version=str(model_version) if model_version != -1 else '', outputs=outputs, headers=headers) return results
def sync_send(triton_client, result_list, values, batch_size, sequence_id, model_name, model_version): count = 1 for value in values: # Create the tensor for INPUT value_data = np.full(shape=[batch_size, 1], fill_value=value, dtype=np.int32) inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT', value_data.shape, "INT32")) # Initialize the data # FIXME, negative value in binary form can't be handled properly, # which causes the library to raise decode exception. inputs[0].set_data_from_numpy(value_data, binary_data=False) outputs = [] outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT')) # Issue the synchronous sequence inference. result = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs, sequence_id=sequence_id, sequence_start=(count == 1), sequence_end=(count == len(values))) result_list.append(result.as_numpy('OUTPUT')) count = count + 1
def TestIdentityInference(np_array, binary_data): model_name = "savedmodel_zero_1_object" inputs = [] outputs = [] inputs.append( tritonhttpclient.InferInput('INPUT0', np_array.shape, "BYTES")) inputs[0].set_data_from_numpy(np_array, binary_data=binary_data) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=binary_data)) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) if (np_array.dtype == np.object): if binary_data: if not np.array_equal(np_array, np.char.decode(results.as_numpy('OUTPUT0'))): print(results.as_numpy('OUTPUT0')) sys.exit(1) else: if not np.array_equal(np_array, results.as_numpy('OUTPUT0')): print(results.as_numpy('OUTPUT0')) sys.exit(1) else: encoded_results = np.char.encode( results.as_numpy('OUTPUT0').astype(str)) if not np.array_equal(np_array, encoded_results): print(encoded_results) sys.exit(1)
def requestGenerator(input_name, output_name, c, h, w, format, dtype, FLAGS): # Preprocess image into input data according to model requirements image_data = None with Image.open(FLAGS.image_filename) as img: image_data = preprocess(img, format, dtype, c, h, w, FLAGS.scaling) repeated_image_data = [image_data for _ in range(FLAGS.batch_size)] batched_image_data = np.stack(repeated_image_data, axis=0) # Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( tritongrpcclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( tritonhttpclient.InferInput(input_name, batched_image_data.shape, dtype)) inputs[0].set_data_from_numpy(batched_image_data, binary_data=False) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( tritongrpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( tritonhttpclient.InferRequestedOutput(output_name, binary_data=False, class_count=FLAGS.classes)) yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def triton_infer(input_mapping, model_name, binary_data=False, binary_output=False, class_count=0): """Helper function for setting Triton inputs and executing a request Arguments ---------- input_mapping : dict A dictionary mapping strings to numpy arrays. The keys should be the names of the model inputs, and the values should be the inputs themselves. model_name : str The name of the model on which you are running inference. binary_data : bool Whether you are expecting binary input and output. Defaults to False class_count : int If the model is a classification model, the number of output classes. Defaults to 0, indicating this is not a classification model. Returns ---------- res : InferResult Triton inference result containing output from running prediction """ input_meta, _, output_meta, _ = parse_model_http(model_name) inputs = [] outputs = [] # Populate the inputs array for in_meta in input_meta: input_name = in_meta["name"] data = input_mapping[input_name] input = tritonhttpclient.InferInput(input_name, data.shape, in_meta["datatype"]) input.set_data_from_numpy(data, binary_data=binary_data) inputs.append(input) # Populate the outputs array for out_meta in output_meta: output_name = out_meta["name"] output = tritonhttpclient.InferRequestedOutput( output_name, binary_data=binary_output, class_count=class_count) outputs.append(output) # Run inference res = triton_client.infer(model_name, inputs, request_id="0", outputs=outputs) return res
def test_chw32_input(self): model_name = "plan_CHW32_LINEAR_float32_float32_float32" for bs in [1, 8]: input_np = np.arange(26 * bs, dtype=np.float32).reshape( (bs, 13, 2, 1)) expected_output0_np = input_np + input_np expected_output1_np = input_np - input_np reformatted_input_np = reformat("CHW32", input_np) # Use shared memory to bypass the shape check in client library, # because for non-linear format tensor, the data buffer is padded # and thus the data byte size may not match what is calculated from # tensor shape inputs = [] inputs.append( tritonhttpclient.InferInput('INPUT0', [bs, 13, 2, 1], "FP32")) self.add_reformat_free_data_as_shared_memory( "input0" + str(bs), inputs[-1], reformatted_input_np) inputs.append( tritonhttpclient.InferInput('INPUT1', [bs, 13, 2, 1], "FP32")) self.add_reformat_free_data_as_shared_memory( "input1" + str(bs), inputs[-1], reformatted_input_np) outputs = [] outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) results = self.triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Validate the results by comparing with precomputed values. output0_np = results.as_numpy('OUTPUT0') output1_np = results.as_numpy('OUTPUT1') self.assertTrue( np.array_equal(output0_np, expected_output0_np), "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np)) self.assertTrue( np.array_equal(output1_np, expected_output1_np), "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np))
def _no_streaming_helper(self, protocol): data_offset = 100 repeat_count = 1 delay_time = 1000 wait_time = 2000 input_data = np.arange(start=data_offset, stop=data_offset + repeat_count, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) delay_data = (np.ones([1, repeat_count], dtype=np.uint32)) * delay_time wait_data = np.array([[wait_time]], dtype=np.uint32) if protocol is "grpc": # Use the inputs and outputs from the setUp this_inputs = self.inputs_ this_outputs = self.outputs_ else: this_inputs = [] this_inputs.append( httpclient.InferInput('IN', [1, repeat_count], "INT32")) this_inputs.append(httpclient.InferInput('DELAY', [1, 1], "UINT32")) this_inputs.append(httpclient.InferInput('WAIT', [1, 1], "UINT32")) this_outputs = [] this_outputs.append(httpclient.InferRequestedOutput('OUT')) # Initialize data for IN this_inputs[0].set_shape([1, repeat_count]) this_inputs[0].set_data_from_numpy(input_data) # Initialize data for DELAY this_inputs[1].set_shape([1, repeat_count]) this_inputs[1].set_data_from_numpy(delay_data) # Initialize data for WAIT this_inputs[2].set_data_from_numpy(wait_data) if protocol is "grpc": triton_client = grpcclient.InferenceServerClient( url="localhost:8001", verbose=True) else: triton_client = httpclient.InferenceServerClient( url="localhost:8000", verbose=True) try: triton_client.infer(model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs) self.assertTrue(False, "expected to fail for decoupled models") except InferenceServerException as ex: self.assertTrue( "doesn't support models with decoupled transaction policy" in ex.message())
def test_infer(model_name, input0_data, input1_data): inputs = [] outputs = [] inputs.append(tritonhttpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs.append(tritonhttpclient.InferInput('INPUT1', [1, 16], "INT32")) # Initialize the data inputs[0].set_data_from_numpy(input0_data, binary_data=False) inputs[1].set_data_from_numpy(input1_data, binary_data=True) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) query_params = {'test_1': 1, 'test_2': 2} results = triton_client.infer(model_name, inputs, outputs=outputs, query_params=query_params) return results
def test_nobatch_request_for_batching_model(self): input_size = 16 # graphdef_int32_int8_int8 has a batching version with max batch size of 8. # The server should return an error if the batch size is not included in the # input shapes. tensor_shape = (input_size,) for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) inputs = [] outputs = [] if protocol == "http": triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True) inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1')) else: triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True) inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) # Initialize the data inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(in1) try: results = triton_client.infer(model_name, inputs, outputs=outputs) self.assertTrue(False, "expected failure with no batch request for batching model") except InferenceServerException as ex: pass
def _erroneous_infer(self, tensor_shape, batch_size): import tritonhttpclient item_size = batch_size for dim in tensor_shape: item_size *= dim full_shape = (batch_size, ) + tensor_shape input_np = np.arange(item_size, dtype=self.dtype_).reshape(full_shape) expected_output0_np = input_np + input_np expected_output1_np = input_np - input_np inputs = [] inputs.append( tritonhttpclient.InferInput('INPUT0', full_shape, self.dtype_str_)) inputs[-1].set_data_from_numpy(input_np) inputs.append( tritonhttpclient.InferInput('INPUT1', full_shape, self.dtype_str_)) inputs[-1].set_data_from_numpy(input_np) outputs = [] outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) model_name = tu.get_model_name(self.model_name_, self.dtype_, self.dtype_, self.dtype_) results = tritonhttpclient.InferenceServerClient( "localhost:8000", verbose=True).infer(model_name=model_name, inputs=inputs, outputs=outputs) # Validate the results by comparing with precomputed values. output0_np = results.as_numpy('OUTPUT0') output1_np = results.as_numpy('OUTPUT1') self.assertFalse(np.array_equal(output0_np, expected_output0_np), "expects OUTPUT0 is not correct") self.assertFalse(np.array_equal(output1_np, expected_output1_np), "expects OUTPUT1 is not correct")
def _prepare_request(self, protocol): if (protocol == "grpc"): self.inputs_ = [] self.inputs_.append(grpcclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0')) else: self.inputs_ = [] self.inputs_.append(httpclient.InferInput('INPUT0', [1, 1], "INT32")) self.outputs_ = [] self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0')) self.inputs_[0].set_data_from_numpy(self.input0_data_)
def _addsub_infer(self, model_name): triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32")) # Initialize the data inputs[0].set_data_from_numpy(self.input0_, binary_data=True) inputs[1].set_data_from_numpy(self.input1_, binary_data=False) outputs.append(httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) results = triton_client.infer(model_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT0') output1_data = results.as_numpy('OUTPUT1') self.assertTrue(np.array_equal(self.expected_output0_, output0_data), "incorrect sum") self.assertTrue(np.array_equal(self.expected_output1_, output1_data), "incorrect difference")
def test_batch_item_shape(self): # Use 3 set of inputs with shape [2, 1, 2], [1, 1, 2], [1, 2, 2] # Note that the test only checks the formation of "BATCH_INPUT" where # the value of "RAGGED_INPUT" is irrelevant, only the shape matters inputs = [] for value in [[2, 1, 2], [1, 1, 2], [1, 2, 2]]: inputs.append( [tritonhttpclient.InferInput('RAGGED_INPUT', value, "FP32")]) inputs[-1][0].set_data_from_numpy( np.full(value, value[0], np.float32)) client = tritonhttpclient.InferenceServerClient( url="localhost:8000", concurrency=len(inputs)) expected_outputs = [ np.array([[1.0, 2.0], [1.0, 2.0]]), np.array([[1.0, 2.0]]), np.array([[2.0, 2.0]]), ] model_name = "batch_item" output_name = 'BATCH_OUTPUT' outputs = [tritonhttpclient.InferRequestedOutput(output_name)] async_requests = [] try: for request_inputs in inputs: # Asynchronous inference call. async_requests.append( client.async_infer(model_name=model_name, inputs=request_inputs, outputs=outputs)) for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. # Note the call will block till the server responds. result = async_requests[idx].get_result() # Validate the results by comparing with precomputed values. output_data = result.as_numpy(output_name) self.assertTrue( np.allclose(output_data, expected_outputs[idx]), "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}" .format(expected_outputs[idx], output_data, np.isclose(expected_outputs[idx], output_data))) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def _test_helper(self, modelVersion, tag, sig_def): shape = [self.dims] model_name = self.base_model_name + str(modelVersion) # The multiplier is defined during model creation. See server/qa/common/gen_tag_sigdef.py # for details multiplier = modelVersion + 1 output_name = "OUTPUT" triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT', shape, "FP32")) input_data = np.ones(shape=shape).astype(np.float32) inputs[0].set_data_from_numpy(input_data, binary_data=True) outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True)) results = triton_client.infer(model_name, inputs, outputs=outputs) output_data = results.as_numpy(output_name) test_output = input_data * multiplier self.assertTrue(np.isclose(output_data, test_output).all())
def test_ragged_output(self): model_name = "ragged_io" # The model is identity model self.inputs = [] for value in [2, 4, 1, 3]: self.inputs.append( [tritonhttpclient.InferInput('INPUT0', [1, value], "FP32")]) self.inputs[-1][0].set_data_from_numpy( np.full([1, value], value, np.float32)) output_name = 'OUTPUT0' outputs = [tritonhttpclient.InferRequestedOutput(output_name)] async_requests = [] try: for inputs in self.inputs: # Asynchronous inference call. async_requests.append( self.client.async_infer(model_name=model_name, inputs=inputs, outputs=outputs)) expected_value_list = [[v] * v for v in [2, 4, 1, 3]] expected_value_list = [ np.asarray([expected_value], dtype=np.float32) for expected_value in expected_value_list ] for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. # Note the call will block till the server responds. result = async_requests[idx].get_result() # Validate the results by comparing with precomputed values. output_data = result.as_numpy(output_name) self.assertTrue( np.array_equal(output_data, expected_value_list[idx]), "Expect response {} to have value {}, got {}".format( idx, expected_value_list[idx], output_data)) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
# Register Input0 and Input1 shared memory with Triton Server triton_client.register_system_shared_memory("input0_data", "/input0_simple", input0_byte_size) triton_client.register_system_shared_memory("input1_data", "/input1_simple", input1_byte_size) # Set the parameters to use data from shared memory inputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input0_data", input0_byte_size) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "BYTES")) inputs[-1].set_shared_memory("input1_data", input1_byte_size) outputs = [] outputs.append(httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs[-1].set_shared_memory("output0_data", output0_byte_size) outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) outputs[-1].set_shared_memory("output1_data", output1_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") print(utils.triton_to_np_dtype(output0['datatype'])) if output0 is not None: output0_data = shm.get_contents_as_numpy(
]], dtype='uint32') input2_data = np.array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 ]], dtype='int32') inputs = [ httpclient.InferInput("DES", input0_data.shape, np_to_triton_dtype(input0_data.dtype)), httpclient.InferInput("CATCOLUMN", input1_data.shape, np_to_triton_dtype(input1_data.dtype)), httpclient.InferInput("ROWINDEX", input2_data.shape, np_to_triton_dtype(input2_data.dtype)), ] inputs[0].set_data_from_numpy(input0_data) inputs[1].set_data_from_numpy(input1_data) inputs[2].set_data_from_numpy(input2_data) outputs = [httpclient.InferRequestedOutput("OUTPUT0")] response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() print(result) print(response.as_numpy("OUTPUT0"))
input_byte_size) triton_client.register_cuda_shared_memory( "input1_data", cudashm.get_raw_handle(shm_ip1_handle), 0, input_byte_size) # Set the parameters to use data from shared memory inputs = [] inputs.append(tritonhttpclient.InferInput('INPUT0', [1, 16], "INT32")) inputs[-1].set_shared_memory("input0_data", input_byte_size) inputs.append(tritonhttpclient.InferInput('INPUT1', [1, 16], "INT32")) inputs[-1].set_shared_memory("input1_data", input_byte_size) outputs = [] outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs[-1].set_shared_memory("output0_data", output_byte_size) outputs.append( tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) outputs[-1].set_shared_memory("output1_data", output_byte_size) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) # Read results from the shared memory. output0 = results.get_output("OUTPUT0") if output0 is not None: output0_data = cudashm.get_contents_as_numpy( shm_op0_handle, utils.triton_to_np_dtype(output0['datatype']),
# Set the input data inputs = [] if FLAGS.protocol.lower() == "grpc": inputs.append( tritongrpcclient.InferInput(input_name, batched_image_data.shape, "BYTES")) inputs[0].set_data_from_numpy(batched_image_data) else: inputs.append( tritonhttpclient.InferInput(input_name, batched_image_data.shape, "BYTES")) inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) outputs = [] if FLAGS.protocol.lower() == "grpc": outputs.append( tritongrpcclient.InferRequestedOutput(output_name, class_count=FLAGS.classes)) else: outputs.append( tritonhttpclient.InferRequestedOutput(output_name, binary_data=True, class_count=FLAGS.classes)) # Send request result = triton_client.infer(model_name, inputs, outputs=outputs) postprocess(result, output_name, input_filenames, batch_size) print("PASS")
def infer_exact(tester, pf, tensor_shape, batch_size, input_dtype, output0_dtype, output1_dtype, output0_raw=True, output1_raw=True, model_version=None, swap=False, outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True, use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True, correlation_id=0, shm_region_names=None, precreated_shm_regions=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue( use_http or use_http_json_tensors or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if output0_raw == output1_raw: # Float16 not supported for Input and Output via JSON if use_http_json_tensors and (input_dtype != np.float16) and \ (output0_dtype != np.float16) and (output1_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) # outputs are sum and difference of inputs so set max input # values so that they will not overflow the output. This # allows us to do an exact match. For float types use 8, 16, # 32 int range for fp 16, 32, 64 respectively. When getting # class outputs the result value/probability is returned as a # float so must use fp32 range in that case. rinput_dtype = _range_repr_dtype(input_dtype) routput0_dtype = _range_repr_dtype( output0_dtype if output0_raw else np.float32) routput1_dtype = _range_repr_dtype( output1_dtype if output1_raw else np.float32) val_min = max(np.iinfo(rinput_dtype).min, np.iinfo(routput0_dtype).min, np.iinfo(routput1_dtype).min) / 2 val_max = min(np.iinfo(rinput_dtype).max, np.iinfo(routput0_dtype).max, np.iinfo(routput1_dtype).max) / 2 num_classes = 3 input0_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) input1_array = np.random.randint(low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype) if input_dtype != np.object: input0_array = input0_array.astype(input_dtype) input1_array = input1_array.astype(input_dtype) if not swap: output0_array = input0_array + input1_array output1_array = input0_array - input1_array else: output0_array = input0_array - input1_array output1_array = input0_array + input1_array if output0_dtype == np.object: output0_array = np.array([unicode(str(x), encoding='utf-8') for x in (output0_array.flatten())], dtype=object).reshape(output0_array.shape) else: output0_array = output0_array.astype(output0_dtype) if output1_dtype == np.object: output1_array = np.array([unicode(str(x), encoding='utf-8') for x in (output1_array.flatten())], dtype=object).reshape(output1_array.shape) else: output1_array = output1_array.astype(output1_dtype) if input_dtype == np.object: in0n = np.array([str(x) for x in input0_array.reshape(input0_array.size)], dtype=object) input0_array = in0n.reshape(input0_array.shape) in1n = np.array([str(x) for x in input1_array.reshape(input1_array.size)], dtype=object) input1_array = in1n.reshape(input1_array.shape) # prepend size of string to output string data if output0_dtype == np.object: if batch_size == 1: output0_array_tmp = serialize_byte_tensor_list([output0_array]) else: output0_array_tmp = serialize_byte_tensor_list(output0_array) else: output0_array_tmp = output0_array if output1_dtype == np.object: if batch_size == 1: output1_array_tmp = serialize_byte_tensor_list([output1_array]) else: output1_array_tmp = serialize_byte_tensor_list(output1_array) else: output1_array_tmp = output1_array OUTPUT0 = "OUTPUT0" OUTPUT1 = "OUTPUT1" INPUT0 = "INPUT0" INPUT1 = "INPUT1" if pf == "libtorch" or pf == "libtorch_nobatch": OUTPUT0 = "OUTPUT__0" OUTPUT1 = "OUTPUT__1" INPUT0 = "INPUT__0" INPUT1 = "INPUT__1" output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp]) output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp]) if batch_size == 1: input0_list = [input0_array] input1_list = [input1_array] else: input0_list = [x for x in input0_array] input1_list = [x for x in input1_array] # Serialization of string tensors in the case of shared memory must be done manually if input_dtype == np.object: input0_list_tmp = serialize_byte_tensor_list(input0_list) input1_list_tmp = serialize_byte_tensor_list(input1_list) else: input0_list_tmp = input0_list input1_list_tmp = input1_list input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp]) input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp]) # Create system/cuda shared memory regions if needed shm_regions, shm_handles = su.create_set_shm_regions(input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size, outputs, shm_region_names, precreated_shm_regions, use_system_shared_memory, use_cuda_shared_memory) if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_model_name( pf, input_dtype, output0_dtype, output1_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient( config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( config[0], verbose=True) inputs = [] if config[1] == "http": inputs.append(httpclient.InferInput( INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append(httpclient.InferInput( INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) else: inputs.append(grpcclient.InferInput( INPUT0, tensor_shape, np_to_triton_dtype(input_dtype))) inputs.append(grpcclient.InferInput( INPUT1, tensor_shape, np_to_triton_dtype(input_dtype))) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[0].set_data_from_numpy( input0_array, binary_data=config[3]) inputs[1].set_data_from_numpy( input1_array, binary_data=config[3]) else: inputs[0].set_data_from_numpy(input0_array) inputs[1].set_data_from_numpy(input1_array) else: # Register necessary shared memory regions/handles su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles, input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if batch_size == 1: expected0_sort_idx = [np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape((1,) + tensor_shape)] expected1_sort_idx = [np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape((1,) + tensor_shape)] else: expected0_sort_idx = [np.flip(np.argsort(x.flatten()), 0) for x in output0_array.reshape(tensor_shape)] expected1_sort_idx = [np.flip(np.argsort(x.flatten()), 0) for x in output1_array.reshape(tensor_shape)] # Force binary_data = False for shared memory and class output_req = [] i = 0 if "OUTPUT0" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append(httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT0)) output_req[-1].set_shared_memory( shm_regions[2]+'_data', output0_byte_size) else: if output0_raw: if config[1] == "http": output_req.append(httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT0)) else: if config[1] == "http": output_req.append(httpclient.InferRequestedOutput( OUTPUT0, binary_data=config[3], class_count=num_classes)) else: output_req.append(grpcclient.InferRequestedOutput( OUTPUT0, class_count=num_classes)) i += 1 if "OUTPUT1" in outputs: if len(shm_regions) != 0: if config[1] == "http": output_req.append(httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3])) else: output_req.append(grpcclient.InferRequestedOutput(OUTPUT1)) output_req[-1].set_shared_memory( shm_regions[2+i]+'_data', output1_byte_size) else: if output1_raw: if config[1] == "http": output_req.append(httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3])) else: output_req.append( grpcclient.InferRequestedOutput(OUTPUT1)) else: if config[1] == "http": output_req.append(httpclient.InferRequestedOutput( OUTPUT1, binary_data=config[3], class_count=num_classes)) else: output_req.append(grpcclient.InferRequestedOutput( OUTPUT1, class_count=num_classes)) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id())) last_response = results.get_response() if not skip_request_id_check: global _seen_request_ids if config[1] == "http": request_id = int(last_response["id"]) else: request_id = int(last_response.id) tester.assertFalse(request_id in _seen_request_ids, "request_id: {}".format(request_id)) _seen_request_ids.add(request_id) if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(str(response_model_version), model_version) tester.assertEqual(len(response_outputs), len(outputs)) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name if ((result_name == OUTPUT0 and output0_raw) or (result_name == OUTPUT1 and output1_raw)): if use_system_shared_memory or use_cuda_shared_memory: if result_name == OUTPUT0: shm_handle = shm_handles[2] else: shm_handle = shm_handles[3] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) if result_name == OUTPUT0: tester.assertTrue(np.array_equal(output_data, output0_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT0, output0_array, output_data)) elif result_name == OUTPUT1: tester.assertTrue(np.array_equal(output_data, output1_array), "{}, {} expected: {}, got {}".format( model_name, OUTPUT1, output1_array, output_data)) else: tester.assertTrue( False, "unexpected raw result {}".format(result_name)) else: for b in range(batch_size): # num_classes values must be returned and must # match expected top values if "nobatch" in pf: class_list = results.as_numpy(result_name) else: class_list = results.as_numpy(result_name)[b] tester.assertEqual(len(class_list), num_classes) if batch_size == 1: expected0_flatten = output0_array.flatten() expected1_flatten = output1_array.flatten() else: expected0_flatten = output0_array[b].flatten() expected1_flatten = output1_array[b].flatten() for idx, class_label in enumerate(class_list): # can't compare indices since could have different # indices with the same value/prob, so check that # the value of each index equals the expected value. # Only compare labels when the indices are equal. if type(class_label) == str: ctuple = class_label.split(':') else: ctuple = "".join(chr(x) for x in class_label).split(':') cval = float(ctuple[0]) cidx = int(ctuple[1]) if result_name == OUTPUT0: tester.assertEqual(cval, expected0_flatten[cidx]) tester.assertEqual( cval, expected0_flatten[expected0_sort_idx[b][idx]]) if cidx == expected0_sort_idx[b][idx]: tester.assertEqual(ctuple[2], 'label{}'.format( expected0_sort_idx[b][idx])) elif result_name == OUTPUT1: tester.assertEqual(cval, expected1_flatten[cidx]) tester.assertEqual( cval, expected1_flatten[expected1_sort_idx[b][idx]]) else: tester.assertTrue( False, "unexpected class result {}".format(result_name)) # Unregister system/cuda shared memory regions if they exist su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory) return results
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None, use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue( use_http or use_grpc or use_http_json_tensors or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", "http", False, True)) if use_http_json_tensors and (tensor_dtype != np.float16): configs.append(("localhost:8000", "http", False, False)) if use_grpc: configs.append(("localhost:8001", "grpc", False, False)) if use_streaming: configs.append(("localhost:8001", "grpc", True, False)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if shm_region_name_prefix is None: shm_region_name_prefix = ["input", "output"] input_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() for io_num in range(io_cnt): if pf == "libtorch" or pf == "libtorch_nobatch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_shape = input_shapes[io_num] output_shape = output_shapes[io_num] rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shape, dtype=rtensor_dtype) else: input_array = np.random.choice(a=[False, True], size=input_shape) if tensor_dtype != np.object: input_array = input_array.astype(tensor_dtype) expected_array = np.ndarray.copy(input_array) else: expected_array = np.array([unicode(str(x), encoding='utf-8') for x in input_array.flatten()], dtype=object) input_array = np.array([str(x) for x in input_array.flatten()], dtype=object).reshape(input_array.shape) expected_array = expected_array.reshape(output_shape) expected_dict[output_name] = expected_array output_byte_size = expected_array.nbytes if batch_size == 1: input_list = [input_array] else: input_list = [x for x in input_array] # Serialization of string tensors in the case of shared memory must be done manually if tensor_dtype == np.object: input_list_tmp = serialize_byte_tensor_list(input_list) else: input_list_tmp = input_list input_byte_size = sum([ip.nbytes for ip in input_list_tmp]) # create and register shared memory region for inputs and outputs shm_io_handles = su.create_set_either_shm_region([shm_region_name_prefix[0]+str(io_num), shm_region_name_prefix[1]+str(io_num)], input_list_tmp, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory) if len(shm_io_handles) != 0: shm_ip_handles.append(shm_io_handles[0]) shm_op_handles.append(shm_io_handles[1]) input_dict[input_name] = input_array if model_version is not None: model_version = str(model_version) else: model_version = "" # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) if config[1] == "http": triton_client = httpclient.InferenceServerClient( config[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( config[0], verbose=True) inputs = [] output_req = [] for io_num, (input_name, output_name) in enumerate(zip(input_dict.keys(), expected_dict.keys())): input_data = input_dict[input_name] input_byte_size = input_data.nbytes output_byte_size = expected_dict[output_name].nbytes if config[1] == "http": inputs.append(httpclient.InferInput( input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append(httpclient.InferRequestedOutput( output_name, binary_data=config[3])) else: inputs.append(grpcclient.InferInput( input_name, input_data.shape, np_to_triton_dtype(tensor_dtype))) output_req.append( grpcclient.InferRequestedOutput(output_name)) if not (use_cuda_shared_memory or use_system_shared_memory): if config[1] == "http": inputs[-1].set_data_from_numpy(input_data, binary_data=config[3]) else: inputs[-1].set_data_from_numpy(input_data) else: # Register necessary shared memory regions/handles su.register_add_either_shm_regions(inputs, output_req, shm_region_name_prefix, (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size, use_system_shared_memory, use_cuda_shared_memory, triton_client) if config[2]: user_data = UserData() triton_client.start_stream(partial(completion_callback, user_data)) try: results = triton_client.async_stream_infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) except Exception as e: triton_client.stop_stream() raise e triton_client.stop_stream() (results, error) = user_data._completed_requests.get() if error is not None: raise error else: results = triton_client.infer(model_name, inputs, model_version=model_version, outputs=output_req, request_id=str(_unique_request_id()), priority=priority, timeout=timeout_us) last_response = results.get_response() if config[1] == "http": response_model_name = last_response["model_name"] if model_version != "": response_model_version = last_response["model_version"] response_outputs = last_response["outputs"] else: response_model_name = last_response.model_name if model_version != "": response_model_version = last_response.model_version response_outputs = last_response.outputs tester.assertEqual(response_model_name, model_name) if model_version != "": tester.assertEqual(response_model_version, model_version) tester.assertEqual(len(response_outputs), io_cnt) for result in response_outputs: if config[1] == "http": result_name = result["name"] else: result_name = result.name tester.assertTrue(result_name in expected_dict) if use_system_shared_memory or use_cuda_shared_memory: if pf == "libtorch" or pf == "libtorch_nobatch": io_num = int(result_name.split("OUTPUT__")[1]) else: io_num = int(result_name.split("OUTPUT")[1]) shm_handle = shm_op_handles[io_num] output = results.get_output(result_name) if config[1] == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) if use_system_shared_memory: output_data = shm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) elif use_cuda_shared_memory: output_data = cudashm.get_contents_as_numpy( shm_handle, output_dtype, output_shape) else: output_data = results.as_numpy(result_name) if (output_data.dtype == np.object) and (config[3] == False): output_data = output_data.astype(np.bytes_) expected = expected_dict[result_name] tester.assertEqual(output_data.shape, expected.shape) tester.assertTrue(np.array_equal(output_data, expected), "{}, {}, expected: {}, got {}".format( model_name, result_name, expected, output_data)) if len(shm_ip_handles) != 0: for io_num in range(io_cnt): if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0]+str(io_num)+'_data') triton_client.unregister_cuda_shared_memory( shm_region_name_prefix[0]+str(io_num)+'_data') cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: triton_client.unregister_system_shared_memory( shm_region_name_prefix[1]+str(io_num)+'_data') triton_client.unregister_system_shared_memory( shm_region_name_prefix[1]+str(io_num)+'_data') shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results