def test_register_after_inference(self): # Register after inference error_msg = [] shm_handles = self._configure_sever() if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg) if len(error_msg) > 0: raise Exception(str(error_msg)) shm_ip2_handle = shm.create_shared_memory_region( "input2_data", "/input2_data", 64) triton_client.register_system_shared_memory("input2_data", "/input2_data", 64) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 5) else: self.assertTrue(len(shm_status.regions) == 5) shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles)
def _configure_sever(self): shm_ip0_handle = cshm.create_shared_memory_region("input0_data", 64, 0) shm_ip1_handle = cshm.create_shared_memory_region("input1_data", 64, 0) shm_op0_handle = cshm.create_shared_memory_region( "output0_data", 64, 0) shm_op1_handle = cshm.create_shared_memory_region( "output1_data", 64, 0) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) cshm.set_shared_memory_region(shm_ip0_handle, [input0_data]) cshm.set_shared_memory_region(shm_ip1_handle, [input1_data]) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) triton_client.register_cuda_shared_memory( "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64) triton_client.register_cuda_shared_memory( "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64) triton_client.register_cuda_shared_memory( "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64) triton_client.register_cuda_shared_memory( "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def _basic_inference(self, shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle, error_msg, big_shm_name="", big_shm_size=64): input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) inputs = [] outputs = [] if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) inputs[0].set_shared_memory("input0_data", 64) if type(shm_ip1_handle) == np.array: inputs[1].set_data_from_numpy(input0_data, binary_data=False) elif big_shm_name != "": inputs[1].set_shared_memory(big_shm_name, big_shm_size) else: inputs[1].set_shared_memory("input1_data", 64) outputs[0].set_shared_memory("output0_data", 64) outputs[1].set_shared_memory("output1_data", 64) try: results = triton_client.infer("simple", inputs, model_version="", outputs=outputs) output = results.get_output('OUTPUT0') if _protocol == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = triton_to_np_dtype(output_datatype) output_data = shm.get_contents_as_numpy(shm_op0_handle, output_dtype, output_shape) self.assertTrue( (output_data[0] == (input0_data + input1_data)).all()) except Exception as ex: error_msg.append(str(ex))
def test_reregister_after_register(self): # Create a valid system shared memory region and unregister after register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) try: triton_client.register_system_shared_memory( "dummy_data", "/dummy_data", 8) except Exception as ex: self.assertTrue( "shared memory region 'dummy_data' already in manager" in str( ex)) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle)
def _full_exact(self, model_name, request_concurrency, shape): # Run async requests to make sure backend handles concurrent requests # correctly. client = httpclient.InferenceServerClient( "localhost:8000", concurrency=request_concurrency) input_datas = [] requests = [] for i in range(request_concurrency): input_data = (16384 * np.random.randn(*shape)).astype(np.float32) input_datas.append(input_data) inputs = [ httpclient.InferInput("INPUT__0", input_data.shape, "FP32") ] inputs[0].set_data_from_numpy(input_data) requests.append(client.async_infer(model_name, inputs)) for i in range(request_concurrency): # Get the result from the initiated asynchronous inference request. # Note the call will block until the server responds. results = requests[i].get_result() output_data = results.as_numpy("OUTPUT__0") self.assertIsNotNone(output_data, "error: expected 'OUTPUT__0' to be found") np.testing.assert_allclose(output_data, input_datas[i])
def test_batch_request_for_batching_model(self): input_size = 16 # graphdef_nobatch_int32_int8_int8 is non batching version. # The server should return an error if the batch size dimension # is included in the shape tensor_shape = (1, input_size) for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) inputs = [] outputs = [] if protocol == "http": triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True) inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1')) else: triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True) inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32")) inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32")) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) # Initialize the data inputs[0].set_data_from_numpy(in0) inputs[1].set_data_from_numpy(in1) results = triton_client.infer(model_name, inputs, outputs=outputs)
def test_unknown_model(self): try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: model_name = "foo" if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) server_metadata = triton_client.get_server_metadata() if pair[1] == "http": self.assertEqual(os.environ["TRITON_SERVER_VERSION"], server_metadata['version']) self.assertEqual("triton", server_metadata['name']) else: self.assertEqual(os.environ["TRITON_SERVER_VERSION"], server_metadata.version) self.assertEqual("triton", server_metadata.name) model_metadata = triton_client.get_model_metadata(model_name) self.assertTrue(False, "expected unknown model failure") except InferenceServerException as ex: self.assertTrue(ex.message().startswith( "Request for unknown model: 'foo' is not found"))
def _addsub_infer(self, model_name): triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32")) inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32")) # Initialize the data inputs[0].set_data_from_numpy(self.input0_, binary_data=False) inputs[1].set_data_from_numpy(self.input1_, binary_data=True) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) results = triton_client.infer(model_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT0') output1_data = results.as_numpy('OUTPUT1') self.assertTrue(np.array_equal(self.expected_output0_, output0_data), "incorrect sum") self.assertTrue(np.array_equal(self.expected_output1_, output1_data), "incorrect difference")
def setUp(self): self._data_type = np.float32 # Very large tensor will always fail for gRPC because the Protobuf has # a hard limit on 2GBs for the size of input tensors. All backends # except the Python and plan backend should be able to handle payloads # larger than 2GBs using HTTP. very_large_tensor_shape = (math.trunc( 3 * (1024 * 1024 * 1024) / np.dtype(self._data_type).itemsize), ) self._very_large_in0 = np.random.random( very_large_tensor_shape).astype(self._data_type) # 1.9 GBs allows us to test gRPC with moderate sizes too. large_tensor_shape = (math.trunc(1.9 * (1024 * 1024 * 1024) // np.dtype(self._data_type).itemsize), ) self._large_in0 = np.random.random(large_tensor_shape).astype( self._data_type) small_tensor_shape = (1, ) self._small_in0 = np.random.random(small_tensor_shape).astype( self._data_type) self._clients = ((httpclient, httpclient.InferenceServerClient('localhost:8000')), (grpcclient, grpcclient.InferenceServerClient('localhost:8001')))
def _full_exact(self, batch_size, model_name, plugin_name): triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', [batch_size, 16], "FP32")) input0_data = np.random.randn(batch_size, 16).astype(np.float32) inputs[0].set_data_from_numpy(input0_data, binary_data=False) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) results = triton_client.infer(model_name + '_' + plugin_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT0') # Verify values of Leaky RELU (it uses 0.1 instead of the default 0.01) # and for CustomClipPlugin min_clip = 0.1, max_clip = 0.5 for b in range(batch_size): if plugin_name == 'LReLU_TRT': test_input = np.where(input0_data > 0, input0_data, input0_data * 0.1) self.assertTrue(np.isclose(output0_data, test_input).all()) else: # [TODO] Add test for CustomClip output test_input = np.clip(input0_data, 0.1, 0.5)
def _full_exact(self, model_name, plugin_name, shape): triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT0', list(shape), "FP32")) input0_data = np.ones(shape=shape).astype(np.float32) inputs[0].set_data_from_numpy(input0_data, binary_data=True) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) results = triton_client.infer(model_name + '_' + plugin_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT0') # Verify values of Normalize and GELU if plugin_name == 'CustomGeluPluginDynamic': # Add bias input0_data += 1 # Calculate Gelu activation test_output = (input0_data * 0.5) * (1 + np.tanh((0.797885 * input0_data) + (0.035677 * (input0_data**3)))) self.assertTrue(np.isclose(output0_data, test_output).all()) else: # L2 norm is sqrt(sum([1]*16))) test_output = input0_data / np.sqrt(sum([1] * 16)) self.assertTrue(np.isclose(output0_data, test_output).all())
def test_infer_stats_no_model(self): # Test get_inference_statistics when no model/model_version is passed. try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) # Returns infer stats for ALL models + ready versions infer_stats = triton_client.get_inference_statistics() if pair[1] == "http": stats = infer_stats['model_stats'] else: stats = infer_stats.model_stats self.assertEqual( len(stats), 207, "expected 207 infer stats for all ready versions of all model" ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def infer_unknown(self, model_name, tensor_shape): print("About to run the test") input_data = np.random.random_sample(tensor_shape).astype(np.float32) client = tritonhttpclient.InferenceServerClient('localhost:8000') inputs = [ tritonhttpclient.InferInput("INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) results = client.infer(model_name, inputs) self.assertTrue(np.array_equal(results.as_numpy('OUTPUT'), input_data))
def _no_streaming_helper(self, protocol): data_offset = 100 repeat_count = 1 delay_time = 1000 wait_time = 2000 input_data = np.arange(start=data_offset, stop=data_offset + repeat_count, dtype=np.int32) input_data = np.expand_dims(input_data, axis=0) delay_data = (np.ones([1, repeat_count], dtype=np.uint32)) * delay_time wait_data = np.array([[wait_time]], dtype=np.uint32) if protocol is "grpc": # Use the inputs and outputs from the setUp this_inputs = self.inputs_ this_outputs = self.outputs_ else: this_inputs = [] this_inputs.append( httpclient.InferInput('IN', [1, repeat_count], "INT32")) this_inputs.append(httpclient.InferInput('DELAY', [1, 1], "UINT32")) this_inputs.append(httpclient.InferInput('WAIT', [1, 1], "UINT32")) this_outputs = [] this_outputs.append(httpclient.InferRequestedOutput('OUT')) # Initialize data for IN this_inputs[0].set_shape([1, repeat_count]) this_inputs[0].set_data_from_numpy(input_data) # Initialize data for DELAY this_inputs[1].set_shape([1, repeat_count]) this_inputs[1].set_data_from_numpy(delay_data) # Initialize data for WAIT this_inputs[2].set_data_from_numpy(wait_data) if protocol is "grpc": triton_client = grpcclient.InferenceServerClient( url="localhost:8001", verbose=True) else: triton_client = httpclient.InferenceServerClient( url="localhost:8000", verbose=True) try: triton_client.infer(model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs) self.assertTrue(False, "expected to fail for decoupled models") except InferenceServerException as ex: self.assertTrue( "doesn't support models with decoupled transaction policy" in ex.message())
def triton_init(url="localhost:8000"): """Initializes the triton client to point at the specified URL Parameter ---------- url : str The URL on which to address the Triton server, defaults to localhost:8000 """ global triton_client triton_client = tritonhttpclient.InferenceServerClient(url) return triton_client
def setUp(self): self.dtype_ = np.float32 self.inputs = [] # 4 set of inputs with shape [2], [4], [1], [3] for value in [2, 4, 1, 3]: self.inputs.append([ tritonhttpclient.InferInput('RAGGED_INPUT', [1, value], "FP32") ]) self.inputs[-1][0].set_data_from_numpy( np.full([1, value], value, np.float32)) self.client = tritonhttpclient.InferenceServerClient( url="localhost:8000", concurrency=len(self.inputs))
def test_unregister_before_register(self): # Create a valid cuda shared memory region and unregister before register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) triton_client.unregister_cuda_shared_memory("dummy_data") shm_status = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertEqual(len(shm_status), 0) else: self.assertEqual(len(shm_status.regions), 0) cshm.destroy_shared_memory_region(shm_op0_handle)
def setUp(self): self.data_type_ = np.float32 # n GB divided by element size as tensor shape tensor_shape = (math.trunc(6 * (1024 * 1024 * 1024) / np.dtype(self.data_type_).itemsize), ) self.in0_ = np.random.random(tensor_shape).astype(self.data_type_) small_tensor_shape = (1, ) self.sin0_ = np.random.random(small_tensor_shape).astype( self.data_type_) self.clients_ = ((httpclient, httpclient.InferenceServerClient('localhost:8000')), (grpcclient, grpcclient.InferenceServerClient('localhost:8001')))
def test_http_infer(self): self._prepare_request("http") # The model is configured to take three seconds to send the # response. Expect an exception for small timeout values. with self.assertRaises(socket.timeout) as cm: triton_client = httpclient.InferenceServerClient( url="localhost:8000", verbose=True, network_timeout=2.0) result = triton_client.infer(model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_) self.assertIn("timed out", str(cm.exception)) # Expect to successfully pass with sufficiently large timeout triton_client = httpclient.InferenceServerClient( url="localhost:8000", verbose=True, connection_timeout=10.0) result = triton_client.infer(model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_) output0_data = result.as_numpy('OUTPUT0') self.assertTrue(np.array_equal(self.input0_data_, output0_data))
def _get_infer_count_per_version(self, model_name): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True) stats = triton_client.get_inference_statistics(model_name) self.assertEqual(len(stats["model_stats"]), 2) infer_count = [0, 0] for model_stat in stats["model_stats"]: self.assertEqual(model_stat["name"], model_name, "expected stats for model " + model_name) model_version = model_stat['version'] if model_version == "1": infer_count[0] = model_stat["inference_stats"]["success"]["count"] elif model_version == "2": infer_count[1] = model_stat["inference_stats"]["success"]["count"] else: self.assertTrue(False, "unexpected version {} for model {}".format(model_version, model_name)) return infer_count
def test_batch_item_shape(self): # Use 3 set of inputs with shape [2, 1, 2], [1, 1, 2], [1, 2, 2] # Note that the test only checks the formation of "BATCH_INPUT" where # the value of "RAGGED_INPUT" is irrelevant, only the shape matters inputs = [] for value in [[2, 1, 2], [1, 1, 2], [1, 2, 2]]: inputs.append( [tritonhttpclient.InferInput('RAGGED_INPUT', value, "FP32")]) inputs[-1][0].set_data_from_numpy( np.full(value, value[0], np.float32)) client = tritonhttpclient.InferenceServerClient( url="localhost:8000", concurrency=len(inputs)) expected_outputs = [ np.array([[1.0, 2.0], [1.0, 2.0]]), np.array([[1.0, 2.0]]), np.array([[2.0, 2.0]]), ] model_name = "batch_item" output_name = 'BATCH_OUTPUT' outputs = [tritonhttpclient.InferRequestedOutput(output_name)] async_requests = [] try: for request_inputs in inputs: # Asynchronous inference call. async_requests.append( client.async_infer(model_name=model_name, inputs=request_inputs, outputs=outputs)) for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. # Note the call will block till the server responds. result = async_requests[idx].get_result() # Validate the results by comparing with precomputed values. output_data = result.as_numpy(output_name) self.assertTrue( np.allclose(output_data, expected_outputs[idx]), "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}" .format(expected_outputs[idx], output_data, np.isclose(expected_outputs[idx], output_data))) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def test_valid_create_set_register(self): # Create a valid cuda shared memory region, fill data in it and register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) cshm.set_shared_memory_region(shm_op0_handle, [np.array([1, 2], dtype=np.float32)]) triton_client.register_cuda_shared_memory( "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8) shm_status = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertEqual(len(shm_status), 1) else: self.assertEqual(len(shm_status.regions), 1) cshm.destroy_shared_memory_region(shm_op0_handle)
def test_unregister_after_register(self): # Create a valid system shared memory region and unregister after register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) triton_client.unregister_system_shared_memory("dummy_data") shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 0) else: self.assertTrue(len(shm_status.regions) == 0) shm.destroy_shared_memory_region(shm_op0_handle)
def test_unregisterall(self): # Unregister all shared memory blocks shm_handles = self._configure_sever() if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) status_before = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertEqual(len(status_before), 4) else: self.assertEqual(len(status_before.regions), 4) triton_client.unregister_cuda_shared_memory() status_after = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertEqual(len(status_after), 0) else: self.assertEqual(len(status_after.regions), 0) self._cleanup_server(shm_handles)
def test_too_big_shm(self): # Shared memory input region larger than needed - Throws error error_msg = [] shm_handles = self._configure_sever() shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 128, 0) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) triton_client.register_cuda_shared_memory( "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128) self._basic_inference(shm_handles[0], shm_ip2_handle, shm_handles[2], shm_handles[3], error_msg, "input2_data", 128) if len(error_msg) > 0: self.assertIn( "unexpected total byte size 128 for input 'INPUT1', expecting 64", error_msg[-1]) shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles)
def test_unregister_after_inference(self): # Unregister after inference error_msg = [] shm_handles = self._configure_sever() self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg) if len(error_msg) > 0: raise Exception(str(error_msg)) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) triton_client.unregister_cuda_shared_memory("output0_data") shm_status = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertEqual(len(shm_status), 3) else: self.assertEqual(len(shm_status.regions), 3) self._cleanup_server(shm_handles)
def unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory): if not (use_system_shared_memory or use_cuda_shared_memory): return None triton_client = httpclient.InferenceServerClient("localhost:8000") if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(shm_regions[0] + '_data') triton_client.unregister_cuda_shared_memory(shm_regions[1] + '_data') cudashm.destroy_shared_memory_region(shm_handles[0]) cudashm.destroy_shared_memory_region(shm_handles[1]) else: triton_client.unregister_system_shared_memory(shm_regions[0] + '_data') triton_client.unregister_system_shared_memory(shm_regions[1] + '_data') shm.destroy_shared_memory_region(shm_handles[0]) shm.destroy_shared_memory_region(shm_handles[1]) if precreated_shm_regions is None: i = 0 if "OUTPUT0" in outputs: if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(shm_regions[2] + '_data') cudashm.destroy_shared_memory_region(shm_handles[2]) else: triton_client.unregister_system_shared_memory(shm_regions[2] + '_data') shm.destroy_shared_memory_region(shm_handles[2]) i += 1 if "OUTPUT1" in outputs: if use_cuda_shared_memory: triton_client.unregister_cuda_shared_memory(shm_regions[2 + i] + '_data') cudashm.destroy_shared_memory_region(shm_handles[3]) else: triton_client.unregister_system_shared_memory(shm_regions[2 + i] + '_data') shm.destroy_shared_memory_region(shm_handles[3])
def test_basic(self): try: for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: model_name = "graphdef_int32_int8_int8" extensions = [ 'classification', 'sequence', 'model_repository', 'schedule_policy', 'model_configuration', 'system_shared_memory', 'cuda_shared_memory', 'binary_tensor_data', 'statistics' ] if pair[1] == "http": triton_client = httpclient.InferenceServerClient( url=pair[0], verbose=True) else: triton_client = grpcclient.InferenceServerClient( url=pair[0], verbose=True) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) server_metadata = triton_client.get_server_metadata() model_metadata = triton_client.get_model_metadata(model_name) if pair[1] == "http": self.assertEqual(os.environ["TRITON_SERVER_VERSION"], server_metadata['version']) self.assertEqual("triton", server_metadata['name']) for ext in extensions: self.assertTrue(ext in server_metadata['extensions']) self.assertEqual(model_name, model_metadata['name']) else: self.assertEqual(os.environ["TRITON_SERVER_VERSION"], server_metadata.version) self.assertEqual("triton", server_metadata.name) for ext in extensions: self.assertTrue(ext in server_metadata.extensions) self.assertEqual(model_name, model_metadata.name) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex))
def _test_helper(self, modelVersion, tag, sig_def): shape = [self.dims] model_name = self.base_model_name + str(modelVersion) # The multiplier is defined during model creation. See server/qa/common/gen_tag_sigdef.py # for details multiplier = modelVersion + 1 output_name = "OUTPUT" triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] inputs.append(httpclient.InferInput('INPUT', shape, "FP32")) input_data = np.ones(shape=shape).astype(np.float32) inputs[0].set_data_from_numpy(input_data, binary_data=True) outputs.append( httpclient.InferRequestedOutput(output_name, binary_data=True)) results = triton_client.infer(model_name, inputs, outputs=outputs) output_data = results.as_numpy(output_name) test_output = input_data * multiplier self.assertTrue(np.isclose(output_data, test_output).all())
def test_reregister_after_register(self): # Create a valid cuda shared memory region and unregister after register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) triton_client.register_cuda_shared_memory( "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8) try: triton_client.register_cuda_shared_memory( "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8) except Exception as ex: self.assertIn( "shared memory region 'dummy_data' already in manager", str(ex)) shm_status = triton_client.get_cuda_shared_memory_status() if _protocol == "http": self.assertEqual(len(shm_status), 1) else: self.assertEqual(len(shm_status.regions), 1) cshm.destroy_shared_memory_region(shm_op0_handle)