def __init__(self, server_url, ssl_options={}): """ Parameters ---------- server_url : str The url for Triton server's GRPC endpoint ssl_options : dict Dictionary of SSL options for gRPC python client """ ssl = False root_certificates = None private_key = None certificate_chain = None if 'ssl-grpc-use-ssl' in ssl_options: ssl = ssl_options['ssl-grpc-use-ssl'].lower() == 'true' if 'ssl-grpc-root-certifications-file' in ssl_options: root_certificates = ssl_options[ 'ssl-grpc-root-certifications-file'] if 'ssl-grpc-private-key-file' in ssl_options: private_key = ssl_options['ssl-grpc-private-key-file'] if 'ssl-grpc-certificate-chain-file' in ssl_options: certificate_chain = ssl_options['ssl-grpc-certificate-chain-file'] self._client = grpcclient.InferenceServerClient( url=server_url, ssl=ssl, root_certificates=root_certificates, private_key=private_key, certificate_chain=certificate_chain)
def __init__( self, _algorithm_name, _server_url, _server_port, _model_name, _model_version, ): self.name = _algorithm_name self.type_name = 'triton' self.target_url = '%s:%s' % (_server_url, _server_port) self.model_name = _model_name self.model_version = str(_model_version) try: # 新版本的triton client的send和receive的length都超过了1GB,足够霍霍了 # self.triton_client = CustomInferenceServerClient(url=self.target_url) self.triton_client = grpcclient.InferenceServerClient( url=self.target_url) except Exception as e: raise TritonServerCannotConnectException( f'triton server {self.target_url} connect fail') if not self.triton_client.is_server_ready(): raise TritonServerNotReadyException( f'triton server {self.target_url} not ready')
def test_grpc_get_settings(self): # Model trace settings will be the same as global trace settings since # no update has been made. initial_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( json.dumps({ "settings": { "trace_file": { "value": ["global_unittest.log"] }, "trace_level": { "value": ["TIMESTAMPS"] }, "trace_rate": { "value": ["1"] }, "trace_count": { "value": ["-1"] }, "log_frequency": { "value": ["0"] }, } }), initial_settings) triton_client = grpcclient.InferenceServerClient("localhost:8001") self.assertEqual(initial_settings, triton_client.get_trace_settings(model_name="simple"), "Unexpected initial model trace settings") self.assertEqual(initial_settings, triton_client.get_trace_settings(), "Unexpected initial global settings")
def setUp(self): global _deferred_exceptions _deferred_exceptions = [] # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient( "localhost:8001") self.model_name_ = 'identity_2_float32' # This will not be changed even when ensemble is under test, # as the dynamic batching is performed within the composing model self.check_status_model = 'identity_2_float32' self.tensor_shape_ = (1, 1) self.inputs_ = { "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"), "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32") } self.input_data_ = { "INPUT0": np.ones(shape=(1, 1), dtype=np.float32), "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32) } self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"]) self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"]) self.outputs_ = { "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'), "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1') }
def get_result(url, model_name, x): try: triton_client = grpcclient.InferenceServerClient(url=url, verbose=False, ssl=False) print("Channel creation success") except Exception as e: print("channel creation failed: " + str(e)) inputs = [] outputs = [] inputs.append(grpcclient.InferInput('input0', x.shape, "FP32")) input0_data = x print("X Shape : ", x.shape) inputs[0].set_data_from_numpy(input0_data) outputs.append(grpcclient.InferRequestedOutput('output0')) results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) inputs[0].set_data_from_numpy(input0_data) output0_data = results.as_numpy('output0') output0_data = sigmoid(output0_data.squeeze()) print(output0_data) result = np.mean(output0_data) return output0_data
def _configure_sever(self): shm_ip0_handle = shm.create_shared_memory_region( "input0_data", "/input0_data", 64) shm_ip1_handle = shm.create_shared_memory_region( "input1_data", "/input1_data", 64) shm_op0_handle = shm.create_shared_memory_region( "output0_data", "/output0_data", 64) shm_op1_handle = shm.create_shared_memory_region( "output1_data", "/output1_data", 64) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) triton_client.register_system_shared_memory("input0_data", "/input0_data", 64) triton_client.register_system_shared_memory("input1_data", "/input1_data", 64) triton_client.register_system_shared_memory("output0_data", "/output0_data", 64) triton_client.register_system_shared_memory("output1_data", "/output1_data", 64) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def connect_to_server(server_host, server_port, model_name, model_version): url = f"{server_host}:{server_port}" verbose = False # connection = {"text": f"Connected to server {server_host}:{server_port}, model: {model_name}, " # f"version: {model_version}", "success": True} connection = {"text": "", "success": False} try: triton_client = grpcclient.InferenceServerClient( url=url, verbose=verbose) connection = {"text": f"Connected to server {server_host}:{server_port}, model: {model_name}, " f"version: {model_version}", "success": True} except Exception as e: connection = {"text": f"Client creation failed {server_host}:{server_port}", "success": False} return connection try: model_metadata = triton_client.get_model_metadata( model_name=model_name, model_version=model_version) except InferenceServerException as e: connection = {"text": f"Failed to retrieve the metadata: {server_host}:{server_port}, model: {model_name}, " f"version: {model_version}", "success": False} return connection try: model_config = triton_client.get_model_config( model_name=model_name, model_version=model_version) except InferenceServerException as e: connection = {"text": f"Failed to retrieve the config: {server_host}:{server_port}, model: {model_name}, " f"version: {model_version}", "success": False} return connection return connection
def test_decoupled_bls(self): # Test combinations of BLS and decoupled API in Python backend. model_name = "decoupled_bls" shape = [1, 2] user_data = UserData() with grpcclient.InferenceServerClient( "localhost:8001") as triton_client: triton_client.start_stream(callback=partial(callback, user_data)) input_datas = [] input_data = np.random.randn(*shape).astype(np.float32) input_datas.append(input_data) inputs = [ grpcclient.InferInput("IN", input_data.shape, np_to_triton_dtype(input_data.dtype)) ] inputs[0].set_data_from_numpy(input_data) triton_client.async_stream_infer(model_name=model_name, inputs=inputs) # Check the results of the decoupled model using BLS def check_result(result): # Make sure the result is not an exception self.assertIsNot(type(result), InferenceServerException) output_data = result.as_numpy("OUT") self.assertIsNotNone(output_data, "error: expected 'OUT'") self.assertTrue( np.array_equal(output_data, input_data), "error: expected output {} to match input {}".format( output_data, input_data)) result = user_data._completed_requests.get() check_result(result)
def _basic_inference(self, shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle, error_msg, big_shm_name="", big_shm_size=64): input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) inputs = [] outputs = [] if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append( httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) outputs.append( httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32")) outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) inputs[0].set_shared_memory("input0_data", 64) if type(shm_ip1_handle) == np.array: inputs[1].set_data_from_numpy(input0_data, binary_data=True) elif big_shm_name != "": inputs[1].set_shared_memory(big_shm_name, big_shm_size) else: inputs[1].set_shared_memory("input1_data", 64) outputs[0].set_shared_memory("output0_data", 64) outputs[1].set_shared_memory("output1_data", 64) try: results = triton_client.infer("simple", inputs, model_version="", outputs=outputs) output = results.get_output('OUTPUT0') if _protocol == "http": output_datatype = output['datatype'] output_shape = output['shape'] else: output_datatype = output.datatype output_shape = output.shape output_dtype = utils.triton_to_np_dtype(output_datatype) output_data = shm.get_contents_as_numpy(shm_op0_handle, output_dtype, output_shape) self.assertTrue( (output_data[0] == (input0_data + input1_data)).all(), "Model output does not match expected output") except Exception as ex: error_msg.append(str(ex))
def __init__(self, server_url): """ Parameters ---------- server_url : str The url for Triton server's GRPC endpoint """ self._client = grpcclient.InferenceServerClient(url=server_url)
def main(args): client = t_client.InferenceServerClient(url=args.url) if args.action in ['reload', 'unload']: client.unload_model(args.model) print('Successfully unloaded model', args.model) if args.action in ['reload', 'load']: client.load_model(args.model) print('Successfully loaded model', args.model)
def _run_unittest(self, model_name): with grpcclient.InferenceServerClient("localhost:8001") as client: # No input is required result = client.infer(model_name, [], client_timeout=120) output0 = result.as_numpy('OUTPUT0') # The model returns 1 if the tests were sucessfully passed. # Otherwise, it will return 0. self.assertEqual(output0, [1])
def _test_no_outputs_helper(self, use_grpc=True, use_http=True, use_streaming=True): if use_grpc: triton_client = grpcclient.InferenceServerClient( url="localhost:8001", verbose=True) self._prepare_request("grpc") result = triton_client.infer(model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_, client_timeout=1) # The response should not contain any outputs self.assertEqual(result.as_numpy('OUTPUT0'), None) if use_http: triton_client = httpclient.InferenceServerClient( url="localhost:8000", verbose=True, network_timeout=2.0) self._prepare_request("http") result = triton_client.infer(model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_) # The response should not contain any outputs self.assertEqual(result.as_numpy('OUTPUT0'), None) if use_streaming: triton_client = grpcclient.InferenceServerClient( url="localhost:8001", verbose=True) self._prepare_request("grpc") user_data = UserData() triton_client.stop_stream() triton_client.start_stream(callback=partial(callback, user_data), stream_timeout=1) triton_client.async_stream_infer(model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_) result = user_data._completed_requests.get() if type(result) == InferenceServerException: raise result # The response should not contain any outputs self.assertEqual(result.as_numpy('OUTPUT0'), None)
def test_python_unittest(self): model_name = os.environ['MODEL_NAME'] with grpcclient.InferenceServerClient("localhost:8001") as client: # No input is required result = client.infer(model_name, [], client_timeout=120) output0 = result.as_numpy('OUTPUT0') # The model returns 1 if the tests were sucessfully passed. # Otherwise, it will return 0. self.assertTrue(output0 == [1])
def __init__(self, url, model_name): self.triton_client = grpcclient.InferenceServerClient(url) self.model_name = model_name assert self.triton_client.is_server_ready() assert self.triton_client.is_model_ready(model_name) self.model_metadata = self.triton_client.get_model_metadata(model_name) self.model_config = self.triton_client.get_model_config(model_name) self.inputs_specs, self.outputs_specs, self.max_batch_size = self.parse_model_grpc( )
def connectServer(url): try: triton_client = grpcclient.InferenceServerClient(url=url, verbose=False, ssl=False) print("Channel creation success") except Exception as e: triton_client = None print("channel creation failed: " + str(e)) return triton_client
def run_triton_server(modelpath, model_name, triton_server_path, device_id="0", backend="tensorflow", ps_path=None): import tritonclient import tritonclient.grpc as grpcclient if backend == "tensorflow": backend_config = "tensorflow,version=2" elif backend == "hugectr": backend_config = "hugectr,ps=" + ps_path else: raise ValueError("unknown backend:" + backend) cmdline = [ triton_server_path, "--model-repository", modelpath, "--backend-config", backend_config, "--model-control-mode=explicit", "--load-model", model_name, ] env = os.environ.copy() env["CUDA_VISIBLE_DEVICES"] = device_id with subprocess.Popen(cmdline, env=env) as process: try: with grpcclient.InferenceServerClient("localhost:8001") as client: # wait until server is ready for _ in range(60): if process.poll() is not None: retcode = process.returncode raise RuntimeError( f"Tritonserver failed to start (ret={retcode})") try: ready = client.is_server_ready() except tritonclient.utils.InferenceServerException: ready = False if ready: yield client return time.sleep(1) raise RuntimeError( "Timed out waiting for tritonserver to become ready") finally: # signal triton to shutdown process.send_signal(signal.SIGINT)
def check_ready(self): try: # 新版本的triton client的send和receive的length都超过了1GB,足够霍霍了 self.triton_client = grpcclient.InferenceServerClient( url=self.target_url) except Exception as e: raise TritonServerCannotConnectException( f'triton server {self.target_url} connect fail') if not self.triton_client.is_server_ready(): raise TritonServerNotReadyException( f'triton server {self.target_url} not ready')
def test_grpc(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) try: triton_client.infer(model_name="query", inputs=inputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) self.assertTrue("OUTPUT1 CPU 0" in ex.message())
def __init__(self): # Create server context try: triton_client = grpcclient.InferenceServerClient( url=self.url, verbose=self.verbose, ssl=self.ssl, root_certificates=self.root_certificates, private_key=self.private_key, certificate_chain=self.certificate_chain) except Exception as e: print("context creation failed: " + str(e)) sys.exit() # Health check if not triton_client.is_server_live(): print("FAILED : is_server_live") sys.exit(1) if not triton_client.is_server_ready(): print("FAILED : is_server_ready") sys.exit(1) if not triton_client.is_model_ready(self.model): print("FAILED : is_model_ready",self.model) sys.exit(1) try: metadata = triton_client.get_model_metadata(self.model) print(metadata) except InferenceServerException as ex: if "Request for unknown model" not in ex.message(): print("FAILED : get_model_metadata") print("Got: {}".format(ex.message())) sys.exit(1) else: print("FAILED : get_model_metadata") sys.exit(1) # Model configuration try: config = triton_client.get_model_config(self.model) if not (config.config.name == self.model): print("FAILED: get_model_config") sys.exit(1) # print(config) except InferenceServerException as ex: print("FAILED : get_model_config") print("Got: {}".format(ex.message())) sys.exit(1) self. triton_client = triton_client
def __init__(self, model_name: str, input_names: Sequence[str], output_names: Sequence[str], url, concurrency=1, verbose=False): self.client = t_client.InferenceServerClient(url=url, verbose=verbose) self.input_names = input_names self.output_names = output_names self.concurrency = concurrency self.model_name = model_name
def single_job(client_files): with grpcclient.InferenceServerClient( url=FLAGS.url, verbose=FLAGS.verbose) as triton_client: protocol_client = grpcclient speech_client = SpeechClient(triton_client, FLAGS.model_name, protocol_client) idx, audio_files = client_files predictions = [] for li in audio_files: result = speech_client.recognize(li, idx) print("Recognized {}:{}".format(li, result[0])) predictions += result return predictions
def set_client(self): try: url = self.host + ':' + str(self.port) client = grpcclient.InferenceServerClient(url=url, verbose=False, ssl=False, root_certificates=None, private_key=None, certificate_chain=None) except Exception as e: print("channel create failed: " + str(e)) sys.exit() return client
def __init__(self, config): """ Parameters ---------- config : TritonClientConfig A config with the relevant client options """ self._client_config = config assert self._client_config['url'], \ "Must specify url in client config." self._client = grpcclient.InferenceServerClient( url=self._client_config['url'])
def __init__(self, model='retinaface_r50_v1', output_order=None, triton_uri='localhost:8001', model_version='1', **kwargs): self.model_name = model self.model_version = model_version self.url = triton_uri self.input_shape = (1, 3, 640, 640) self.input_dtype = np.float32 self.output_order = output_order self.triton_client = grpcclient.InferenceServerClient(url=triton_uri)
def test_unregister_before_register(self): # Create a valid system shared memory region and unregister before register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) triton_client.unregister_system_shared_memory("dummy_data") shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 0) else: self.assertTrue(len(shm_status.regions) == 0) shm.destroy_shared_memory_region(shm_op0_handle)
def __init__(self, rec_name='arcface_r100_v1', triton_uri='localhost:8001', model_version='1', input_mean: float = 0., input_std: float = 1., **kwargs): self.model_name = rec_name self.model_version = model_version self.url = triton_uri self.input_shape = None self.max_batch_size = 1 self.input_mean = input_mean self.input_std = input_std self.triton_client = grpcclient.InferenceServerClient(url=triton_uri)
def test_valid_create_set_register(self): # Create a valid system shared memory region, fill data in it and register if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) shm_op0_handle = shm.create_shared_memory_region( "dummy_data", "/dummy_data", 8) shm.set_shared_memory_region(shm_op0_handle, [np.array([1, 2], dtype=np.float32)]) triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) else: self.assertTrue(len(shm_status.regions) == 1) shm.destroy_shared_memory_region(shm_op0_handle)
def __init__(self, triton_host, fpsec=None, batch_size=16, resize=(1280, 800)): """Constructor for DetectionPipeline class. Keyword Arguments: fpsec {int} -- Frame per second for sampling the video. These will be evenly spaced throughout the video. If not specified (i.e., None), all frames will be loaded. (default: {None}) batch_size {int} -- Batch size to use with detector. (default: {32}) resize {tuple} -- Dimensions to resize frames to before inference. (default: {(1280, 800)}) """ self.triton_client = triton.InferenceServerClient(url=triton_host) self.fpsec = fpsec self.batch_size = batch_size self.resize = resize
def test_unregisterall(self): # Unregister all shared memory blocks shm_handles = self._configure_sever() if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) status_before = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(status_before) == 4) else: self.assertTrue(len(status_before.regions) == 4) triton_client.unregister_system_shared_memory() status_after = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(status_after) == 0) else: self.assertTrue(len(status_after.regions) == 0) self._cleanup_server(shm_handles)