def __init__(self, server_url, ssl_options={}):
        """
        Parameters
        ----------
        server_url : str
            The url for Triton server's GRPC endpoint
        ssl_options : dict
            Dictionary of SSL options for gRPC python client
        """

        ssl = False
        root_certificates = None
        private_key = None
        certificate_chain = None

        if 'ssl-grpc-use-ssl' in ssl_options:
            ssl = ssl_options['ssl-grpc-use-ssl'].lower() == 'true'
        if 'ssl-grpc-root-certifications-file' in ssl_options:
            root_certificates = ssl_options[
                'ssl-grpc-root-certifications-file']
        if 'ssl-grpc-private-key-file' in ssl_options:
            private_key = ssl_options['ssl-grpc-private-key-file']
        if 'ssl-grpc-certificate-chain-file' in ssl_options:
            certificate_chain = ssl_options['ssl-grpc-certificate-chain-file']

        self._client = grpcclient.InferenceServerClient(
            url=server_url,
            ssl=ssl,
            root_certificates=root_certificates,
            private_key=private_key,
            certificate_chain=certificate_chain)
Beispiel #2
0
    def __init__(
        self,
        _algorithm_name,
        _server_url,
        _server_port,
        _model_name,
        _model_version,
    ):
        self.name = _algorithm_name
        self.type_name = 'triton'
        self.target_url = '%s:%s' % (_server_url, _server_port)
        self.model_name = _model_name
        self.model_version = str(_model_version)

        try:
            # 新版本的triton client的send和receive的length都超过了1GB,足够霍霍了
            # self.triton_client = CustomInferenceServerClient(url=self.target_url)
            self.triton_client = grpcclient.InferenceServerClient(
                url=self.target_url)
        except Exception as e:
            raise TritonServerCannotConnectException(
                f'triton server {self.target_url} connect fail')
        if not self.triton_client.is_server_ready():
            raise TritonServerNotReadyException(
                f'triton server {self.target_url} not ready')
    def test_grpc_get_settings(self):
        # Model trace settings will be the same as global trace settings since
        # no update has been made.
        initial_settings = grpcclient.service_pb2.TraceSettingResponse()
        json_format.Parse(
            json.dumps({
                "settings": {
                    "trace_file": {
                        "value": ["global_unittest.log"]
                    },
                    "trace_level": {
                        "value": ["TIMESTAMPS"]
                    },
                    "trace_rate": {
                        "value": ["1"]
                    },
                    "trace_count": {
                        "value": ["-1"]
                    },
                    "log_frequency": {
                        "value": ["0"]
                    },
                }
            }), initial_settings)

        triton_client = grpcclient.InferenceServerClient("localhost:8001")
        self.assertEqual(initial_settings,
                         triton_client.get_trace_settings(model_name="simple"),
                         "Unexpected initial model trace settings")
        self.assertEqual(initial_settings, triton_client.get_trace_settings(),
                         "Unexpected initial global settings")
Beispiel #4
0
    def setUp(self):
        global _deferred_exceptions
        _deferred_exceptions = []

        # The helper client for setup will be GRPC for simplicity.
        self.triton_client_ = grpcclient.InferenceServerClient(
            "localhost:8001")
        self.model_name_ = 'identity_2_float32'
        # This will not be changed even when ensemble is under test,
        # as the dynamic batching is performed within the composing model
        self.check_status_model = 'identity_2_float32'
        self.tensor_shape_ = (1, 1)
        self.inputs_ = {
            "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"),
            "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32")
        }
        self.input_data_ = {
            "INPUT0": np.ones(shape=(1, 1), dtype=np.float32),
            "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32)
        }
        self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"])
        self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"])
        self.outputs_ = {
            "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'),
            "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1')
        }
def get_result(url, model_name, x):
    try:
        triton_client = grpcclient.InferenceServerClient(url=url,
                                                         verbose=False,
                                                         ssl=False)
        print("Channel creation success")
    except Exception as e:
        print("channel creation failed: " + str(e))

    inputs = []
    outputs = []
    inputs.append(grpcclient.InferInput('input0', x.shape, "FP32"))
    input0_data = x
    print("X Shape : ", x.shape)
    inputs[0].set_data_from_numpy(input0_data)
    outputs.append(grpcclient.InferRequestedOutput('output0'))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)
    inputs[0].set_data_from_numpy(input0_data)
    output0_data = results.as_numpy('output0')
    output0_data = sigmoid(output0_data.squeeze())
    print(output0_data)
    result = np.mean(output0_data)
    return output0_data
 def _configure_sever(self):
     shm_ip0_handle = shm.create_shared_memory_region(
         "input0_data", "/input0_data", 64)
     shm_ip1_handle = shm.create_shared_memory_region(
         "input1_data", "/input1_data", 64)
     shm_op0_handle = shm.create_shared_memory_region(
         "output0_data", "/output0_data", 64)
     shm_op1_handle = shm.create_shared_memory_region(
         "output1_data", "/output1_data", 64)
     input0_data = np.arange(start=0, stop=16, dtype=np.int32)
     input1_data = np.ones(shape=16, dtype=np.int32)
     shm.set_shared_memory_region(shm_ip0_handle, [input0_data])
     shm.set_shared_memory_region(shm_ip1_handle, [input1_data])
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     triton_client.register_system_shared_memory("input0_data",
                                                 "/input0_data", 64)
     triton_client.register_system_shared_memory("input1_data",
                                                 "/input1_data", 64)
     triton_client.register_system_shared_memory("output0_data",
                                                 "/output0_data", 64)
     triton_client.register_system_shared_memory("output1_data",
                                                 "/output1_data", 64)
     return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def connect_to_server(server_host, server_port, model_name, model_version):
    url = f"{server_host}:{server_port}"
    verbose = False
    # connection = {"text": f"Connected to server {server_host}:{server_port}, model: {model_name}, "
    #                       f"version: {model_version}", "success": True}
    connection = {"text": "", "success": False}
    try:
        triton_client = grpcclient.InferenceServerClient(
            url=url, verbose=verbose)
        connection = {"text": f"Connected to server {server_host}:{server_port}, model: {model_name}, "
                              f"version: {model_version}", "success": True}
    except Exception as e:
        connection = {"text": f"Client creation failed {server_host}:{server_port}", "success": False}
        return connection
    try:
        model_metadata = triton_client.get_model_metadata(
            model_name=model_name, model_version=model_version)
    except InferenceServerException as e:
        connection = {"text": f"Failed to retrieve the metadata: {server_host}:{server_port}, model: {model_name}, "
                              f"version: {model_version}", "success": False}
        return connection

    try:
        model_config = triton_client.get_model_config(
            model_name=model_name, model_version=model_version)
    except InferenceServerException as e:
        connection = {"text": f"Failed to retrieve the config: {server_host}:{server_port}, model: {model_name}, "
                             f"version: {model_version}", "success": False}
        return connection

    return connection
    def test_decoupled_bls(self):
        # Test combinations of BLS and decoupled API in Python backend.
        model_name = "decoupled_bls"
        shape = [1, 2]
        user_data = UserData()
        with grpcclient.InferenceServerClient(
                "localhost:8001") as triton_client:
            triton_client.start_stream(callback=partial(callback, user_data))

            input_datas = []
            input_data = np.random.randn(*shape).astype(np.float32)
            input_datas.append(input_data)
            inputs = [
                grpcclient.InferInput("IN", input_data.shape,
                                      np_to_triton_dtype(input_data.dtype))
            ]
            inputs[0].set_data_from_numpy(input_data)
            triton_client.async_stream_infer(model_name=model_name,
                                             inputs=inputs)

            # Check the results of the decoupled model using BLS
            def check_result(result):
                # Make sure the result is not an exception
                self.assertIsNot(type(result), InferenceServerException)

                output_data = result.as_numpy("OUT")
                self.assertIsNotNone(output_data, "error: expected 'OUT'")
                self.assertTrue(
                    np.array_equal(output_data, input_data),
                    "error: expected output {} to match input {}".format(
                        output_data, input_data))

            result = user_data._completed_requests.get()
            check_result(result)
    def _basic_inference(self,
                         shm_ip0_handle,
                         shm_ip1_handle,
                         shm_op0_handle,
                         shm_op1_handle,
                         error_msg,
                         big_shm_name="",
                         big_shm_size=64):
        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        inputs = []
        outputs = []
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
        else:
            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

        inputs[0].set_shared_memory("input0_data", 64)

        if type(shm_ip1_handle) == np.array:
            inputs[1].set_data_from_numpy(input0_data, binary_data=True)
        elif big_shm_name != "":
            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
        else:
            inputs[1].set_shared_memory("input1_data", 64)

        outputs[0].set_shared_memory("output0_data", 64)
        outputs[1].set_shared_memory("output1_data", 64)

        try:
            results = triton_client.infer("simple",
                                          inputs,
                                          model_version="",
                                          outputs=outputs)
            output = results.get_output('OUTPUT0')
            if _protocol == "http":
                output_datatype = output['datatype']
                output_shape = output['shape']
            else:
                output_datatype = output.datatype
                output_shape = output.shape
            output_dtype = utils.triton_to_np_dtype(output_datatype)
            output_data = shm.get_contents_as_numpy(shm_op0_handle,
                                                    output_dtype, output_shape)
            self.assertTrue(
                (output_data[0] == (input0_data + input1_data)).all(),
                "Model output does not match expected output")
        except Exception as ex:
            error_msg.append(str(ex))
Beispiel #10
0
    def __init__(self, server_url):
        """
        Parameters
        ----------
        server_url : str
            The url for Triton server's GRPC endpoint
        """

        self._client = grpcclient.InferenceServerClient(url=server_url)
def main(args):
  client = t_client.InferenceServerClient(url=args.url)
  if args.action in ['reload', 'unload']:
    client.unload_model(args.model)
    print('Successfully unloaded model', args.model)

  if args.action in ['reload', 'load']:
    client.load_model(args.model)
    print('Successfully loaded model', args.model)
Beispiel #12
0
    def _run_unittest(self, model_name):
        with grpcclient.InferenceServerClient("localhost:8001") as client:
            # No input is required
            result = client.infer(model_name, [], client_timeout=120)
            output0 = result.as_numpy('OUTPUT0')

            # The model returns 1 if the tests were sucessfully passed.
            # Otherwise, it will return 0.
            self.assertEqual(output0, [1])
    def _test_no_outputs_helper(self,
                                use_grpc=True,
                                use_http=True,
                                use_streaming=True):

        if use_grpc:
            triton_client = grpcclient.InferenceServerClient(
                url="localhost:8001", verbose=True)
            self._prepare_request("grpc")
            result = triton_client.infer(model_name=self.model_name_,
                                         inputs=self.inputs_,
                                         outputs=self.outputs_,
                                         client_timeout=1)
            # The response should not contain any outputs
            self.assertEqual(result.as_numpy('OUTPUT0'), None)

        if use_http:
            triton_client = httpclient.InferenceServerClient(
                url="localhost:8000", verbose=True, network_timeout=2.0)
            self._prepare_request("http")
            result = triton_client.infer(model_name=self.model_name_,
                                         inputs=self.inputs_,
                                         outputs=self.outputs_)
            # The response should not contain any outputs
            self.assertEqual(result.as_numpy('OUTPUT0'), None)

        if use_streaming:
            triton_client = grpcclient.InferenceServerClient(
                url="localhost:8001", verbose=True)
            self._prepare_request("grpc")
            user_data = UserData()

            triton_client.stop_stream()
            triton_client.start_stream(callback=partial(callback, user_data),
                                       stream_timeout=1)
            triton_client.async_stream_infer(model_name=self.model_name_,
                                             inputs=self.inputs_,
                                             outputs=self.outputs_)
            result = user_data._completed_requests.get()
            if type(result) == InferenceServerException:
                raise result

            # The response should not contain any outputs
            self.assertEqual(result.as_numpy('OUTPUT0'), None)
Beispiel #14
0
    def test_python_unittest(self):
        model_name = os.environ['MODEL_NAME']
        with grpcclient.InferenceServerClient("localhost:8001") as client:
            # No input is required
            result = client.infer(model_name, [], client_timeout=120)
            output0 = result.as_numpy('OUTPUT0')

            # The model returns 1 if the tests were sucessfully passed.
            # Otherwise, it will return 0.
            self.assertTrue(output0 == [1])
Beispiel #15
0
    def __init__(self, url, model_name):

        self.triton_client = grpcclient.InferenceServerClient(url)
        self.model_name = model_name
        assert self.triton_client.is_server_ready()
        assert self.triton_client.is_model_ready(model_name)
        self.model_metadata = self.triton_client.get_model_metadata(model_name)
        self.model_config = self.triton_client.get_model_config(model_name)
        self.inputs_specs, self.outputs_specs, self.max_batch_size = self.parse_model_grpc(
        )
def connectServer(url):
    try:
        triton_client = grpcclient.InferenceServerClient(url=url,
                                                         verbose=False,
                                                         ssl=False)
        print("Channel creation success")
    except Exception as e:
        triton_client = None
        print("channel creation failed: " + str(e))

    return triton_client
Beispiel #17
0
def run_triton_server(modelpath,
                      model_name,
                      triton_server_path,
                      device_id="0",
                      backend="tensorflow",
                      ps_path=None):
    import tritonclient
    import tritonclient.grpc as grpcclient

    if backend == "tensorflow":
        backend_config = "tensorflow,version=2"
    elif backend == "hugectr":
        backend_config = "hugectr,ps=" + ps_path
    else:
        raise ValueError("unknown backend:" + backend)

    cmdline = [
        triton_server_path,
        "--model-repository",
        modelpath,
        "--backend-config",
        backend_config,
        "--model-control-mode=explicit",
        "--load-model",
        model_name,
    ]
    env = os.environ.copy()
    env["CUDA_VISIBLE_DEVICES"] = device_id
    with subprocess.Popen(cmdline, env=env) as process:
        try:
            with grpcclient.InferenceServerClient("localhost:8001") as client:
                # wait until server is ready
                for _ in range(60):
                    if process.poll() is not None:
                        retcode = process.returncode
                        raise RuntimeError(
                            f"Tritonserver failed to start (ret={retcode})")

                    try:
                        ready = client.is_server_ready()
                    except tritonclient.utils.InferenceServerException:
                        ready = False

                    if ready:
                        yield client
                        return

                    time.sleep(1)

                raise RuntimeError(
                    "Timed out waiting for tritonserver to become ready")
        finally:
            # signal triton to shutdown
            process.send_signal(signal.SIGINT)
Beispiel #18
0
 def check_ready(self):
     try:
         # 新版本的triton client的send和receive的length都超过了1GB,足够霍霍了
         self.triton_client = grpcclient.InferenceServerClient(
             url=self.target_url)
     except Exception as e:
         raise TritonServerCannotConnectException(
             f'triton server {self.target_url} connect fail')
     if not self.triton_client.is_server_ready():
         raise TritonServerNotReadyException(
             f'triton server {self.target_url} not ready')
    def test_grpc(self):
        triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
        inputs = []
        inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8"))
        inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))

        try:
            triton_client.infer(model_name="query", inputs=inputs)
            self.assertTrue(False, "expect error with query information")
        except InferenceServerException as ex:
            self.assertTrue("OUTPUT0 CPU 0" in ex.message())
            self.assertTrue("OUTPUT1 CPU 0" in ex.message())
Beispiel #20
0
    def __init__(self):
        # Create server context
        try:
            triton_client = grpcclient.InferenceServerClient(
                url=self.url,
                verbose=self.verbose,
                ssl=self.ssl,
                root_certificates=self.root_certificates,
                private_key=self.private_key,
                certificate_chain=self.certificate_chain)
        except Exception as e:
            print("context creation failed: " + str(e))
            sys.exit()

        # Health check
        if not triton_client.is_server_live():
            print("FAILED : is_server_live")
            sys.exit(1)

        if not triton_client.is_server_ready():
            print("FAILED : is_server_ready")
            sys.exit(1)

        if not triton_client.is_model_ready(self.model):
            print("FAILED : is_model_ready",self.model)
            sys.exit(1)


        try:
            metadata = triton_client.get_model_metadata(self.model)
            print(metadata)
        except InferenceServerException as ex:
            if "Request for unknown model" not in ex.message():
                print("FAILED : get_model_metadata")
                print("Got: {}".format(ex.message()))
                sys.exit(1)
            else:
                print("FAILED : get_model_metadata")
                sys.exit(1)

        # Model configuration
        try:
            config = triton_client.get_model_config(self.model)
            if not (config.config.name == self.model):
                print("FAILED: get_model_config")
                sys.exit(1)
            # print(config)
        except InferenceServerException as ex:
            print("FAILED : get_model_config")
            print("Got: {}".format(ex.message()))
            sys.exit(1)

        self. triton_client = triton_client
 def __init__(self,
              model_name: str,
              input_names: Sequence[str],
              output_names: Sequence[str],
              url,
              concurrency=1,
              verbose=False):
     self.client = t_client.InferenceServerClient(url=url, verbose=verbose)
     self.input_names = input_names
     self.output_names = output_names
     self.concurrency = concurrency
     self.model_name = model_name
Beispiel #22
0
 def single_job(client_files):
     with grpcclient.InferenceServerClient(
             url=FLAGS.url, verbose=FLAGS.verbose) as triton_client:
         protocol_client = grpcclient
         speech_client = SpeechClient(triton_client, FLAGS.model_name,
                                      protocol_client)
         idx, audio_files = client_files
         predictions = []
         for li in audio_files:
             result = speech_client.recognize(li, idx)
             print("Recognized {}:{}".format(li, result[0]))
             predictions += result
     return predictions
Beispiel #23
0
 def set_client(self):
     try:
         url = self.host + ':' + str(self.port)
         client = grpcclient.InferenceServerClient(url=url,
                                                   verbose=False,
                                                   ssl=False,
                                                   root_certificates=None,
                                                   private_key=None,
                                                   certificate_chain=None)
     except Exception as e:
         print("channel create failed: " + str(e))
         sys.exit()
     return client
Beispiel #24
0
    def __init__(self, config):
        """
        Parameters
        ----------
        config : TritonClientConfig
            A config with the relevant client options
        """

        self._client_config = config
        assert self._client_config['url'], \
            "Must specify url in client config."
        self._client = grpcclient.InferenceServerClient(
            url=self._client_config['url'])
Beispiel #25
0
 def __init__(self,
              model='retinaface_r50_v1',
              output_order=None,
              triton_uri='localhost:8001',
              model_version='1',
              **kwargs):
     self.model_name = model
     self.model_version = model_version
     self.url = triton_uri
     self.input_shape = (1, 3, 640, 640)
     self.input_dtype = np.float32
     self.output_order = output_order
     self.triton_client = grpcclient.InferenceServerClient(url=triton_uri)
 def test_unregister_before_register(self):
     # Create a valid system shared memory region and unregister before register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = shm.create_shared_memory_region(
         "dummy_data", "/dummy_data", 8)
     triton_client.unregister_system_shared_memory("dummy_data")
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 0)
     else:
         self.assertTrue(len(shm_status.regions) == 0)
     shm.destroy_shared_memory_region(shm_op0_handle)
Beispiel #27
0
 def __init__(self,
              rec_name='arcface_r100_v1',
              triton_uri='localhost:8001',
              model_version='1',
              input_mean: float = 0.,
              input_std: float = 1.,
              **kwargs):
     self.model_name = rec_name
     self.model_version = model_version
     self.url = triton_uri
     self.input_shape = None
     self.max_batch_size = 1
     self.input_mean = input_mean
     self.input_std = input_std
     self.triton_client = grpcclient.InferenceServerClient(url=triton_uri)
 def test_valid_create_set_register(self):
     # Create a valid system shared memory region, fill data in it and register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     shm_op0_handle = shm.create_shared_memory_region(
         "dummy_data", "/dummy_data", 8)
     shm.set_shared_memory_region(shm_op0_handle,
                                  [np.array([1, 2], dtype=np.float32)])
     triton_client.register_system_shared_memory("dummy_data", "/dummy_data",
                                                 8)
     shm_status = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 1)
     else:
         self.assertTrue(len(shm_status.regions) == 1)
     shm.destroy_shared_memory_region(shm_op0_handle)
 def __init__(self,
              triton_host,
              fpsec=None,
              batch_size=16,
              resize=(1280, 800)):
     """Constructor for DetectionPipeline class.
     
     Keyword Arguments:
         fpsec {int} -- Frame per second for sampling the video. These will be evenly spaced
             throughout the video. If not specified (i.e., None), all frames will be loaded.
             (default: {None})
         batch_size {int} -- Batch size to use with detector. (default: {32})
         resize {tuple} -- Dimensions to resize frames to before inference. (default: {(1280, 800)})
     """
     self.triton_client = triton.InferenceServerClient(url=triton_host)
     self.fpsec = fpsec
     self.batch_size = batch_size
     self.resize = resize
 def test_unregisterall(self):
     # Unregister all shared memory blocks
     shm_handles = self._configure_sever()
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
     status_before = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(status_before) == 4)
     else:
         self.assertTrue(len(status_before.regions) == 4)
     triton_client.unregister_system_shared_memory()
     status_after = triton_client.get_system_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(status_after) == 0)
     else:
         self.assertTrue(len(status_after.regions) == 0)
     self._cleanup_server(shm_handles)