Beispiel #1
0
    def setUp(self):
        global _deferred_exceptions
        _deferred_exceptions = []

        # The helper client for setup will be GRPC for simplicity.
        self.triton_client_ = grpcclient.InferenceServerClient(
            "localhost:8001")
        self.model_name_ = 'identity_2_float32'
        # This will not be changed even when ensemble is under test,
        # as the dynamic batching is performed within the composing model
        self.check_status_model = 'identity_2_float32'
        self.tensor_shape_ = (1, 1)
        self.inputs_ = {
            "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"),
            "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32")
        }
        self.input_data_ = {
            "INPUT0": np.ones(shape=(1, 1), dtype=np.float32),
            "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32)
        }
        self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"])
        self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"])
        self.outputs_ = {
            "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'),
            "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1')
        }
    def _basic_inference(self,
                         shm_ip0_handle,
                         shm_ip1_handle,
                         shm_op0_handle,
                         shm_op1_handle,
                         error_msg,
                         big_shm_name="",
                         big_shm_size=64):
        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        inputs = []
        outputs = []
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
        else:
            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

        inputs[0].set_shared_memory("input0_data", 64)

        if type(shm_ip1_handle) == np.array:
            inputs[1].set_data_from_numpy(input0_data, binary_data=True)
        elif big_shm_name != "":
            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
        else:
            inputs[1].set_shared_memory("input1_data", 64)

        outputs[0].set_shared_memory("output0_data", 64)
        outputs[1].set_shared_memory("output1_data", 64)

        try:
            results = triton_client.infer("simple",
                                          inputs,
                                          model_version="",
                                          outputs=outputs)
            output = results.get_output('OUTPUT0')
            if _protocol == "http":
                output_datatype = output['datatype']
                output_shape = output['shape']
            else:
                output_datatype = output.datatype
                output_shape = output.shape
            output_dtype = utils.triton_to_np_dtype(output_datatype)
            output_data = shm.get_contents_as_numpy(shm_op0_handle,
                                                    output_dtype, output_shape)
            self.assertTrue(
                (output_data[0] == (input0_data + input1_data)).all(),
                "Model output does not match expected output")
        except Exception as ex:
            error_msg.append(str(ex))
def simple_string_inference(triton_client):
    model_name = 'simple_string'

    inputs = []
    outputs = []
    inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES"))
    inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "BYTES"))

    # Create the data for the two input tensors. Initialize the first
    # to unique integers and the second to all ones.
    in0 = np.arange(start=0, stop=16, dtype=np.int32)
    in0 = np.expand_dims(in0, axis=0)
    in1 = np.ones(shape=(1, 16), dtype=np.int32)
    expected_sum = np.add(in0, in1)
    expected_diff = np.subtract(in0, in1)

    # The 'simple_string' model expects 2 BYTES tensors where each
    # element in those tensors is the utf-8 string representation of
    # an integer. The BYTES tensors must be represented by a numpy
    # array with dtype=np.object_.
    in0n = np.array([str(x).encode('utf-8') for x in in0.reshape(in0.size)],
                    dtype=np.object_)
    input0_data = in0n.reshape(in0.shape)
    in1n = np.array([str(x).encode('utf-8') for x in in1.reshape(in1.size)],
                    dtype=np.object_)
    input1_data = in1n.reshape(in1.shape)

    # Initialize the data
    inputs[0].set_data_from_numpy(input0_data)
    inputs[1].set_data_from_numpy(input1_data)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
    outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Get the output arrays from the results
    output0_data = results.as_numpy('OUTPUT0')
    output1_data = results.as_numpy('OUTPUT1')

    for i in range(16):
        print(
            str(input0_data[0][i]) + " + " + str(input1_data[0][i]) + " = " +
            str(output0_data[0][i]))
        print(
            str(input0_data[0][i]) + " - " + str(input1_data[0][i]) + " = " +
            str(output1_data[0][i]))

        # Convert result from string to int to check result
        r0 = int(output0_data[0][i])
        r1 = int(output1_data[0][i])
        if expected_sum[0][i] != r0:
            print("error: incorrect sum")
            sys.exit(1)
        if expected_diff[0][i] != r1:
            print("error: incorrect difference")
            sys.exit(1)
def get_result(url, model_name, x):
    try:
        triton_client = grpcclient.InferenceServerClient(url=url,
                                                         verbose=False,
                                                         ssl=False)
        print("Channel creation success")
    except Exception as e:
        print("channel creation failed: " + str(e))

    inputs = []
    outputs = []
    inputs.append(grpcclient.InferInput('input0', x.shape, "FP32"))
    input0_data = x
    print("X Shape : ", x.shape)
    inputs[0].set_data_from_numpy(input0_data)
    outputs.append(grpcclient.InferRequestedOutput('output0'))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)
    inputs[0].set_data_from_numpy(input0_data)
    output0_data = results.as_numpy('output0')
    output0_data = sigmoid(output0_data.squeeze())
    print(output0_data)
    result = np.mean(output0_data)
    return output0_data
Beispiel #5
0
    def run(self, client_metadata):
        trial = self.get_trial()
        model_name = tu.get_zero_model_name(trial, 1, self.input_dtype_)
        triton_client = client_metadata[0]
        input_name = self.input_name_
        if "librotch" in trial:
            input_name = "INPUT__0"

        tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) //
                                   np.dtype(self.input_dtype_).itemsize), )
        in0 = np.random.random(tensor_shape).astype(self.input_dtype_)
        inputs = [
            grpcclient.InferInput(input_name, tensor_shape,
                                  np_to_triton_dtype(self.input_dtype_)),
        ]
        inputs[0].set_data_from_numpy(in0)

        # Expect an exception for small timeout values.
        try:
            triton_client.infer(model_name, inputs, client_timeout=0.1)
            assert False, "expected inference failure from deadline exceeded"
        except Exception as ex:
            if "Deadline Exceeded" not in ex.message():
                assert False, "timeout_client failed {}".format(self.name_)
            # Expect timeout error as success case
            return 1
Beispiel #6
0
 def infer(self, _need_tensor_check=False, **_input_tensor):
     self.check_ready()
     inputs = []
     assert _input_tensor.keys() == set(self.all_inputs.keys(
     )), f'{self.model_name} the input tensor not match'
     for m_name, m_tensor_info in self.all_inputs.items():
         m_tensor = _input_tensor[m_name]
         if not (isinstance(m_tensor, np.ndarray)
                 and m_tensor.dtype.name in self.numpy_data_type_mapper):
             raise InferenceTensorCheckFailException(
                 f'tensor {m_name} is available numpy array')
         if _need_tensor_check:
             check_status, check_result = m_tensor_info.tensor_check(
                 m_tensor)
             if not check_status:
                 raise InferenceTensorCheckFailException(check_result)
         m_normalized_tensor = m_tensor_info.normalize(
             m_tensor, _tensor_format='chw').astype(m_tensor.dtype)
         m_infer_input = grpcclient.InferInput(
             m_name, m_normalized_tensor.shape,
             self.numpy_data_type_mapper[m_normalized_tensor.dtype.name])
         m_infer_input.set_data_from_numpy(m_normalized_tensor)
         inputs.append(m_infer_input)
     results = self.triton_client.infer(model_name=self.model_name,
                                        model_version=self.model_version,
                                        inputs=inputs)
     to_return_result = dict()
     for m_result_name in self.all_outputs.keys():
         to_return_result[m_result_name] = results.as_numpy(m_result_name)
     return to_return_result
    def test_decoupled_bls(self):
        # Test combinations of BLS and decoupled API in Python backend.
        model_name = "decoupled_bls"
        shape = [1, 2]
        user_data = UserData()
        with grpcclient.InferenceServerClient(
                "localhost:8001") as triton_client:
            triton_client.start_stream(callback=partial(callback, user_data))

            input_datas = []
            input_data = np.random.randn(*shape).astype(np.float32)
            input_datas.append(input_data)
            inputs = [
                grpcclient.InferInput("IN", input_data.shape,
                                      np_to_triton_dtype(input_data.dtype))
            ]
            inputs[0].set_data_from_numpy(input_data)
            triton_client.async_stream_infer(model_name=model_name,
                                             inputs=inputs)

            # Check the results of the decoupled model using BLS
            def check_result(result):
                # Make sure the result is not an exception
                self.assertIsNot(type(result), InferenceServerException)

                output_data = result.as_numpy("OUT")
                self.assertIsNotNone(output_data, "error: expected 'OUT'")
                self.assertTrue(
                    np.array_equal(output_data, input_data),
                    "error: expected output {} to match input {}".format(
                        output_data, input_data))

            result = user_data._completed_requests.get()
            check_result(result)
Beispiel #8
0
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS):

    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            grpcclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            httpclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            grpcclient.InferRequestedOutput(output_name,
                                            class_count=FLAGS.classes))
    else:
        outputs.append(
            httpclient.InferRequestedOutput(output_name,
                                            binary_data=True,
                                            class_count=FLAGS.classes))

    yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def identity_inference(triton_client, np_array):
    model_name = "simple_identity"
    inputs = []
    outputs = []

    inputs.append(grpcclient.InferInput('INPUT0', np_array.shape, "BYTES"))
    inputs[0].set_data_from_numpy(np_array)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)
    if (np_array.dtype == np.object_):
        print(results.as_numpy('OUTPUT0'))
        if not np.array_equal(np_array, results.as_numpy('OUTPUT0')):
            print(results.as_numpy('OUTPUT0'))
            print("error: incorrect output")
            sys.exit(1)
    else:
        encoded_results = np.char.encode(
            results.as_numpy('OUTPUT0').astype(str))
        if not np.array_equal(np_array, encoded_results):
            print(encoded_results)
            print("error: incorrect output")
            sys.exit(1)
def sync_send(triton_client, result_list, values, batch_size, sequence_id,
              model_name, model_version):

    count = 1
    for value in values:
        # Create the tensor for INPUT
        value_data = np.full(shape=[batch_size, 1],
                             fill_value=value,
                             dtype=np.int32)
        inputs = []
        inputs.append(grpcclient.InferInput('INPUT', value_data.shape,
                                            "INT32"))
        # Initialize the data
        inputs[0].set_data_from_numpy(value_data)
        outputs = []
        outputs.append(grpcclient.InferRequestedOutput('OUTPUT'))
        # Issue the synchronous sequence inference.
        result = triton_client.infer(model_name=model_name,
                                     inputs=inputs,
                                     outputs=outputs,
                                     sequence_id=sequence_id,
                                     sequence_start=(count == 1),
                                     sequence_end=(count == len(values)))
        result_list.append(result.as_numpy('OUTPUT'))
        count = count + 1
    def detector(self, frames):
        infer_inputs = [
            triton.InferInput('input_1', (len(frames), 3, *self.resize[::-1]),
                              "FP32")
        ]
        frames = np.array(frames, dtype=np.float32)
        frames = np.transpose(frames, (0, 3, 1, 2))
        infer_inputs[0].set_data_from_numpy(frames)
        result = self.triton_client.infer('retinanet', infer_inputs)
        scores = result.as_numpy('scores').reshape((-1, 100))
        boxes = result.as_numpy('boxes').reshape((-1, 100, 4))
        classes = result.as_numpy('classes').reshape((-1, 100))

        # Calculate embeddings for all the detected subjects
        embs = []
        scores_filtered = []
        boxes_filters = []
        for i in range(len(frames)):
            mask = (scores[i] > 0.4) & (
                classes[i] == 0)  # only care about 'person' with score > 0.4
            scores_i = scores[i, mask]
            boxes_i = boxes[i, mask]

            scores_i, boxes_i = self.bbox_filter(scores_i, boxes_i)

            img = frames[i].astype(np.uint8)  # (3, 800, 1280)
            embs_i = []
            boxes_i = boxes_i.astype(int)
            for j in range(len(boxes_i)):
                imp = img[:, boxes_i[j, 1]:boxes_i[j, 3],
                          boxes_i[j, 0]:boxes_i[j, 2]]
                imp = np.transpose(imp, (1, 2, 0))
                imp = Image.fromarray(imp)
                data = [
                    np.asarray(transforms.Resize(size=(256, 128))(imp)).astype(
                        np.float32)
                ]

                inputs = []
                inputs.append(
                    tritongrpcclient.InferInput('image',
                                                [len(data), 256, 128, 3],
                                                "FP32"))
                # Initialize the data
                inputs[0].set_data_from_numpy(np.asarray(data))
                outputs = []
                outputs.append(
                    tritongrpcclient.InferRequestedOutput('features'))
                results = self.triton_client.infer('osnet_ensemble',
                                                   inputs,
                                                   outputs=outputs)
                emb = np.squeeze(results.as_numpy('features'))
                embs_i.append(emb / np.linalg.norm(emb))
            embs.append(embs_i)
            scores_filtered.append(scores_i)
            boxes_filters.append(boxes_i)

        return np.asarray(scores_filtered), np.asarray(
            boxes_filters), np.asarray(embs)
    def test_grpc(self):
        triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
        inputs = []
        inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8"))
        inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))

        try:
            triton_client.infer(model_name="query", inputs=inputs)
            self.assertTrue(False, "expect error with query information")
        except InferenceServerException as ex:
            self.assertTrue("OUTPUT0 CPU 0" in ex.message())
            self.assertTrue("OUTPUT1 CPU 0" in ex.message())
Beispiel #13
0
def _run_query(
    client,
    n_rows,
    model_name,
    workflow_path,
    data_path,
    actual_output_filename,
    output_name,
    input_cols_name=None,
    backend="tensorflow",
):

    workflow = nvt.Workflow.load(workflow_path)

    if input_cols_name is None:
        batch = cudf.read_csv(
            data_path, nrows=n_rows)[workflow.output_node.input_columns.names]
    else:
        batch = cudf.read_csv(data_path, nrows=n_rows)[input_cols_name]

    input_dtypes = workflow.input_dtypes
    columns = [(col, batch[col]) for col in batch.columns]

    inputs = []
    for i, (name, col) in enumerate(columns):
        d = col.values_host.astype(input_dtypes[name])
        d = d.reshape(len(d), 1)
        inputs.append(
            grpcclient.InferInput(name, d.shape,
                                  np_to_triton_dtype(input_dtypes[name])))
        inputs[i].set_data_from_numpy(d)

    outputs = [grpcclient.InferRequestedOutput(output_name)]
    time_start = dt.datetime.now()
    response = client.infer(model_name,
                            inputs,
                            request_id="1",
                            outputs=outputs)
    run_time = dt.datetime.now() - time_start

    output_key = "output" if backend == "hugectr" else "0"

    output_actual = cudf.read_csv(os.path.expanduser(actual_output_filename),
                                  nrows=n_rows)
    output_actual = cp.asnumpy(output_actual[output_key].values)
    output_predict = response.as_numpy(output_name)

    if backend == "tensorflow":
        output_predict = output_predict[:, 0]

    diff = abs(output_actual - output_predict)
    return diff, run_time
Beispiel #14
0
    def setUp(self):
        global _deferred_exceptions
        _deferred_exceptions = []

        # The helper client for setup will be GRPC for simplicity.
        self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001")
        self.model_name_ = 'identity_2_float32'
        self.tensor_shape_ = (1, 1)
        self.inputs_ = {
            "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"),
            "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32")
        }
        self.input_data_ = {
            "INPUT0": np.ones(shape=(1, 1), dtype=np.float32),
            "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32)
        }
        self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"])
        self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"])
        self.outputs_ = {
            "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'),
            "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1')
        }
Beispiel #15
0
 def set_inputs(self, inputs_node: list):
     """
     Args:
         inputs: a list of NodeInfo
     Returns:
         a list of input tensors
     """
     inputs_tensor = []
     for node in inputs_node:
         input_tensor = grpcclient.InferInput(node.node_name,
                                              node.node_data.shape,
                                              node.node_type)
         input_tensor.set_data_from_numpy(node.node_data)
         inputs_tensor.append(input_tensor)
     return inputs_tensor
    def test_decoupled_send_after_close_error(self):
        model_name = "decoupled_send_after_close_error"
        shape = [16]
        user_data = UserData()
        with grpcclient.InferenceServerClient("localhost:8001") as client:
            client.start_stream(callback=partial(callback, user_data))
            input_data_0 = np.random.random(shape).astype(np.float32)
            input_data_1 = np.random.random(shape).astype(np.float32)
            inputs = [
                grpcclient.InferInput("INPUT0", input_data_0.shape,
                                      np_to_triton_dtype(input_data_0.dtype)),
                grpcclient.InferInput("INPUT1", input_data_1.shape,
                                      np_to_triton_dtype(input_data_1.dtype))
            ]
            inputs[0].set_data_from_numpy(input_data_0)
            inputs[1].set_data_from_numpy(input_data_1)
            client.async_stream_infer(model_name=model_name, inputs=inputs)

            # Because the model has closed the response sender there is no
            # way to deliver the error message to the client. The error
            # will be logged on the server side.
            time.sleep(4)
            self.assertEqual(user_data._completed_requests.qsize(), 0,
                             "The completed request size must be zero.")
 def test_decoupled_return_response_error(self):
     model_name = "decoupled_return_response_error"
     shape = [16]
     user_data = UserData()
     with grpcclient.InferenceServerClient("localhost:8001") as client:
         client.start_stream(callback=partial(callback, user_data))
         input_data_0 = np.random.random(shape).astype(np.float32)
         input_data_1 = np.random.random(shape).astype(np.float32)
         inputs = [
             grpcclient.InferInput("INPUT0", input_data_0.shape,
                                   np_to_triton_dtype(input_data_0.dtype)),
             grpcclient.InferInput("INPUT1", input_data_1.shape,
                                   np_to_triton_dtype(input_data_1.dtype))
         ]
         inputs[0].set_data_from_numpy(input_data_0)
         inputs[1].set_data_from_numpy(input_data_1)
         client.async_stream_infer(model_name=model_name, inputs=inputs)
         data_item = user_data._completed_requests.get()
         if type(data_item) == InferenceServerException:
             self.assertEqual(
                 data_item.message(),
                 "Python model 'decoupled_return_response_error_0' is using "
                 "the decoupled mode and the execute function must return "
                 "None.", "Exception message didn't match.")
Beispiel #18
0
    def run(self, client_metadata):
        triton_client = client_metadata[0]

        inputs = [
            grpcclient.InferInput("input", self.image_data_.shape, "FP32")
        ]
        inputs[0].set_data_from_numpy(self.image_data_)

        outputs = [
            grpcclient.InferRequestedOutput("resnet_v1_50/predictions/Softmax",
                                            class_count=1)
        ]
        res = triton_client.infer(self.model_name_, inputs, outputs=outputs)
        self.postprocess(res)
        return self.batch_size_
Beispiel #19
0
 def _run_test(self):
     model_name = "ensemble_io"
     user_data = UserData()
     with grpcclient.InferenceServerClient("localhost:8001") as client:
         input0 = np.random.random([1000]).astype(np.float32)
         client.start_stream(callback=partial(callback, user_data))
         for model_1_in_gpu in [True, False]:
             for model_2_in_gpu in [True, False]:
                 for model_3_in_gpu in [True, False]:
                     gpu_output = np.asarray(
                         [model_1_in_gpu, model_2_in_gpu, model_3_in_gpu],
                         dtype=bool)
                     inputs = [
                         grpcclient.InferInput(
                             "INPUT0", input0.shape,
                             np_to_triton_dtype(input0.dtype)),
                         grpcclient.InferInput(
                             "GPU_OUTPUT", gpu_output.shape,
                             np_to_triton_dtype(gpu_output.dtype))
                     ]
                     inputs[0].set_data_from_numpy(input0)
                     inputs[1].set_data_from_numpy(gpu_output)
                     client.async_stream_infer(model_name=model_name,
                                               inputs=inputs)
                     if TRIAL == 'default':
                         result = user_data._completed_requests.get()
                         output0 = result.as_numpy('OUTPUT0')
                         self.assertIsNotNone(output0)
                         self.assertTrue(np.all(output0 == input0))
                     else:
                         response_repeat = 2
                         for _ in range(response_repeat):
                             result = user_data._completed_requests.get()
                             output0 = result.as_numpy('OUTPUT0')
                             self.assertIsNotNone(output0)
                             self.assertTrue(np.all(output0 == input0))
    def _prepare_request(self, protocol):
        if (protocol == "grpc"):
            self.inputs_ = []
            self.inputs_.append(
                grpcclient.InferInput('INPUT0', [1, 1], "INT32"))
            self.outputs_ = []
            self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0'))
        else:
            self.inputs_ = []
            self.inputs_.append(
                httpclient.InferInput('INPUT0', [1, 1], "INT32"))
            self.outputs_ = []
            self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0'))

        self.inputs_[0].set_data_from_numpy(self.input0_data_)
def requestGenerator(batched_image_data, input_name, output_name, dtype, model_name, model_version, classes=1):

    # Set the input data
    inputs = []

    inputs.append(
        grpcclient.InferInput(input_name, batched_image_data.shape, dtype))
    inputs[0].set_data_from_numpy(batched_image_data)

    outputs = []

    outputs.append(
        grpcclient.InferRequestedOutput(output_name,
                                        class_count=classes))

    yield inputs, outputs, model_name, model_version
Beispiel #22
0
    def get_embedding(self, face_img):
        if not isinstance(face_img, list):
            face_img = [face_img]

        face_img = np.stack(face_img)

        input_size = tuple(face_img[0].shape[0:2][::-1])
        blob = cv2.dnn.blobFromImages(
            face_img,
            1.0 / self.input_std,
            input_size, (self.input_mean, self.input_mean, self.input_mean),
            swapRB=True)

        blob = blob.astype(triton_to_np_dtype(self.dtype))

        inputs = []
        inputs.append(
            grpcclient.InferInput(self.input_name,
                                  [blob.shape[0], self.c, self.h, self.w],
                                  "FP32"))
        # inputs[0].set_data_from_numpy(face_img)

        cudashm.set_shared_memory_region(self.in_handle, [blob])
        input_bytesize = 12 * blob.shape[0] * self.w * self.h
        inputs[-1].set_shared_memory(self.in_handle_name, input_bytesize)

        outputs = []
        out_bytesize = 12 * 512 * self.max_batch_size
        outputs.append(grpcclient.InferRequestedOutput(self.output_name[0]))
        outputs[-1].set_shared_memory(self.out_handle_name, out_bytesize)

        out = self.triton_client.infer(self.model_name,
                                       inputs,
                                       model_version=self.model_version,
                                       outputs=outputs)

        out = [
            cudashm.get_contents_as_numpy(self.out_handle,
                                          triton_to_np_dtype(self.dtype),
                                          [blob.shape[0], 512])
        ]
        # out = [out.as_numpy(e) for e in self.output_name]

        return out[0]
Beispiel #23
0
    def run(self, input):
        inputs = []
        outputs = [
            grpcclient.InferRequestedOutput(e) for e in self.output_order
        ]
        inputs.append(
            grpcclient.InferInput(self.input_name, [1, self.c, self.h, self.w],
                                  self.dtype))
        # inputs[0].set_data_from_numpy(input)
        cudashm.set_shared_memory_region(self.in_handle, [input])
        inputs[-1].set_shared_memory(self.in_handle_name, self.input_bytesize)

        out = self.triton_client.infer(self.model_name,
                                       inputs,
                                       model_version=self.model_version,
                                       outputs=outputs)

        out = [out.as_numpy(e) for e in self.output_order]

        return out
Beispiel #24
0
    def _to_trt(self, inputs, model_name, model_version='1'):
        tt_inputs = []
        if self.protocol == 'http':
            input_metadata = self.metadata[model_name][model_version]['inputs']
            for input, metadata in zip([inputs], input_metadata):
                tt_input = httpclient.InferInput(metadata['name'],
                                                 list(input.shape),
                                                 metadata['datatype'])
                tt_input.set_data_from_numpy(input)
                tt_inputs.append(tt_input)

        elif self.protocol == 'grpc':
            input_metadata = self.metadata[model_name][model_version].inputs
            for input, metadata in zip([inputs], input_metadata):
                tt_input = grpcclient.InferInput(metadata.name,
                                                 list(input.shape),
                                                 metadata.datatype)
                tt_input.set_data_from_numpy(input)
                tt_inputs.append(tt_input)

        return tt_inputs
def requestGenerator(input_name, input_data, output_name, dtype, protocol):

    # Set the input data
    inputs = []
    if protocol.lower() == "grpc":
        inputs.append(grpcclient.InferInput(input_name, input_data.shape,
                                            dtype))
        inputs[0].set_data_from_numpy(input_data)
    else:
        inputs.append(httpclient.InferInput(input_name, input_data.shape,
                                            dtype))
        inputs[0].set_data_from_numpy(input_data, binary_data=True)

    outputs = []
    if protocol.lower() == "grpc":
        outputs.append(grpcclient.InferRequestedOutput(output_name))
    else:
        outputs.append(
            httpclient.InferRequestedOutput(output_name, binary_data=True))

    return inputs, outputs
    def test_decoupled_execute_error(self):
        # The decoupled_execute_error model returns an error for the first
        # request and sucessfully processes the second request. This is making
        # sure that an error in a single request does not completely fail the
        # batch.

        model_name = "decoupled_execute_error"
        shape = [2, 2]
        number_of_requests = 2
        user_data = UserData()
        with grpcclient.InferenceServerClient(
                "localhost:8001") as triton_client:
            triton_client.start_stream(callback=partial(callback, user_data))

            input_datas = []
            for i in range(number_of_requests):
                input_data = np.random.randn(*shape).astype(np.float32)
                input_datas.append(input_data)
                inputs = [
                    grpcclient.InferInput("IN", input_data.shape,
                                          np_to_triton_dtype(input_data.dtype))
                ]
                inputs[0].set_data_from_numpy(input_data)
                triton_client.async_stream_infer(model_name=model_name,
                                                 inputs=inputs)

            for i in range(number_of_requests):
                result = user_data._completed_requests.get()
                if i == 0:
                    self.assertIs(type(result), InferenceServerException)
                    continue

                print(result)
                output_data = result.as_numpy("OUT")
                self.assertIsNotNone(output_data, "error: expected 'OUT'")
                self.assertTrue(
                    np.array_equal(output_data, input_datas[i]),
                    "error: expected output {} to match input {}".format(
                        output_data, input_datas[i]))
    def test_grpc_out_of_shared_memory(self):
        triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
        inputs = []
        inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8"))
        inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))

        # Set up too small CUDA shared memory for outputs, expect query
        # returns default value
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
        shm_op0_handle = cudashm.create_shared_memory_region(
            "output0_data", 1, 0)
        shm_op1_handle = cudashm.create_shared_memory_region(
            "output1_data", 1, 0)
        triton_client.register_cuda_shared_memory(
            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1)
        triton_client.register_cuda_shared_memory(
            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1)
        outputs = []
        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
        outputs[-1].set_shared_memory("output0_data", 1)

        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
        outputs[-1].set_shared_memory("output1_data", 1)

        try:
            triton_client.infer(model_name="query",
                                inputs=inputs,
                                outputs=outputs)
            self.assertTrue(False, "expect error with query information")
        except InferenceServerException as ex:
            self.assertTrue("OUTPUT0 CPU 0" in ex.message())
            self.assertTrue("OUTPUT1 CPU 0" in ex.message())

        cudashm.destroy_shared_memory_region(shm_op0_handle)
        cudashm.destroy_shared_memory_region(shm_op1_handle)
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
Beispiel #28
0
def test_nvt_hugectr_inference(n_rows, err_tol):
    warnings.simplefilter("ignore")

    model_name = "test_model_ens"
    col_names = ["userId", "movieId", "new_cat1"]
    # read in a batch of data to get transforms for
    batch = cudf.read_csv(DATA_DIR + "test/data.csv", nrows=n_rows)[col_names]

    # convert the batch to a triton inputs
    columns = [(col, batch[col]) for col in col_names]
    inputs = []

    col_dtypes = [np.int64, np.int64, np.int64]
    for i, (name, col) in enumerate(columns):
        d = col.values_host.astype(col_dtypes[i])
        d = d.reshape(len(d), 1)
        inputs.append(
            httpclient.InferInput(name, d.shape,
                                  np_to_triton_dtype(col_dtypes[i])))
        inputs[i].set_data_from_numpy(d)

    # placeholder variables for the output
    outputs = []
    outputs.append(httpclient.InferRequestedOutput("OUTPUT0"))
    # make the request
    with httpclient.InferenceServerClient("localhost:8001") as client:
        response = client.infer(model_name,
                                inputs,
                                request_id=str(1),
                                outputs=outputs)

    output_actual = cudf.read_csv(DATA_DIR + "test/output.csv", nrows=n_rows)
    output_actual = cp.asnumpy(output_actual["output"].values)
    output_predict = response.as_numpy("OUTPUT0")

    diff = abs(output_actual - output_predict)

    assert (diff < err_tol).all()
Beispiel #29
0
    def inputs_outputs_generator(self, raw_inputs):
        """
        Generate inputs and outptus blob for triton client inference
        :param raw_inputs: list of raw numpy inputs
        :return: inputs outputs data
        """
        inputs = []
        for input_specs, raw_input in zip(self.inputs_specs, raw_inputs):
            # parse data type
            raw_input = raw_input.astype(
                triton_to_np_dtype(input_specs.datatype))
            infer_input = grpcclient.InferInput(input_specs.name,
                                                raw_input.shape,
                                                input_specs.datatype)
            infer_input.set_data_from_numpy(raw_input)
            inputs.append(infer_input)

        outputs = []
        for output_specs in self.outputs_specs:
            outputs.append(
                grpcclient.InferRequestedOutput(output_specs.name,
                                                class_count=0))
        return inputs, outputs
Beispiel #30
0
def crashing_client(model_name,
                    dtype,
                    tensor_shape,
                    shm_name,
                    triton_client,
                    input_name="INPUT0"):
    in0 = np.random.random(tensor_shape).astype(dtype)
    if "libtorch" in model_name:
        input_name = "INPUT__0"
    inputs = [
        grpcclient.InferInput(input_name, tensor_shape,
                              np_to_triton_dtype(dtype)),
    ]
    inputs[0].set_data_from_numpy(in0)

    # Run in a loop so that it is guaranteed that
    # the inference will not have completed when being terminated.
    while True:
        existing_shm = shared_memory.SharedMemory(shm_name)
        count = np.ndarray((1, ), dtype=np.int32, buffer=existing_shm.buf)
        count[0] += 1
        existing_shm.close()
        results = triton_client.infer(model_name, inputs)