Ejemplo n.º 1
0
    def test_batch_request_for_batching_model(self):
        input_size = 16

        # graphdef_nobatch_int32_int8_int8 is non batching version.
        # The server should return an error if the batch size dimension
        # is included in the shape
        tensor_shape = (1, input_size)
        for protocol in ["http", "grpc"]:
            model_name = tu.get_model_name("graphdef", np.int32, np.int8,
                                           np.int8)
            in0 = np.random.randint(low=0,
                                    high=100,
                                    size=tensor_shape,
                                    dtype=np.int32)
            in1 = np.random.randint(low=0,
                                    high=100,
                                    size=tensor_shape,
                                    dtype=np.int32)

            inputs = []
            outputs = []
            if protocol == "http":
                triton_client = tritonhttpclient.InferenceServerClient(
                    url='localhost:8000', verbose=True)
                inputs.append(
                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
                                                "INT32"))
                inputs.append(
                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
                                                "INT32"))
                outputs.append(
                    tritonhttpclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(
                    tritonhttpclient.InferRequestedOutput('OUTPUT1'))
            else:
                triton_client = tritongrpcclient.InferenceServerClient(
                    url='localhost:8001', verbose=True)
                inputs.append(
                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
                                                "INT32"))
                inputs.append(
                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
                                                "INT32"))
                outputs.append(
                    tritongrpcclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(
                    tritongrpcclient.InferRequestedOutput('OUTPUT1'))

            # Initialize the data
            inputs[0].set_data_from_numpy(in0)
            inputs[1].set_data_from_numpy(in1)

            results = triton_client.infer(model_name, inputs, outputs=outputs)
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS):

    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            tritongrpcclient.InferInput(input_name, batched_image_data.shape,
                                        dtype))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            tritonhttpclient.InferInput(input_name, batched_image_data.shape,
                                        dtype))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            tritongrpcclient.InferRequestedOutput(output_name,
                                                  class_count=FLAGS.classes))
    else:
        outputs.append(
            tritonhttpclient.InferRequestedOutput(output_name,
                                                  binary_data=True,
                                                  class_count=FLAGS.classes))

    yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
Ejemplo n.º 3
0
    def test_ragged_input(self):
        model_name = "ragged_acc_shape"

        output_name = 'RAGGED_OUTPUT'
        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]

        async_requests = []
        try:
            for inputs in self.inputs:
                # Asynchronous inference call.
                async_requests.append(
                    self.client.async_infer(model_name=model_name,
                                            inputs=inputs,
                                            outputs=outputs))

            value_lists = [[v] * v for v in [2, 4, 1, 3]]
            expected_value = []
            for value_list in value_lists:
                expected_value += value_list
            expected_value = np.asarray([expected_value], dtype=np.float32)
            for idx in range(len(async_requests)):
                # Get the result from the initiated asynchronous inference request.
                # Note the call will block till the server responds.
                result = async_requests[idx].get_result()

                # Validate the results by comparing with precomputed values.
                output_data = result.as_numpy(output_name)
                self.assertTrue(
                    np.array_equal(output_data, expected_value),
                    "Expect response {} to have value {}, got {}".format(
                        idx, expected_value, output_data))
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
    def test_tf_unicode_bytes(self):
        # We use a simple model that takes an input tensor of 8 byte strings
        # and returns an output tensors of 8 strings. The output tensor
        # is the same as the input tensor.
        model_name = "graphdef_nobatch_zero_1_object"
        model_version = ""

        # Create the inference server client for the model.
        triton_client = tritonhttpclient.InferenceServerClient(
            "localhost:8000", verbose=True)

        # Create the data for the input tensor. Initialize the tensor to 8
        # byte strings. (dtype of np.bytes_)
        # Sample string that should no longer cause failure
        in0 = np.array([
            [
                b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\xfa\x03\x94\x01\x0f\xd7\x02\xf1\x05\xdf\x01\x82\x03\xb5\x05\xc1\x07\xba\x06\xff\x06\xc7\x07L\xf5\x03\xe2\x07\xa9\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xdf\\\xcb\xbf'
            ],
            [
                b'\n:\n\x1a\n\x01a\x12\x15\x1a\x13\n\x11*\xe3\x05\xc5\x06\xda\x07\xcb\x06~\xb1\x05\xb3\x01\xa9\x02\x15\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbb[\n\xbf'
            ],
            [
                b'\nL\n-\n\x01a\x12(\x1a&\n$\x87\x07\xce\x01\xe7\x06\xee\x04\xe1\x03\xf1\x03\xd7\x07\xbe\x02\xb8\x05\xe0\x05\xe4\x01\x88\x06\xb6\x03\xb9\x05\x83\x06\xf8\x04\xe2\x04\xf4\x06\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbc\x99+@'
            ],
            [
                b'\n2\n\x12\n\x01a\x12\r\x1a\x0b\n\t\x99\x02\xde\x04\x9f\x04\xc5\x053\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x12\x07\x83\xbe'
            ],
            [
                b'\nJ\n\r\n\x01b\x12\x08\x1a\x06\n\x04\x9b\x94\xad\x04\n\r\n\x01c\x12\x08\x12\x06\n\x04\xc3\x8a\x08\xbf\n*\n\x01a\x12%\x1a#\n!\x9c\x02\xb2\x02\xcd\x02\x9d\x07\x8d\x01\xb6\x05a\xf1\x01\xf0\x05\xdb\x02\xac\x04\xbd\x05\xe0\x04\xd2\x06\xaf\x02\xa8\x01\x8b\x04'
            ],
            [
                b'\n3\n\x13\n\x01a\x12\x0e\x1a\x0c\n\n<\xe2\x05\x8a\x01\xb3\x07?\xfd\x01\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x1b\x931\xbf'
            ],
            [
                b'\n&\n\x07\n\x01a\x12\x02\x1a\x00\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04{\xbc\x0e>'
            ],
            [
                b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\x97\x01\x93\x02\x9e\x01\xac\x06\xff\x01\xd8\x05\xe1\x07\xd8\x04g]\x9a\x05\xff\x06\xde\x07\x8f\x04\x97\x04\xda\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x9a\xb7I\n\r\n\x01c\x12\x08\x12\x06\n\x04\xfb\x87\x83\xbf'
            ]
        ],
                       dtype='|S78').flatten()

        # Send inference request to the inference server. Get results for
        # both output tensors.
        inputs = []
        outputs = []
        inputs.append(tritonhttpclient.InferInput('INPUT0', in0.shape,
                                                  "BYTES"))
        inputs[0].set_data_from_numpy(in0)

        outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))

        results = triton_client.infer(model_name=model_name,
                                      inputs=inputs,
                                      outputs=outputs,
                                      model_version=model_version)

        # We expect there to be 1 results (with batch-size 1). Verify
        # that all 8 result elements are the same as the input.
        self.assertTrue(np.array_equal(in0, results.as_numpy('OUTPUT0')))
Ejemplo n.º 5
0
    def test_max_element_count_as_shape(self):
        model_name = "ragged_acc_shape"

        output_name = 'BATCH_OUTPUT'
        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]

        async_requests = []
        try:
            for inputs in self.inputs:
                # Asynchronous inference call.
                async_requests.append(
                    self.client.async_infer(model_name=model_name,
                                            inputs=inputs,
                                            outputs=outputs))

            for idx in range(len(async_requests)):
                # Get the result from the initiated asynchronous inference request.
                # Note the call will block till the server responds.
                result = async_requests[idx].get_result()

                # Validate the results by comparing with precomputed values.
                output_data = result.as_numpy(output_name)
                self.assertEqual(
                    output_data.shape, (1, 4),
                    "Expect response {} to have shape to represent max element count {} among the batch , got {}"
                    .format(idx, 4, output_data.shape))
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
Ejemplo n.º 6
0
    def test_accumulated_element_count_with_zero(self):
        model_name = "ragged_element_count_acc_zero"

        output_name = 'BATCH_OUTPUT'
        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]

        async_requests = []
        try:
            for inputs in self.inputs:
                # Asynchronous inference call.
                async_requests.append(
                    self.client.async_infer(model_name=model_name,
                                            inputs=inputs,
                                            outputs=outputs))

            expected_value = np.asarray([[0, 2, 6, 7, 10]], np.float32)
            for idx in range(len(async_requests)):
                # Get the result from the initiated asynchronous inference request.
                # Note the call will block till the server responds.
                result = async_requests[idx].get_result()

                # Validate the results by comparing with precomputed values.
                output_data = result.as_numpy(output_name)
                self.assertTrue(
                    np.array_equal(output_data, expected_value),
                    "Expect response {} to have value {}, got {}".format(
                        idx, expected_value, output_data))
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
Ejemplo n.º 7
0
def sync_send(triton_client, result_list, values, batch_size, sequence_id,
              model_name, model_version):

    count = 1
    for value in values:
        # Create the tensor for INPUT
        value_data = np.full(shape=[batch_size, 1],
                             fill_value=value,
                             dtype=np.int32)
        inputs = []
        inputs.append(
            tritonhttpclient.InferInput('INPUT', value_data.shape, "INT32"))
        # Initialize the data
        inputs[0].set_data_from_numpy(value_data)
        outputs = []
        outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT'))
        # Issue the synchronous sequence inference.
        result = triton_client.infer(model_name=model_name,
                                     inputs=inputs,
                                     outputs=outputs,
                                     sequence_id=sequence_id,
                                     sequence_start=(count == 1),
                                     sequence_end=(count == len(values)))
        result_list.append(result.as_numpy('OUTPUT'))
        count = count + 1
    def _full_exact(self, model_name, plugin_name, shape):
        triton_client = httpclient.InferenceServerClient("localhost:8000",
                                                         verbose=True)

        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT0', list(shape), "FP32"))

        input0_data = np.ones(shape=shape).astype(np.float32)
        inputs[0].set_data_from_numpy(input0_data, binary_data=True)

        outputs.append(
            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))

        results = triton_client.infer(model_name + '_' + plugin_name,
                                      inputs,
                                      outputs=outputs)

        output0_data = results.as_numpy('OUTPUT0')

        # Verify values of Normalize and GELU
        if plugin_name == 'CustomGeluPluginDynamic':
            # Add bias
            input0_data += 1
            # Calculate Gelu activation
            test_output = (input0_data *
                           0.5) * (1 + np.tanh((0.797885 * input0_data) +
                                               (0.035677 * (input0_data**3))))
            self.assertTrue(np.isclose(output0_data, test_output).all())
        else:
            # L2 norm is sqrt(sum([1]*16)))
            test_output = input0_data / np.sqrt(sum([1] * 16))
            self.assertTrue(np.isclose(output0_data, test_output).all())
    def _full_exact(self, batch_size, model_name, plugin_name):
        triton_client = httpclient.InferenceServerClient("localhost:8000",
                                                         verbose=True)

        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT0', [batch_size, 16],
                                            "FP32"))

        input0_data = np.random.randn(batch_size, 16).astype(np.float32)
        inputs[0].set_data_from_numpy(input0_data, binary_data=True)

        outputs.append(
            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))

        results = triton_client.infer(model_name + '_' + plugin_name,
                                      inputs,
                                      outputs=outputs)

        output0_data = results.as_numpy('OUTPUT0')

        # Verify values of Leaky RELU (it uses 0.1 instead of the default 0.01)
        # and for CustomClipPlugin min_clip = 0.1, max_clip = 0.5
        for b in range(batch_size):
            if plugin_name == 'LReLU_TRT':
                test_input = np.where(input0_data > 0, input0_data,
                                      input0_data * 0.1)
                self.assertTrue(np.isclose(output0_data, test_input).all())
            else:
                # [TODO] Add test for CustomClip output
                test_input = np.clip(input0_data, 0.1, 0.5)
Ejemplo n.º 10
0
def run_infer(model_name,
              model_version,
              numerical_features,
              categorical_features,
              headers=None):
    inputs = []
    outputs = []
    num_type = "FP16" if numerical_features.dtype == np.float16 else "FP32"
    inputs.append(
        tritonhttpclient.InferInput('input__0', numerical_features.shape,
                                    num_type))
    inputs.append(
        tritonhttpclient.InferInput('input__1', categorical_features.shape,
                                    "INT64"))

    # Initialize the data
    inputs[0].set_data_from_numpy(numerical_features, binary_data=True)
    inputs[1].set_data_from_numpy(categorical_features, binary_data=False)

    outputs.append(
        tritonhttpclient.InferRequestedOutput('output__0', binary_data=True))
    results = triton_client.infer(
        model_name,
        inputs,
        model_version=str(model_version) if model_version != -1 else '',
        outputs=outputs,
        headers=headers)
    return results
Ejemplo n.º 11
0
def sync_send(triton_client, result_list, values, batch_size, sequence_id,
              model_name, model_version):

    count = 1
    for value in values:
        # Create the tensor for INPUT
        value_data = np.full(shape=[batch_size, 1],
                             fill_value=value,
                             dtype=np.int32)
        inputs = []
        inputs.append(tritonhttpclient.InferInput('INPUT', value_data.shape, "INT32"))
        # Initialize the data
        # FIXME, negative value in binary form can't be handled properly,
        # which causes the library to raise decode exception.
        inputs[0].set_data_from_numpy(value_data, binary_data=False)
        outputs = []
        outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT'))
        # Issue the synchronous sequence inference.
        result = triton_client.infer(model_name=model_name,
                                     inputs=inputs,
                                     outputs=outputs,
                                     sequence_id=sequence_id,
                                     sequence_start=(count == 1),
                                     sequence_end=(count == len(values)))
        result_list.append(result.as_numpy('OUTPUT'))
        count = count + 1
def TestIdentityInference(np_array, binary_data):
    model_name = "savedmodel_zero_1_object"
    inputs = []
    outputs = []

    inputs.append(
        tritonhttpclient.InferInput('INPUT0', np_array.shape, "BYTES"))
    inputs[0].set_data_from_numpy(np_array, binary_data=binary_data)

    outputs.append(
        tritonhttpclient.InferRequestedOutput('OUTPUT0',
                                              binary_data=binary_data))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)
    if (np_array.dtype == np.object):
        if binary_data:
            if not np.array_equal(np_array,
                                  np.char.decode(results.as_numpy('OUTPUT0'))):
                print(results.as_numpy('OUTPUT0'))
                sys.exit(1)
        else:
            if not np.array_equal(np_array, results.as_numpy('OUTPUT0')):
                print(results.as_numpy('OUTPUT0'))
                sys.exit(1)
    else:
        encoded_results = np.char.encode(
            results.as_numpy('OUTPUT0').astype(str))
        if not np.array_equal(np_array, encoded_results):
            print(encoded_results)
            sys.exit(1)
Ejemplo n.º 13
0
def requestGenerator(input_name, output_name, c, h, w, format, dtype, FLAGS):
    # Preprocess image into input data according to model requirements
    image_data = None
    with Image.open(FLAGS.image_filename) as img:
        image_data = preprocess(img, format, dtype, c, h, w, FLAGS.scaling)

    repeated_image_data = [image_data for _ in range(FLAGS.batch_size)]
    batched_image_data = np.stack(repeated_image_data, axis=0)

    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            tritongrpcclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            tritonhttpclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=False)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            tritongrpcclient.InferRequestedOutput(output_name,
                                            class_count=FLAGS.classes))
    else:
        outputs.append(
            tritonhttpclient.InferRequestedOutput(output_name,
                                            binary_data=False,
                                            class_count=FLAGS.classes))

    yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
Ejemplo n.º 14
0
def triton_infer(input_mapping,
                 model_name,
                 binary_data=False,
                 binary_output=False,
                 class_count=0):
    """Helper function for setting Triton inputs and executing a request

    Arguments
    ----------
    input_mapping : dict
        A dictionary mapping strings to numpy arrays. The keys should
        be the names of the model inputs, and the values should be the
        inputs themselves.

    model_name : str
        The name of the model on which you are running inference.

    binary_data : bool
        Whether you are expecting binary input and output. Defaults to False

    class_count : int
        If the model is a classification model, the number of output classes.
        Defaults to 0, indicating this is not a classification model.

    Returns
    ----------
    res : InferResult
        Triton inference result containing output from running prediction
    """
    input_meta, _, output_meta, _ = parse_model_http(model_name)

    inputs = []
    outputs = []

    # Populate the inputs array
    for in_meta in input_meta:
        input_name = in_meta["name"]
        data = input_mapping[input_name]

        input = tritonhttpclient.InferInput(input_name, data.shape,
                                            in_meta["datatype"])

        input.set_data_from_numpy(data, binary_data=binary_data)
        inputs.append(input)

    # Populate the outputs array
    for out_meta in output_meta:
        output_name = out_meta["name"]
        output = tritonhttpclient.InferRequestedOutput(
            output_name, binary_data=binary_output, class_count=class_count)
        outputs.append(output)

    # Run inference
    res = triton_client.infer(model_name,
                              inputs,
                              request_id="0",
                              outputs=outputs)

    return res
Ejemplo n.º 15
0
    def test_chw32_input(self):
        model_name = "plan_CHW32_LINEAR_float32_float32_float32"
        for bs in [1, 8]:
            input_np = np.arange(26 * bs, dtype=np.float32).reshape(
                (bs, 13, 2, 1))
            expected_output0_np = input_np + input_np
            expected_output1_np = input_np - input_np
            reformatted_input_np = reformat("CHW32", input_np)

            # Use shared memory to bypass the shape check in client library,
            # because for non-linear format tensor, the data buffer is padded
            # and thus the data byte size may not match what is calculated from
            # tensor shape
            inputs = []
            inputs.append(
                tritonhttpclient.InferInput('INPUT0', [bs, 13, 2, 1], "FP32"))
            self.add_reformat_free_data_as_shared_memory(
                "input0" + str(bs), inputs[-1], reformatted_input_np)
            inputs.append(
                tritonhttpclient.InferInput('INPUT1', [bs, 13, 2, 1], "FP32"))
            self.add_reformat_free_data_as_shared_memory(
                "input1" + str(bs), inputs[-1], reformatted_input_np)

            outputs = []
            outputs.append(
                tritonhttpclient.InferRequestedOutput('OUTPUT0',
                                                      binary_data=True))
            outputs.append(
                tritonhttpclient.InferRequestedOutput('OUTPUT1',
                                                      binary_data=True))

            results = self.triton_client.infer(model_name=model_name,
                                               inputs=inputs,
                                               outputs=outputs)
            # Validate the results by comparing with precomputed values.
            output0_np = results.as_numpy('OUTPUT0')
            output1_np = results.as_numpy('OUTPUT1')
            self.assertTrue(
                np.array_equal(output0_np, expected_output0_np),
                "OUTPUT0 expected: {}, got {}".format(expected_output0_np,
                                                      output0_np))
            self.assertTrue(
                np.array_equal(output1_np, expected_output1_np),
                "OUTPUT0 expected: {}, got {}".format(expected_output1_np,
                                                      output1_np))
    def _no_streaming_helper(self, protocol):
        data_offset = 100
        repeat_count = 1
        delay_time = 1000
        wait_time = 2000

        input_data = np.arange(start=data_offset,
                               stop=data_offset + repeat_count,
                               dtype=np.int32)
        input_data = np.expand_dims(input_data, axis=0)
        delay_data = (np.ones([1, repeat_count], dtype=np.uint32)) * delay_time
        wait_data = np.array([[wait_time]], dtype=np.uint32)

        if protocol is "grpc":
            # Use the inputs and outputs from the setUp
            this_inputs = self.inputs_
            this_outputs = self.outputs_
        else:
            this_inputs = []
            this_inputs.append(
                httpclient.InferInput('IN', [1, repeat_count], "INT32"))
            this_inputs.append(httpclient.InferInput('DELAY', [1, 1],
                                                     "UINT32"))
            this_inputs.append(httpclient.InferInput('WAIT', [1, 1], "UINT32"))
            this_outputs = []
            this_outputs.append(httpclient.InferRequestedOutput('OUT'))

        # Initialize data for IN
        this_inputs[0].set_shape([1, repeat_count])
        this_inputs[0].set_data_from_numpy(input_data)

        # Initialize data for DELAY
        this_inputs[1].set_shape([1, repeat_count])
        this_inputs[1].set_data_from_numpy(delay_data)

        # Initialize data for WAIT
        this_inputs[2].set_data_from_numpy(wait_data)

        if protocol is "grpc":
            triton_client = grpcclient.InferenceServerClient(
                url="localhost:8001", verbose=True)
        else:
            triton_client = httpclient.InferenceServerClient(
                url="localhost:8000", verbose=True)
        try:
            triton_client.infer(model_name=self.model_name_,
                                inputs=this_inputs,
                                outputs=this_outputs)
            self.assertTrue(False, "expected to fail for decoupled models")
        except InferenceServerException as ex:
            self.assertTrue(
                "doesn't support models with decoupled transaction policy" in
                ex.message())
Ejemplo n.º 17
0
def test_infer(model_name, input0_data, input1_data):
    inputs = []
    outputs = []
    inputs.append(tritonhttpclient.InferInput('INPUT0', [1, 16], "INT32"))
    inputs.append(tritonhttpclient.InferInput('INPUT1', [1, 16], "INT32"))

    # Initialize the data
    inputs[0].set_data_from_numpy(input0_data, binary_data=False)
    inputs[1].set_data_from_numpy(input1_data, binary_data=True)

    outputs.append(
        tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
    outputs.append(
        tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
    query_params = {'test_1': 1, 'test_2': 2}
    results = triton_client.infer(model_name,
                                  inputs,
                                  outputs=outputs,
                                  query_params=query_params)

    return results
Ejemplo n.º 18
0
    def test_nobatch_request_for_batching_model(self):
        input_size = 16

        # graphdef_int32_int8_int8 has a batching version with max batch size of 8.
        # The server should return an error if the batch size is not included in the
        # input shapes.
        tensor_shape = (input_size,)
        for protocol in ["http", "grpc"]:
            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)

            inputs = []
            outputs = []
            if protocol == "http":
                triton_client = tritonhttpclient.InferenceServerClient(url='localhost:8000', verbose=True)
                inputs.append(tritonhttpclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritonhttpclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
            else:
                triton_client = tritongrpcclient.InferenceServerClient(url='localhost:8001', verbose=True)
                inputs.append(tritongrpcclient.InferInput('INPUT0', tensor_shape , "INT32"))
                inputs.append(tritongrpcclient.InferInput('INPUT1', tensor_shape, "INT32"))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))

            # Initialize the data
            inputs[0].set_data_from_numpy(in0)
            inputs[1].set_data_from_numpy(in1)

            try:
                results = triton_client.infer(model_name,
                                  inputs,
                                  outputs=outputs)
                self.assertTrue(False, "expected failure with no batch request for batching model")
            except InferenceServerException as ex:
                pass
Ejemplo n.º 19
0
    def _erroneous_infer(self, tensor_shape, batch_size):
        import tritonhttpclient
        item_size = batch_size
        for dim in tensor_shape:
            item_size *= dim
        full_shape = (batch_size, ) + tensor_shape
        input_np = np.arange(item_size, dtype=self.dtype_).reshape(full_shape)
        expected_output0_np = input_np + input_np
        expected_output1_np = input_np - input_np

        inputs = []
        inputs.append(
            tritonhttpclient.InferInput('INPUT0', full_shape, self.dtype_str_))
        inputs[-1].set_data_from_numpy(input_np)
        inputs.append(
            tritonhttpclient.InferInput('INPUT1', full_shape, self.dtype_str_))
        inputs[-1].set_data_from_numpy(input_np)
        outputs = []
        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
        outputs.append(
            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))

        model_name = tu.get_model_name(self.model_name_, self.dtype_,
                                       self.dtype_, self.dtype_)
        results = tritonhttpclient.InferenceServerClient(
            "localhost:8000", verbose=True).infer(model_name=model_name,
                                                  inputs=inputs,
                                                  outputs=outputs)
        # Validate the results by comparing with precomputed values.
        output0_np = results.as_numpy('OUTPUT0')
        output1_np = results.as_numpy('OUTPUT1')
        self.assertFalse(np.array_equal(output0_np, expected_output0_np),
                         "expects OUTPUT0 is not correct")
        self.assertFalse(np.array_equal(output1_np, expected_output1_np),
                         "expects OUTPUT1 is not correct")
    def _prepare_request(self, protocol):
        if (protocol == "grpc"):
            self.inputs_ = []
            self.inputs_.append(grpcclient.InferInput('INPUT0', [1, 1],
                                                      "INT32"))
            self.outputs_ = []
            self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0'))
        else:
            self.inputs_ = []
            self.inputs_.append(httpclient.InferInput('INPUT0', [1, 1],
                                                      "INT32"))
            self.outputs_ = []
            self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0'))

        self.inputs_[0].set_data_from_numpy(self.input0_data_)
Ejemplo n.º 21
0
    def _addsub_infer(self, model_name):
        triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)

        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32"))
        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32"))

        # Initialize the data
        inputs[0].set_data_from_numpy(self.input0_, binary_data=True)
        inputs[1].set_data_from_numpy(self.input1_, binary_data=False)

        outputs.append(httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
        outputs.append(httpclient.InferRequestedOutput('OUTPUT1', binary_data=True))

        results = triton_client.infer(model_name,
                                      inputs,
                                      outputs=outputs)

        output0_data = results.as_numpy('OUTPUT0')
        output1_data = results.as_numpy('OUTPUT1')

        self.assertTrue(np.array_equal(self.expected_output0_, output0_data), "incorrect sum")
        self.assertTrue(np.array_equal(self.expected_output1_, output1_data), "incorrect difference")
Ejemplo n.º 22
0
    def test_batch_item_shape(self):
        # Use 3 set of inputs with shape [2, 1, 2], [1, 1, 2], [1, 2, 2]
        # Note that the test only checks the formation of "BATCH_INPUT" where
        # the value of "RAGGED_INPUT" is irrelevant, only the shape matters
        inputs = []
        for value in [[2, 1, 2], [1, 1, 2], [1, 2, 2]]:
            inputs.append(
                [tritonhttpclient.InferInput('RAGGED_INPUT', value, "FP32")])
            inputs[-1][0].set_data_from_numpy(
                np.full(value, value[0], np.float32))
        client = tritonhttpclient.InferenceServerClient(
            url="localhost:8000", concurrency=len(inputs))

        expected_outputs = [
            np.array([[1.0, 2.0], [1.0, 2.0]]),
            np.array([[1.0, 2.0]]),
            np.array([[2.0, 2.0]]),
        ]

        model_name = "batch_item"

        output_name = 'BATCH_OUTPUT'
        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]

        async_requests = []
        try:
            for request_inputs in inputs:
                # Asynchronous inference call.
                async_requests.append(
                    client.async_infer(model_name=model_name,
                                       inputs=request_inputs,
                                       outputs=outputs))

            for idx in range(len(async_requests)):
                # Get the result from the initiated asynchronous inference request.
                # Note the call will block till the server responds.
                result = async_requests[idx].get_result()

                # Validate the results by comparing with precomputed values.
                output_data = result.as_numpy(output_name)
                self.assertTrue(
                    np.allclose(output_data, expected_outputs[idx]),
                    "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}"
                    .format(expected_outputs[idx], output_data,
                            np.isclose(expected_outputs[idx], output_data)))
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
    def _test_helper(self, modelVersion, tag, sig_def):
        shape = [self.dims]
        model_name = self.base_model_name + str(modelVersion)
        # The multiplier is defined during model creation. See server/qa/common/gen_tag_sigdef.py
        # for details
        multiplier = modelVersion + 1
        output_name = "OUTPUT"
        triton_client = httpclient.InferenceServerClient("localhost:8000",
                                                         verbose=True)
        inputs = []
        outputs = []
        inputs.append(httpclient.InferInput('INPUT', shape, "FP32"))
        input_data = np.ones(shape=shape).astype(np.float32)
        inputs[0].set_data_from_numpy(input_data, binary_data=True)

        outputs.append(
            httpclient.InferRequestedOutput(output_name, binary_data=True))
        results = triton_client.infer(model_name, inputs, outputs=outputs)
        output_data = results.as_numpy(output_name)
        test_output = input_data * multiplier
        self.assertTrue(np.isclose(output_data, test_output).all())
Ejemplo n.º 24
0
    def test_ragged_output(self):
        model_name = "ragged_io"

        # The model is identity model
        self.inputs = []
        for value in [2, 4, 1, 3]:
            self.inputs.append(
                [tritonhttpclient.InferInput('INPUT0', [1, value], "FP32")])
            self.inputs[-1][0].set_data_from_numpy(
                np.full([1, value], value, np.float32))
        output_name = 'OUTPUT0'
        outputs = [tritonhttpclient.InferRequestedOutput(output_name)]

        async_requests = []
        try:
            for inputs in self.inputs:
                # Asynchronous inference call.
                async_requests.append(
                    self.client.async_infer(model_name=model_name,
                                            inputs=inputs,
                                            outputs=outputs))

            expected_value_list = [[v] * v for v in [2, 4, 1, 3]]
            expected_value_list = [
                np.asarray([expected_value], dtype=np.float32)
                for expected_value in expected_value_list
            ]
            for idx in range(len(async_requests)):
                # Get the result from the initiated asynchronous inference request.
                # Note the call will block till the server responds.
                result = async_requests[idx].get_result()

                # Validate the results by comparing with precomputed values.
                output_data = result.as_numpy(output_name)
                self.assertTrue(
                    np.array_equal(output_data, expected_value_list[idx]),
                    "Expect response {} to have value {}, got {}".format(
                        idx, expected_value_list[idx], output_data))
        except InferenceServerException as ex:
            self.assertTrue(False, "unexpected error {}".format(ex))
    # Register Input0 and Input1 shared memory with Triton Server
    triton_client.register_system_shared_memory("input0_data", "/input0_simple",
                                                input0_byte_size)
    triton_client.register_system_shared_memory("input1_data", "/input1_simple",
                                                input1_byte_size)

    # Set the parameters to use data from shared memory
    inputs = []
    inputs.append(httpclient.InferInput('INPUT0', [1, 16], "BYTES"))
    inputs[-1].set_shared_memory("input0_data", input0_byte_size)

    inputs.append(httpclient.InferInput('INPUT1', [1, 16], "BYTES"))
    inputs[-1].set_shared_memory("input1_data", input1_byte_size)

    outputs = []
    outputs.append(httpclient.InferRequestedOutput('OUTPUT0',
                                                   binary_data=True))
    outputs[-1].set_shared_memory("output0_data", output0_byte_size)

    outputs.append(httpclient.InferRequestedOutput('OUTPUT1',
                                                   binary_data=True))
    outputs[-1].set_shared_memory("output1_data", output1_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    print(utils.triton_to_np_dtype(output0['datatype']))
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
Ejemplo n.º 26
0
    ]],
                           dtype='uint32')
    input2_data = np.array([[
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26
    ]],
                           dtype='int32')
    inputs = [
        httpclient.InferInput("DES", input0_data.shape,
                              np_to_triton_dtype(input0_data.dtype)),
        httpclient.InferInput("CATCOLUMN", input1_data.shape,
                              np_to_triton_dtype(input1_data.dtype)),
        httpclient.InferInput("ROWINDEX", input2_data.shape,
                              np_to_triton_dtype(input2_data.dtype)),
    ]

    inputs[0].set_data_from_numpy(input0_data)
    inputs[1].set_data_from_numpy(input1_data)
    inputs[2].set_data_from_numpy(input2_data)
    outputs = [httpclient.InferRequestedOutput("OUTPUT0")]

    response = client.infer(model_name,
                            inputs,
                            request_id=str(1),
                            outputs=outputs)

    result = response.get_response()

    print(result)
    print(response.as_numpy("OUTPUT0"))
Ejemplo n.º 27
0
        input_byte_size)
    triton_client.register_cuda_shared_memory(
        "input1_data", cudashm.get_raw_handle(shm_ip1_handle), 0,
        input_byte_size)

    # Set the parameters to use data from shared memory
    inputs = []
    inputs.append(tritonhttpclient.InferInput('INPUT0', [1, 16], "INT32"))
    inputs[-1].set_shared_memory("input0_data", input_byte_size)

    inputs.append(tritonhttpclient.InferInput('INPUT1', [1, 16], "INT32"))
    inputs[-1].set_shared_memory("input1_data", input_byte_size)

    outputs = []
    outputs.append(
        tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
    outputs[-1].set_shared_memory("output0_data", output_byte_size)

    outputs.append(
        tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
    outputs[-1].set_shared_memory("output1_data", output_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    if output0 is not None:
        output0_data = cudashm.get_contents_as_numpy(
            shm_op0_handle, utils.triton_to_np_dtype(output0['datatype']),
    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            tritongrpcclient.InferInput(input_name, batched_image_data.shape,
                                        "BYTES"))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            tritonhttpclient.InferInput(input_name, batched_image_data.shape,
                                        "BYTES"))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            tritongrpcclient.InferRequestedOutput(output_name,
                                                  class_count=FLAGS.classes))
    else:
        outputs.append(
            tritonhttpclient.InferRequestedOutput(output_name,
                                                  binary_data=True,
                                                  class_count=FLAGS.classes))

    # Send request
    result = triton_client.infer(model_name, inputs, outputs=outputs)

    postprocess(result, output_name, input_filenames, batch_size)

    print("PASS")
def infer_exact(tester, pf, tensor_shape, batch_size,
                input_dtype, output0_dtype, output1_dtype,
                output0_raw=True, output1_raw=True,
                model_version=None, swap=False,
                outputs=("OUTPUT0", "OUTPUT1"), use_http=True, use_grpc=True,
                use_http_json_tensors=True, skip_request_id_check=False, use_streaming=True,
                correlation_id=0, shm_region_names=None, precreated_shm_regions=None,
                use_system_shared_memory=False, use_cuda_shared_memory=False,
                priority=0, timeout_us=0):
    tester.assertTrue(
        use_http or use_http_json_tensors or use_grpc or use_streaming)
    configs = []
    if use_http:
            configs.append(("localhost:8000", "http", False, True))
    if output0_raw == output1_raw:
        # Float16 not supported for Input and Output via JSON
        if use_http_json_tensors and (input_dtype != np.float16) and \
            (output0_dtype != np.float16) and (output1_dtype != np.float16):
            configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))

    # outputs are sum and difference of inputs so set max input
    # values so that they will not overflow the output. This
    # allows us to do an exact match. For float types use 8, 16,
    # 32 int range for fp 16, 32, 64 respectively. When getting
    # class outputs the result value/probability is returned as a
    # float so must use fp32 range in that case.
    rinput_dtype = _range_repr_dtype(input_dtype)
    routput0_dtype = _range_repr_dtype(
        output0_dtype if output0_raw else np.float32)
    routput1_dtype = _range_repr_dtype(
        output1_dtype if output1_raw else np.float32)
    val_min = max(np.iinfo(rinput_dtype).min,
                  np.iinfo(routput0_dtype).min,
                  np.iinfo(routput1_dtype).min) / 2
    val_max = min(np.iinfo(rinput_dtype).max,
                  np.iinfo(routput0_dtype).max,
                  np.iinfo(routput1_dtype).max) / 2

    num_classes = 3

    input0_array = np.random.randint(low=val_min, high=val_max,
                                     size=tensor_shape, dtype=rinput_dtype)
    input1_array = np.random.randint(low=val_min, high=val_max,
                                     size=tensor_shape, dtype=rinput_dtype)
    if input_dtype != np.object:
        input0_array = input0_array.astype(input_dtype)
        input1_array = input1_array.astype(input_dtype)

    if not swap:
        output0_array = input0_array + input1_array
        output1_array = input0_array - input1_array
    else:
        output0_array = input0_array - input1_array
        output1_array = input0_array + input1_array

    if output0_dtype == np.object:
        output0_array = np.array([unicode(str(x), encoding='utf-8')
                                  for x in (output0_array.flatten())], dtype=object).reshape(output0_array.shape)
    else:
        output0_array = output0_array.astype(output0_dtype)
    if output1_dtype == np.object:
        output1_array = np.array([unicode(str(x), encoding='utf-8')
                                  for x in (output1_array.flatten())], dtype=object).reshape(output1_array.shape)
    else:
        output1_array = output1_array.astype(output1_dtype)

    if input_dtype == np.object:
        in0n = np.array([str(x)
                         for x in input0_array.reshape(input0_array.size)], dtype=object)
        input0_array = in0n.reshape(input0_array.shape)
        in1n = np.array([str(x)
                         for x in input1_array.reshape(input1_array.size)], dtype=object)
        input1_array = in1n.reshape(input1_array.shape)

    # prepend size of string to output string data
    if output0_dtype == np.object:
        if batch_size == 1:
            output0_array_tmp = serialize_byte_tensor_list([output0_array])
        else:
            output0_array_tmp = serialize_byte_tensor_list(output0_array)
    else:
        output0_array_tmp = output0_array

    if output1_dtype == np.object:
        if batch_size == 1:
            output1_array_tmp = serialize_byte_tensor_list([output1_array])
        else:
            output1_array_tmp = serialize_byte_tensor_list(output1_array)
    else:
        output1_array_tmp = output1_array

    OUTPUT0 = "OUTPUT0"
    OUTPUT1 = "OUTPUT1"
    INPUT0 = "INPUT0"
    INPUT1 = "INPUT1"
    if pf == "libtorch" or pf == "libtorch_nobatch":
        OUTPUT0 = "OUTPUT__0"
        OUTPUT1 = "OUTPUT__1"
        INPUT0 = "INPUT__0"
        INPUT1 = "INPUT__1"

    output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp])
    output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp])

    if batch_size == 1:
        input0_list = [input0_array]
        input1_list = [input1_array]
    else:
        input0_list = [x for x in input0_array]
        input1_list = [x for x in input1_array]

    # Serialization of string tensors in the case of shared memory must be done manually
    if input_dtype == np.object:
        input0_list_tmp = serialize_byte_tensor_list(input0_list)
        input1_list_tmp = serialize_byte_tensor_list(input1_list)
    else:
        input0_list_tmp = input0_list
        input1_list_tmp = input1_list

    input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp])

    # Create system/cuda shared memory regions if needed
    shm_regions, shm_handles = su.create_set_shm_regions(input0_list_tmp, input1_list_tmp, output0_byte_size,
                                                        output1_byte_size, outputs, shm_region_names, precreated_shm_regions,
                                                        use_system_shared_memory, use_cuda_shared_memory)

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_model_name(
            pf, input_dtype, output0_dtype, output1_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(
                config[0], verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(
                config[0], verbose=True)

        inputs = []
        if config[1] == "http":
            inputs.append(httpclient.InferInput(
                INPUT0, tensor_shape, np_to_triton_dtype(input_dtype)))
            inputs.append(httpclient.InferInput(
                INPUT1, tensor_shape, np_to_triton_dtype(input_dtype)))
        else:
            inputs.append(grpcclient.InferInput(
                INPUT0, tensor_shape, np_to_triton_dtype(input_dtype)))
            inputs.append(grpcclient.InferInput(
                INPUT1, tensor_shape, np_to_triton_dtype(input_dtype)))

        if not (use_cuda_shared_memory or use_system_shared_memory):
            if config[1] == "http":
                inputs[0].set_data_from_numpy(
                    input0_array, binary_data=config[3])
                inputs[1].set_data_from_numpy(
                    input1_array, binary_data=config[3])
            else:
                inputs[0].set_data_from_numpy(input0_array)
                inputs[1].set_data_from_numpy(input1_array)
        else:
            # Register necessary shared memory regions/handles
            su.register_add_shm_regions(inputs, outputs, shm_regions, precreated_shm_regions, shm_handles,
                                input0_byte_size, input1_byte_size, output0_byte_size, output1_byte_size,
                                use_system_shared_memory, use_cuda_shared_memory, triton_client)

        if batch_size == 1:
            expected0_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output0_array.reshape((1,) + tensor_shape)]
            expected1_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output1_array.reshape((1,) + tensor_shape)]
        else:
            expected0_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output0_array.reshape(tensor_shape)]
            expected1_sort_idx = [np.flip(np.argsort(x.flatten()), 0)
                                  for x in output1_array.reshape(tensor_shape)]

        # Force binary_data = False for shared memory and class
        output_req = []
        i = 0
        if "OUTPUT0" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(httpclient.InferRequestedOutput(
                        OUTPUT0, binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))

                output_req[-1].set_shared_memory(
                    shm_regions[2]+'_data', output0_byte_size)
            else:
                if output0_raw:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT0, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT0))
                else:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT0, binary_data=config[3], class_count=num_classes))
                    else:
                        output_req.append(grpcclient.InferRequestedOutput(
                            OUTPUT0, class_count=num_classes))
            i += 1
        if "OUTPUT1" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(httpclient.InferRequestedOutput(
                        OUTPUT1, binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))

                output_req[-1].set_shared_memory(
                    shm_regions[2+i]+'_data', output1_byte_size)
            else:
                if output1_raw:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT1, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT1))
                else:
                    if config[1] == "http":
                        output_req.append(httpclient.InferRequestedOutput(
                            OUTPUT1, binary_data=config[3], class_count=num_classes))
                    else:
                        output_req.append(grpcclient.InferRequestedOutput(
                            OUTPUT1, class_count=num_classes))

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))

        last_response = results.get_response()

        if not skip_request_id_check:
            global _seen_request_ids
            if config[1] == "http":
                request_id = int(last_response["id"])
            else:
                request_id = int(last_response.id)
            tester.assertFalse(request_id in _seen_request_ids,
                               "request_id: {}".format(request_id))
            _seen_request_ids.add(request_id)

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(str(response_model_version), model_version)

        tester.assertEqual(len(response_outputs), len(outputs))

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            if ((result_name == OUTPUT0 and output0_raw) or
                    (result_name == OUTPUT1 and output1_raw)):
                if use_system_shared_memory or use_cuda_shared_memory:
                    if result_name == OUTPUT0:
                        shm_handle = shm_handles[2]
                    else:
                        shm_handle = shm_handles[3]

                    output = results.get_output(result_name)
                    if config[1] == "http":
                        output_datatype = output['datatype']
                        output_shape = output['shape']
                    else:
                        output_datatype = output.datatype
                        output_shape = output.shape
                    output_dtype = triton_to_np_dtype(output_datatype)
                if use_system_shared_memory:
                    output_data = shm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                elif use_cuda_shared_memory:
                    output_data = cudashm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                else:
                    output_data = results.as_numpy(result_name)

                if (output_data.dtype == np.object) and (config[3] == False):
                    output_data = output_data.astype(np.bytes_)

                if result_name == OUTPUT0:
                    tester.assertTrue(np.array_equal(output_data, output0_array),
                                      "{}, {} expected: {}, got {}".format(
                        model_name, OUTPUT0, output0_array, output_data))
                elif result_name == OUTPUT1:
                    tester.assertTrue(np.array_equal(output_data, output1_array),
                                      "{}, {} expected: {}, got {}".format(
                        model_name, OUTPUT1, output1_array, output_data))
                else:
                    tester.assertTrue(
                        False, "unexpected raw result {}".format(result_name))
            else:
                for b in range(batch_size):
                    # num_classes values must be returned and must
                    # match expected top values
                    if "nobatch" in pf:
                      class_list = results.as_numpy(result_name)
                    else:
                      class_list = results.as_numpy(result_name)[b]

                    tester.assertEqual(len(class_list), num_classes)
                    if batch_size == 1:
                        expected0_flatten = output0_array.flatten()
                        expected1_flatten = output1_array.flatten()
                    else:
                        expected0_flatten = output0_array[b].flatten()
                        expected1_flatten = output1_array[b].flatten()

                    for idx, class_label in enumerate(class_list):
                        # can't compare indices since could have different
                        # indices with the same value/prob, so check that
                        # the value of each index equals the expected value.
                        # Only compare labels when the indices are equal.
                        if type(class_label) == str:
                            ctuple = class_label.split(':')
                        else:
                            ctuple = "".join(chr(x)
                                         for x in class_label).split(':')
                        cval = float(ctuple[0])
                        cidx = int(ctuple[1])
                        if result_name == OUTPUT0:
                            tester.assertEqual(cval, expected0_flatten[cidx])
                            tester.assertEqual(
                                cval, expected0_flatten[expected0_sort_idx[b][idx]])
                            if cidx == expected0_sort_idx[b][idx]:
                                tester.assertEqual(ctuple[2], 'label{}'.format(
                                    expected0_sort_idx[b][idx]))
                        elif result_name == OUTPUT1:
                            tester.assertEqual(cval, expected1_flatten[cidx])
                            tester.assertEqual(
                                cval, expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(result_name))

    # Unregister system/cuda shared memory regions if they exist
    su.unregister_cleanup_shm_regions(shm_regions, shm_handles, precreated_shm_regions, outputs,
                                      use_system_shared_memory, use_cuda_shared_memory)

    return results
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes,
               model_version=None, use_http=True, use_grpc=True,
               use_http_json_tensors=True, use_streaming=True, shm_region_name_prefix=None,
               use_system_shared_memory=False, use_cuda_shared_memory=False,
               priority=0, timeout_us=0):
    tester.assertTrue(
        use_http or use_grpc or use_http_json_tensors or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
    if use_http_json_tensors and (tensor_dtype != np.float16):
        configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()

    for io_num in range(io_cnt):
        if pf == "libtorch" or pf == "libtorch_nobatch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_shape = input_shapes[io_num]
        output_shape = output_shapes[io_num]

        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                            high=np.iinfo(rtensor_dtype).max,
                                            size=input_shape, dtype=rtensor_dtype)
        else:
            input_array = np.random.choice(a=[False, True], size=input_shape)
        if tensor_dtype != np.object:
            input_array = input_array.astype(tensor_dtype)
            expected_array = np.ndarray.copy(input_array)
        else:
            expected_array = np.array([unicode(str(x), encoding='utf-8')
                                       for x in input_array.flatten()], dtype=object)
            input_array = np.array([str(x) for x in input_array.flatten()],
                                   dtype=object).reshape(input_array.shape)

        expected_array = expected_array.reshape(output_shape)
        expected_dict[output_name] = expected_array

        output_byte_size = expected_array.nbytes

        if batch_size == 1:
            input_list = [input_array]
        else:
            input_list = [x for x in input_array]

        # Serialization of string tensors in the case of shared memory must be done manually
        if tensor_dtype == np.object:
            input_list_tmp = serialize_byte_tensor_list(input_list)
        else:
            input_list_tmp = input_list

        input_byte_size = sum([ip.nbytes for ip in input_list_tmp])

        # create and register shared memory region for inputs and outputs
        shm_io_handles = su.create_set_either_shm_region([shm_region_name_prefix[0]+str(io_num),
                                                        shm_region_name_prefix[1]+str(io_num)],
                                                        input_list_tmp, input_byte_size, output_byte_size,
                                                        use_system_shared_memory, use_cuda_shared_memory)

        if len(shm_io_handles) != 0:
            shm_ip_handles.append(shm_io_handles[0])
            shm_op_handles.append(shm_io_handles[1])
        input_dict[input_name] = input_array

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(
                config[0], verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(
                config[0], verbose=True)

        inputs = []
        output_req = []
        for io_num, (input_name, output_name) in enumerate(zip(input_dict.keys(), expected_dict.keys())):
            input_data = input_dict[input_name]
            input_byte_size = input_data.nbytes
            output_byte_size = expected_dict[output_name].nbytes
            if config[1] == "http":
                inputs.append(httpclient.InferInput(
                    input_name, input_data.shape, np_to_triton_dtype(tensor_dtype)))
                output_req.append(httpclient.InferRequestedOutput(
                    output_name, binary_data=config[3]))
            else:
                inputs.append(grpcclient.InferInput(
                    input_name, input_data.shape, np_to_triton_dtype(tensor_dtype)))
                output_req.append(
                    grpcclient.InferRequestedOutput(output_name))

            if not (use_cuda_shared_memory or use_system_shared_memory):
                if config[1] == "http":
                    inputs[-1].set_data_from_numpy(input_data, binary_data=config[3])
                else:
                    inputs[-1].set_data_from_numpy(input_data)
            else:
                # Register necessary shared memory regions/handles
                su.register_add_either_shm_regions(inputs, output_req, shm_region_name_prefix,
                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size, output_byte_size,
                    use_system_shared_memory, use_cuda_shared_memory, triton_client)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority, timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority, timeout=timeout_us)

        last_response = results.get_response()

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(response_model_version, model_version)

        tester.assertEqual(len(response_outputs), io_cnt)

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            tester.assertTrue(result_name in expected_dict)
            if use_system_shared_memory or use_cuda_shared_memory:
                if pf == "libtorch" or pf == "libtorch_nobatch":
                    io_num = int(result_name.split("OUTPUT__")[1])
                else:
                    io_num = int(result_name.split("OUTPUT")[1])
                shm_handle = shm_op_handles[io_num]

                output = results.get_output(result_name)
                if config[1] == "http":
                    output_datatype = output['datatype']
                    output_shape = output['shape']
                else:
                    output_datatype = output.datatype
                    output_shape = output.shape
                output_dtype = triton_to_np_dtype(output_datatype)
            if use_system_shared_memory:
                output_data = shm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            elif use_cuda_shared_memory:
                output_data = cudashm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            else:
                output_data = results.as_numpy(result_name)

            if (output_data.dtype == np.object) and (config[3] == False):
                output_data = output_data.astype(np.bytes_)

            expected = expected_dict[result_name]
            tester.assertEqual(output_data.shape, expected.shape)
            tester.assertTrue(np.array_equal(output_data, expected),
                                "{}, {}, expected: {}, got {}".format(
                                    model_name, result_name, expected, output_data))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0]+str(io_num)+'_data')
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0]+str(io_num)+'_data')
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1]+str(io_num)+'_data')
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1]+str(io_num)+'_data')
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results