def _test_helper(self,
                     client,
                     model_name,
                     input_name='INPUT0',
                     output_name='OUTPUT0'):
        try:
            inputs = [
                client[0].InferInput(input_name, self.in0_.shape,
                                     np_to_triton_dtype(self.data_type_))
            ]
            inputs[0].set_data_from_numpy(self.in0_)
            results = client[1].infer(model_name, inputs)
            # if the inference is completed, examine results to ensure that
            # the framework and protocol do support large payload
            self.assertTrue(
                np.array_equal(self.in0_, results.as_numpy(output_name)),
                "output is different from input")

        except InferenceServerException as ex:
            # if the inference failed, inference server should return error
            # gracefully. In addition to this, send a small payload to
            # verify if the server is still functional
            inputs = [
                client[0].InferInput(input_name, self.sin0_.shape,
                                     np_to_triton_dtype(self.data_type_))
            ]
            inputs[0].set_data_from_numpy(self.sin0_)
            results = client[1].infer(model_name, inputs)
            self.assertTrue(
                np.array_equal(self.sin0_, results.as_numpy(output_name)),
                "output is different from input")
Beispiel #2
0
    def _test_helper(self,
                     client,
                     model_name,
                     input_name='INPUT0',
                     output_name='OUTPUT0'):
        # plan does not supoort large batch sizes.
        if not model_name.startswith('plan'):
            inputs = [
                client[0].InferInput(input_name, self._large_in0.shape,
                                     np_to_triton_dtype(self._data_type))
            ]
            inputs[0].set_data_from_numpy(self._large_in0)
            results = client[1].infer(model_name, inputs)

            # if the inference is completed, examine results to ensure that
            # the framework and protocol do support large payload
            self.assertTrue(
                np.array_equal(self._large_in0, results.as_numpy(output_name)),
                "output is different from input")

        if client[0] == httpclient:
            # FIXME HTTPServer cannot support large payloads. See DLIS-1776.
            inputs = [
                client[0].InferInput(input_name, self._very_large_in0.shape,
                                     np_to_triton_dtype(self._data_type))
            ]
            inputs[0].set_data_from_numpy(self._very_large_in0)
            with self.assertRaises(InferenceServerException):
                results = client[1].infer(model_name, inputs)

        # FIXME Test is terminated due to libprotobuf FATAL error when GRPC sends
        # the second request with input tensors larger than 1.3GBs. In this test
        # GRPC has been currently exempted from testing for Very Large tensor(3GBs)
        # until the problem is resolved. Should be uncommented once the GRPC issue is resolved.
        # See DLIS-2474.
        # if client[0] == grpcclient:
        #     inputs = [
        #         client[0].InferInput(input_name, self._very_large_in0.shape,
        #                              np_to_triton_dtype(self._data_type))
        #     ]
        #     inputs[0].set_data_from_numpy(self._very_large_in0)
        #     # GRPC must fail for large payloads because of a 2GB protobuf limit
        #     with self.assertRaises(InferenceServerException):
        #         results = client[1].infer(model_name, inputs)

        # Send a small payload to verify if the server is still functional
        inputs = [
            client[0].InferInput(input_name, self._small_in0.shape,
                                 np_to_triton_dtype(self._data_type))
        ]
        inputs[0].set_data_from_numpy(self._small_in0)
        results = client[1].infer(model_name, inputs)
        self.assertTrue(
            np.array_equal(self._small_in0, results.as_numpy(output_name)),
            "output is different from input")
Beispiel #3
0
def crashing_client(model_name,
                    dtype,
                    tensor_shape,
                    shm_name,
                    triton_client,
                    input_name="INPUT0"):
    in0 = np.random.random(tensor_shape).astype(dtype)
    if "libtorch" in model_name:
        input_name = "INPUT__0"
    inputs = [
        grpcclient.InferInput(input_name, tensor_shape,
                              np_to_triton_dtype(dtype)),
    ]
    inputs[0].set_data_from_numpy(in0)

    # Run in a loop so that it is guaranteed that
    # the inference will not have completed when being terminated.
    while True:
        existing_shm = shared_memory.SharedMemory(shm_name)
        count = np.ndarray((1,), dtype=np.int32, buffer=existing_shm.buf)
        count[0] += 1
        existing_shm.close()
        results = triton_client.infer(model_name, inputs)
    # We use identity string models that takes 1 input tensor of a single string
    # and returns 1 output tensor of a single string. The output tensor is the
    # same as the input tensor.
    batch_size = 1

    # Create the data for the input tensor. It contains a null character in
    # the middle of the string.
    tmp_str = "abc\0def"
    input0_data = np.array([tmp_str], dtype=object)

    # Send inference request to the inference server. Get results for
    # output tensor.
    inputs = [
        client_util.InferInput("INPUT0", input0_data.shape,
                               np_to_triton_dtype(np.object))
    ]
    inputs[0].set_data_from_numpy(input0_data)

    results = client.infer(FLAGS.model_name, inputs)

    # We expect there to be 1 result (with batch-size 1). Compare the input
    # and output tensor calculated by the model. They must be the same.
    output0_data = results.as_numpy('OUTPUT0')
    # Element type returned is different between HTTP and GRPC client.
    # The former is str and the latter is bytes
    output0_data2 = np.array([
        output0_data[0]
        if type(output0_data[0]) == str else output0_data[0].decode('utf8')
    ],
                             dtype=object)
    # Create the inference context for the model.
    client = client_util.InferenceServerClient(FLAGS.url,
                                               verbose=FLAGS.verbose)

    # Input tensor will be raw content from image file
    image_path = FLAGS.image_filename
    with open(image_path, "rb") as fd:
        input_data = np.array([[fd.read()]], dtype=bytes)

    expected_res_path = FLAGS.preprocessed_filename
    with open(expected_res_path, "r") as fd:
        expected_data = np.fromfile(fd, np.float32)

    inputs = [
        client_util.InferInput("INPUT", input_data.shape,
                               np_to_triton_dtype(input_data.dtype))
    ]
    inputs[0].set_data_from_numpy(input_data)

    results = client.infer(model_name, inputs)

    output = results.as_numpy("OUTPUT")
    if output is None:
        print("error: expected 'OUTPUT'")
        sys.exit(1)

    if output.shape[0] != 1:
        print("error: expected 1 output result, got {}".format(
            len(result["OUTPUT"])))
        sys.exit(1)
    def _test_helper(self,
                     client,
                     model_name,
                     input_name='INPUT0',
                     output_name='OUTPUT0'):

        # FIXME libtorch seems to have an issue with handling large batch sizes see DLIS-1770
        if model_name.startswith('libtorch'):
            try:
                inputs = [
                    client[0].InferInput(input_name, self._large_in0.shape,
                                         np_to_triton_dtype(self._data_type))
                ]
                inputs[0].set_data_from_numpy(self._large_in0)
                results = client[1].infer(model_name, inputs)

                # if the inference is completed, examine results to ensure that
                # the framework and protocol do support large payload
                self.assertTrue(
                    np.array_equal(self._large_in0,
                                   results.as_numpy(output_name)),
                    "output is different from input")
            except InferenceServerException as ex:
                self.assertTrue(
                    ex.message() ==
                    "OUTPUT__0: failed to perform CUDA copy: invalid argument")

        # plan does not supoort large batch sizes.
        elif not model_name.startswith('plan'):

            inputs = [
                client[0].InferInput(input_name, self._large_in0.shape,
                                     np_to_triton_dtype(self._data_type))
            ]
            inputs[0].set_data_from_numpy(self._large_in0)
            results = client[1].infer(model_name, inputs)

            # if the inference is completed, examine results to ensure that
            # the framework and protocol do support large payload
            self.assertTrue(
                np.array_equal(self._large_in0, results.as_numpy(output_name)),
                "output is different from input")

        if client[0] == httpclient:
            # FIXME HTTPServer cannot support large payloads. See DLIS-1776.
            inputs = [
                client[0].InferInput(input_name, self._very_large_in0.shape,
                                     np_to_triton_dtype(self._data_type))
            ]
            inputs[0].set_data_from_numpy(self._very_large_in0)
            with self.assertRaises(InferenceServerException):
                results = client[1].infer(model_name, inputs)

        if client[0] == grpcclient:
            inputs = [
                client[0].InferInput(input_name, self._very_large_in0.shape,
                                     np_to_triton_dtype(self._data_type))
            ]
            inputs[0].set_data_from_numpy(self._very_large_in0)
            # GRPC must fail for large payloads because of a 2GB protobuf limit
            with self.assertRaises(InferenceServerException):
                results = client[1].infer(model_name, inputs)

        # Send a small payload to verify if the server is still functional
        inputs = [
            client[0].InferInput(input_name, self._small_in0.shape,
                                 np_to_triton_dtype(self._data_type))
        ]
        inputs[0].set_data_from_numpy(self._small_in0)
        results = client[1].infer(model_name, inputs)
        self.assertTrue(
            np.array_equal(self._small_in0, results.as_numpy(output_name)),
            "output is different from input")
    dtype = np.float32

    # Create the inference context for the model.
    client = client_util.InferenceServerClient(FLAGS.url,
                                               verbose=FLAGS.verbose)

    # Create the data for one input tensor.
    input_data = []
    input_data.append(np.ones((3, 5), dtype=np.float32))
    input_data.append(np.ones((3, 5), dtype=np.float32))

    inputs = []
    for i in range(len(input_data)):
        inputs.append(
            client_util.InferInput("input_{}".format(i + 1), shape,
                                   np_to_triton_dtype(dtype)))
        inputs[i].set_data_from_numpy(input_data[i])

    results = client.infer(model_name, inputs)

    # We expect 1 result of size 10 with alternating 1 and 0.
    output_data = results.as_numpy('output')
    if output_data is None:
        print("error: expected 'output'")
        sys.exit(1)

    for i in range(3):
        for j in range(5):
            print(
                str(input_data[0][i][j]) + " + " + str(input_data[1][i][j]) +
                " = " + str(output_data[i][j]))
Beispiel #8
0
    # Run the custom_modulo model, which depends on a custom mod operation
    model_name = FLAGS.model
    elements = 10

    # Create the inference context for the model.
    client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose)

    # Create the data for one input tensor.
    input_data = []
    input_data.append(np.arange(start=1, stop=1+elements, dtype=np.float32))
    input_data.append(np.array([2] * elements, dtype=np.float32))

    inputs = []
    for i in range(len(input_data)):
        inputs.append(client_util.InferInput(
                "INPUT__{}".format(i), input_data[0].shape, np_to_triton_dtype(input_data[0].dtype)))
        inputs[i].set_data_from_numpy(input_data[i])

    results = client.infer(model_name, inputs)

    # We expect 1 result of size 10 with alternating 1 and 0.
    output_data = results.as_numpy('OUTPUT__0')
    if output_data is None:
        print("error: expected 'OUTPUT__0'")
        sys.exit(1)

    for i in range(elements):
        print(str(i) + ": " + str(input_data[0][i]) + " % " +  str(input_data[1][i]) + " = " + str(output_data[i]))
        if ((input_data[0][i] % input_data[1][i]) != output_data[i]):
            print("error: incorrect value")
            sys.exit(1)
def check_sequence_async(client_metadata,
                         trial,
                         model_name,
                         input_dtype,
                         steps,
                         timeout_ms=DEFAULT_TIMEOUT_MS,
                         sequence_name="<unknown>"):
    """Perform sequence of inferences using async run. The 'steps' holds
    a list of tuples, one for each inference with format:

    (flag_str, value, expected_result, delay_ms)

    """
    if (("savedmodel" in trial) or ("graphdef" in trial) or
        ("custom" in trial) or ("plan" in trial)):
        tensor_shape = (
            1,
            1,
        )
    else:
        assert False, "unknown trial type: " + trial

    triton_client = client_metadata[0]
    sequence_id = client_metadata[1]

    # Execute the sequence of inference...
    seq_start_ms = int(round(time.time() * 1000))
    user_data = UserData()
    # Ensure there is no running stream
    triton_client.stop_stream()
    triton_client.start_stream(partial(completion_callback, user_data))

    sent_count = 0
    for flag_str, value, expected_result, delay_ms in steps:
        seq_start = False
        seq_end = False
        if flag_str is not None:
            seq_start = ("start" in flag_str)
            seq_end = ("end" in flag_str)

        if input_dtype == np.object_:
            in0 = np.full(tensor_shape, value, dtype=np.int32)
            in0n = np.array([str(x) for x in in0.reshape(in0.size)],
                            dtype=object)
            in0 = in0n.reshape(tensor_shape)
        else:
            in0 = np.full(tensor_shape, value, dtype=input_dtype)
        inputs = [
            grpcclient.InferInput("INPUT", tensor_shape,
                                  np_to_triton_dtype(input_dtype)),
        ]
        inputs[0].set_data_from_numpy(in0)

        triton_client.async_stream_infer(model_name,
                                         inputs,
                                         sequence_id=sequence_id,
                                         sequence_start=seq_start,
                                         sequence_end=seq_end)
        sent_count += 1

        if delay_ms is not None:
            time.sleep(delay_ms / 1000.0)

    # Process the results in order that they were sent
    result = None
    processed_count = 0
    while processed_count < sent_count:
        (results, error) = user_data._completed_requests.get()
        if error is not None:
            raise error

        (_, value, expected, _) = steps[processed_count]
        processed_count += 1
        if timeout_ms != None:
            now_ms = int(round(time.time() * 1000))
            if (now_ms - seq_start_ms) > timeout_ms:
                raise TimeoutException(
                    "Timeout expired for {}".format(sequence_name))

        result = results.as_numpy("OUTPUT")[0][0]
        if FLAGS.verbose:
            print("{} {}: + {} = {}".format(sequence_name, sequence_id, value,
                                            result))

        if expected is not None:
            if input_dtype == np.object_:
                assert int(
                    result
                ) == expected, "{}: expected result {}, got {}".format(
                    sequence_name, expected, int(result))
            else:
                assert result == expected, "{}: expected result {}, got {}".format(
                    sequence_name, expected, result)
    triton_client.stop_stream()
Beispiel #10
0
    # Create the inference context for the model.
    client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose)

    # We use identity string models that takes 1 input tensor of a single string
    # and returns 1 output tensor of a single string. The output tensor is the
    # same as the input tensor.
    batch_size = 1

    # Create the data for the input tensor. It contains a null character in
    # the middle of the string.
    tmp_str = "abc\0def"
    input0_data = np.array([tmp_str], dtype=object)

    # Send inference request to the inference server. Get results for
    # output tensor.
    inputs = [client_util.InferInput(
                "INPUT0", input0_data.shape, np_to_triton_dtype(np.object))]
    inputs[0].set_data_from_numpy(input0_data)

    results = client.infer(FLAGS.model_name, inputs)

    # We expect there to be 1 result (with batch-size 1). Compare the input
    # and output tensor calculated by the model. They must be the same.
    output0_data = results.as_numpy('OUTPUT0')
    # Element type returned is different between HTTP and GRPC client.
    # The former is str and the latter is bytes
    output0_data2 = np.array([output0_data[0] if type(output0_data[0]) == str else output0_data[0].decode('utf8')], dtype=object)

    print(input0_data,"?=?",output0_data2)
    assert np.equal(input0_data,output0_data2).all()
Beispiel #11
0
    model_name = FLAGS.model
    elements = 10

    # Create the inference context for the model.
    client = client_util.InferenceServerClient(FLAGS.url, FLAGS.verbose)

    # Create the data for one input tensor.
    input_data = []
    input_data.append(np.arange(start=1, stop=1 + elements, dtype=np.float32))
    input_data.append(np.array([2] * elements, dtype=np.float32))

    inputs = []
    for i in range(len(input_data)):
        inputs.append(
            client_util.InferInput("INPUT__{}".format(i), input_data[0].shape,
                                   np_to_triton_dtype(input_data[0].dtype)))
        inputs[i].set_data_from_numpy(input_data[i])

    results = client.infer(model_name, inputs)

    # We expect 1 result of size 10 with alternating 1 and 0.
    output_data = results.as_numpy('OUTPUT__0')
    if output_data is None:
        print("error: expected 'OUTPUT__0'")
        sys.exit(1)

    for i in range(elements):
        print(
            str(i) + ": " + str(input_data[0][i]) + " % " +
            str(input_data[1][i]) + " = " + str(output_data[i]))
        if ((input_data[0][i] % input_data[1][i]) != output_data[i]):
Beispiel #12
0
   if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
      print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(FLAGS.protocol))
      exit(1)

   client_util = httpclient if FLAGS.protocol == "http" else grpcclient

   model_name = "param"

   # Create the inference context for the model.
   client = client_util.InferenceServerClient(FLAGS.url, FLAGS.verbose)

   # Input tensor can be any size int32 vector...
   input_data = np.zeros(shape=1, dtype=np.int32)

   inputs = [client_util.InferInput(
                  "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype))]
   inputs[0].set_data_from_numpy(input_data)

   results = client.infer(model_name, inputs)

   print(results)

   params = results.as_numpy("OUTPUT")
   if params is None:
      print("error: expected 'OUTPUT'")
      sys.exit(1)

   if params.size != 5:
      print("error: expected 5 output strings, got {}".format(params.size))
      sys.exit(1)
    model_name = FLAGS.model
    shape = (3, 5)
    dtype = np.float32

    # Create the inference context for the model.
    client = client_util.InferenceServerClient(FLAGS.url, FLAGS.verbose)

    # Create the data for one input tensor.
    input_data = []
    input_data.append(np.ones((3, 5), dtype=np.float32))
    input_data.append(np.ones((3, 5), dtype=np.float32))

    inputs = []
    for i in range(len(input_data)):
        inputs.append(client_util.InferInput(
            "input_{}".format(i+1), shape, np_to_triton_dtype(dtype)))
        inputs[i].set_data_from_numpy(input_data[i])

    results = client.infer(model_name, inputs)

    # We expect 1 result of size 10 with alternating 1 and 0.
    output_data = results.as_numpy('output')
    if output_data is None:
        print("error: expected 'output'")
        sys.exit(1)

    for i in range(3):
        for j in range(5):
            print(str(input_data[0][i][j]) + " + " +
                str(input_data[1][i][j]) + " = " + str(output_data[i][j]))
            if ((input_data[0][i][j] + input_data[1][i][j]) != output_data[i][j]):