コード例 #1
0
    def setUp(self):
        global _deferred_exceptions
        _deferred_exceptions = []

        # The helper client for setup will be GRPC for simplicity.
        self.triton_client_ = grpcclient.InferenceServerClient(
            "localhost:8001")
        self.model_name_ = 'identity_2_float32'
        # This will not be changed even when ensemble is under test,
        # as the dynamic batching is performed within the composing model
        self.check_status_model = 'identity_2_float32'
        self.tensor_shape_ = (1, 1)
        self.inputs_ = {
            "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"),
            "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32")
        }
        self.input_data_ = {
            "INPUT0": np.ones(shape=(1, 1), dtype=np.float32),
            "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32)
        }
        self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"])
        self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"])
        self.outputs_ = {
            "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'),
            "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1')
        }
def simple_string_inference(triton_client):
    model_name = 'simple_string'

    inputs = []
    outputs = []
    inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES"))
    inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "BYTES"))

    # Create the data for the two input tensors. Initialize the first
    # to unique integers and the second to all ones.
    in0 = np.arange(start=0, stop=16, dtype=np.int32)
    in0 = np.expand_dims(in0, axis=0)
    in1 = np.ones(shape=(1, 16), dtype=np.int32)
    expected_sum = np.add(in0, in1)
    expected_diff = np.subtract(in0, in1)

    # The 'simple_string' model expects 2 BYTES tensors where each
    # element in those tensors is the utf-8 string representation of
    # an integer. The BYTES tensors must be represented by a numpy
    # array with dtype=np.object_.
    in0n = np.array([str(x).encode('utf-8') for x in in0.reshape(in0.size)],
                    dtype=np.object_)
    input0_data = in0n.reshape(in0.shape)
    in1n = np.array([str(x).encode('utf-8') for x in in1.reshape(in1.size)],
                    dtype=np.object_)
    input1_data = in1n.reshape(in1.shape)

    # Initialize the data
    inputs[0].set_data_from_numpy(input0_data)
    inputs[1].set_data_from_numpy(input1_data)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
    outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Get the output arrays from the results
    output0_data = results.as_numpy('OUTPUT0')
    output1_data = results.as_numpy('OUTPUT1')

    for i in range(16):
        print(
            str(input0_data[0][i]) + " + " + str(input1_data[0][i]) + " = " +
            str(output0_data[0][i]))
        print(
            str(input0_data[0][i]) + " - " + str(input1_data[0][i]) + " = " +
            str(output1_data[0][i]))

        # Convert result from string to int to check result
        r0 = int(output0_data[0][i])
        r1 = int(output1_data[0][i])
        if expected_sum[0][i] != r0:
            print("error: incorrect sum")
            sys.exit(1)
        if expected_diff[0][i] != r1:
            print("error: incorrect difference")
            sys.exit(1)
コード例 #3
0
    def _basic_inference(self,
                         shm_ip0_handle,
                         shm_ip1_handle,
                         shm_op0_handle,
                         shm_op1_handle,
                         error_msg,
                         big_shm_name="",
                         big_shm_size=64):
        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        inputs = []
        outputs = []
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
            outputs.append(
                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
        else:
            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))

        inputs[0].set_shared_memory("input0_data", 64)

        if type(shm_ip1_handle) == np.array:
            inputs[1].set_data_from_numpy(input0_data, binary_data=True)
        elif big_shm_name != "":
            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
        else:
            inputs[1].set_shared_memory("input1_data", 64)

        outputs[0].set_shared_memory("output0_data", 64)
        outputs[1].set_shared_memory("output1_data", 64)

        try:
            results = triton_client.infer("simple",
                                          inputs,
                                          model_version="",
                                          outputs=outputs)
            output = results.get_output('OUTPUT0')
            if _protocol == "http":
                output_datatype = output['datatype']
                output_shape = output['shape']
            else:
                output_datatype = output.datatype
                output_shape = output.shape
            output_dtype = utils.triton_to_np_dtype(output_datatype)
            output_data = shm.get_contents_as_numpy(shm_op0_handle,
                                                    output_dtype, output_shape)
            self.assertTrue(
                (output_data[0] == (input0_data + input1_data)).all(),
                "Model output does not match expected output")
        except Exception as ex:
            error_msg.append(str(ex))
def sync_send(triton_client, result_list, values, batch_size, sequence_id,
              model_name, model_version):

    count = 1
    for value in values:
        # Create the tensor for INPUT
        value_data = np.full(shape=[batch_size, 1],
                             fill_value=value,
                             dtype=np.int32)
        inputs = []
        inputs.append(grpcclient.InferInput('INPUT', value_data.shape,
                                            "INT32"))
        # Initialize the data
        inputs[0].set_data_from_numpy(value_data)
        outputs = []
        outputs.append(grpcclient.InferRequestedOutput('OUTPUT'))
        # Issue the synchronous sequence inference.
        result = triton_client.infer(model_name=model_name,
                                     inputs=inputs,
                                     outputs=outputs,
                                     sequence_id=sequence_id,
                                     sequence_start=(count == 1),
                                     sequence_end=(count == len(values)))
        result_list.append(result.as_numpy('OUTPUT'))
        count = count + 1
コード例 #5
0
def get_result(url, model_name, x):
    try:
        triton_client = grpcclient.InferenceServerClient(url=url,
                                                         verbose=False,
                                                         ssl=False)
        print("Channel creation success")
    except Exception as e:
        print("channel creation failed: " + str(e))

    inputs = []
    outputs = []
    inputs.append(grpcclient.InferInput('input0', x.shape, "FP32"))
    input0_data = x
    print("X Shape : ", x.shape)
    inputs[0].set_data_from_numpy(input0_data)
    outputs.append(grpcclient.InferRequestedOutput('output0'))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)
    inputs[0].set_data_from_numpy(input0_data)
    output0_data = results.as_numpy('output0')
    output0_data = sigmoid(output0_data.squeeze())
    print(output0_data)
    result = np.mean(output0_data)
    return output0_data
コード例 #6
0
def requestGenerator(batched_image_data, input_name, output_name, dtype, FLAGS):

    # Set the input data
    inputs = []
    if FLAGS.protocol.lower() == "grpc":
        inputs.append(
            grpcclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            httpclient.InferInput(input_name, batched_image_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

    outputs = []
    if FLAGS.protocol.lower() == "grpc":
        outputs.append(
            grpcclient.InferRequestedOutput(output_name,
                                            class_count=FLAGS.classes))
    else:
        outputs.append(
            httpclient.InferRequestedOutput(output_name,
                                            binary_data=True,
                                            class_count=FLAGS.classes))

    yield inputs, outputs, FLAGS.model_name, FLAGS.model_version
def identity_inference(triton_client, np_array):
    model_name = "simple_identity"
    inputs = []
    outputs = []

    inputs.append(grpcclient.InferInput('INPUT0', np_array.shape, "BYTES"))
    inputs[0].set_data_from_numpy(np_array)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)
    if (np_array.dtype == np.object_):
        print(results.as_numpy('OUTPUT0'))
        if not np.array_equal(np_array, results.as_numpy('OUTPUT0')):
            print(results.as_numpy('OUTPUT0'))
            print("error: incorrect output")
            sys.exit(1)
    else:
        encoded_results = np.char.encode(
            results.as_numpy('OUTPUT0').astype(str))
        if not np.array_equal(np_array, encoded_results):
            print(encoded_results)
            print("error: incorrect output")
            sys.exit(1)
コード例 #8
0
ファイル: utils.py プロジェクト: thibaultcharrin/NVTabular
def _run_query(
    client,
    n_rows,
    model_name,
    workflow_path,
    data_path,
    actual_output_filename,
    output_name,
    input_cols_name=None,
    backend="tensorflow",
):

    workflow = nvt.Workflow.load(workflow_path)

    if input_cols_name is None:
        batch = cudf.read_csv(
            data_path, nrows=n_rows)[workflow.output_node.input_columns.names]
    else:
        batch = cudf.read_csv(data_path, nrows=n_rows)[input_cols_name]

    input_dtypes = workflow.input_dtypes
    columns = [(col, batch[col]) for col in batch.columns]

    inputs = []
    for i, (name, col) in enumerate(columns):
        d = col.values_host.astype(input_dtypes[name])
        d = d.reshape(len(d), 1)
        inputs.append(
            grpcclient.InferInput(name, d.shape,
                                  np_to_triton_dtype(input_dtypes[name])))
        inputs[i].set_data_from_numpy(d)

    outputs = [grpcclient.InferRequestedOutput(output_name)]
    time_start = dt.datetime.now()
    response = client.infer(model_name,
                            inputs,
                            request_id="1",
                            outputs=outputs)
    run_time = dt.datetime.now() - time_start

    output_key = "output" if backend == "hugectr" else "0"

    output_actual = cudf.read_csv(os.path.expanduser(actual_output_filename),
                                  nrows=n_rows)
    output_actual = cp.asnumpy(output_actual[output_key].values)
    output_predict = response.as_numpy(output_name)

    if backend == "tensorflow":
        output_predict = output_predict[:, 0]

    diff = abs(output_actual - output_predict)
    return diff, run_time
コード例 #9
0
    def setUp(self):
        global _deferred_exceptions
        _deferred_exceptions = []

        # The helper client for setup will be GRPC for simplicity.
        self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001")
        self.model_name_ = 'identity_2_float32'
        self.tensor_shape_ = (1, 1)
        self.inputs_ = {
            "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"),
            "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32")
        }
        self.input_data_ = {
            "INPUT0": np.ones(shape=(1, 1), dtype=np.float32),
            "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32)
        }
        self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"])
        self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"])
        self.outputs_ = {
            "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'),
            "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1')
        }
コード例 #10
0
    def test_grpc_out_of_shared_memory(self):
        triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
        inputs = []
        inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8"))
        inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))

        # Set up too small CUDA shared memory for outputs, expect query
        # returns default value
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
        shm_op0_handle = cudashm.create_shared_memory_region(
            "output0_data", 1, 0)
        shm_op1_handle = cudashm.create_shared_memory_region(
            "output1_data", 1, 0)
        triton_client.register_cuda_shared_memory(
            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1)
        triton_client.register_cuda_shared_memory(
            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1)
        outputs = []
        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
        outputs[-1].set_shared_memory("output0_data", 1)

        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
        outputs[-1].set_shared_memory("output1_data", 1)

        try:
            triton_client.infer(model_name="query",
                                inputs=inputs,
                                outputs=outputs)
            self.assertTrue(False, "expect error with query information")
        except InferenceServerException as ex:
            self.assertTrue("OUTPUT0 CPU 0" in ex.message())
            self.assertTrue("OUTPUT1 CPU 0" in ex.message())

        cudashm.destroy_shared_memory_region(shm_op0_handle)
        cudashm.destroy_shared_memory_region(shm_op1_handle)
        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
コード例 #11
0
    def _prepare_request(self, protocol):
        if (protocol == "grpc"):
            self.inputs_ = []
            self.inputs_.append(
                grpcclient.InferInput('INPUT0', [1, 1], "INT32"))
            self.outputs_ = []
            self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0'))
        else:
            self.inputs_ = []
            self.inputs_.append(
                httpclient.InferInput('INPUT0', [1, 1], "INT32"))
            self.outputs_ = []
            self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0'))

        self.inputs_[0].set_data_from_numpy(self.input0_data_)
コード例 #12
0
    def run(self, client_metadata):
        triton_client = client_metadata[0]

        inputs = [
            grpcclient.InferInput("input", self.image_data_.shape, "FP32")
        ]
        inputs[0].set_data_from_numpy(self.image_data_)

        outputs = [
            grpcclient.InferRequestedOutput("resnet_v1_50/predictions/Softmax",
                                            class_count=1)
        ]
        res = triton_client.infer(self.model_name_, inputs, outputs=outputs)
        self.postprocess(res)
        return self.batch_size_
コード例 #13
0
def requestGenerator(batched_image_data, input_name, output_name, dtype, model_name, model_version, classes=1):

    # Set the input data
    inputs = []

    inputs.append(
        grpcclient.InferInput(input_name, batched_image_data.shape, dtype))
    inputs[0].set_data_from_numpy(batched_image_data)

    outputs = []

    outputs.append(
        grpcclient.InferRequestedOutput(output_name,
                                        class_count=classes))

    yield inputs, outputs, model_name, model_version
コード例 #14
0
 def test_infer(self, data, it):
     assert (len(data) == len(self.input_names))
     if (len(data) > 1):
         for b in data:
             assert b.shape[0] == data[0].shape[0]
     inputs = [
         self._get_input(batch, name)
         for batch, name in zip(data, self.input_names)
     ]
     outputs = [
         t_client.InferRequestedOutput(name) for name in self.output_names
     ]
     res = self.client.infer(model_name=self.model_name,
                             inputs=inputs,
                             outputs=outputs)
     res_data = [res.as_numpy(name) for name in self.output_names]
     return it, data, res_data
コード例 #15
0
    def get_embedding(self, face_img):
        if not isinstance(face_img, list):
            face_img = [face_img]

        face_img = np.stack(face_img)

        input_size = tuple(face_img[0].shape[0:2][::-1])
        blob = cv2.dnn.blobFromImages(
            face_img,
            1.0 / self.input_std,
            input_size, (self.input_mean, self.input_mean, self.input_mean),
            swapRB=True)

        blob = blob.astype(triton_to_np_dtype(self.dtype))

        inputs = []
        inputs.append(
            grpcclient.InferInput(self.input_name,
                                  [blob.shape[0], self.c, self.h, self.w],
                                  "FP32"))
        # inputs[0].set_data_from_numpy(face_img)

        cudashm.set_shared_memory_region(self.in_handle, [blob])
        input_bytesize = 12 * blob.shape[0] * self.w * self.h
        inputs[-1].set_shared_memory(self.in_handle_name, input_bytesize)

        outputs = []
        out_bytesize = 12 * 512 * self.max_batch_size
        outputs.append(grpcclient.InferRequestedOutput(self.output_name[0]))
        outputs[-1].set_shared_memory(self.out_handle_name, out_bytesize)

        out = self.triton_client.infer(self.model_name,
                                       inputs,
                                       model_version=self.model_version,
                                       outputs=outputs)

        out = [
            cudashm.get_contents_as_numpy(self.out_handle,
                                          triton_to_np_dtype(self.dtype),
                                          [blob.shape[0], 512])
        ]
        # out = [out.as_numpy(e) for e in self.output_name]

        return out[0]
コード例 #16
0
    def run(self, input):
        inputs = []
        outputs = [
            grpcclient.InferRequestedOutput(e) for e in self.output_order
        ]
        inputs.append(
            grpcclient.InferInput(self.input_name, [1, self.c, self.h, self.w],
                                  self.dtype))
        # inputs[0].set_data_from_numpy(input)
        cudashm.set_shared_memory_region(self.in_handle, [input])
        inputs[-1].set_shared_memory(self.in_handle_name, self.input_bytesize)

        out = self.triton_client.infer(self.model_name,
                                       inputs,
                                       model_version=self.model_version,
                                       outputs=outputs)

        out = [out.as_numpy(e) for e in self.output_order]

        return out
コード例 #17
0
def requestGenerator(input_name, input_data, output_name, dtype, protocol):

    # Set the input data
    inputs = []
    if protocol.lower() == "grpc":
        inputs.append(grpcclient.InferInput(input_name, input_data.shape,
                                            dtype))
        inputs[0].set_data_from_numpy(input_data)
    else:
        inputs.append(httpclient.InferInput(input_name, input_data.shape,
                                            dtype))
        inputs[0].set_data_from_numpy(input_data, binary_data=True)

    outputs = []
    if protocol.lower() == "grpc":
        outputs.append(grpcclient.InferRequestedOutput(output_name))
    else:
        outputs.append(
            httpclient.InferRequestedOutput(output_name, binary_data=True))

    return inputs, outputs
コード例 #18
0
def test_nvt_hugectr_inference(n_rows, err_tol):
    warnings.simplefilter("ignore")

    model_name = "test_model_ens"
    col_names = ["userId", "movieId", "new_cat1"]
    # read in a batch of data to get transforms for
    batch = cudf.read_csv(DATA_DIR + "test/data.csv", nrows=n_rows)[col_names]

    # convert the batch to a triton inputs
    columns = [(col, batch[col]) for col in col_names]
    inputs = []

    col_dtypes = [np.int64, np.int64, np.int64]
    for i, (name, col) in enumerate(columns):
        d = col.values_host.astype(col_dtypes[i])
        d = d.reshape(len(d), 1)
        inputs.append(
            httpclient.InferInput(name, d.shape,
                                  np_to_triton_dtype(col_dtypes[i])))
        inputs[i].set_data_from_numpy(d)

    # placeholder variables for the output
    outputs = []
    outputs.append(httpclient.InferRequestedOutput("OUTPUT0"))
    # make the request
    with httpclient.InferenceServerClient("localhost:8001") as client:
        response = client.infer(model_name,
                                inputs,
                                request_id=str(1),
                                outputs=outputs)

    output_actual = cudf.read_csv(DATA_DIR + "test/output.csv", nrows=n_rows)
    output_actual = cp.asnumpy(output_actual["output"].values)
    output_predict = response.as_numpy("OUTPUT0")

    diff = abs(output_actual - output_predict)

    assert (diff < err_tol).all()
コード例 #19
0
    def inputs_outputs_generator(self, raw_inputs):
        """
        Generate inputs and outptus blob for triton client inference
        :param raw_inputs: list of raw numpy inputs
        :return: inputs outputs data
        """
        inputs = []
        for input_specs, raw_input in zip(self.inputs_specs, raw_inputs):
            # parse data type
            raw_input = raw_input.astype(
                triton_to_np_dtype(input_specs.datatype))
            infer_input = grpcclient.InferInput(input_specs.name,
                                                raw_input.shape,
                                                input_specs.datatype)
            infer_input.set_data_from_numpy(raw_input)
            inputs.append(infer_input)

        outputs = []
        for output_specs in self.outputs_specs:
            outputs.append(
                grpcclient.InferRequestedOutput(output_specs.name,
                                                class_count=0))
        return inputs, outputs
コード例 #20
0
    def infer(self, input_img, obj, confidence):
        confidence = confidence
        out = "crop/" + input_img.split("/")[1]
        # IMAGE MODE
        # print("Running in 'image' mode")
        if not input_img:
            print("FAILED: no input image")
            sys.exit(1)

        inputs = []
        outputs = []
        inputs.append(grpcclient.InferInput('data', [1, 3, 128, 64], "FP32"))
        outputs.append(grpcclient.InferRequestedOutput('prob'))

        # print("Creating buffer from image file...")
        input_image = cv2.imread(input_img)
        h = int(input_image.shape[0])
        w = int(input_image.shape[1])
        input_image = input_image[int(h * obj[3]):int(h * obj[4]),
                                  int(w * obj[1]):int(w * obj[2])]
        print(obj)
        cv2.imwrite(out, input_image)
        if input_image is None:
            print(f"FAILED: could not load input image {str(input_img)}")
            sys.exit(1)
        input_image_buffer = preprocess_suit_img(input_image, 128, 64)
        input_image_buffer = np.expand_dims(input_image_buffer, axis=0)
        inputs[0].set_data_from_numpy(input_image_buffer)

        # print("Invoking inference...")
        results = self.triton_client.infer(model_name=self.model,
                                           inputs=inputs,
                                           outputs=outputs,
                                           client_timeout=self.client_timeout)
        result = results.as_numpy('prob')
        # print (result[0])
        return np.argmax(result[0])
コード例 #21
0
    async def request(self, model_name, clientName, x):
        result = list()
        
        print(x.shape)
        inputs  = [ grpcclient.InferInput('input0', x.shape, "FP32") ]
        outputs = [ grpcclient.InferRequestedOutput('output0') ]

        inputs[0].set_data_from_numpy(x)
        
        if self.DEBUG:
            stime = time.time()
        results = await self.loop.run_in_executor(None, lambda: self.tritonclient[clientName].infer(model_name=model_name, \
                                                                                                    inputs=inputs, \
                                                                                                    outputs=outputs))
        
        if self.DEBUG:
            distime = time.time() - stime
            self.Elapsed[model_name] = distime
            print("[Request] Elapsed (After Request) {} {:.3f} second".format(model_name, distime))

        output0_data = results.as_numpy('output0')
        output0_data = sigmoid(output0_data.squeeze())

        return output0_data
コード例 #22
0
    def __init__(self,
                 url="localhost:8001",
                 model_name="yolov5",
                 model_version="",
                 verbose=False) -> None:
        self.triton_client = grpcclient.InferenceServerClient(url=url,
                                                              verbose=verbose)

        # To make sure no shared memory regions are registered with the server.
        self.triton_client.unregister_system_shared_memory()
        self.triton_client.unregister_cuda_shared_memory()

        # Yolo model takes 1 input tensors dims [1, 3, 640, 640]
        # each and returns 4 output tensors
        # dims: [1,25200,6]
        # dims: [1,3,80,80,6]
        # dims: [1,3,40,40,6]
        # dims: [1,3,20,20,6]
        self.model_name = model_name
        self.model_version = model_version

        # Create the data for the input and 4 outputs tensors
        input_images = np.zeros((1, 3, 640, 640), dtype=np.float32)
        output = np.zeros((1, 25200, 6), dtype=np.float32)
        output_397 = np.zeros((1, 3, 80, 80, 6), dtype=np.float32)
        output_458 = np.zeros((1, 3, 40, 40, 6), dtype=np.float32)
        output_519 = np.zeros((1, 3, 20, 20, 6), dtype=np.float32)

        # Calc input/output tensors sizes
        input_images_byte_size = input_images.size * input_images.itemsize
        output_byte_size = output.size * output.itemsize
        output_397_byte_size = output_397.size * output_397.itemsize
        output_458_byte_size = output_458.size * output_458.itemsize
        output_519_byte_size = output_519.size * output_519.itemsize

        # Create outputs in Shared Memory and store shared memory handles
        self.output_handle = shm.create_shared_memory_region(
            "output", "/output", output_byte_size)
        self.output_397_handle = shm.create_shared_memory_region(
            "output_397", "/output_397", output_397_byte_size)
        self.output_458_handle = shm.create_shared_memory_region(
            "output_458", "/output_458", output_458_byte_size)
        self.output_519_handle = shm.create_shared_memory_region(
            "output_519", "/output_519", output_519_byte_size)

        # Register outputs shared memory with Triton Server
        self.triton_client.register_system_shared_memory(
            "output", "/output", output_byte_size)
        self.triton_client.register_system_shared_memory(
            "output_397", "/output_397", output_397_byte_size)
        self.triton_client.register_system_shared_memory(
            "output_458", "/output_458", output_458_byte_size)
        self.triton_client.register_system_shared_memory(
            "output_519", "/output_519", output_519_byte_size)

        # Create inputs in Shared Memory and store shared memory handles
        self.input_images_handle = shm.create_shared_memory_region(
            "images", "/images", input_images_byte_size)
        # Register inputs shared memory with Triton Server
        self.triton_client.register_system_shared_memory(
            "images", "/images", input_images_byte_size)

        # Set the parameters to use data from shared memory
        self.inputs = []
        self.inputs.append(
            grpcclient.InferInput('images', [1, 3, 640, 640], "FP32"))
        self.inputs[-1].set_shared_memory("images", input_images_byte_size)

        self.outputs = []
        self.outputs.append(grpcclient.InferRequestedOutput('output'))
        self.outputs[-1].set_shared_memory("output", output_byte_size)

        self.predict(input_images)
コード例 #23
0
                                                "/input0_simple",
                                                input0_byte_size)
    triton_client.register_system_shared_memory("input1_data",
                                                "/input1_simple",
                                                input1_byte_size)

    # Set the parameters to use data from shared memory
    inputs = []
    inputs.append(grpcclient.InferInput('INPUT0', [1, 16], "BYTES"))
    inputs[-1].set_shared_memory("input0_data", input0_byte_size)

    inputs.append(grpcclient.InferInput('INPUT1', [1, 16], "BYTES"))
    inputs[-1].set_shared_memory("input1_data", input1_byte_size)

    outputs = []
    outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
    outputs[-1].set_shared_memory("output0_data", output0_byte_size)

    outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
    outputs[-1].set_shared_memory("output1_data", output1_byte_size)

    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    # Read results from the shared memory.
    output0 = results.get_output("OUTPUT0")
    print(utils.triton_to_np_dtype(output0.datatype))
    if output0 is not None:
        output0_data = shm.get_contents_as_numpy(
            shm_op0_handle, utils.triton_to_np_dtype(output0.datatype),
コード例 #24
0
def infer_exact(tester,
                pf,
                tensor_shape,
                batch_size,
                input_dtype,
                output0_dtype,
                output1_dtype,
                output0_raw=True,
                output1_raw=True,
                model_version=None,
                swap=False,
                outputs=("OUTPUT0", "OUTPUT1"),
                use_http=True,
                use_grpc=True,
                use_http_json_tensors=True,
                skip_request_id_check=False,
                use_streaming=True,
                correlation_id=0,
                shm_region_names=None,
                precreated_shm_regions=None,
                use_system_shared_memory=False,
                use_cuda_shared_memory=False,
                priority=0,
                timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    # configs [ url, protocol, async stream, binary data ]
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
        if output0_raw == output1_raw:
            # Float16 not supported for Input and Output via JSON
            if use_http_json_tensors and (input_dtype != np.float16) and \
               (output0_dtype != np.float16) and (output1_dtype != np.float16):
                configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))

    # outputs are sum and difference of inputs so set max input
    # values so that they will not overflow the output. This
    # allows us to do an exact match. For float types use 8, 16,
    # 32 int range for fp 16, 32, 64 respectively. When getting
    # class outputs the result value/probability is returned as a
    # float so must use fp32 range in that case.
    rinput_dtype = _range_repr_dtype(input_dtype)
    routput0_dtype = _range_repr_dtype(
        output0_dtype if output0_raw else np.float32)
    routput1_dtype = _range_repr_dtype(
        output1_dtype if output1_raw else np.float32)
    val_min = max(
        np.iinfo(rinput_dtype).min,
        np.iinfo(routput0_dtype).min,
        np.iinfo(routput1_dtype).min) / 2
    val_max = min(
        np.iinfo(rinput_dtype).max,
        np.iinfo(routput0_dtype).max,
        np.iinfo(routput1_dtype).max) / 2

    num_classes = 3

    input0_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    input1_array = np.random.randint(low=val_min,
                                     high=val_max,
                                     size=tensor_shape,
                                     dtype=rinput_dtype)
    if input_dtype != np.object:
        input0_array = input0_array.astype(input_dtype)
        input1_array = input1_array.astype(input_dtype)

    if not swap:
        output0_array = input0_array + input1_array
        output1_array = input0_array - input1_array
    else:
        output0_array = input0_array - input1_array
        output1_array = input0_array + input1_array

    if output0_dtype == np.object:
        output0_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output0_array.flatten())
        ],
                                 dtype=object).reshape(output0_array.shape)
    else:
        output0_array = output0_array.astype(output0_dtype)
    if output1_dtype == np.object:
        output1_array = np.array([
            unicode(str(x), encoding='utf-8')
            for x in (output1_array.flatten())
        ],
                                 dtype=object).reshape(output1_array.shape)
    else:
        output1_array = output1_array.astype(output1_dtype)

    if input_dtype == np.object:
        in0n = np.array(
            [str(x) for x in input0_array.reshape(input0_array.size)],
            dtype=object)
        input0_array = in0n.reshape(input0_array.shape)
        in1n = np.array(
            [str(x) for x in input1_array.reshape(input1_array.size)],
            dtype=object)
        input1_array = in1n.reshape(input1_array.shape)

    # prepend size of string to output string data
    if output0_dtype == np.object:
        if batch_size == 1:
            output0_array_tmp = serialize_byte_tensor_list([output0_array])
        else:
            output0_array_tmp = serialize_byte_tensor_list(output0_array)
    else:
        output0_array_tmp = output0_array

    if output1_dtype == np.object:
        if batch_size == 1:
            output1_array_tmp = serialize_byte_tensor_list([output1_array])
        else:
            output1_array_tmp = serialize_byte_tensor_list(output1_array)
    else:
        output1_array_tmp = output1_array

    # Get model platform
    model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                   output1_dtype)
    if configs[0][1] == "http":
        metadata_client = httpclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata["platform"]
    else:
        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata.platform

    if platform == "pytorch_libtorch":
        OUTPUT0 = "OUTPUT__0"
        OUTPUT1 = "OUTPUT__1"
        INPUT0 = "INPUT__0"
        INPUT1 = "INPUT__1"
    else:
        OUTPUT0 = "OUTPUT0"
        OUTPUT1 = "OUTPUT1"
        INPUT0 = "INPUT0"
        INPUT1 = "INPUT1"

    output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp])
    output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp])

    if batch_size == 1:
        input0_list = [input0_array]
        input1_list = [input1_array]
    else:
        input0_list = [x for x in input0_array]
        input1_list = [x for x in input1_array]

    # Serialization of string tensors in the case of shared memory must be done manually
    if input_dtype == np.object:
        input0_list_tmp = serialize_byte_tensor_list(input0_list)
        input1_list_tmp = serialize_byte_tensor_list(input1_list)
    else:
        input0_list_tmp = input0_list
        input1_list_tmp = input1_list

    input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp])
    input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp])

    # Create system/cuda shared memory regions if needed
    shm_regions, shm_handles = su.create_set_shm_regions(
        input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size,
        outputs, shm_region_names, precreated_shm_regions,
        use_system_shared_memory, use_cuda_shared_memory)

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
                                       output1_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        if config[1] == "http":
            inputs.append(
                httpclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                httpclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
        else:
            inputs.append(
                grpcclient.InferInput(INPUT0, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))
            inputs.append(
                grpcclient.InferInput(INPUT1, tensor_shape,
                                      np_to_triton_dtype(input_dtype)))

        if not (use_cuda_shared_memory or use_system_shared_memory):
            if config[1] == "http":
                inputs[0].set_data_from_numpy(input0_array,
                                              binary_data=config[3])
                inputs[1].set_data_from_numpy(input1_array,
                                              binary_data=config[3])
            else:
                inputs[0].set_data_from_numpy(input0_array)
                inputs[1].set_data_from_numpy(input1_array)
        else:
            # Register necessary shared memory regions/handles
            su.register_add_shm_regions(inputs, outputs, shm_regions,
                                        precreated_shm_regions, shm_handles,
                                        input0_byte_size, input1_byte_size,
                                        output0_byte_size, output1_byte_size,
                                        use_system_shared_memory,
                                        use_cuda_shared_memory, triton_client)

        if batch_size == 1:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape((1, ) + tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape((1, ) + tensor_shape)
            ]
        else:
            expected0_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output0_array.reshape(tensor_shape)
            ]
            expected1_sort_idx = [
                np.flip(np.argsort(x.flatten()), 0)
                for x in output1_array.reshape(tensor_shape)
            ]

        # Force binary_data = False for shared memory and class
        output_req = []
        i = 0
        if "OUTPUT0" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT0,
                                                        binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))

                output_req[-1].set_shared_memory(shm_regions[2] + '_data',
                                                 output0_byte_size)
            else:
                if output0_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT0))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT0,
                                binary_data=config[3],
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT0, class_count=num_classes))
            i += 1
        if "OUTPUT1" in outputs:
            if len(shm_regions) != 0:
                if config[1] == "http":
                    output_req.append(
                        httpclient.InferRequestedOutput(OUTPUT1,
                                                        binary_data=config[3]))
                else:
                    output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))

                output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data',
                                                 output1_byte_size)
            else:
                if output1_raw:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1, binary_data=config[3]))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(OUTPUT1))
                else:
                    if config[1] == "http":
                        output_req.append(
                            httpclient.InferRequestedOutput(
                                OUTPUT1,
                                binary_data=config[3],
                                class_count=num_classes))
                    else:
                        output_req.append(
                            grpcclient.InferRequestedOutput(
                                OUTPUT1, class_count=num_classes))

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()))
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()))

        last_response = results.get_response()

        if not skip_request_id_check:
            global _seen_request_ids
            if config[1] == "http":
                request_id = int(last_response["id"])
            else:
                request_id = int(last_response.id)
            tester.assertFalse(request_id in _seen_request_ids,
                               "request_id: {}".format(request_id))
            _seen_request_ids.add(request_id)

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(str(response_model_version), model_version)

        tester.assertEqual(len(response_outputs), len(outputs))

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            if ((result_name == OUTPUT0 and output0_raw)
                    or (result_name == OUTPUT1 and output1_raw)):
                if use_system_shared_memory or use_cuda_shared_memory:
                    if result_name == OUTPUT0:
                        shm_handle = shm_handles[2]
                    else:
                        shm_handle = shm_handles[3]

                    output = results.get_output(result_name)
                    if config[1] == "http":
                        output_datatype = output['datatype']
                        output_shape = output['shape']
                    else:
                        output_datatype = output.datatype
                        output_shape = output.shape
                    output_dtype = triton_to_np_dtype(output_datatype)
                if use_system_shared_memory:
                    output_data = shm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                elif use_cuda_shared_memory:
                    output_data = cudashm.get_contents_as_numpy(
                        shm_handle, output_dtype, output_shape)
                else:
                    output_data = results.as_numpy(result_name)

                if (output_data.dtype == np.object) and (config[3] == False):
                    output_data = output_data.astype(np.bytes_)

                if result_name == OUTPUT0:
                    tester.assertTrue(
                        np.array_equal(output_data, output0_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT0, output0_array, output_data))
                elif result_name == OUTPUT1:
                    tester.assertTrue(
                        np.array_equal(output_data, output1_array),
                        "{}, {} expected: {}, got {}".format(
                            model_name, OUTPUT1, output1_array, output_data))
                else:
                    tester.assertTrue(
                        False, "unexpected raw result {}".format(result_name))
            else:
                for b in range(batch_size):
                    # num_classes values must be returned and must
                    # match expected top values
                    if "nobatch" in pf:
                        class_list = results.as_numpy(result_name)
                    else:
                        class_list = results.as_numpy(result_name)[b]

                    tester.assertEqual(len(class_list), num_classes)
                    if batch_size == 1:
                        expected0_flatten = output0_array.flatten()
                        expected1_flatten = output1_array.flatten()
                    else:
                        expected0_flatten = output0_array[b].flatten()
                        expected1_flatten = output1_array[b].flatten()

                    for idx, class_label in enumerate(class_list):
                        # can't compare indices since could have different
                        # indices with the same value/prob, so check that
                        # the value of each index equals the expected value.
                        # Only compare labels when the indices are equal.
                        if type(class_label) == str:
                            ctuple = class_label.split(':')
                        else:
                            ctuple = "".join(chr(x)
                                             for x in class_label).split(':')
                        cval = float(ctuple[0])
                        cidx = int(ctuple[1])
                        if result_name == OUTPUT0:
                            tester.assertEqual(cval, expected0_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected0_flatten[expected0_sort_idx[b][idx]])
                            if cidx == expected0_sort_idx[b][idx]:
                                tester.assertEqual(
                                    ctuple[2], 'label{}'.format(
                                        expected0_sort_idx[b][idx]))
                        elif result_name == OUTPUT1:
                            tester.assertEqual(cval, expected1_flatten[cidx])
                            tester.assertEqual(
                                cval,
                                expected1_flatten[expected1_sort_idx[b][idx]])
                        else:
                            tester.assertTrue(
                                False, "unexpected class result {}".format(
                                    result_name))

    # Unregister system/cuda shared memory regions if they exist
    su.unregister_cleanup_shm_regions(shm_regions, shm_handles,
                                      precreated_shm_regions, outputs,
                                      use_system_shared_memory,
                                      use_cuda_shared_memory)

    return results
コード例 #25
0
def infer_zero(tester,
               pf,
               batch_size,
               tensor_dtype,
               input_shapes,
               output_shapes,
               model_version=None,
               use_http=True,
               use_grpc=True,
               use_http_json_tensors=True,
               use_streaming=True,
               shm_region_name_prefix=None,
               use_system_shared_memory=False,
               use_cuda_shared_memory=False,
               priority=0,
               timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False, True))
        if use_http_json_tensors and (tensor_dtype != np.float16):
            configs.append(("localhost:8000", "http", False, False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False, False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True, False))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if shm_region_name_prefix is None:
        shm_region_name_prefix = ["input", "output"]

    input_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()

    # Get model platform
    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    if configs[0][1] == "http":
        metadata_client = httpclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata["platform"]
    else:
        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
                                                           verbose=True)
        metadata = metadata_client.get_model_metadata(model_name)
        platform = metadata.platform

    for io_num in range(io_cnt):
        if platform == "pytorch_libtorch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_shape = input_shapes[io_num]
        output_shape = output_shapes[io_num]

        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                            high=np.iinfo(rtensor_dtype).max,
                                            size=input_shape,
                                            dtype=rtensor_dtype)
        else:
            input_array = np.random.choice(a=[False, True], size=input_shape)
        if tensor_dtype != np.object:
            input_array = input_array.astype(tensor_dtype)
            expected_array = np.ndarray.copy(input_array)
        else:
            expected_array = np.array([
                unicode(str(x), encoding='utf-8')
                for x in input_array.flatten()
            ],
                                      dtype=object)
            input_array = np.array([str(x) for x in input_array.flatten()],
                                   dtype=object).reshape(input_array.shape)

        expected_array = expected_array.reshape(output_shape)
        expected_dict[output_name] = expected_array

        output_byte_size = expected_array.nbytes

        if batch_size == 1:
            input_list = [input_array]
        else:
            input_list = [x for x in input_array]

        # Serialization of string tensors in the case of shared memory must be done manually
        if tensor_dtype == np.object:
            input_list_tmp = serialize_byte_tensor_list(input_list)
        else:
            input_list_tmp = input_list

        input_byte_size = sum([ip.nbytes for ip in input_list_tmp])

        # create and register shared memory region for inputs and outputs
        shm_io_handles = su.create_set_either_shm_region(
            [
                shm_region_name_prefix[0] + str(io_num),
                shm_region_name_prefix[1] + str(io_num)
            ], input_list_tmp, input_byte_size, output_byte_size,
            use_system_shared_memory, use_cuda_shared_memory)

        if len(shm_io_handles) != 0:
            shm_ip_handles.append(shm_io_handles[0])
            shm_op_handles.append(shm_io_handles[1])
        input_dict[input_name] = input_array

    if model_version is not None:
        model_version = str(model_version)
    else:
        model_version = ""

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        if config[1] == "http":
            triton_client = httpclient.InferenceServerClient(config[0],
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(config[0],
                                                             verbose=True)

        inputs = []
        output_req = []
        for io_num, (input_name, output_name) in enumerate(
                zip(input_dict.keys(), expected_dict.keys())):
            input_data = input_dict[input_name]
            input_byte_size = input_data.nbytes
            output_byte_size = expected_dict[output_name].nbytes
            if config[1] == "http":
                inputs.append(
                    httpclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(
                    httpclient.InferRequestedOutput(output_name,
                                                    binary_data=config[3]))
            else:
                inputs.append(
                    grpcclient.InferInput(input_name, input_data.shape,
                                          np_to_triton_dtype(tensor_dtype)))
                output_req.append(grpcclient.InferRequestedOutput(output_name))

            if not (use_cuda_shared_memory or use_system_shared_memory):
                if config[1] == "http":
                    inputs[-1].set_data_from_numpy(input_data,
                                                   binary_data=config[3])
                else:
                    inputs[-1].set_data_from_numpy(input_data)
            else:
                # Register necessary shared memory regions/handles
                su.register_add_either_shm_regions(
                    inputs, output_req, shm_region_name_prefix,
                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size,
                    output_byte_size, use_system_shared_memory,
                    use_cuda_shared_memory, triton_client)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(
                    model_name,
                    inputs,
                    model_version=model_version,
                    outputs=output_req,
                    request_id=str(_unique_request_id()),
                    priority=priority,
                    timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          model_version=model_version,
                                          outputs=output_req,
                                          request_id=str(_unique_request_id()),
                                          priority=priority,
                                          timeout=timeout_us)

        last_response = results.get_response()

        if config[1] == "http":
            response_model_name = last_response["model_name"]
            if model_version != "":
                response_model_version = last_response["model_version"]
            response_outputs = last_response["outputs"]
        else:
            response_model_name = last_response.model_name
            if model_version != "":
                response_model_version = last_response.model_version
            response_outputs = last_response.outputs

        tester.assertEqual(response_model_name, model_name)

        if model_version != "":
            tester.assertEqual(response_model_version, model_version)

        tester.assertEqual(len(response_outputs), io_cnt)

        for result in response_outputs:
            if config[1] == "http":
                result_name = result["name"]
            else:
                result_name = result.name

            tester.assertTrue(result_name in expected_dict)
            if use_system_shared_memory or use_cuda_shared_memory:
                if platform == "pytorch_libtorch":
                    io_num = int(result_name.split("OUTPUT__")[1])
                else:
                    io_num = int(result_name.split("OUTPUT")[1])
                shm_handle = shm_op_handles[io_num]

                output = results.get_output(result_name)
                if config[1] == "http":
                    output_datatype = output['datatype']
                    output_shape = output['shape']
                else:
                    output_datatype = output.datatype
                    output_shape = output.shape
                output_dtype = triton_to_np_dtype(output_datatype)
            if use_system_shared_memory:
                output_data = shm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            elif use_cuda_shared_memory:
                output_data = cudashm.get_contents_as_numpy(
                    shm_handle, output_dtype, output_shape)
            else:
                output_data = results.as_numpy(result_name)

            if (output_data.dtype == np.object) and (config[3] == False):
                output_data = output_data.astype(np.bytes_)

            expected = expected_dict[result_name]
            tester.assertEqual(output_data.shape, expected.shape)
            tester.assertTrue(
                np.array_equal(output_data, expected),
                "{}, {}, expected: {}, got {}".format(model_name, result_name,
                                                      expected, output_data))

    if len(shm_ip_handles) != 0:
        for io_num in range(io_cnt):
            if use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                triton_client.unregister_cuda_shared_memory(
                    shm_region_name_prefix[0] + str(io_num) + '_data')
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                triton_client.unregister_system_shared_memory(
                    shm_region_name_prefix[1] + str(io_num) + '_data')
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results
コード例 #26
0
            print(config)
        except InferenceServerException as ex:
            print("FAILED : get_model_config")
            print("Got: {}".format(ex.message()))
            sys.exit(1)

    # DUMMY MODE
    if FLAGS.mode == 'dummy':
        print("Running in 'dummy' mode")
        print("Creating emtpy buffer filled with ones...")
        inputs = []
        outputs = []
        inputs.append(grpcclient.InferInput('data', [1, 3, 608, 608], "FP32"))
        inputs[0].set_data_from_numpy(
            np.ones(shape=(1, 3, 608, 608), dtype=np.float32))
        outputs.append(grpcclient.InferRequestedOutput('prob'))

        print("Invoking inference...")
        results = triton_client.infer(model_name=FLAGS.model,
                                      inputs=inputs,
                                      outputs=outputs,
                                      client_timeout=FLAGS.client_timeout)
        if FLAGS.model_info:
            statistics = triton_client.get_inference_statistics(
                model_name=FLAGS.model)
            if len(statistics.model_stats) != 1:
                print("FAILED: get_inference_statistics")
                sys.exit(1)
            print(statistics)
        print("Done")
コード例 #27
0
# pre-processing
img = Image.open(DATA).convert('L')
img = img.resize(INPUT_SHAPE)
imgArr = np.asarray(img) / 255
imgArr = np.expand_dims(imgArr[:, :, np.newaxis], 0)
imgArr= imgArr.astype(triton_to_np_dtype('FP32'))

# Client-Server GRPC
print("Using GRPC ... ")
triton_client = grpcclient.InferenceServerClient(url=URL_GRPC, verbose=0)
inputs = []
inputs.append(grpcclient.InferInput('flatten_1_input', imgArr.shape, 'FP32'))
inputs[0].set_data_from_numpy(imgArr)
outputs = []
outputs.append(grpcclient.InferRequestedOutput('dense_3', class_count=0))
responses = []
responses.append(triton_client.infer(MODEL,inputs,
                    request_id=str(1),
                    model_version=MODEL_VER,
                    outputs=outputs))

# post-proocessing
print (np.argmax(responses[0].as_numpy('dense_3')[0]))
# TODO: Add in return of human-readable label

# Client-Server HTTPS
print("Using HTTPS ... ")
triton_client = httpclient.InferenceServerClient(url=URL_HTTP, verbose=0)
inputs = []
inputs.append(httpclient.InferInput('flatten_1_input', imgArr.shape, 'FP32'))
コード例 #28
0
            end_idx = (batch + 1) * (args.batch_size)

            # Convert the batch to a triton inputs
            current_batch = all_batches[start_idx:end_idx]
            columns = [(col, current_batch[col]) for col in col_names]
            inputs = []

            for i, (name, col) in enumerate(columns):
                d = col.values_host.astype(col_dtypes[i])
                d = d.reshape(len(d), 1)
                inputs.append(
                    grpcclient.InferInput(name, d.shape,
                                          np_to_triton_dtype(col_dtypes[i])))
                inputs[i].set_data_from_numpy(d)

            outputs = []
            outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))

            response = client.infer(args.model_name,
                                    inputs,
                                    request_id=str(1),
                                    outputs=outputs)

            results.extend(response.as_numpy("OUTPUT0"))

            publish_batch(args.project_id, args.topic_id, current_batch,
                          response.as_numpy("OUTPUT0"))

    logging.info(
        f"ROC AUC Score: {metrics.roc_auc_score(all_batches[LABEL_COLUMNS].values.tolist(), results)}"
    )
コード例 #29
0
    def infer(self,input_img,triton_client,confidence = 0.5):
        confidence = confidence
        out = "output/"+input_img.split("/")[1]
        # IMAGE MODE
        # print("Running in 'image' mode")
        if not input_img:
            print("FAILED: no input image")
            sys.exit(1)
        
        inputs = []
        outputs = []
        inputs.append(grpcclient.InferInput('data', [1, 3, 640, 640], "FP32"))
        outputs.append(grpcclient.InferRequestedOutput('prob'))

        # print("Creating buffer from image file...")
        input_image = cv2.imread(input_img)
        if input_image is None:
            print(f"FAILED: could not load input image {str(input_img)}")
            sys.exit(1)
        input_image_buffer,dw,dh,padding_w,padding_h= preprocess(input_image)
        input_image_buffer = np.expand_dims(input_image_buffer, axis=0)
        inputs[0].set_data_from_numpy(input_image_buffer)

        # print("Invoking inference...")
        results = self.triton_client.infer(model_name=self.model,
                                    inputs=inputs,
                                    outputs=outputs,
                                    client_timeout=self.client_timeout)
        if self.model_info:
            statistics = self.triton_client.get_inference_statistics(model_name=self.model)
            if len(statistics.model_stats) != 1:
                print("FAILED: get_inference_statistics")
                sys.exit(1)
            # print(statistics)
        # print("load model done")

        result = results.as_numpy('prob')
        # print(f"Received result buffer of size {result.shape}")
        # print(f"Naive buffer sum: {np.sum(result)}")

        detected_objects = postprocess(result, input_image.shape[1], input_image.shape[0],dw,dh,padding_w,padding_h,confidence, self.nms)
        print(f"Raw boxes: {int(result[0, 0, 0, 0])}")
        print(f"Detected objects: {len(detected_objects)}")

        return_info = []

        for box in detected_objects:
            return_info.append([box.classID,box.u1,box.u2,box.v1,box.v2])
            print(f"{COCOLabels(box.classID).name}: {box.confidence}")
            input_image = render_box(input_image, box.box(), color=tuple(RAND_COLORS[box.classID % 64].tolist()))
            size = get_text_size(input_image, f"{COCOLabels(box.classID).name}: {box.confidence:.2f}", normalised_scaling=0.6)
            input_image = render_filled_box(input_image, (box.x1 - 3, box.y1 - 3, box.x1 + size[0], box.y1 + size[1]), color=(220, 220, 220))
            input_image = render_text(input_image, f"{COCOLabels(box.classID).name}: {box.confidence:.2f}", (box.x1, box.y1), color=(30, 30, 30), normalised_scaling=0.5)

        if out:
            cv2.imwrite(out, input_image)
            print(f"Saved result to {out}")
        else:
            cv2.imshow('image', input_image)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
        return return_info
コード例 #30
0
        inputs[0].set_data_from_numpy(batched_image_data)
    else:
        inputs.append(
            httpclient.InferInput(input_name, batched_image_data.shape,
                                  "BYTES"))
        inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)

    output_names = [
        output.name if FLAGS.protocol.lower() == "grpc" else output['name']
        for output in output_metadata
    ]

    outputs = []
    for output_name in output_names:
        if FLAGS.protocol.lower() == "grpc":
            outputs.append(
                grpcclient.InferRequestedOutput(output_name,
                                                class_count=FLAGS.classes))
        else:
            outputs.append(
                httpclient.InferRequestedOutput(output_name,
                                                binary_data=True,
                                                class_count=FLAGS.classes))

    # Send request
    result = triton_client.infer(model_name, inputs, outputs=outputs)

    postprocess(result, output_names, input_filenames, batch_size)

    print("PASS")