def _configure_sever(self):
        shm_ip0_handle = cshm.create_shared_memory_region("input0_data", 64, 0)
        shm_ip1_handle = cshm.create_shared_memory_region("input1_data", 64, 0)
        shm_op0_handle = cshm.create_shared_memory_region(
            "output0_data", 64, 0)
        shm_op1_handle = cshm.create_shared_memory_region(
            "output1_data", 64, 0)

        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
        input1_data = np.ones(shape=16, dtype=np.int32)
        cshm.set_shared_memory_region(shm_ip0_handle, [input0_data])
        cshm.set_shared_memory_region(shm_ip1_handle, [input1_data])
        if _protocol == "http":
            triton_client = httpclient.InferenceServerClient(_url,
                                                             verbose=True)
        else:
            triton_client = grpcclient.InferenceServerClient(_url,
                                                             verbose=True)
        triton_client.register_cuda_shared_memory(
            "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64)
        triton_client.register_cuda_shared_memory(
            "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64)
        triton_client.register_cuda_shared_memory(
            "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64)
        triton_client.register_cuda_shared_memory(
            "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64)
        return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
Ejemplo n.º 2
0
def register_add_either_shm_regions(inputs, outputs, shm_region_prefix,
                                    shm_handles, io_num, input_byte_size,
                                    output_byte_size, use_system_shared_memory,
                                    use_cuda_shared_memory, triton_client):
    if use_system_shared_memory or use_cuda_shared_memory:
        # Unregister then register required shared memory regions
        input_shm_name = shm_region_prefix[0] + str(io_num)
        output_shm_name = shm_region_prefix[1] + str(io_num)
        if use_system_shared_memory:
            triton_client.unregister_system_shared_memory(input_shm_name +
                                                          '_data')
            triton_client.unregister_system_shared_memory(output_shm_name +
                                                          '_data')
            triton_client.register_system_shared_memory(
                input_shm_name + '_data', '/' + input_shm_name,
                input_byte_size)
            triton_client.register_system_shared_memory(
                output_shm_name + '_data', '/' + output_shm_name,
                output_byte_size)

        if use_cuda_shared_memory:
            triton_client.unregister_cuda_shared_memory(input_shm_name +
                                                        '_data')
            triton_client.unregister_cuda_shared_memory(output_shm_name +
                                                        '_data')
            triton_client.register_cuda_shared_memory(
                input_shm_name + '_data',
                cudashm.get_raw_handle(shm_handles[0][io_num]), 0,
                input_byte_size)
            triton_client.register_cuda_shared_memory(
                output_shm_name + '_data',
                cudashm.get_raw_handle(shm_handles[1][io_num]), 0,
                output_byte_size)

        # Add shared memory regions to inputs
        inputs[io_num].set_shared_memory(input_shm_name + '_data',
                                         input_byte_size)
        outputs[io_num].set_shared_memory(output_shm_name + '_data',
                                          output_byte_size)
 def test_reregister_after_register(self):
     # Create a valid cuda shared memory region and unregister after register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(
             _url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(
             _url, verbose=True)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     triton_client.register_cuda_shared_memory(
         "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
     try:
         triton_client.register_cuda_shared_memory(
             "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
     except Exception as ex:
         self.assertTrue(
             "shared memory region 'dummy_data' already in manager" in str(ex))
     shm_status = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 1)
     else:
         self.assertTrue(len(shm_status.regions) == 1)
     cshm.destroy_shared_memory_region(shm_op0_handle)
 def test_valid_create_set_register(self):
     # Create a valid cuda shared memory region, fill data in it and register
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url,
                                                          verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url,
                                                          verbose=True)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     cshm.set_shared_memory_region(shm_op0_handle,
                                   [np.array([1, 2], dtype=np.float32)])
     triton_client.register_cuda_shared_memory(
         "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
     shm_status = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 1)
     else:
         self.assertTrue(len(shm_status.regions) == 1)
     cshm.destroy_shared_memory_region(shm_op0_handle)
 def test_too_big_shm(self):
     # Shared memory input region larger than needed - Throws error
     error_msg = []
     shm_handles = self._configure_sever()
     shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 128, 0)
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(
             _url, verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(
             _url, verbose=True)
     triton_client.register_cuda_shared_memory(
         "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128)
     self._basic_inference(
         shm_handles[0], shm_ip2_handle, shm_handles[2], shm_handles[3],
         error_msg, "input2_data", 128)
     if len(error_msg) > 0:
         self.assertTrue(
             "unexpected size 128 for inference input 'INPUT1', expecting 64" in error_msg[-1])
     shm_handles.append(shm_ip2_handle)
     self._cleanup_server(shm_handles)
 def test_register_after_inference(self):
     # Register after inference
     error_msg = []
     shm_handles = self._configure_sever()
     if _protocol == "http":
         triton_client = httpclient.InferenceServerClient(_url,
                                                          verbose=True)
     else:
         triton_client = grpcclient.InferenceServerClient(_url,
                                                          verbose=True)
     self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
                           shm_handles[3], error_msg)
     if len(error_msg) > 0:
         raise Exception(str(error_msg))
     shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0)
     triton_client.register_cuda_shared_memory(
         "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64)
     shm_status = triton_client.get_cuda_shared_memory_status()
     if _protocol == "http":
         self.assertTrue(len(shm_status) == 5)
     else:
         self.assertTrue(len(shm_status.regions) == 5)
     shm_handles.append(shm_ip2_handle)
     self._cleanup_server(shm_handles)
Ejemplo n.º 7
0
def infer_shape_tensor(tester,
                       pf,
                       tensor_dtype,
                       input_shape_values,
                       dummy_input_shapes,
                       use_http=True,
                       use_grpc=True,
                       use_streaming=True,
                       shm_suffix="",
                       use_system_shared_memory=False,
                       use_cuda_shared_memory=False,
                       priority=0,
                       timeout_us=0,
                       batch_size=1):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    tester.assertTrue(pf == "plan" or pf == "plan_nobatch")
    tester.assertEqual(len(input_shape_values), len(dummy_input_shapes))
    if use_system_shared_memory and use_cuda_shared_memory:
        raise ValueError(
            "Cannot set both System and CUDA shared memory flags to 1")

    configs = []
    if use_http:
        configs.append(("localhost:8000", "http", False))
    if use_grpc:
        configs.append(("localhost:8001", "grpc", False))
    if use_streaming:
        configs.append(("localhost:8001", "grpc", True))

    io_cnt = len(input_shape_values)

    # FIXME wrap up shm handle cleanup
    # For (cuda) shared memory, it's only set for shape tensor for simplicity.
    # Regular tensor with (cuda) shared memory should be well-tested in other
    # tests.
    # item is (handle, byte_size, is_cuda)
    input_shm_handle_list = []
    output_shm_handle_list = []
    dummy_input_list = []
    input_list = []
    expected_dict = dict()
    # Prepare IO in advance
    for io_num in range(io_cnt):
        dummy_input_name = "DUMMY_INPUT{}".format(io_num)
        input_name = "INPUT{}".format(io_num)
        dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
        output_name = "OUTPUT{}".format(io_num)

        # Prepare the dummy tensor
        rtensor_dtype = _range_repr_dtype(tensor_dtype)
        if (rtensor_dtype != np.bool):
            dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                          high=np.iinfo(rtensor_dtype).max,
                                          size=dummy_input_shapes[io_num],
                                          dtype=rtensor_dtype)
        else:
            dummy_in0 = np.random.choice(a=[False, True],
                                         size=dummy_input_shapes[io_num])
        if tensor_dtype != np.object:
            dummy_in0 = dummy_in0.astype(tensor_dtype)
        else:
            dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()],
                                 dtype=object).reshape(dummy_in0.shape)
        dummy_input_list.append(dummy_in0)

        # Prepare shape input tensor
        in0 = np.asarray(input_shape_values[io_num], dtype=np.int32)
        input_list.append(in0)

        # Prepare the expected value for the output. Skip dummy output as we
        # only care about its shape (== value of OUTPUT*)
        expected_dict[output_name] = np.ndarray.copy(in0)

        # Only need to create region once
        input_byte_size = in0.size * np.dtype(np.int32).itemsize
        output_byte_size = input_byte_size * batch_size
        if use_system_shared_memory:
            input_shm_handle_list.append(
                (shm.create_shared_memory_region(input_name + shm_suffix,
                                                 '/' + input_name + shm_suffix,
                                                 input_byte_size),
                 input_byte_size, False))
            output_shm_handle_list.append((shm.create_shared_memory_region(
                output_name + shm_suffix, '/' + output_name + shm_suffix,
                output_byte_size), output_byte_size, False))
            shm.set_shared_memory_region(input_shm_handle_list[-1][0], [
                in0,
            ])
        elif use_cuda_shared_memory:
            input_shm_handle_list.append(
                (cudashm.create_shared_memory_region(input_name + shm_suffix,
                                                     input_byte_size, 0),
                 input_byte_size, True))
            output_shm_handle_list.append(
                (cudashm.create_shared_memory_region(output_name + shm_suffix,
                                                     output_byte_size, 0),
                 output_byte_size, True))
            cudashm.set_shared_memory_region(input_shm_handle_list[-1][0], [
                in0,
            ])

    model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
    # Run inference and check results for each config
    for config in configs:
        client_utils = grpcclient if config[1] == "grpc" else httpclient
        triton_client = client_utils.InferenceServerClient(config[0],
                                                           verbose=True)

        inputs = []
        outputs = []

        # Set IOs
        for io_num in range(io_cnt):
            dummy_input_name = "DUMMY_INPUT{}".format(io_num)
            input_name = "INPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

            inputs.append(
                client_utils.InferInput(dummy_input_name,
                                        dummy_input_shapes[io_num],
                                        np_to_triton_dtype(tensor_dtype)))
            inputs.append(
                client_utils.InferInput(input_name, input_list[io_num].shape,
                                        "INT32"))
            outputs.append(
                client_utils.InferRequestedOutput(dummy_output_name))
            outputs.append(client_utils.InferRequestedOutput(output_name))

            # -2: dummy; -1: input
            inputs[-2].set_data_from_numpy(dummy_input_list[io_num])
            if (not use_system_shared_memory) and (not use_cuda_shared_memory):
                inputs[-1].set_data_from_numpy(input_list[io_num])
            else:
                input_byte_size = input_shm_handle_list[io_num][1]
                output_byte_size = output_shm_handle_list[io_num][1]
                if use_system_shared_memory:
                    triton_client.register_system_shared_memory(
                        input_name + shm_suffix, "/" + input_name + shm_suffix,
                        input_byte_size)
                    triton_client.register_system_shared_memory(
                        output_name + shm_suffix,
                        "/" + output_name + shm_suffix, output_byte_size)
                else:
                    triton_client.register_cuda_shared_memory(
                        input_name + shm_suffix,
                        cudashm.get_raw_handle(
                            input_shm_handle_list[io_num][0]), 0,
                        input_byte_size)
                    triton_client.register_cuda_shared_memory(
                        output_name + shm_suffix,
                        cudashm.get_raw_handle(
                            output_shm_handle_list[io_num][0]), 0,
                        output_byte_size)
                inputs[-1].set_shared_memory(input_name + shm_suffix,
                                             input_byte_size)
                outputs[-1].set_shared_memory(output_name + shm_suffix,
                                              output_byte_size)

        if config[2]:
            user_data = UserData()
            triton_client.start_stream(partial(completion_callback, user_data))
            try:
                results = triton_client.async_stream_infer(model_name,
                                                           inputs,
                                                           outputs=outputs,
                                                           priority=priority,
                                                           timeout=timeout_us)
            except Exception as e:
                triton_client.stop_stream()
                raise e
            triton_client.stop_stream()
            (results, error) = user_data._completed_requests.get()
            if error is not None:
                raise error
        else:
            results = triton_client.infer(model_name,
                                          inputs,
                                          outputs=outputs,
                                          priority=priority,
                                          timeout=timeout_us)

        for io_num in range(io_cnt):
            output_name = "OUTPUT{}".format(io_num)
            dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)
            expected = expected_dict[output_name]

            # get outputs as numpy array
            dummy_out = results.as_numpy(dummy_output_name)
            if (not use_system_shared_memory) and (not use_cuda_shared_memory):
                out = results.as_numpy(output_name)
            else:
                output = results.get_output(output_name)
                if config[1] == "grpc":
                    output_shape = output.shape
                else:
                    output_shape = output["shape"]
                if use_system_shared_memory:
                    out = shm.get_contents_as_numpy(
                        output_shm_handle_list[io_num][0], np.int32,
                        output_shape)
                else:
                    out = cudashm.get_contents_as_numpy(
                        output_shm_handle_list[io_num][0], np.int32,
                        output_shape)

            # if out shape is 2D, it is batched
            if (len(out.shape) == 2):
                # The shape of the dummy output should be equal to the shape values
                # specified in the shape tensor
                tester.assertTrue(
                    np.array_equal(dummy_out.shape[1:], out[0]),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out[0],
                        dummy_out.shape[1:]))
                for b in range(1, out.shape[0]):
                    tester.assertTrue(
                        np.array_equal(out[b - 1], out[b]),
                        "expect shape tensor has consistent value, "
                        "expected: {}, got {}".format(out[b - 1], out[b]))
                out = out[0]
            else:
                tester.assertTrue(
                    np.array_equal(dummy_out.shape, out),
                    "{}, {} shape, expected: {}, got {}".format(
                        model_name, dummy_output_name, out, dummy_out.shape))
            tester.assertTrue(
                np.array_equal(out, expected),
                "{}, {}, expected: {}, got {}".format(model_name, output_name,
                                                      expected, out))

            # unregister shared memory region for next config
            if use_system_shared_memory:
                triton_client.unregister_system_shared_memory(input_name +
                                                              shm_suffix)
                triton_client.unregister_system_shared_memory(output_name +
                                                              shm_suffix)
            elif use_cuda_shared_memory:
                triton_client.unregister_cuda_shared_memory(input_name +
                                                            shm_suffix)
                triton_client.unregister_cuda_shared_memory(output_name +
                                                            shm_suffix)

    for handle in input_shm_handle_list:
        if (handle[2]):
            cudashm.destroy_shared_memory_region(handle[0])
        else:
            shm.destroy_shared_memory_region(handle[0])
    for handle in output_shm_handle_list:
        if (handle[2]):
            cudashm.destroy_shared_memory_region(handle[0])
        else:
            shm.destroy_shared_memory_region(handle[0])
Ejemplo n.º 8
0
def register_add_shm_regions(inputs, outputs, shm_region_names,
                             precreated_shm_regions, shm_handles,
                             input0_byte_size, input1_byte_size,
                             output0_byte_size, output1_byte_size,
                             use_system_shared_memory, use_cuda_shared_memory,
                             triton_client):
    if use_system_shared_memory or use_cuda_shared_memory:
        # Unregister then register required shared memory regions
        if use_system_shared_memory:
            triton_client.unregister_system_shared_memory(shm_region_names[0] +
                                                          '_data')
            triton_client.unregister_system_shared_memory(shm_region_names[1] +
                                                          '_data')
            triton_client.register_system_shared_memory(
                shm_region_names[0] + '_data', '/' + shm_region_names[0],
                input0_byte_size)
            triton_client.register_system_shared_memory(
                shm_region_names[1] + '_data', '/' + shm_region_names[1],
                input1_byte_size)
            i = 0
            if "OUTPUT0" in outputs:
                if precreated_shm_regions is None:
                    triton_client.unregister_system_shared_memory(
                        shm_region_names[2] + '_data')
                    triton_client.register_system_shared_memory(
                        shm_region_names[2] + '_data',
                        '/' + shm_region_names[2], output0_byte_size)
                i += 1
            if "OUTPUT1" in outputs:
                if precreated_shm_regions is None:
                    triton_client.unregister_system_shared_memory(
                        shm_region_names[2 + i] + '_data')
                    triton_client.register_system_shared_memory(
                        shm_region_names[2 + i] + '_data',
                        '/' + shm_region_names[2 + i], output1_byte_size)

        if use_cuda_shared_memory:
            triton_client.unregister_cuda_shared_memory(shm_region_names[0] +
                                                        '_data')
            triton_client.unregister_cuda_shared_memory(shm_region_names[1] +
                                                        '_data')
            triton_client.register_cuda_shared_memory(
                shm_region_names[0] + '_data',
                cudashm.get_raw_handle(shm_handles[0]), 0, input0_byte_size)
            triton_client.register_cuda_shared_memory(
                shm_region_names[1] + '_data',
                cudashm.get_raw_handle(shm_handles[1]), 0, input1_byte_size)
            i = 0
            if "OUTPUT0" in outputs:
                if precreated_shm_regions is None:
                    triton_client.unregister_cuda_shared_memory(
                        shm_region_names[2] + '_data')
                    triton_client.register_cuda_shared_memory(
                        shm_region_names[2] + '_data',
                        cudashm.get_raw_handle(shm_handles[2]), 0,
                        output0_byte_size)
                i += 1
            if "OUTPUT1" in outputs:
                if precreated_shm_regions is None:
                    triton_client.unregister_cuda_shared_memory(
                        shm_region_names[2 + i] + '_data')
                    triton_client.register_cuda_shared_memory(
                        shm_region_names[2 + i] + '_data',
                        cudashm.get_raw_handle(shm_handles[3]), 0,
                        output1_byte_size)

        # Add shared memory regions to inputs
        inputs[0].set_shared_memory(shm_region_names[0] + '_data',
                                    input0_byte_size)
        inputs[1].set_shared_memory(shm_region_names[1] + '_data',
                                    input1_byte_size)
Ejemplo n.º 9
0
    # to unique integers and the second to all ones.
    input0_data = np.arange(start=0, stop=16, dtype=np.int32)
    input1_data = np.ones(shape=16, dtype=np.int32)

    input_byte_size = input0_data.size * input0_data.itemsize
    output_byte_size = input_byte_size

    # Create Output0 and Output1 in Shared Memory and store shared memory handles
    shm_op0_handle = cudashm.create_shared_memory_region(
        "output0_data", output_byte_size, 0)
    shm_op1_handle = cudashm.create_shared_memory_region(
        "output1_data", output_byte_size, 0)

    # Register Output0 and Output1 shared memory with Triton Server
    triton_client.register_cuda_shared_memory(
        "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0,
        output_byte_size)
    triton_client.register_cuda_shared_memory(
        "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0,
        output_byte_size)

    # Create Input0 and Input1 in Shared Memory and store shared memory handles
    shm_ip0_handle = cudashm.create_shared_memory_region(
        "input0_data", input_byte_size, 0)
    shm_ip1_handle = cudashm.create_shared_memory_region(
        "input1_data", input_byte_size, 0)

    # Put input data values into shared memory
    cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data])
    cudashm.set_shared_memory_region(shm_ip1_handle, [input1_data])