Beispiel #1
0
    def precreate_register_regions(self,
                                   value_list,
                                   dtype,
                                   i,
                                   batch_size=1,
                                   tensor_shape=(1,)):
        if _test_system_shared_memory or _test_cuda_shared_memory:
            shm_region_handles = []
            for j, value in enumerate(value_list):
                # For string we can't know the size of the output
                # so we conservatively assume 64 bytes for each
                # element of the output
                if dtype == np.object_:
                    output_byte_size = 4  # size of empty string
                else:
                    output_byte_size = 0

                # create data
                input_list = list()
                for b in range(batch_size):
                    if dtype == np.object_:
                        in0 = np.full(tensor_shape, value, dtype=np.int32)
                        in0n = np.array([
                            str(x).encode('utf-8')
                            for x in in0.reshape(in0.size)
                        ],
                                        dtype=object)
                        in0 = in0n.reshape(tensor_shape)
                        output_byte_size += 64 * in0.size
                    else:
                        in0 = np.full(tensor_shape, value, dtype=dtype)
                        output_byte_size += np.dtype(dtype).itemsize * in0.size
                    input_list.append(in0)

                if dtype == np.object_:
                    input_list_tmp = iu.serialize_byte_tensor_list(input_list)
                    input_byte_size = sum(
                        [serialized_byte_size(i0) for i0 in input_list_tmp])
                else:
                    input_list_tmp = input_list
                    input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])

                # create shared memory regions and copy data for input values
                ip_name = 'ip{}{}'.format(i, j)
                op_name = 'op{}{}_data'.format(i, j)
                if _test_system_shared_memory:
                    shm_ip_handle = shm.create_shared_memory_region(
                        ip_name, '/' + ip_name, input_byte_size)
                    shm_op_handle = shm.create_shared_memory_region(
                        op_name, '/' + op_name, output_byte_size)
                    shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                    self.triton_client_.register_system_shared_memory(
                        ip_name, '/' + ip_name, input_byte_size)
                    self.triton_client_.register_system_shared_memory(
                        op_name, '/' + op_name, output_byte_size)
                elif _test_cuda_shared_memory:
                    shm_ip_handle = cudashm.create_shared_memory_region(
                        ip_name, input_byte_size, 0)
                    shm_op_handle = cudashm.create_shared_memory_region(
                        op_name, output_byte_size, 0)
                    cudashm.set_shared_memory_region(shm_ip_handle,
                                                     input_list_tmp)
                    self.triton_client_.register_cuda_shared_memory(
                        ip_name, cudashm.get_raw_handle(shm_ip_handle), 0,
                        input_byte_size)
                    self.triton_client_.register_cuda_shared_memory(
                        op_name, cudashm.get_raw_handle(shm_op_handle), 0,
                        output_byte_size)
                shm_region_handles.append(
                    (ip_name, input_byte_size, shm_ip_handle))
                shm_region_handles.append(
                    (op_name, output_byte_size, shm_op_handle))
            return shm_region_handles
        else:
            return []
    def precreate_register_dynaseq_shape_tensor_regions(self, value_list, dtype, i,
                                            batch_size=1, tensor_shape=(1,)):
        if _test_system_shared_memory or _test_cuda_shared_memory:
            shm_region_handles = []
            for j, (shape_value, value) in enumerate(value_list):
                input_list = list()
                shape_input_list = list()
                dummy_input_list = list()

                for b in range(batch_size):
                    if dtype == np.object:
                        dummy_in0 = np.full(tensor_shape, value, dtype=np.int32)
                        dummy_in0n = np.array([str(x) for x in dummy_in0.reshape(in0.size)], dtype=object)
                        dummy_in0 = dummy_in0n.reshape(tensor_shape)
                    else:
                        dummy_in0 = np.full(tensor_shape, value, dtype=dtype)
                    dummy_input_list.append(dummy_in0)
                    in0 =  np.full(tensor_shape, value, dtype=np.int32)
                    input_list.append(in0)

                # Only one shape tensor input per batch
                shape_input_list.append(np.full(tensor_shape, shape_value, dtype=np.int32))

                input_list_tmp = iu.serialize_byte_tensor_list(input_list) if (dtype == np.object) else input_list
                input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])
                shape_input_byte_size = sum([i0.nbytes for i0 in shape_input_list])
                dummy_input_byte_size = sum([i0.nbytes for i0 in dummy_input_list])
                shape_output_byte_size = shape_input_byte_size
                output_byte_size = np.dtype(np.int32).itemsize + 2
                resized_output_byte_size = 32 * shape_value

                # create shared memory regions and copy data for input values
                ip_name = 'ip{}{}'.format(i,j)
                shape_ip_name = 'shape_ip{}{}'.format(i,j)
                dummy_ip_name = 'dummy_ip{}{}'.format(i,j)
                shape_op_name = 'shape_op{}{}'.format(i,j)
                op_name = 'op{}{}'.format(i,j)
                resized_op_name = 'resized_op{}{}'.format(i,j)
                if _test_system_shared_memory:
                    shm_ip_handle = shm.create_shared_memory_region(
                        ip_name, '/'+ip_name, input_byte_size)
                    shm_shape_ip_handle = shm.create_shared_memory_region(
                        shape_ip_name, '/'+shape_ip_name, shape_input_byte_size)
                    shm_dummy_ip_handle = shm.create_shared_memory_region(
                        dummy_ip_name, '/'+dummy_ip_name, dummy_input_byte_size)
                    shm_shape_op_handle = shm.create_shared_memory_region(
                        shape_op_name, '/'+shape_op_name, shape_output_byte_size)
                    shm_op_handle = shm.create_shared_memory_region(
                        op_name, '/'+op_name, output_byte_size)
                    shm_resized_op_handle = shm.create_shared_memory_region(
                        resized_op_name, '/'+resized_op_name, resized_output_byte_size)
                    shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                    shm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list)
                    shm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list)
                    self.triton_client_.register_system_shared_memory(ip_name, '/'+ip_name, input_byte_size)
                    self.triton_client_.register_system_shared_memory(shape_ip_name, '/'+shape_ip_name, shape_input_byte_size)
                    self.triton_client_.register_system_shared_memory(dummy_ip_name, '/'+dummy_ip_name, dummy_input_byte_size)
                    self.triton_client_.register_system_shared_memory(shape_op_name, '/'+shape_op_name, shape_output_byte_size)
                    self.triton_client_.register_system_shared_memory(op_name, '/'+op_name, output_byte_size)
                    self.triton_client_.register_system_shared_memory(resized_op_name, '/'+resized_op_name, resized_output_byte_size)
                elif _test_cuda_shared_memory:
                    shm_ip_handle = cudashm.create_shared_memory_region(
                        ip_name, input_byte_size, 0)
                    shm_shape_ip_handle = cudashm.create_shared_memory_region(
                        shape_ip_name, shape_input_byte_size, 0)
                    shm_dummy_ip_handle = cudashm.create_shared_memory_region(
                        dummy_ip_name, dummy_input_byte_size, 0)
                    shm_shape_op_handle = cudashm.create_shared_memory_region(
                        shape_op_name, shape_output_byte_size, 0)
                    shm_op_handle = cudashm.create_shared_memory_region(
                        op_name, output_byte_size, 0)
                    shm_resized_op_handle = cudashm.create_shared_memory_region(
                        resized_op_name, resized_output_byte_size, 0)
                    cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                    cudashm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list)
                    cudashm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list)
                    self.triton_client_.register_cuda_shared_memory(ip_name, cudashm.get_raw_handle(shm_ip_handle), 0, input_byte_size)
                    self.triton_client_.register_cuda_shared_memory(shape_ip_name, cudashm.get_raw_handle(shm_shape_ip_handle), 0, shape_input_byte_size)
                    self.triton_client_.register_cuda_shared_memory(dummy_ip_name, cudashm.get_raw_handle(shm_dummy_ip_handle), 0, dummy_input_byte_size)
                    self.triton_client_.register_cuda_shared_memory(shape_op_name, cudashm.get_raw_handle(shm_shape_op_handle), 0, shape_output_byte_size)
                    self.triton_client_.register_cuda_shared_memory(op_name, cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size)
                    self.triton_client_.register_cuda_shared_memory(resized_op_name, cudashm.get_raw_handle(shm_resized_op_handle), 0, resized_output_byte_size)
                shm_region_handles.append((ip_name, input_byte_size, shm_ip_handle))
                shm_region_handles.append((shape_ip_name, shape_input_byte_size, shm_shape_ip_handle))
                shm_region_handles.append((dummy_ip_name, dummy_input_byte_size, shm_dummy_ip_handle))
                shm_region_handles.append((shape_op_name, shape_output_byte_size, shm_shape_op_handle))
                shm_region_handles.append((op_name, output_byte_size, shm_op_handle))
                shm_region_handles.append((resized_op_name, resized_output_byte_size, shm_resized_op_handle))
            return shm_region_handles
        else:
            return []
    def check_sequence(self, trial, model_name, input_dtype, correlation_id,
                       sequence_thresholds, values, expected_result,
                       protocol, batch_size=1, sequence_name="<unknown>", tensor_shape=(1,)):
        """Perform sequence of inferences. The 'values' holds a list of
        tuples, one for each inference with format:

        (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms)

        """
        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
            ("netdef" not in trial) and ("custom" not in trial) and
            ("onnx" not in trial) and ("libtorch" not in trial) and
	    ("plan" not in trial)):
            self.assertFalse(True, "unknown trial type: " + trial)

        # Can only send the request exactly once since it is a
        # sequence model with state, so can have only a single config.
        configs = []
        if protocol == "http":
            configs.append(("localhost:8000", "http", False))
        if protocol == "grpc":
            configs.append(("localhost:8001", "grpc", False))
        if protocol == "streaming":
            configs.append(("localhost:8001", "grpc", True))

        self.assertFalse(_test_system_shared_memory and _test_cuda_shared_memory,
                        "Cannot set both System and CUDA shared memory flags to 1")

        self.assertEqual(len(configs), 1)

        full_shape = tensor_shape if "nobatch" in trial else (batch_size,) + tensor_shape

        # create and register shared memory output region in advance,
        # knowing that this function will not be called concurrently.
        if _test_system_shared_memory or _test_cuda_shared_memory:
            self.triton_client_.unregister_system_shared_memory()
            self.triton_client_.unregister_cuda_shared_memory()
            output_byte_size = 512
            if _test_system_shared_memory:
                shm_op_handle = shm.create_shared_memory_region("output_data", "/output", output_byte_size)
                self.triton_client_.register_system_shared_memory("output_data", "/output", output_byte_size)
            elif _test_cuda_shared_memory:
                shm_op_handle = cudashm.create_shared_memory_region("output_data", output_byte_size, 0)
                self.triton_client_.register_cuda_shared_memory("output_data", cudashm.get_raw_handle(shm_op_handle), 0, output_byte_size)
            shm_ip_handles = []


        for config in configs:
            client_utils = grpcclient if config[1] == "grpc" else httpclient

            triton_client = client_utils.InferenceServerClient(config[0], verbose=True)
            if config[2]:
                user_data = UserData()
                triton_client.start_stream(partial(completion_callback, user_data))
            # Execute the sequence of inference...
            try:
                seq_start_ms = int(round(time.time() * 1000))

                INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT"
                OUTPUT = "OUTPUT__0" if trial.startswith("libtorch") else "OUTPUT"
                for flag_str, value, thresholds, delay_ms in values:
                    if delay_ms is not None:
                        time.sleep(delay_ms[0] / 1000.0)

                    seq_start = False
                    seq_end = False
                    if flag_str is not None:
                        seq_start = ("start" in flag_str)
                        seq_end = ("end" in flag_str)

                    # Construct request IOs
                    inputs = []
                    outputs = []
                    inputs.append(client_utils.InferInput(INPUT, full_shape,
                            np_to_triton_dtype(input_dtype)))
                    outputs.append(client_utils.InferRequestedOutput(OUTPUT))
                    if input_dtype == np.object:
                        in0 = np.full(full_shape, value, dtype=np.int32)
                        in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object)
                        in0 = in0n.reshape(full_shape)
                    else:
                        in0 = np.full(full_shape, value, dtype=input_dtype)

                    # create input shared memory and copy input data values into it
                    if _test_system_shared_memory or _test_cuda_shared_memory:
                        input_list_tmp = iu.serialize_byte_tensor_list([in0]) if (input_dtype == np.object) else [in0]
                        input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])
                        ip_name = "ip{}".format(len(shm_ip_handles))
                        if _test_system_shared_memory:
                            shm_ip_handles.append(shm.create_shared_memory_region(ip_name, "/"+ip_name, input_byte_size))
                            shm.set_shared_memory_region(shm_ip_handles[-1], input_list_tmp)
                            triton_client.register_system_shared_memory(ip_name, "/"+ip_name, input_byte_size)
                        elif _test_cuda_shared_memory:
                            shm_ip_handles.append(cudashm.create_shared_memory_region(ip_name, input_byte_size, 0))
                            cudashm.set_shared_memory_region(shm_ip_handles[-1], input_list_tmp)
                            triton_client.register_cuda_shared_memory(ip_name, cudashm.get_raw_handle(shm_ip_handles[-1]), 0, input_byte_size)

                        inputs[0].set_shared_memory(ip_name, input_byte_size)
                        outputs[0].set_shared_memory("output_data", output_byte_size)
                    else:
                        inputs[0].set_data_from_numpy(in0)

                    start_ms = int(round(time.time() * 1000))

                    if config[2]:
                        triton_client.async_stream_infer(model_name, inputs,
                            outputs=outputs, sequence_id=correlation_id,
                            sequence_start=seq_start, sequence_end=seq_end)
                        (results, error) = user_data._completed_requests.get()
                        if error is not None:
                            raise error
                    else:
                        results = triton_client.infer(model_name, inputs,
                            outputs=outputs, sequence_id=correlation_id,
                            sequence_start=seq_start, sequence_end=seq_end)

                    end_ms = int(round(time.time() * 1000))

                    # Get value of "OUTPUT", for shared memory, need to get it via
                    # shared memory utils
                    if (not _test_system_shared_memory) and (not _test_cuda_shared_memory):
                        out = results.as_numpy(OUTPUT)
                    else:
                        output = results.get_output(OUTPUT)
                        if config[1] == "http":
                            output_shape = output["shape"]
                        else:
                            output_shape = output.shape
                        output_type = input_dtype
                        if _test_system_shared_memory:
                            out = shm.get_contents_as_numpy(shm_op_handle, output_type, output_shape)
                        else:
                            out = cudashm.get_contents_as_numpy(shm_op_handle, output_type, output_shape)
                    result = out[0] if "nobatch" in trial else out[0][0]
                    print("{}: {}".format(sequence_name, result))

                    if thresholds is not None:
                        lt_ms = thresholds[0]
                        gt_ms = thresholds[1]
                        if lt_ms is not None:
                            self.assertTrue((end_ms - start_ms) < lt_ms,
                                            "expected less than " + str(lt_ms) +
                                            "ms response time, got " + str(end_ms - start_ms) + " ms")
                        if gt_ms is not None:
                            self.assertTrue((end_ms - start_ms) > gt_ms,
                                            "expected greater than " + str(gt_ms) +
                                            "ms response time, got " + str(end_ms - start_ms) + " ms")
                    if delay_ms is not None:
                        time.sleep(delay_ms[1] / 1000.0)

                seq_end_ms = int(round(time.time() * 1000))

                if input_dtype == np.object:
                    self.assertEqual(int(result), expected_result)
                else:
                    self.assertEqual(result, expected_result)

                if sequence_thresholds is not None:
                    lt_ms = sequence_thresholds[0]
                    gt_ms = sequence_thresholds[1]
                    if lt_ms is not None:
                        self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
                                        "sequence expected less than " + str(lt_ms) +
                                        "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms")
                    if gt_ms is not None:
                        self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
                                        "sequence expected greater than " + str(gt_ms) +
                                        "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms")
            except Exception as ex:
                self.add_deferred_exception(ex)
            if config[2]:
                triton_client.stop_stream()

        if _test_system_shared_memory or _test_cuda_shared_memory:
            self.triton_client_.unregister_system_shared_memory()
            self.triton_client_.unregister_cuda_shared_memory()
            destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region
            destroy_func(shm_op_handle)
            for shm_ip_handle in shm_ip_handles:
                destroy_func(shm_ip_handle)
Beispiel #4
0
    def precreate_register_shape_tensor_regions(self,
                                                value_list,
                                                dtype,
                                                i,
                                                batch_size=1,
                                                tensor_shape=(1, )):
        self.assertFalse(_test_cuda_shared_memory,
                         "Shape tensors does not support CUDA shared memory")
        if _test_system_shared_memory:
            shm_region_handles = []
            for j, (shape_value, value) in enumerate(value_list):
                input_list = list()
                shape_input_list = list()

                for b in range(batch_size):
                    if dtype == np.object_:
                        in0 = np.full(tensor_shape, value, dtype=np.int32)
                        in0n = np.array(
                            [str(x) for x in in0.reshape(in0.size)],
                            dtype=object)
                        in0 = in0n.reshape(tensor_shape)
                    else:
                        in0 = np.full(tensor_shape, value, dtype=dtype)
                    input_list.append(in0)

                # Only one shape tensor input per batch
                shape_input_list.append(
                    np.full(tensor_shape, shape_value, dtype=np.int32))

                if dtype == np.object_:
                    input_list_tmp = iu.serialize_byte_tensor_list(input_list)
                    input_byte_size = sum(
                        [serialized_byte_size(i0) for i0 in input_list_tmp])
                else:
                    input_list_tmp = input_list
                    input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])

                shape_input_byte_size = sum(
                    [i0.nbytes for i0 in shape_input_list])
                shape_output_byte_size = shape_input_byte_size
                output_byte_size = np.dtype(dtype).itemsize + 2
                resized_output_byte_size = 32 * shape_value

                # create shared memory regions and copy data for input values
                ip_name = 'ip{}{}'.format(i, j)
                shape_ip_name = 'shape_ip{}{}'.format(i, j)
                shape_op_name = 'shape_op{}{}'.format(i, j)
                op_name = 'op{}{}'.format(i, j)
                resized_op_name = 'resized_op{}{}'.format(i, j)

                shm_ip_handle = shm.create_shared_memory_region(
                    ip_name, '/' + ip_name, input_byte_size)
                shm_shape_ip_handle = shm.create_shared_memory_region(
                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
                shm_shape_op_handle = shm.create_shared_memory_region(
                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
                shm_op_handle = shm.create_shared_memory_region(
                    op_name, '/' + op_name, output_byte_size)
                shm_resized_op_handle = shm.create_shared_memory_region(
                    resized_op_name, '/' + resized_op_name,
                    resized_output_byte_size)
                shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                shm.set_shared_memory_region(shm_shape_ip_handle,
                                             shape_input_list)
                self.triton_client_.register_system_shared_memory(
                    ip_name, '/' + ip_name, input_byte_size)
                self.triton_client_.register_system_shared_memory(
                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
                self.triton_client_.register_system_shared_memory(
                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
                self.triton_client_.register_system_shared_memory(
                    op_name, '/' + op_name, output_byte_size)
                self.triton_client_.register_system_shared_memory(
                    resized_op_name, '/' + resized_op_name,
                    resized_output_byte_size)

                shm_region_handles.append(
                    (ip_name, input_byte_size, shm_ip_handle))
                shm_region_handles.append(
                    (shape_ip_name, shape_input_byte_size,
                     shm_shape_ip_handle))
                shm_region_handles.append(
                    (shape_op_name, shape_output_byte_size,
                     shm_shape_op_handle))
                shm_region_handles.append(
                    (op_name, output_byte_size, shm_op_handle))
                shm_region_handles.append(
                    (resized_op_name, resized_output_byte_size,
                     shm_resized_op_handle))
            return shm_region_handles
        else:
            return []