Python destroy_shared_memory_region Exemples, tensorrtserver.cuda_shared_memory.destroy_shared_memory_region Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : cuda_shared_memory_test.py Projet : pratyushlohumi26/tensorrt-inference-server

 def test_invalid_create_shm():
     # Raises error since tried to create invalid cuda shared memory region
     try:
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
     except Exception as ex:
         self.assertTrue(str(ex) == "unable to create cuda shared memory handle")
     cshm.destroy_shared_memory_region(shm_op0_handle)

Exemple #2

0

Afficher le fichier

Fichier : cuda_shared_memory_test.py Projet : pratyushlohumi26/tensorrt-inference-server

 def test_unregister_before_register():
     # Create a valid cuda shared memory region and unregister before register
     shared_memory_ctx = SharedMemoryControlContext("localhost:8000",  ProtocolType.HTTP, verbose=False)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     shared_memory_ctx.unregister(shm_op0_handle)
     shm_status = shared_memory_ctx.get_shared_memory_status()
     self.assertTrue(len(shm_status.shared_memory_region) == 0)
     cshm.destroy_shared_memory_region(shm_op0_handle)

Exemple #3

0

Afficher le fichier

Fichier : sequence_util.py Projet : zhangxuann/tensorrt-inference-server

 def cleanup_shm_regions(self, shm_handles):
     if len(shm_handles) != 0:
         shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=True)
         for shm_tmp_handle in shm_handles:
             shared_memory_ctx.unregister(shm_tmp_handle)
             if _test_system_shared_memory:
                 shm.destroy_shared_memory_region(shm_tmp_handle)
             elif _test_cuda_shared_memory:
                 cudashm.destroy_shared_memory_region(shm_tmp_handle)

Exemple #4

0

Afficher le fichier

Fichier : cuda_shared_memory_test.py Projet : pratyushlohumi26/tensorrt-inference-server

 def test_valid_create_set_register():
     # Create a valid cuda shared memory region, fill data in it and register
     shared_memory_ctx = SharedMemoryControlContext("localhost:8000",  ProtocolType.HTTP, verbose=False)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     cshm.set_shared_memory_region(shm_op0_handle, [np.array([1,2], dtype=np.float32)])
     shared_memory_ctx.cuda_register(shm_op0_handle)
     shm_status = shared_memory_ctx.get_shared_memory_status()
     self.assertTrue(len(shm_status.shared_memory_region) == 1)
     cshm.destroy_shared_memory_region(shm_op0_handle)

Exemple #5

0

Afficher le fichier

Fichier : cuda_shared_memory_test.py Projet : zeta1999/triton-inference-server

 def test_unregister_before_register(self):
     # Create a valid cuda shared memory region and unregister before register
     shared_memory_ctx = SharedMemoryControlContext(_url,
                                                    _protocol,
                                                    verbose=True)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     shared_memory_ctx.unregister(shm_op0_handle)
     shm_status = shared_memory_ctx.get_shared_memory_status()
     self.assertTrue(len(shm_status.shared_memory_region) == 0)
     cshm.destroy_shared_memory_region(shm_op0_handle)

Exemple #6

0

Afficher le fichier

Fichier : cuda_shared_memory_test.py Projet : pratyushlohumi26/tensorrt-inference-server

 def test_reregister_after_register():
     # Create a valid cuda shared memory region and unregister after register
     shared_memory_ctx = SharedMemoryControlContext("localhost:8000",  ProtocolType.HTTP, verbose=False)
     shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
     shared_memory_ctx.cuda_register(shm_op0_handle)
     try:
         shared_memory_ctx.cuda_register(shm_op0_handle)
     except Exception as ex:
         self.assertTrue("shared memory block 'dummy_data' already in manager" in str(ex))
     shm_status = shared_memory_ctx.get_shared_memory_status()
     self.assertTrue(len(shm_status.shared_memory_region) == 1)
     cshm.destroy_shared_memory_region(shm_op0_handle)

Exemple #7

0

Afficher le fichier

Fichier : shm_util.py Projet : zhangxuann/tensorrt-inference-server

def unregister_cleanup_shm_regions(shm_handles, precreated_shm_regions,
                                   outputs):
    shared_memory_ctx = SharedMemoryControlContext("localhost:8000",
                                                   ProtocolType.HTTP,
                                                   verbose=False)
    shared_memory_ctx.unregister(shm_handles[0])
    shared_memory_ctx.unregister(shm_handles[1])
    if TEST_CUDA_SHARED_MEMORY:
        cudashm.destroy_shared_memory_region(shm_handles[0])
        cudashm.destroy_shared_memory_region(shm_handles[1])
    else:
        shm.destroy_shared_memory_region(shm_handles[0])
        shm.destroy_shared_memory_region(shm_handles[1])

    if precreated_shm_regions is None:
        i = 0
        if "OUTPUT0" in outputs:
            shared_memory_ctx.unregister(shm_handles[2])
            if TEST_CUDA_SHARED_MEMORY:
                cudashm.destroy_shared_memory_region(shm_handles[2])
            else:
                shm.destroy_shared_memory_region(shm_handles[2])
            i += 1
        if "OUTPUT1" in outputs:
            shared_memory_ctx.unregister(shm_handles[2 + i])
            if TEST_CUDA_SHARED_MEMORY:
                cudashm.destroy_shared_memory_region(shm_handles[2 + i])
            else:
                shm.destroy_shared_memory_region(shm_handles[2 + i])

Exemple #8

0

Afficher le fichier

Fichier : shm_util.py Projet : intellisyscorp/fitzme-triton-inference-server

def unregister_cleanup_shm_regions(shm_handles, precreated_shm_regions, outputs, use_system_shared_memory, use_cuda_shared_memory):
    if not (use_system_shared_memory or use_cuda_shared_memory):
        return None

    shared_memory_ctx = SharedMemoryControlContext("localhost:8000",  ProtocolType.HTTP, verbose=False)
    shared_memory_ctx.unregister(shm_handles[0])
    shared_memory_ctx.unregister(shm_handles[1])
    if use_cuda_shared_memory:
        cudashm.destroy_shared_memory_region(shm_handles[0])
        cudashm.destroy_shared_memory_region(shm_handles[1])
    else:
        shm.destroy_shared_memory_region(shm_handles[0])
        shm.destroy_shared_memory_region(shm_handles[1])

    if precreated_shm_regions is None:
        i = 0
        if "OUTPUT0" in outputs:
            shared_memory_ctx.unregister(shm_handles[2])
            if use_cuda_shared_memory:
                cudashm.destroy_shared_memory_region(shm_handles[2])
            else:
                shm.destroy_shared_memory_region(shm_handles[2])
            i +=1
        if "OUTPUT1" in outputs:
            shared_memory_ctx.unregister(shm_handles[2+i])
            if use_cuda_shared_memory:
                cudashm.destroy_shared_memory_region(shm_handles[2+i])
            else:
                shm.destroy_shared_memory_region(shm_handles[2+i])

Exemple #9

0

Afficher le fichier

Fichier : simple_cuda_shm_string_client.py Projet : hnishi/tensorrt-inference-server

    # We expect there to be 2 results (each with batch-size 1). Walk
    # over all 16 result elements and print the sum and difference
    # calculated by the model.
    output0_data = results['OUTPUT0'][0]
    output1_data = results['OUTPUT1'][0]

    for i in range(16):
        print(
            str(input0_data[i]) + " + " + str(input1_data[i]) + " = " +
            output0_data[i].decode("utf-8"))
        print(
            str(input0_data[i]) + " - " + str(input1_data[i]) + " = " +
            output1_data[i].decode("utf-8"))

        # Convert result from string to int to check result
        r0 = int(output0_data[i])
        r1 = int(output1_data[i])
        if expected_sum[i] != r0:
            print("error: incorrect sum")
            sys.exit(1)
        if expected_diff[i] != r1:
            print("error: incorrect difference")
            sys.exit(1)

    print(shared_memory_ctx.get_shared_memory_status())
    shared_memory_ctx.unregister_all()
    cudashm.destroy_shared_memory_region(shm_ip0_handle)
    cudashm.destroy_shared_memory_region(shm_ip1_handle)
    cudashm.destroy_shared_memory_region(shm_op0_handle)
    cudashm.destroy_shared_memory_region(shm_op1_handle)

Exemple #10

0

Afficher le fichier

def infer_shape_tensor(tester,
                       pf,
                       batch_size,
                       tensor_dtype,
                       input_shape_values,
                       dummy_input_shapes,
                       model_version=None,
                       use_http=True,
                       use_grpc=True,
                       use_streaming=True,
                       shm_suffix="",
                       use_system_shared_memory=False,
                       use_cuda_shared_memory=False,
                       priority=0,
                       timeout_us=0):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", ProtocolType.HTTP, False))
    if use_grpc:
        configs.append(("localhost:8001", ProtocolType.GRPC, False))
    if use_streaming:
        configs.append(("localhost:8001", ProtocolType.GRPC, True))
    tester.assertEqual(len(input_shape_values), len(dummy_input_shapes))
    io_cnt = len(input_shape_values)

    if use_system_shared_memory and use_cuda_shared_memory:
        raise ValueError(
            "Cannot set both System and CUDA shared memory flags to 1")

    input_dict = {}
    output_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()
    shared_memory_ctx = SharedMemoryControlContext("localhost:8000",
                                                   ProtocolType.HTTP,
                                                   verbose=False)

    for io_num in range(io_cnt):
        tester.assertTrue(pf == "plan" or pf == "plan_nobatch")

        input_name = "INPUT{}".format(io_num)
        output_name = "OUTPUT{}".format(io_num)
        dummy_input_name = "DUMMY_INPUT{}".format(io_num)
        dummy_output_name = "DUMMY_OUTPUT{}".format(io_num)

        input_list = list()
        dummy_input_list = list()
        expected_list = list()
        for b in range(batch_size):
            # Prepare the dummy tensor
            rtensor_dtype = _range_repr_dtype(tensor_dtype)
            if (rtensor_dtype != np.bool):
                dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                              high=np.iinfo(rtensor_dtype).max,
                                              size=dummy_input_shapes[io_num],
                                              dtype=rtensor_dtype)
            else:
                dummy_in0 = np.random.choice(a=[False, True],
                                             size=dummy_input_shapes[io_num])
            if tensor_dtype != np.object:
                dummy_in0 = dummy_in0.astype(tensor_dtype)
            else:
                dummy_in0 = np.array([str(x) for x in in0.flatten()],
                                     dtype=object).reshape(in0.shape)

            dummy_input_list.append(dummy_in0)

        # Prepare shape input tensor. Only one tensor per batch
        in0 = np.asarray(input_shape_values[io_num], dtype=np.int32)
        input_list.append(in0)

        # Prepare the expected list for the output
        expected0 = np.ndarray.copy(in0)
        expected_list.append(expected0)

        expected_dict[output_name] = expected_list

        input_byte_size = len(in0) * np.dtype(tensor_dtype).itemsize
        output_byte_size = input_byte_size * batch_size
        dummy_input_byte_size = tu.shape_element_count(dummy_input_shapes[io_num]) *\
                            np.dtype(tensor_dtype).itemsize * batch_size
        # The dimension of this tensor will be the value of the shape tensor
        dummy_output_byte_size = tu.shape_element_count(in0) *\
                            np.dtype(tensor_dtype).itemsize * batch_size

        # create and register shared memory region for inputs and outputs
        if use_cuda_shared_memory:
            shm_ip_handles.append(
                cudashm.create_shared_memory_region(
                    "input" + str(io_num) + "_data" + shm_suffix,
                    input_byte_size, 0))
            shm_ip_handles.append(
                cudashm.create_shared_memory_region(
                    "dummy_input" + str(io_num) + "_data" + shm_suffix,
                    dummy_input_byte_size, 0))
            shm_op_handles.append(
                cudashm.create_shared_memory_region(
                    "output" + str(io_num) + "_data" + shm_suffix,
                    output_byte_size, 0))
            shm_op_handles.append(
                cudashm.create_shared_memory_region(
                    "dummy_output" + str(io_num) + "_data" + shm_suffix,
                    dummy_output_byte_size, 0))

            shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num])
            shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num + 1])
            shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num])
            shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num + 1])

            # copy data into shared memory region for input values
            cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num],
                                             input_list)
            cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1],
                                             dummy_input_list)
        elif use_system_shared_memory:
            shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data"+shm_suffix,\
                                        "/input"+str(io_num)+shm_suffix, input_byte_size))
            shm_ip_handles.append(shm.create_shared_memory_region("dumy_input"+str(io_num)+"_data"+shm_suffix,\
                                        "/dummy_input"+str(io_num)+shm_suffix, dummy_input_byte_size))
            shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data"+shm_suffix,\
                                        "/output"+str(io_num)+shm_suffix, output_byte_size))
            shm_op_handles.append(shm.create_shared_memory_region("dummy_output"+str(io_num)+"_data"+shm_suffix,\
                                        "/dummy_output"+str(io_num)+shm_suffix, dummy_output_byte_size))
            shared_memory_ctx.register(shm_ip_handles[2 * io_num])
            shared_memory_ctx.register(shm_ip_handles[2 * io_num + 1])
            shared_memory_ctx.register(shm_op_handles[2 * io_num])
            shared_memory_ctx.register(shm_op_handles[2 * io_num + 1])
            # copy data into shared memory region for input values
            shm.set_shared_memory_region(shm_ip_handles[2 * io_num],
                                         input_list)
            shm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1],
                                         dummy_input_list)
        if use_system_shared_memory or use_cuda_shared_memory:
            input_dict[input_name] = (shm_ip_handles[2 * io_num],
                                      [len(input_shape_values[0])])
            input_dict[dummy_input_name] = (shm_ip_handles[2 * io_num + 1],
                                            dummy_input_shapes[io_num])
            output_dict[output_name] = (InferContext.ResultFormat.RAW,
                                        shm_op_handles[2 * io_num])
            output_dict[dummy_output_name] = (InferContext.ResultFormat.RAW,
                                              shm_op_handles[2 * io_num + 1])
        else:
            input_dict[input_name] = input_list
            input_dict[dummy_input_name] = dummy_input_list
            output_dict[output_name] = InferContext.ResultFormat.RAW
            output_dict[dummy_output_name] = InferContext.ResultFormat.RAW

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        ctx = InferContext(config[0],
                           config[1],
                           model_name,
                           model_version,
                           correlation_id=0,
                           streaming=config[2],
                           verbose=True)
        results = ctx.run(input_dict,
                          output_dict,
                          batch_size,
                          priority=priority,
                          timeout_us=timeout_us)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(),
                               model_version)

        tester.assertEqual(len(results), 2 * io_cnt)
        for (result_name, result_val) in iteritems(results):
            tester.assertTrue(result_name in output_dict)
            expected = expected_dict[output_name][0]
            for b in range(batch_size):
                if result_name == output_name:
                    tester.assertEqual(result_val[b].shape, expected.shape)
                    tester.assertTrue(
                        np.array_equal(result_val[b], expected),
                        "{}, {}, slot {}, expected: {}, got {}".format(
                            model_name, result_name, b, expected,
                            result_val[b]))
                elif result_name == dummy_output_name:
                    # The shape of the dummy output should be equal to the shape values
                    # specified in the shape tensor
                    tester.assertTrue(
                        np.array_equal(result_val[b].shape, expected),
                        "{}, {}, slot {}, expected: {}, got {}".format(
                            model_name, result_name, b, expected,
                            result_val[b]))

    if use_cuda_shared_memory or use_system_shared_memory:
        for io_num in range(2 * io_cnt):
            shared_memory_ctx.unregister(shm_ip_handles[io_num])
            shared_memory_ctx.unregister(shm_op_handles[io_num])
            if use_cuda_shared_memory:
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results

Exemple #11

0

Afficher le fichier

Fichier : sequence_util.py Projet : intellisyscorp/fitzme-triton-inference-server

    def check_sequence(self,
                       trial,
                       model_name,
                       input_dtype,
                       correlation_id,
                       sequence_thresholds,
                       values,
                       expected_result,
                       protocol,
                       batch_size=1,
                       sequence_name="<unknown>",
                       tensor_shape=(1, )):
        """Perform sequence of inferences. The 'values' holds a list of
        tuples, one for each inference with format:

        (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms)

        """
        if (("savedmodel" not in trial) and ("graphdef" not in trial)
                and ("netdef" not in trial) and ("custom" not in trial)
                and ("onnx" not in trial) and ("libtorch" not in trial)
                and ("plan" not in trial)):
            self.assertFalse(True, "unknown trial type: " + trial)

        # Can only send the request exactly once since it is a
        # sequence model with state, so can have only a single config.
        configs = []
        if protocol == "http":
            configs.append(("localhost:8000", ProtocolType.HTTP, False))
        if protocol == "grpc":
            configs.append(("localhost:8001", ProtocolType.GRPC, False))
        if protocol == "streaming":
            configs.append(("localhost:8001", ProtocolType.GRPC, True))

        self.assertFalse(
            _test_system_shared_memory and _test_cuda_shared_memory,
            "Cannot set both System and CUDA shared memory flags to 1")

        self.assertEqual(len(configs), 1)

        # create and register shared memory output region in advance
        if _test_system_shared_memory or _test_cuda_shared_memory:
            shared_memory_ctx = SharedMemoryControlContext("localhost:8000",
                                                           ProtocolType.HTTP,
                                                           verbose=True)
            output_byte_size = 512
            if _test_system_shared_memory:
                shm_op_handle = shm.create_shared_memory_region(
                    "output_data", "/output", output_byte_size)
                shared_memory_ctx.unregister(shm_op_handle)
                shared_memory_ctx.register(shm_op_handle)
            elif _test_cuda_shared_memory:
                shm_op_handle = cudashm.create_shared_memory_region(
                    "output_data", output_byte_size, 0)
                shared_memory_ctx.unregister(shm_op_handle)
                shared_memory_ctx.cuda_register(shm_op_handle)

        for config in configs:
            ctx = InferContext(config[0],
                               config[1],
                               model_name,
                               correlation_id=correlation_id,
                               streaming=config[2],
                               verbose=True)
            # Execute the sequence of inference...
            try:
                seq_start_ms = int(round(time.time() * 1000))

                for flag_str, value, thresholds, delay_ms in values:
                    if delay_ms is not None:
                        time.sleep(delay_ms[0] / 1000.0)

                    flags = InferRequestHeader.FLAG_NONE
                    if flag_str is not None:
                        if "start" in flag_str:
                            flags = flags | InferRequestHeader.FLAG_SEQUENCE_START
                        if "end" in flag_str:
                            flags = flags | InferRequestHeader.FLAG_SEQUENCE_END

                    input_list = list()
                    for b in range(batch_size):
                        if input_dtype == np.object:
                            in0 = np.full(tensor_shape, value, dtype=np.int32)
                            in0n = np.array(
                                [str(x) for x in in0.reshape(in0.size)],
                                dtype=object)
                            in0 = in0n.reshape(tensor_shape)
                        else:
                            in0 = np.full(tensor_shape,
                                          value,
                                          dtype=input_dtype)
                        input_list.append(in0)

                    # create input shared memory and copy input data values into it
                    if _test_system_shared_memory or _test_cuda_shared_memory:
                        input_list_tmp = iu._prepend_string_size(
                            input_list) if (input_dtype
                                            == np.object) else input_list
                        input_byte_size = sum(
                            [i0.nbytes for i0 in input_list_tmp])
                        if _test_system_shared_memory:
                            shm_ip_handle = shm.create_shared_memory_region(
                                "input_data", "/input", input_byte_size)
                            shm.set_shared_memory_region(
                                shm_ip_handle, input_list_tmp)
                            shared_memory_ctx.unregister(shm_ip_handle)
                            shared_memory_ctx.register(shm_ip_handle)
                        elif _test_cuda_shared_memory:
                            shm_ip_handle = cudashm.create_shared_memory_region(
                                "input_data", input_byte_size, 0)
                            cudashm.set_shared_memory_region(
                                shm_ip_handle, input_list_tmp)
                            shared_memory_ctx.unregister(shm_ip_handle)
                            shared_memory_ctx.cuda_register(shm_ip_handle)

                        input_info = (shm_ip_handle, tensor_shape)
                        output_info = (InferContext.ResultFormat.RAW,
                                       shm_op_handle)
                    else:
                        input_info = input_list
                        output_info = InferContext.ResultFormat.RAW

                    start_ms = int(round(time.time() * 1000))
                    INPUT = "INPUT__0" if trial.startswith(
                        "libtorch") else "INPUT"
                    OUTPUT = "OUTPUT__0" if trial.startswith(
                        "libtorch") else "OUTPUT"

                    results = ctx.run({INPUT: input_info},
                                      {OUTPUT: output_info},
                                      batch_size=batch_size,
                                      flags=flags)

                    end_ms = int(round(time.time() * 1000))

                    self.assertEqual(len(results), 1)
                    self.assertTrue(OUTPUT in results)
                    result = results[OUTPUT][0][0]
                    print("{}: {}".format(sequence_name, result))

                    if thresholds is not None:
                        lt_ms = thresholds[0]
                        gt_ms = thresholds[1]
                        if lt_ms is not None:
                            self.assertTrue(
                                (end_ms - start_ms) < lt_ms,
                                "expected less than " + str(lt_ms) +
                                "ms response time, got " +
                                str(end_ms - start_ms) + " ms")
                        if gt_ms is not None:
                            self.assertTrue(
                                (end_ms - start_ms) > gt_ms,
                                "expected greater than " + str(gt_ms) +
                                "ms response time, got " +
                                str(end_ms - start_ms) + " ms")
                    if delay_ms is not None:
                        time.sleep(delay_ms[1] / 1000.0)

                seq_end_ms = int(round(time.time() * 1000))

                if input_dtype == np.object:
                    self.assertEqual(int(result), expected_result)
                else:
                    self.assertEqual(result, expected_result)

                if sequence_thresholds is not None:
                    lt_ms = sequence_thresholds[0]
                    gt_ms = sequence_thresholds[1]
                    if lt_ms is not None:
                        self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
                                        "sequence expected less than " +
                                        str(lt_ms) + "ms response time, got " +
                                        str(seq_end_ms - seq_start_ms) + " ms")
                    if gt_ms is not None:
                        self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
                                        "sequence expected greater than " +
                                        str(gt_ms) + "ms response time, got " +
                                        str(seq_end_ms - seq_start_ms) + " ms")
            except Exception as ex:
                self.add_deferred_exception(ex)

        if _test_system_shared_memory or _test_cuda_shared_memory:
            shared_memory_ctx.unregister(shm_op_handle)
            if _test_system_shared_memory:
                shm.destroy_shared_memory_region(shm_op_handle)
            elif _test_cuda_shared_memory:
                cudashm.destroy_shared_memory_region(shm_op_handle)

Exemple #12

0

Afficher le fichier

Fichier : infer_util.py Projet : hnishi/tensorrt-inference-server

def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes,
               model_version=None, use_http=True, use_grpc=True,
               use_streaming=True):
    tester.assertTrue(use_http or use_grpc or use_streaming)
    configs = []
    if use_http:
        configs.append(("localhost:8000", ProtocolType.HTTP, False))
    if use_grpc:
        configs.append(("localhost:8001", ProtocolType.GRPC, False))
    if use_streaming:
        configs.append(("localhost:8001", ProtocolType.GRPC, True))
    tester.assertEqual(len(input_shapes), len(output_shapes))
    io_cnt = len(input_shapes)

    if TEST_CUDA_SHARED_MEMORY and TEST_SYSTEM_SHARED_MEMORY:
        raise ValueError("Cannot set both System and CUDA shared memory flags to 1")

    input_dict = {}
    output_dict = {}
    expected_dict = {}
    shm_ip_handles = list()
    shm_op_handles = list()
    shared_memory_ctx = SharedMemoryControlContext("localhost:8000",  ProtocolType.HTTP, verbose=False)

    for io_num in range(io_cnt):
        if pf == "libtorch" or pf == "libtorch_nobatch":
            input_name = "INPUT__{}".format(io_num)
            output_name = "OUTPUT__{}".format(io_num)
        else:
            input_name = "INPUT{}".format(io_num)
            output_name = "OUTPUT{}".format(io_num)

        input_list = list()
        expected_list = list()
        for b in range(batch_size):
            rtensor_dtype = _range_repr_dtype(tensor_dtype)
            in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
                                    high=np.iinfo(rtensor_dtype).max,
                                    size=input_shapes[io_num], dtype=rtensor_dtype)
            if tensor_dtype != np.object:
                in0 = in0.astype(tensor_dtype)
                expected0 = np.ndarray.copy(in0)
            else:
                expected0 = np.array([unicode(str(x), encoding='utf-8')
                                for x in in0.flatten()], dtype=object)
                in0 = np.array([str(x) for x in in0.flatten()],
                                dtype=object).reshape(in0.shape)

            expected0 = expected0.reshape(output_shapes[io_num])

            input_list.append(in0)
            expected_list.append(expected0)

        expected_dict[output_name] = expected_list

        input_byte_size = tu.shape_element_count(input_shapes[io_num]) *\
                            np.dtype(tensor_dtype).itemsize * batch_size
        output_byte_size = tu.shape_element_count(output_shapes[io_num]) *\
                            np.dtype(tensor_dtype).itemsize * batch_size
        # create and register shared memory region for inputs and outputs
        if TEST_SYSTEM_SHARED_MEMORY:
            shm_ip_handles.append(cudashm.create_shared_memory_region("input"+str(io_num)+"_data",
                                                                input_byte_size, 0))
            shm_op_handles.append(cudashm.create_shared_memory_region("output"+str(io_num)+"_data",
                                                                output_byte_size, 0))
            shared_memory_ctx.cuda_register(shm_ip_handles[io_num])
            shared_memory_ctx.cuda_register(shm_op_handles[io_num])
            # copy data into shared memory region for input values
            cudashm.set_shared_memory_region(shm_ip_handles[io_num], input_list)
        elif TEST_CUDA_SHARED_MEMORY:
            shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data",\
                                        "/input"+str(io_num), input_byte_size))
            shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data",\
                                        "/output"+str(io_num), output_byte_size))
            shared_memory_ctx.register(shm_ip_handles[io_num])
            shared_memory_ctx.register(shm_op_handles[io_num])
            # copy data into shared memory region for input values
            shm.set_shared_memory_region(shm_ip_handles[io_num], input_list)

        if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
            input_dict[input_name] = shm_ip_handles[io_num]
            output_dict[output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[io_num])
        else:
            input_dict[input_name] = input_list
            output_dict[output_name] = InferContext.ResultFormat.RAW

    # Run inference and check results for each config
    for config in configs:
        model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)

        ctx = InferContext(config[0], config[1], model_name, model_version,
                           correlation_id=0, streaming=config[2],
                           verbose=True)
        results = ctx.run(input_dict, output_dict, batch_size)

        tester.assertEqual(ctx.get_last_request_model_name(), model_name)
        if model_version is not None:
            tester.assertEqual(ctx.get_last_request_model_version(), model_version)

        tester.assertEqual(len(results), io_cnt)
        for (result_name, result_val) in iteritems(results):
            tester.assertTrue(result_name in output_dict)
            tester.assertTrue(result_name in expected_dict)
            for b in range(batch_size):
                expected = expected_dict[result_name][b]
                tester.assertEqual(result_val[b].shape, expected.shape)
                tester.assertTrue(np.array_equal(result_val[b], expected),
                                  "{}, {}, slot {}, expected: {}, got {}".format(
                                      model_name, result_name, b, expected, result_val[b]))

    if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
        for io_num in range(io_cnt):
            shared_memory_ctx.unregister(shm_ip_handles[io_num])
            shared_memory_ctx.unregister(shm_op_handles[io_num])
            if TEST_CUDA_SHARED_MEMORY:
                cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
            else:
                shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                shm.destroy_shared_memory_region(shm_op_handles[io_num])

    return results

Exemple #13

0

Afficher le fichier

Fichier : cuda_shared_memory_test.py Projet : zeta1999/triton-inference-server

 def _cleanup_server(self, shm_handles):
     for shm_handle in shm_handles:
         cshm.destroy_shared_memory_region(shm_handle)

Exemple #14

0

Afficher le fichier

Fichier : shm_util.py Projet : intellisyscorp/fitzme-triton-inference-server

def destroy_either_shm_region(shm_handle, use_system_shared_memory, use_cuda_shared_memory):
    if use_cuda_shared_memory:
        cudashm.destroy_shared_memory_region(shm_handle)
    else:
        shm.destroy_shared_memory_region(shm_handle)