def create_register_set_either_shm_region(shm_region_names, input_list, input_byte_size, output_byte_size, shared_memory_ctx, use_system_shared_memory, use_cuda_shared_memory): if use_cuda_shared_memory and use_system_shared_memory: raise ValueError("Cannot set both System and CUDA shared memory flags to 1") if not (use_system_shared_memory or use_cuda_shared_memory): return [] if use_system_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region(shm_region_names[0]+"_data", input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region(shm_region_names[1]+"_data", output_byte_size, 0) shared_memory_ctx.cuda_register(shm_ip_handle) shared_memory_ctx.cuda_register(shm_op_handle) # copy data into shared memory region for input values cudashm.set_shared_memory_region(shm_ip_handle, input_list) elif use_cuda_shared_memory: shm_ip_handle = shm.create_shared_memory_region(shm_region_names[0]+"_data",\ "/"+shm_region_names[0], input_byte_size) shm_op_handle = shm.create_shared_memory_region(shm_region_names[1]+"_data",\ "/"+shm_region_names[1], output_byte_size) shared_memory_ctx.register(shm_ip_handle) shared_memory_ctx.register(shm_op_handle) # copy data into shared memory region for input values shm.set_shared_memory_region(shm_ip_handle, input_list) return [shm_ip_handle, shm_op_handle]
def _configure_sever(): shm_op0_handle = cshm.create_shared_memory_region("output0_data", 64, 0) shm_op1_handle = cshm.create_shared_memory_region("output1_data", 64, 0) shm_ip0_handle = cshm.create_shared_memory_region("input0_data", 64, 0) shm_ip1_handle = cshm.create_shared_memory_region("input1_data", 64, 0) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) cshm.set_shared_memory_region(shm_ip0_handle, [input0_data]) cshm.set_shared_memory_region(shm_ip1_handle, [input1_data]) shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) shared_memory_ctx.cuda_register(shm_ip0_handle) shared_memory_ctx.cuda_register(shm_ip1_handle) shared_memory_ctx.cuda_register(shm_op0_handle) shared_memory_ctx.cuda_register(shm_op1_handle) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def test_invalid_create_shm(): # Raises error since tried to create invalid cuda shared memory region try: shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0) except Exception as ex: self.assertTrue(str(ex) == "unable to create cuda shared memory handle") cshm.destroy_shared_memory_region(shm_op0_handle)
def test_unregister_before_register(): # Create a valid cuda shared memory region and unregister before register shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) shared_memory_ctx.unregister(shm_op0_handle) shm_status = shared_memory_ctx.get_shared_memory_status() self.assertTrue(len(shm_status.shared_memory_region) == 0) cshm.destroy_shared_memory_region(shm_op0_handle)
def test_valid_create_set_register(): # Create a valid cuda shared memory region, fill data in it and register shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) cshm.set_shared_memory_region(shm_op0_handle, [np.array([1,2], dtype=np.float32)]) shared_memory_ctx.cuda_register(shm_op0_handle) shm_status = shared_memory_ctx.get_shared_memory_status() self.assertTrue(len(shm_status.shared_memory_region) == 1) cshm.destroy_shared_memory_region(shm_op0_handle)
def test_unregister_before_register(self): # Create a valid cuda shared memory region and unregister before register shared_memory_ctx = SharedMemoryControlContext(_url, _protocol, verbose=True) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) shared_memory_ctx.unregister(shm_op0_handle) shm_status = shared_memory_ctx.get_shared_memory_status() self.assertTrue(len(shm_status.shared_memory_region) == 0) cshm.destroy_shared_memory_region(shm_op0_handle)
def _configure_sever(self): shm_op0_handle = cshm.create_shared_memory_region( "output0_data", 64, 0) shm_op1_handle = cshm.create_shared_memory_region( "output1_data", 64, 0) shm_ip0_handle = cshm.create_shared_memory_region("input0_data", 64, 0) shm_ip1_handle = cshm.create_shared_memory_region("input1_data", 64, 0) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) cshm.set_shared_memory_region(shm_ip0_handle, [input0_data]) cshm.set_shared_memory_region(shm_ip1_handle, [input1_data]) shared_memory_ctx = SharedMemoryControlContext(_url, _protocol, verbose=True) shared_memory_ctx.cuda_register(shm_ip0_handle) shared_memory_ctx.cuda_register(shm_ip1_handle) shared_memory_ctx.cuda_register(shm_op0_handle) shared_memory_ctx.cuda_register(shm_op1_handle) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
def precreate_register_regions(self, value_list, dtype, i, batch_size=1): if _test_system_shared_memory or _test_cuda_shared_memory: shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=True) shm_region_handles = [] for j, value in enumerate(value_list): # create data input_list = list() for b in range(batch_size): if dtype == np.object: in0 = np.full((1,), value, dtype=np.int32) in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape((1,)) else: in0 = np.full((1,), value, dtype=dtype) input_list.append(in0) input_list_tmp = iu._prepend_string_size(input_list) if (dtype == np.object) else input_list input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) output_byte_size = np.dtype(dtype).itemsize + 2 # create shared memory regions and copy data for input values if _test_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( 'ip{}{}_data'.format(i,j), '/ip{}{}'.format(i,j), input_byte_size) shm_op_handle = shm.create_shared_memory_region( 'op{}{}_data'.format(i,j), '/op{}{}'.format(i,j), output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list_tmp) shared_memory_ctx.register(shm_ip_handle) shared_memory_ctx.register(shm_op_handle) elif _test_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( 'ip{}{}_data'.format(i,j), input_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( 'op{}{}_data'.format(i,j), output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp) shared_memory_ctx.cuda_register(shm_ip_handle) shared_memory_ctx.cuda_register(shm_op_handle) shm_region_handles.append(shm_ip_handle) shm_region_handles.append(shm_op_handle) return shm_region_handles else: return []
def create_cuda_shm(data, name, url, protocol, is_input=True): #c, h, w = shape shared_memory_ctx = SharedMemoryControlContext(url, protocol) byte_size = data.size * data.itemsize shm_handle = cudashm.create_shared_memory_region(name, byte_size, 3) if is_input: cudashm.set_shared_memory_region(shm_handle, [data]) shared_memory_ctx.cuda_register(shm_handle) else: shared_memory_ctx.cuda_register(shm_handle)
def test_reregister_after_register(): # Create a valid cuda shared memory region and unregister after register shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) shared_memory_ctx.cuda_register(shm_op0_handle) try: shared_memory_ctx.cuda_register(shm_op0_handle) except Exception as ex: self.assertTrue("shared memory block 'dummy_data' already in manager" in str(ex)) shm_status = shared_memory_ctx.get_shared_memory_status() self.assertTrue(len(shm_status.shared_memory_region) == 1) cshm.destroy_shared_memory_region(shm_op0_handle)
def test_too_big_shm(): # Shared memory input region larger than needed - Throws error error_msg = [] threads = [] shm_handles = self._configure_sever() shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 128, 0) shared_memory_ctx.cuda_register(shm_ip2_handle) self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg) if len(error_msg) > 0: self.assertTrue(error_msg[-1] == "The input 'INPUT1' has shared memory of size 128 bytes"\ " while the expected size is 1 * 64 = 64 bytes") shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles)
def test_register_after_inference(self): # Register after inference error_msg = [] shm_handles = self._configure_sever() shared_memory_ctx = SharedMemoryControlContext(_url, _protocol, verbose=True) self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg) if len(error_msg) > 0: raise Exception(str(error_msg)) shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0) shared_memory_ctx.cuda_register(shm_ip2_handle) shm_status = shared_memory_ctx.get_shared_memory_status() self.assertTrue(len(shm_status.shared_memory_region) == 5) shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles)
def test_register_during_inference(): # Register during inference - Registered successfully error_msg = [] threads = [] shm_handles = self._configure_sever() shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0) threads.append(threading.Thread(target=self._basic_inference, args=(shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg))) threads.append(threading.Thread(target=shared_memory_ctx.cuda_register, args=(shm_ip2_handle,))) threads[0].start() threads[1].start() threads[0].join() threads[1].join() if len(error_msg) > 0: raise Exception(str(error_msg)) shm_status = shared_memory_ctx.get_shared_memory_status() self.assertTrue(len(shm_status.shared_memory_region) == 5) shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles)
input0_data = in0n.reshape(in0.shape) in1n = np.array([str(x) for x in in1.reshape(in1.size)], dtype=object) input1_data = in1n.reshape(in1.shape) # serialize the string tensors input0_data_serialized = cudashm.serialize_string_tensor(input0_data) input1_data_serialized = cudashm.serialize_string_tensor(input1_data) # Use the size of the serialized tensors to create the shared memory regions input0_byte_size = input0_data_serialized.size * input0_data_serialized.itemsize input1_byte_size = input1_data_serialized.size * input1_data_serialized.itemsize output_byte_size = max(input0_byte_size, input1_byte_size) + 1 output_byte_size = max(input0_byte_size, input1_byte_size) + 1 # Create Output0 and Output1 in Shared Memory and store shared memory handles shm_op0_handle = cudashm.create_shared_memory_region( "output0_data", output_byte_size, 0) shm_op1_handle = cudashm.create_shared_memory_region( "output1_data", output_byte_size, 0) # Register Output0 and Output1 shared memory with TRTIS shared_memory_ctx.cuda_register(shm_op0_handle) shared_memory_ctx.cuda_register(shm_op1_handle) # Create Input0 and Input1 in Shared Memory and store shared memory handles shm_ip0_handle = cudashm.create_shared_memory_region( "input0_data", input0_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( "input1_data", input1_byte_size, 0) # Put input data values into shared memory cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data_serialized])
def infer_shape_tensor(tester, pf, batch_size, tensor_dtype, input_shape_values, dummy_input_shapes, model_version=None, use_http=True, use_grpc=True, use_streaming=True, shm_suffix="", use_system_shared_memory=False, use_cuda_shared_memory=False, priority=0, timeout_us=0): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", ProtocolType.HTTP, False)) if use_grpc: configs.append(("localhost:8001", ProtocolType.GRPC, False)) if use_streaming: configs.append(("localhost:8001", ProtocolType.GRPC, True)) tester.assertEqual(len(input_shape_values), len(dummy_input_shapes)) io_cnt = len(input_shape_values) if use_system_shared_memory and use_cuda_shared_memory: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") input_dict = {} output_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) for io_num in range(io_cnt): tester.assertTrue(pf == "plan" or pf == "plan_nobatch") input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) dummy_input_name = "DUMMY_INPUT{}".format(io_num) dummy_output_name = "DUMMY_OUTPUT{}".format(io_num) input_list = list() dummy_input_list = list() expected_list = list() for b in range(batch_size): # Prepare the dummy tensor rtensor_dtype = _range_repr_dtype(tensor_dtype) if (rtensor_dtype != np.bool): dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=dummy_input_shapes[io_num], dtype=rtensor_dtype) else: dummy_in0 = np.random.choice(a=[False, True], size=dummy_input_shapes[io_num]) if tensor_dtype != np.object: dummy_in0 = dummy_in0.astype(tensor_dtype) else: dummy_in0 = np.array([str(x) for x in in0.flatten()], dtype=object).reshape(in0.shape) dummy_input_list.append(dummy_in0) # Prepare shape input tensor. Only one tensor per batch in0 = np.asarray(input_shape_values[io_num], dtype=np.int32) input_list.append(in0) # Prepare the expected list for the output expected0 = np.ndarray.copy(in0) expected_list.append(expected0) expected_dict[output_name] = expected_list input_byte_size = len(in0) * np.dtype(tensor_dtype).itemsize output_byte_size = input_byte_size * batch_size dummy_input_byte_size = tu.shape_element_count(dummy_input_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size # The dimension of this tensor will be the value of the shape tensor dummy_output_byte_size = tu.shape_element_count(in0) *\ np.dtype(tensor_dtype).itemsize * batch_size # create and register shared memory region for inputs and outputs if use_cuda_shared_memory: shm_ip_handles.append( cudashm.create_shared_memory_region( "input" + str(io_num) + "_data" + shm_suffix, input_byte_size, 0)) shm_ip_handles.append( cudashm.create_shared_memory_region( "dummy_input" + str(io_num) + "_data" + shm_suffix, dummy_input_byte_size, 0)) shm_op_handles.append( cudashm.create_shared_memory_region( "output" + str(io_num) + "_data" + shm_suffix, output_byte_size, 0)) shm_op_handles.append( cudashm.create_shared_memory_region( "dummy_output" + str(io_num) + "_data" + shm_suffix, dummy_output_byte_size, 0)) shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num]) shared_memory_ctx.cuda_register(shm_ip_handles[2 * io_num + 1]) shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num]) shared_memory_ctx.cuda_register(shm_op_handles[2 * io_num + 1]) # copy data into shared memory region for input values cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num], input_list) cudashm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1], dummy_input_list) elif use_system_shared_memory: shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data"+shm_suffix,\ "/input"+str(io_num)+shm_suffix, input_byte_size)) shm_ip_handles.append(shm.create_shared_memory_region("dumy_input"+str(io_num)+"_data"+shm_suffix,\ "/dummy_input"+str(io_num)+shm_suffix, dummy_input_byte_size)) shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data"+shm_suffix,\ "/output"+str(io_num)+shm_suffix, output_byte_size)) shm_op_handles.append(shm.create_shared_memory_region("dummy_output"+str(io_num)+"_data"+shm_suffix,\ "/dummy_output"+str(io_num)+shm_suffix, dummy_output_byte_size)) shared_memory_ctx.register(shm_ip_handles[2 * io_num]) shared_memory_ctx.register(shm_ip_handles[2 * io_num + 1]) shared_memory_ctx.register(shm_op_handles[2 * io_num]) shared_memory_ctx.register(shm_op_handles[2 * io_num + 1]) # copy data into shared memory region for input values shm.set_shared_memory_region(shm_ip_handles[2 * io_num], input_list) shm.set_shared_memory_region(shm_ip_handles[2 * io_num + 1], dummy_input_list) if use_system_shared_memory or use_cuda_shared_memory: input_dict[input_name] = (shm_ip_handles[2 * io_num], [len(input_shape_values[0])]) input_dict[dummy_input_name] = (shm_ip_handles[2 * io_num + 1], dummy_input_shapes[io_num]) output_dict[output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[2 * io_num]) output_dict[dummy_output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[2 * io_num + 1]) else: input_dict[input_name] = input_list input_dict[dummy_input_name] = dummy_input_list output_dict[output_name] = InferContext.ResultFormat.RAW output_dict[dummy_output_name] = InferContext.ResultFormat.RAW # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=0, streaming=config[2], verbose=True) results = ctx.run(input_dict, output_dict, batch_size, priority=priority, timeout_us=timeout_us) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), 2 * io_cnt) for (result_name, result_val) in iteritems(results): tester.assertTrue(result_name in output_dict) expected = expected_dict[output_name][0] for b in range(batch_size): if result_name == output_name: tester.assertEqual(result_val[b].shape, expected.shape) tester.assertTrue( np.array_equal(result_val[b], expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) elif result_name == dummy_output_name: # The shape of the dummy output should be equal to the shape values # specified in the shape tensor tester.assertTrue( np.array_equal(result_val[b].shape, expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) if use_cuda_shared_memory or use_system_shared_memory: for io_num in range(2 * io_cnt): shared_memory_ctx.unregister(shm_ip_handles[io_num]) shared_memory_ctx.unregister(shm_op_handles[io_num]) if use_cuda_shared_memory: cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
def check_sequence(self, trial, model_name, input_dtype, correlation_id, sequence_thresholds, values, expected_result, protocol, batch_size=1, sequence_name="<unknown>", tensor_shape=(1, )): """Perform sequence of inferences. The 'values' holds a list of tuples, one for each inference with format: (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms) """ if (("savedmodel" not in trial) and ("graphdef" not in trial) and ("netdef" not in trial) and ("custom" not in trial) and ("onnx" not in trial) and ("libtorch" not in trial) and ("plan" not in trial)): self.assertFalse(True, "unknown trial type: " + trial) # Can only send the request exactly once since it is a # sequence model with state, so can have only a single config. configs = [] if protocol == "http": configs.append(("localhost:8000", ProtocolType.HTTP, False)) if protocol == "grpc": configs.append(("localhost:8001", ProtocolType.GRPC, False)) if protocol == "streaming": configs.append(("localhost:8001", ProtocolType.GRPC, True)) self.assertFalse( _test_system_shared_memory and _test_cuda_shared_memory, "Cannot set both System and CUDA shared memory flags to 1") self.assertEqual(len(configs), 1) # create and register shared memory output region in advance if _test_system_shared_memory or _test_cuda_shared_memory: shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=True) output_byte_size = 512 if _test_system_shared_memory: shm_op_handle = shm.create_shared_memory_region( "output_data", "/output", output_byte_size) shared_memory_ctx.unregister(shm_op_handle) shared_memory_ctx.register(shm_op_handle) elif _test_cuda_shared_memory: shm_op_handle = cudashm.create_shared_memory_region( "output_data", output_byte_size, 0) shared_memory_ctx.unregister(shm_op_handle) shared_memory_ctx.cuda_register(shm_op_handle) for config in configs: ctx = InferContext(config[0], config[1], model_name, correlation_id=correlation_id, streaming=config[2], verbose=True) # Execute the sequence of inference... try: seq_start_ms = int(round(time.time() * 1000)) for flag_str, value, thresholds, delay_ms in values: if delay_ms is not None: time.sleep(delay_ms[0] / 1000.0) flags = InferRequestHeader.FLAG_NONE if flag_str is not None: if "start" in flag_str: flags = flags | InferRequestHeader.FLAG_SEQUENCE_START if "end" in flag_str: flags = flags | InferRequestHeader.FLAG_SEQUENCE_END input_list = list() for b in range(batch_size): if input_dtype == np.object: in0 = np.full(tensor_shape, value, dtype=np.int32) in0n = np.array( [str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(tensor_shape) else: in0 = np.full(tensor_shape, value, dtype=input_dtype) input_list.append(in0) # create input shared memory and copy input data values into it if _test_system_shared_memory or _test_cuda_shared_memory: input_list_tmp = iu._prepend_string_size( input_list) if (input_dtype == np.object) else input_list input_byte_size = sum( [i0.nbytes for i0 in input_list_tmp]) if _test_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( "input_data", "/input", input_byte_size) shm.set_shared_memory_region( shm_ip_handle, input_list_tmp) shared_memory_ctx.unregister(shm_ip_handle) shared_memory_ctx.register(shm_ip_handle) elif _test_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( "input_data", input_byte_size, 0) cudashm.set_shared_memory_region( shm_ip_handle, input_list_tmp) shared_memory_ctx.unregister(shm_ip_handle) shared_memory_ctx.cuda_register(shm_ip_handle) input_info = (shm_ip_handle, tensor_shape) output_info = (InferContext.ResultFormat.RAW, shm_op_handle) else: input_info = input_list output_info = InferContext.ResultFormat.RAW start_ms = int(round(time.time() * 1000)) INPUT = "INPUT__0" if trial.startswith( "libtorch") else "INPUT" OUTPUT = "OUTPUT__0" if trial.startswith( "libtorch") else "OUTPUT" results = ctx.run({INPUT: input_info}, {OUTPUT: output_info}, batch_size=batch_size, flags=flags) end_ms = int(round(time.time() * 1000)) self.assertEqual(len(results), 1) self.assertTrue(OUTPUT in results) result = results[OUTPUT][0][0] print("{}: {}".format(sequence_name, result)) if thresholds is not None: lt_ms = thresholds[0] gt_ms = thresholds[1] if lt_ms is not None: self.assertTrue( (end_ms - start_ms) < lt_ms, "expected less than " + str(lt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, "expected greater than " + str(gt_ms) + "ms response time, got " + str(end_ms - start_ms) + " ms") if delay_ms is not None: time.sleep(delay_ms[1] / 1000.0) seq_end_ms = int(round(time.time() * 1000)) if input_dtype == np.object: self.assertEqual(int(result), expected_result) else: self.assertEqual(result, expected_result) if sequence_thresholds is not None: lt_ms = sequence_thresholds[0] gt_ms = sequence_thresholds[1] if lt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms, "sequence expected less than " + str(lt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") if gt_ms is not None: self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms, "sequence expected greater than " + str(gt_ms) + "ms response time, got " + str(seq_end_ms - seq_start_ms) + " ms") except Exception as ex: self.add_deferred_exception(ex) if _test_system_shared_memory or _test_cuda_shared_memory: shared_memory_ctx.unregister(shm_op_handle) if _test_system_shared_memory: shm.destroy_shared_memory_region(shm_op_handle) elif _test_cuda_shared_memory: cudashm.destroy_shared_memory_region(shm_op_handle)
def precreate_register_dynaseq_shape_tensor_regions( self, value_list, dtype, i, batch_size=1, tensor_shape=(1, )): if _test_system_shared_memory or _test_cuda_shared_memory: shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=True) shm_region_handles = [] for j, (shape_value, value) in enumerate(value_list): input_list = list() shape_input_list = list() dummy_input_list = list() for b in range(batch_size): if dtype == np.object: dummy_in0 = np.full(tensor_shape, value, dtype=np.int32) dummy_in0n = np.array( [str(x) for x in dummy_in0.reshape(in0.size)], dtype=object) dummy_in0 = dummy_in0n.reshape(tensor_shape) else: dummy_in0 = np.full(tensor_shape, value, dtype=dtype) dummy_input_list.append(dummy_in0) in0 = np.full(tensor_shape, value, dtype=np.int32) input_list.append(in0) # Only one shape tensor input per batch shape_input_list.append( np.full(tensor_shape, shape_value, dtype=np.int32)) input_list_tmp = iu._prepend_string_size(input_list) if ( dtype == np.object) else input_list input_byte_size = sum([i0.nbytes for i0 in input_list_tmp]) shape_input_byte_size = sum( [i0.nbytes for i0 in shape_input_list]) dummy_input_byte_size = sum( [i0.nbytes for i0 in dummy_input_list]) shape_output_byte_size = shape_input_byte_size output_byte_size = np.dtype(np.int32).itemsize + 2 resized_output_byte_size = 32 * shape_value # create shared memory regions and copy data for input values if _test_system_shared_memory: shm_ip_handle = shm.create_shared_memory_region( 'ip{}{}_data'.format(i, j), '/ip{}{}'.format(i, j), input_byte_size) shm_shape_ip_handle = shm.create_shared_memory_region( 'shape_ip{}{}_data'.format(i, j), '/shape_ip{}{}'.format(i, j), shape_input_byte_size) shm_dummy_ip_handle = shm.create_shared_memory_region( 'dummy_ip{}{}_data'.format(i, j), '/dummy_ip{}{}'.format(i, j), dummy_input_byte_size) shm_shape_op_handle = shm.create_shared_memory_region( 'shape_op{}{}_data'.format(i, j), '/shape_op{}{}'.format(i, j), shape_output_byte_size) shm_op_handle = shm.create_shared_memory_region( 'op{}{}_data'.format(i, j), '/op{}{}'.format(i, j), output_byte_size) shm_resized_op_handle = shm.create_shared_memory_region( 'resized_op{}{}_data'.format(i, j), '/resized_op{}{}'.format(i, j), resized_output_byte_size) shm.set_shared_memory_region(shm_ip_handle, input_list_tmp) shm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list) shm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list) shared_memory_ctx.register(shm_ip_handle) shared_memory_ctx.register(shm_shape_ip_handle) shared_memory_ctx.register(shm_dummy_ip_handle) shared_memory_ctx.register(shm_shape_op_handle) shared_memory_ctx.register(shm_op_handle) shared_memory_ctx.register(shm_resized_op_handle) elif _test_cuda_shared_memory: shm_ip_handle = cudashm.create_shared_memory_region( 'ip{}{}_data'.format(i, j), input_byte_size, 0) shm_shape_ip_handle = cudashm.create_shared_memory_region( 'shape_ip{}{}_data'.format(i, j), shape_input_byte_size, 0) shm_dummy_ip_handle = cudashm.create_shared_memory_region( 'dummy_ip{}{}_data'.format(i, j), dummy_input_byte_size, 0) shm_shape_op_handle = cudashm.create_shared_memory_region( 'shape_op{}{}_data'.format(i, j), shape_output_byte_size, 0) shm_op_handle = cudashm.create_shared_memory_region( 'op{}{}_data'.format(i, j), output_byte_size, 0) shm_resized_op_handle = cudashm.create_shared_memory_region( 'resized_op{}{}_data'.format(i, j), resized_output_byte_size, 0) cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp) cudashm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list) cudashm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list) shared_memory_ctx.cuda_register(shm_ip_handle) shared_memory_ctx.cuda_register(shm_shape_ip_handle) shared_memory_ctx.cuda_register(shm_dummy_ip_handle) shared_memory_ctx.cuda_register(shm_shape_op_handle) shared_memory_ctx.cuda_register(shm_op_handle) shared_memory_ctx.cuda_register(shm_resized_op_handle) shm_region_handles.append(shm_ip_handle) shm_region_handles.append(shm_shape_ip_handle) shm_region_handles.append(shm_dummy_ip_handle) shm_region_handles.append(shm_shape_op_handle) shm_region_handles.append(shm_op_handle) shm_region_handles.append(shm_resized_op_handle) return shm_region_handles else: return []
def infer_zero(tester, pf, batch_size, tensor_dtype, input_shapes, output_shapes, model_version=None, use_http=True, use_grpc=True, use_streaming=True): tester.assertTrue(use_http or use_grpc or use_streaming) configs = [] if use_http: configs.append(("localhost:8000", ProtocolType.HTTP, False)) if use_grpc: configs.append(("localhost:8001", ProtocolType.GRPC, False)) if use_streaming: configs.append(("localhost:8001", ProtocolType.GRPC, True)) tester.assertEqual(len(input_shapes), len(output_shapes)) io_cnt = len(input_shapes) if TEST_CUDA_SHARED_MEMORY and TEST_SYSTEM_SHARED_MEMORY: raise ValueError("Cannot set both System and CUDA shared memory flags to 1") input_dict = {} output_dict = {} expected_dict = {} shm_ip_handles = list() shm_op_handles = list() shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) for io_num in range(io_cnt): if pf == "libtorch" or pf == "libtorch_nobatch": input_name = "INPUT__{}".format(io_num) output_name = "OUTPUT__{}".format(io_num) else: input_name = "INPUT{}".format(io_num) output_name = "OUTPUT{}".format(io_num) input_list = list() expected_list = list() for b in range(batch_size): rtensor_dtype = _range_repr_dtype(tensor_dtype) in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min, high=np.iinfo(rtensor_dtype).max, size=input_shapes[io_num], dtype=rtensor_dtype) if tensor_dtype != np.object: in0 = in0.astype(tensor_dtype) expected0 = np.ndarray.copy(in0) else: expected0 = np.array([unicode(str(x), encoding='utf-8') for x in in0.flatten()], dtype=object) in0 = np.array([str(x) for x in in0.flatten()], dtype=object).reshape(in0.shape) expected0 = expected0.reshape(output_shapes[io_num]) input_list.append(in0) expected_list.append(expected0) expected_dict[output_name] = expected_list input_byte_size = tu.shape_element_count(input_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size output_byte_size = tu.shape_element_count(output_shapes[io_num]) *\ np.dtype(tensor_dtype).itemsize * batch_size # create and register shared memory region for inputs and outputs if TEST_SYSTEM_SHARED_MEMORY: shm_ip_handles.append(cudashm.create_shared_memory_region("input"+str(io_num)+"_data", input_byte_size, 0)) shm_op_handles.append(cudashm.create_shared_memory_region("output"+str(io_num)+"_data", output_byte_size, 0)) shared_memory_ctx.cuda_register(shm_ip_handles[io_num]) shared_memory_ctx.cuda_register(shm_op_handles[io_num]) # copy data into shared memory region for input values cudashm.set_shared_memory_region(shm_ip_handles[io_num], input_list) elif TEST_CUDA_SHARED_MEMORY: shm_ip_handles.append(shm.create_shared_memory_region("input"+str(io_num)+"_data",\ "/input"+str(io_num), input_byte_size)) shm_op_handles.append(shm.create_shared_memory_region("output"+str(io_num)+"_data",\ "/output"+str(io_num), output_byte_size)) shared_memory_ctx.register(shm_ip_handles[io_num]) shared_memory_ctx.register(shm_op_handles[io_num]) # copy data into shared memory region for input values shm.set_shared_memory_region(shm_ip_handles[io_num], input_list) if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: input_dict[input_name] = shm_ip_handles[io_num] output_dict[output_name] = (InferContext.ResultFormat.RAW, shm_op_handles[io_num]) else: input_dict[input_name] = input_list output_dict[output_name] = InferContext.ResultFormat.RAW # Run inference and check results for each config for config in configs: model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype) ctx = InferContext(config[0], config[1], model_name, model_version, correlation_id=0, streaming=config[2], verbose=True) results = ctx.run(input_dict, output_dict, batch_size) tester.assertEqual(ctx.get_last_request_model_name(), model_name) if model_version is not None: tester.assertEqual(ctx.get_last_request_model_version(), model_version) tester.assertEqual(len(results), io_cnt) for (result_name, result_val) in iteritems(results): tester.assertTrue(result_name in output_dict) tester.assertTrue(result_name in expected_dict) for b in range(batch_size): expected = expected_dict[result_name][b] tester.assertEqual(result_val[b].shape, expected.shape) tester.assertTrue(np.array_equal(result_val[b], expected), "{}, {}, slot {}, expected: {}, got {}".format( model_name, result_name, b, expected, result_val[b])) if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: for io_num in range(io_cnt): shared_memory_ctx.unregister(shm_ip_handles[io_num]) shared_memory_ctx.unregister(shm_op_handles[io_num]) if TEST_CUDA_SHARED_MEMORY: cudashm.destroy_shared_memory_region(shm_ip_handles[io_num]) cudashm.destroy_shared_memory_region(shm_op_handles[io_num]) else: shm.destroy_shared_memory_region(shm_ip_handles[io_num]) shm.destroy_shared_memory_region(shm_op_handles[io_num]) return results
def create_register_set_shm_regions(input0_list, input1_list, expected0_list, \ expected1_list, outputs, shm_region_names, precreated_shm_regions): if TEST_CUDA_SHARED_MEMORY and TEST_SYSTEM_SHARED_MEMORY: raise ValueError( "Cannot set both System and CUDA shared memory flags to 1") shared_memory_ctx = SharedMemoryControlContext("localhost:8000", ProtocolType.HTTP, verbose=False) input0_byte_size = sum([i0.nbytes for i0 in input0_list]) input1_byte_size = sum([i1.nbytes for i1 in input1_list]) output0_byte_size = sum([e0.nbytes for e0 in expected0_list]) output1_byte_size = sum([e1.nbytes for e1 in expected1_list]) shm_io_handles = [] if shm_region_names is None: shm_region_names = ['input0', 'input1', 'output0', 'output1'] if TEST_SYSTEM_SHARED_MEMORY: shm_ip0_handle = shm.create_shared_memory_region( shm_region_names[0] + '_data', '/' + shm_region_names[0], input0_byte_size) shm_ip1_handle = shm.create_shared_memory_region( shm_region_names[1] + '_data', '/' + shm_region_names[1], input1_byte_size) shm.set_shared_memory_region(shm_ip0_handle, input0_list) shm.set_shared_memory_region(shm_ip1_handle, input1_list) shared_memory_ctx.unregister(shm_ip0_handle) shared_memory_ctx.register(shm_ip0_handle) shared_memory_ctx.unregister(shm_ip1_handle) shared_memory_ctx.register(shm_ip1_handle) shm_io_handles.extend([shm_ip0_handle, shm_ip1_handle]) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = shm.create_shared_memory_region( shm_region_names[2] + '_data', '/' + shm_region_names[2], output0_byte_size) shared_memory_ctx.unregister(shm_op0_handle) shared_memory_ctx.register(shm_op0_handle) else: shm_op0_handle = precreated_shm_regions[0] shm_io_handles.append(shm_op0_handle) i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = shm.create_shared_memory_region( shm_region_names[2 + i] + '_data', '/' + shm_region_names[2 + i], output1_byte_size) shared_memory_ctx.unregister(shm_op1_handle) shared_memory_ctx.register(shm_op1_handle) else: shm_op1_handle = precreated_shm_regions[i] shm_io_handles.append(shm_op1_handle) if TEST_CUDA_SHARED_MEMORY: shm_ip0_handle = cudashm.create_shared_memory_region( shm_region_names[0] + '_data', input0_byte_size, 0) shm_ip1_handle = cudashm.create_shared_memory_region( shm_region_names[1] + '_data', input1_byte_size, 0) cudashm.set_shared_memory_region(shm_ip0_handle, input0_list) cudashm.set_shared_memory_region(shm_ip1_handle, input1_list) shared_memory_ctx.unregister(shm_ip0_handle) shared_memory_ctx.cuda_register(shm_ip0_handle) shared_memory_ctx.unregister(shm_ip1_handle) shared_memory_ctx.cuda_register(shm_ip1_handle) shm_io_handles.extend([shm_ip0_handle, shm_ip1_handle]) i = 0 if "OUTPUT0" in outputs: if precreated_shm_regions is None: shm_op0_handle = cudashm.create_shared_memory_region( shm_region_names[2] + '_data', output0_byte_size, 0) shared_memory_ctx.unregister(shm_op0_handle) shared_memory_ctx.cuda_register(shm_op0_handle) else: shm_op0_handle = precreated_shm_regions[0] shm_io_handles.append(shm_op0_handle) i += 1 if "OUTPUT1" in outputs: if precreated_shm_regions is None: shm_op1_handle = cudashm.create_shared_memory_region( shm_region_names[2 + i] + '_data', output1_byte_size, 0) shared_memory_ctx.unregister(shm_op1_handle) shared_memory_ctx.cuda_register(shm_op1_handle) else: shm_op1_handle = precreated_shm_regions[i] shm_io_handles.append(shm_op1_handle) return shm_io_handles