def __init__(self, engine_file_path): # Create a Context on this device, self.cfx = cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings
def __init__(self, model): print('setting up Yolov5s-simple.trt processor') # load tensorrt engine TRT_LOGGER = trt.Logger(trt.Logger.INFO) TRTbin = '{0}/models/{1}'.format(os.path.dirname(__file__), model) with open(TRTbin, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.context = engine.create_execution_context() # allocate memory inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append({'host': host_mem, 'device': device_mem}) else: outputs.append({'host': host_mem, 'device': device_mem}) # save to class self.inputs = inputs self.outputs = outputs self.bindings = bindings self.stream = stream # post processing config filters = (80 + 5) * 3 self.output_shapes = [(1, 3, 80, 80, 85), (1, 3, 40, 40, 85), (1, 3, 20, 20, 85)] self.strides = np.array([8., 16., 32.]) anchors = np.array([ [[10, 13], [16, 30], [33, 23]], [[30, 61], [62, 45], [59, 119]], [[116, 90], [156, 198], [373, 326]], ]) self.nl = len(anchors) self.nc = 80 # classes self.no = self.nc + 5 # outputs per anchor self.na = len(anchors[0]) a = anchors.copy().astype(np.float32) a = a.reshape(self.nl, -1, 2) self.anchors = a.copy() self.anchor_grid = a.copy().reshape(self.nl, 1, -1, 1, 1, 2)
def allocate_buffers(engine, profile_id): """Allocate device memory for I/O bindings of engine and return them.""" d_inputs, outputs, bindings = [], [], [] stream = cuda.Stream() if engine.has_implicit_batch_dimension: max_batch_size = engine.max_batch_size else: shape = engine.get_binding_shape(0) if -1 in list(shape): batch_dim = list(shape).index(-1) max_batch_size = engine.get_profile_shape(0, 0)[2][batch_dim] else: max_batch_size = shape[0] nb_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles bindings = [0 for i in range(engine.num_bindings)] for binding in range(profile_id * nb_bindings_per_profile, (profile_id + 1) * nb_bindings_per_profile): logging.info("Binding {:}".format(binding)) dtype = engine.get_binding_dtype(binding) format = engine.get_binding_format(binding) shape = engine.get_binding_shape(binding) if format == trt.TensorFormat.CHW4: shape[-3] = ((shape[-3] - 1) // 4 + 1) * 4 elif format == trt.TensorFormat.DHWC8: shape[-4] = ((shape[-4] - 1) // 8 + 1) * 8 if not engine.has_implicit_batch_dimension: if -1 in list(shape): batch_dim = list(shape).index(-1) shape[batch_dim] = max_batch_size size = trt.volume(shape) else: size = trt.volume(shape) * max_batch_size # Allocate device buffers device_mem = cuda.mem_alloc(size * dtype.itemsize) # Append device buffer to device bindings. bindings[binding] = int(device_mem) # Append to the appropriate list. if engine.binding_is_input(binding): d_inputs.append(device_mem) else: host_mem = cuda.pagelocked_empty(size, trt.nptype(dtype)) outputs.append(HostDeviceMem(host_mem, device_mem)) return d_inputs, outputs, bindings, stream
def __init__(self, model, anchor_nums, nc, anchors, output_shapes, img_size): # load tensorrt engine self.cfx = cuda.Device(0).make_context() TRT_LOGGER = trt.Logger(trt.Logger.INFO) TRTbin = model # print('trtbin', TRTbin) runtime = trt.Runtime(TRT_LOGGER) with open(TRTbin, 'rb') as f: engine = runtime.deserialize_cuda_engine(f.read()) self.context = engine.create_execution_context() # allocate memory inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append({'host': host_mem, 'device': device_mem}) else: outputs.append({'host': host_mem, 'device': device_mem}) # save to class self.inputs = inputs self.outputs = outputs self.bindings = bindings self.stream = stream self.anchor_nums = anchor_nums self.nc = nc # classes self.no = self.nc + 5 # outputs per anchor # post processing config self.output_shapes = output_shapes self.strides = np.array([8., 16., 32.]) self.na = len(anchors[0]) self.nl = len(anchors) self.img_size = img_size a = anchors.copy().astype(np.float32) a = a.reshape(self.nl, -1, 2) self.anchors = a.copy() self.anchor_grid = a.copy().reshape(self.nl, 1, -1, 1, 1, 2)
def allocate_buffers(cls, engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) \ * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_memory = cuda.pagelocked_empty(size, dtype) device_memory = cuda.mem_alloc(host_memory.nbytes) bindings.append(int(device_memory)) if engine.binding_is_input(binding): inputs.append(HostDeviceMemory(host_memory, device_memory)) else: outputs.append(HostDeviceMemory(host_memory, device_memory)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): """Allocates host and device buffer for TRT engine inference. This function is similair to the one in ../../common.py, but converts network outputs (which are np.float32) appropriately before writing them to Python buffer. This is needed, since uff plugins doesn't support output type description, and in our particular case, we use NMS plugin as network output. Args: engine (trt.ICudaEngine): uff engine Returns: inputs [HostDeviceMem]: engine input memory outputs [HostDeviceMem]: engine output memory bindings [int]: buffer to device bindings stream (cuda.Stream): cuda stream for engine inference synchronization """ inputs = [] outputs = [] bindings = [] stream = cuda.Stream() # Current NMS implementation in TRT only supports DataType.FLOAT but # it may change in the future, which could brake this sample here # when using lower precision [e.g. NMS output would not be np.float32 # anymore, even though this is assumed in binding_to_type] for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def build_engine(): """Takes an ONNX file and creates a TensorRT engine to run inference with""" builder = trt.Builder(TRT_LOGGER) network = builder.create_network(common.EXPLICIT_BATCH) parser = trt.OnnxParser(network, TRT_LOGGER) runtime = trt.Runtime(TRT_LOGGER) # Parse model file print('Loading ONNX file from path {}...'.format(onnx_file_path)) with open(onnx_file_path, 'rb') as model: print('Beginning ONNX file parsing') if not parser.parse(model.read()): print('ERROR: Failed to parse the ONNX file.') for error in range(parser.num_errors): print(parser.get_error(error)) return None print('Completed parsing of ONNX file') # Print input info print('Network inputs:') for i in range(network.num_inputs): tensor = network.get_input(i) print(tensor.name, trt.nptype(tensor.dtype), tensor.shape) network.get_input(0).shape = [10, 1] network.get_input(1).shape = [10, 1, 1, 16] network.get_input(2).shape = [6, 1] network.get_input(3).shape = [6, 1, 1, 16] config = builder.create_builder_config() config.set_flag(trt.BuilderFlag.REFIT) config.max_workspace_size = 1 << 28 # 256MiB print( 'Building an engine from file {}; this may take a while...'.format( onnx_file_path)) plan = builder.build_serialized_network(network, config) engine = runtime.deserialize_cuda_engine(plan) print("Completed creating Engine") with open(engine_file_path, "wb") as f: f.write(plan) return engine
def allocate_buffers(self): self.inputs = [] self.outputs = [] self.bindings = [] self.stream = cuda.Stream() for binding in self.engine: size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) self.bindings.append(int(device_mem)) # print(bindings) # Append to the appropriate list. if self.engine.binding_is_input(binding): self.inputs.append(HostDeviceMem(host_mem, device_mem)) else: self.outputs.append(HostDeviceMem(host_mem, device_mem))
def get_layer_weights(self, layer_name): """ Get the weights of a constant layer as a numpy array :param layer_name: string :return: """ layer = self._ops[layer_name] if isinstance(layer, trt.IConstantLayer): return np.reshape(self._ops[layer_name].weights, self._ops_output[layer_name].shape) elif isinstance(layer, trt.IIdentityLayer): cast_to_dtype = trt.nptype(layer.precision) const_layer_name = layer.get_input(0).name weights = self.get_layer_weights(const_layer_name) return weights.astype(cast_to_dtype) else: return layer.get_output(0)
def __init__(self, trt_path): # get model name self._model_name = os.path.basename(trt_path) self._model_name = self._model_name[:self._model_name.rfind(".")] # create engine self.trt_path = trt_path self.logger = trt.Logger() self.runtime = trt.Runtime(self.logger) with open(trt_path, "rb") as f: self.engine = self.runtime.deserialize_cuda_engine(f.read()) # create context and buffer self.context = self.engine.create_execution_context() self.stream = cuda.Stream() bindings = [] host_input = device_input = host_output = device_output = None for binding in self.engine: binding_idx = self.engine.get_binding_index(binding) print(f"binding name {binding}, idx {binding_idx}") shape = trt.volume(self.context.get_binding_shape(binding_idx)) dtype = trt.nptype(self.engine.get_binding_dtype(binding)) if self.engine.binding_is_input(binding): print(shape) host_input = np.empty(shape, dtype=np.float32) device_input = cuda.mem_alloc(host_input.nbytes) bindings.append(int(device_input)) else: host_output = cuda.pagelocked_empty(shape, dtype) device_output = cuda.mem_alloc(host_output.nbytes) bindings.append(int(device_output)) assert device_input is not None assert device_output is not None assert len(bindings) == 2 self.bindings = bindings self.device_input = device_input self.host_input = host_input self.device_output = device_output self.host_output = host_output
def build_engine(onnx_file_path, engine_file_path, calib=None): EXPLICIT_BATCH = 1 << (int)( trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) with trt.Builder(TRT_LOGGER) as builder, builder.create_network( (EXPLICIT_BATCH)) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: with open(onnx_file_path, 'rb') as model: print('Beginning ONNX file parsing') if not parser.parse(model.read()): print('ERROR: Failed to parse the ONNX file.') for error in range(parser.num_errors): print(parser.get_error(error)) return None print('Completed parsing of ONNX file') print('Network inputs:') for i in range(network.num_inputs): tensor = network.get_input(i) print(tensor.name, trt.nptype(tensor.dtype), tensor.shape) config = builder.create_builder_config() config.max_workspace_size = 1 << 20 if calib: config.set_flag(trt.BuilderFlag.INT8) config.int8_calibrator = calib else: builder.fp16_mode = True print( 'Building an engine from file {}; this may take a while...'.format( onnx_file_path)) engine = builder.build_engine(network, config) print("Completed creating Engine. Writing file to: {}".format( engine_file_path)) with open(engine_file_path, "wb") as f: f.write(engine.serialize()) return engine
def run(batchSize, nRow, nCol): print("test", batchSize, nRow, nCol) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, nRow, nCol) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() condition = np.array(np.random.randint(0, 2, [batchSize, nRow, nCol]), dtype=np.int32) inputX = np.full([batchSize, nRow, nCol], 1, dtype=np.float32) inputY = np.full([batchSize, nRow, nCol], -1, dtype=np.float32) inputH0 = np.ascontiguousarray(condition.reshape(-1)) inputH1 = np.ascontiguousarray(inputX.reshape(-1)) inputH2 = np.ascontiguousarray(inputY.reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputD1 = cuda.mem_alloc(inputH1.nbytes) inputD2 = cuda.mem_alloc(inputH2.nbytes) outputH0 = np.empty((batchSize, ) + tuple(engine.get_binding_shape(3)), dtype=trt.nptype(engine.get_binding_dtype(3))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) cuda.memcpy_htod_async(inputD2, inputH2, stream) context.execute_async( batchSize, [int(inputD0), int(inputD1), int(inputD2), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() outputH0CPU = whereCPU(condition, inputX, inputY) print("Check result:", ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
def _allocate_buffers(self): """ nvidia function to allocate memory on the device """ inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in self.engine: size = ( trt.volume(self.engine.get_binding_shape(binding)) * self.engine.max_batch_size ) dtype = trt.nptype(self.engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if self.engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(self): d_inputs, outputs, bindings = [], [], [] stream = cuda.Stream() # FIXME: Clarify this with Po-han. What if we want a batch size smaller than max_batch_size from builder? # max_batch_size = self.engine.max_batch_size if self.engine.has_implicit_batch_dimension else self.engine.get_profile_shape(0, 0)[2][0] if self.engine.has_implicit_batch_dimension: max_batch_size = min(self.engine.max_batch_size, self.args.batch_size) else: max_batch_size = self.args.batch_size for binding in self.engine: logging.info("Binding {:}".format(binding)) desc = self.engine.get_binding_format_desc( self.engine.get_binding_index(binding)) logging.info(" Binding info {:} with shape {:}".format( desc, self.engine.get_binding_shape( self.engine.get_binding_index(binding)))) dtype = self.engine.get_binding_dtype(binding) format = self.engine.get_binding_format( self.engine.get_binding_index(binding)) shape = self.engine.get_binding_shape(binding) if format == trt.TensorFormat.CHW4: shape[-3] = ((shape[-3] - 1) // 4 + 1) * 4 if not self.engine.has_implicit_batch_dimension: shape[0] = max_batch_size size = trt.volume(shape) else: size = trt.volume(shape) * max_batch_size # Allocate host and device buffers device_mem = cuda.mem_alloc(size * dtype.itemsize) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if self.engine.binding_is_input(binding): d_inputs.append(device_mem) else: host_mem = cuda.pagelocked_empty(size, trt.nptype(dtype)) outputs.append(HostDeviceMem(host_mem, device_mem)) return d_inputs, outputs, bindings, stream
def allocate_buffers(engine): print(engine.get_binding_shape(0)) print(engine.get_binding_shape(1)) print(engine.get_binding_shape(2)) #print(engine.get_binding_shape(3)) #bindings = [] # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs. h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE)) h_output_1 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE)) h_output_2 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(2)), dtype=trt.nptype(ModelData.DTYPE)) #h_output_3 = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(3)), dtype=trt.nptype(ModelData.DTYPE)) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output_1 = cuda.mem_alloc(h_output_1.nbytes) d_output_2 = cuda.mem_alloc(h_output_2.nbytes) #d_output_3 = cuda.mem_alloc(h_output_3.nbytes) print('bbb') # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() return h_input, d_input, [h_output_1,h_output_2] , [d_output_1,d_output_2], stream
def _init_buffers(self): self._stream = cuda.Stream() self._input_buffers = [] self._output_buffers = [] self._bindings = [] for binding in self._engine: dtype = np.dtype(trt.nptype(self._engine.get_binding_dtype(binding))) shape = self._engine.get_binding_shape(binding) size = trt.volume(shape) * self._engine.max_batch_size device = cuda.mem_alloc(size * dtype.itemsize) self._bindings.append(int(device)) if self._engine.binding_is_input(binding): host = None # will be added during inference buffer = MemoryBuffer(dtype, shape, host, device) self._input_buffers.append(buffer) else: host = cuda.pagelocked_empty(size, dtype) buffer = MemoryBuffer(dtype, shape, host, device) self._output_buffers.append(buffer)
def allocate_buffers(engine, batch_size=None): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str( self.device) def __repr__(self): return self.__str__() for binding in engine: dims = engine.get_binding_shape(binding) # print('buff--dims:', dims) if dims[0] == -1: assert (batch_size is not None) dims[0] = batch_size size = trt.volume( dims ) * engine.max_batch_size # The maximum batch size which can be used for inference. # print("size:",size) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) if engine.binding_is_input( binding): # Determine whether a binding is an input binding. inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): class HostDeviceMem(object): def __init__(self, host_mem, device_mem): """ host_mem: cpu memory device_mem: gpu memory """ self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str( self.device) def __repr__(self): return self.__str__() inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: # print(binding) # 绑定的输入输出 # print(engine.get_binding_shape(binding)) # get_binding_shape 是变量的大小 size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size # volume 计算可迭代变量的空间,指元素个数 # size = trt.volume(engine.get_binding_shape(binding)) # 如果采用固定bs的onnx,则采用该句 dtype = trt.nptype(engine.get_binding_dtype(binding)) # get_binding_dtype 获得binding的数据类型 # nptype等价于numpy中的dtype,即数据类型 # allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) # 创建锁业内存 device_mem = cuda.mem_alloc(host_mem.nbytes) # cuda分配空间 # print(int(device_mem)) # binding在计算图中的缓冲地址 bindings.append(int(device_mem)) # append to the appropriate list if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def __init__(self, model_path): # load model with open(model_path, 'rb') as f: with trt.Runtime(trt.Logger(trt.Logger.ERROR)) as runtime: self.engine = runtime.deserialize_cuda_engine(f.read()) # allocate buffers self.inputs = list() self.outputs = list() self.bindings = list() self.stream = cuda.Stream() for binding in self.engine: shape = self.engine.get_binding_shape(binding) size = trt.volume(shape) * self.engine.max_batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) self.bindings.append(int(device_mem)) if self.engine.binding_is_input(binding): self.inputs.append([host_mem, device_mem]) else: self.outputs.append([host_mem, device_mem]) self.context = self.engine.create_execution_context()
def main(): model_path="model/trt/conv_3d_2.trt" TRT_LOGGER = trt.Logger(trt.Logger.WARNING) runtime=trt.Runtime(TRT_LOGGER) f=open(model_path,"rb") engine = runtime.deserialize_cuda_engine(f.read()) context=engine.create_execution_context() f.close() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Append to the appropriate list. if engine.binding_is_input(binding): print("input_size: ", size, "dtype: ", dtype) else: print("output_size: ", size, "dtype: ", dtype) inputs, outputs, bindings, stream = common.allocate_buffers(engine) length=input_shape[0]*input_shape[1]*input_shape[2]*input_shape[3] data=np.zeros(length,dtype=np.float32) data[:]=1.0 inputs[0].host=data.reshape(input_shape) print(inputs[0].host[0][0][0][:10]) outputs[0].host=np.zeros(output_shape, dtype=np.float32) trt_outputs = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) print(trt_outputs[0][0][0][0][:10]) print("starting...") starttime = time.time() for i in range(1000): trt_outputs = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) endtime = time.time() print (endtime - starttime) print(trt_outputs[0][0][0][0][:10])
def allocate_buffer(self): bindings = [] inputs = [] outputs = [] for binding in self.engine: size = trt.volume(self.engine.get_binding_shape( binding)) * self.engine.max_batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) if self.engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings
def get_input_metadata(self): inputs = OrderedDict() active_profile = self.context.active_optimization_profile bindings_per_profile = len( self.engine) // self.engine.num_optimization_profiles logging.debug( "Total # of Profiles: {:}, Bindings Per Profile: {:}, Active Profile: {:}" .format(self.engine.num_optimization_profiles, bindings_per_profile, active_profile)) start_binding = bindings_per_profile * active_profile end_binding = start_binding + bindings_per_profile logging.info("Start Binding: {:}, End Binding: {:}".format( start_binding, end_binding)) for binding in range(start_binding, end_binding): if self.engine.binding_is_input(binding): inputs[self.engine[binding]] = ( trt.nptype(self.engine.get_binding_dtype(binding)), list(self.engine.get_binding_shape(binding)), ) return inputs
def allocate_buffers(engine): """ Allocates all buffers required for an engine, i.e. host/device inputs/outputs. Parameters ---------- engine : tensorrt.ICudaEngine An ICudaEngine for executing inference on a built network Returns ------- list All input HostDeviceMem of an engine list All output HostDeviceMem of an engine GPU bindings Device bindings GPU stream A stream is a sequence of commands (possibly issued by different host threads) that execute in order """ inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: # binding相当于一组字符串名称,包括'data','prob', engine可以切片, engine[0]='data', engine[1]='prob' size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) # (784,)是把img拉直的一个一维数组,作为主机的缓存 device_mem = cuda.mem_alloc( host_mem.nbytes) # obj,可以int(obj),是所占的GPU内存比特数 # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) # (2,) 分别是data和prob else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) # input_output_path = "/home/nsathish/Efficient_object_detection/mmdetection/buffer/ssd_pickle.dat" # buffers=[inputs,outputs,bindings] # with open(input_output_path, "wb") as f: # pickle.dump(buffers, f, pickle.HIGHEST_PROTOCOL) return inputs, outputs, bindings, stream
def allocate_buffers(engine): inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: # print(binding) # 绑定的输入输出 # print(engine.get_binding_shape(binding)) # get_binding_shape 是变量的大小 size = trt.volume(engine.get_binding_shape(binding))*engine.max_batch_size # volume 计算可迭代变量的空间,指元素个数 # size = trt.volume(engine.get_binding_shape(binding)) # 如果采用固定bs的onnx,则采用该句 dtype = trt.nptype(engine.get_binding_dtype(binding)) # get_binding_dtype 获得binding的数据类型 # nptype等价于numpy中的dtype,即数据类型 # allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) # 创建锁业内存 device_mem = cuda.mem_alloc(host_mem.nbytes) # cuda分配空间 # print(int(device_mem)) # binding在计算图中的缓冲地址 bindings.append(int(device_mem)) #append to the appropriate list if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def __init__(self, model): # load tensorrt engine TRT_LOGGER = trt.Logger(trt.Logger.INFO) TRTbin = model print('trtbin', TRTbin) with open(TRTbin, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.context = engine.create_execution_context() # allocate memory inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append({'host': host_mem, 'device': device_mem}) else: outputs.append({'host': host_mem, 'device': device_mem}) # save to class self.inputs = inputs self.outputs = outputs self.bindings = bindings self.stream = stream self.no = 12 self.output_shapes = [(1, 3, 56, 56, self.no), (1, 3, 28, 28, self.no), (1, 3, 14, 14, self.no)] self.names = [ 'angular_leafspot', 'anthracnose_fruit_rot', 'blossom_blight', 'gray_mold', 'leaf_spot', 'powdery_mildew_fruit', 'powdery_mildew_leaf' ] self.img_size = 448
def build(self): assert os.path.exists(self.engine_file), "Engine file doesn't exist" runtime = trt.Runtime(TrtModel.TRT_LOGGER) with open(self.engine_file, 'rb') as engine_file: self.engine = runtime.deserialize_cuda_engine(engine_file.read()) if self.engine is None: raise RuntimeError('Unable to load the engine file') self.context = self.engine.create_execution_context() self.stream = cp.cuda.Stream(non_blocking=True) self.max_batch_size = self.engine.get_profile_shape(0, 0)[2][0] for binding in self.engine: shape = self.engine.get_binding_shape(binding) if shape[0] == -1: shape = (self.max_batch_size, ) + shape[1:] size = trt.volume(shape) dtype = trt.nptype(self.engine.get_binding_dtype(binding)) buffer = HostDeviceMem(size, dtype) self.bindings.append(buffer.devptr) if self.engine.binding_is_input(binding): self.input = buffer self.input_shapes.append( self.engine.get_binding_shape(binding)) else: self.outputs.append(buffer) self.out_shapes.append(self.engine.get_binding_shape(binding)) self.out_names.append(binding) assert self.input is not None self.start = cp.cuda.Event() self.end = cp.cuda.Event()
def load_input(img_path, host_buffer): print('load input') c, h, w = INPUT_SHAPE img = cv2.imread(img_path) mean = [123.829891747, 127.351147446, 110.256170154] stdv = [0.016895854, 0.017222115, 0.014714524] img = cv2.resize(img, (h, w)) img = np.swapaxes(img, 0, 2) img = np.swapaxes(img, 1, 2) img = np.array(img, dtype=float) mean = np.array(mean, dtype=float) stdv = np.array(stdv, dtype=float) img[0, :, :] -= mean[0] img[1, :, :] -= mean[1] img[2, :, :] -= mean[2] img[0, :, :] *= stdv[0] img[1, :, :] *= stdv[1] img[2, :, :] *= stdv[2] dtype = trt.nptype(DTYPE) img_array = np.asarray(img).astype(dtype).ravel() np.copyto(host_buffer, img_array)