def infer(context, input_img, output_size, batch_size): #load engine engine = context.get_engine() #print("Bindings: {}").format(engine.get_nb_bindings()) assert (engine.get_nb_bindings() == 2) #convert input data to Float32 input_img = input_img.astype(np.float32) #create output array to receive data #output = np.empty(output_size, dtype = np.float32) #alocate pagelocked memory output = cuda.pagelocked_empty(output_size, dtype=np.float32) # #alocate device memory # print(input_img.size) # print(input_img.dtype.itemsize) #itemsize in byte d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #likely copy from here(pc) to device(gpu) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #synchronise threads stream.synchronize() #save our engine to a file to use later #trt.utils.write_engine_to_file("/root/tensorrt/tiny-yolo.engine", engine.serialize()) return output
def detect(engine: trt.ICudaEngine, img: np.ndarray) -> "tuple[np.ndarray, np.ndarray, np.ndarray]": #this function performs network execution on the given img #additionally this function does preprocessing and postprocessing of img #param engine: tensor rt engine created from network weights #param img: image to perform detection on #return value: predictions in original image coordinates #predictions: bounding boxes, confidences, class_ids with engine.create_execution_context() as context: h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32) h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32) #preprocess preprocess_start = time.time() preprocessed_img, resize_ratio, padding_size = preprocess_img(img) preprocess_stop = time.time() #copy our input image to buffer np.copyto(h_input, preprocessed_img.flatten()) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input, h_input, stream) # Run inference. context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() postprocess_start = time.time() predictions = postprocess(h_output) predictions_in_original_coords = transform_detected_coords_to_original(predictions, resize_ratio, padding_size) postprocess_stop = time.time() print(f"Preprocessing time: {(preprocess_stop - preprocess_start) * 1000:.4f} ms") print(f"Postprocessing time: {(postprocess_stop - postprocess_start) * 1000:.4f} ms") print(f"Complete detection time: {(postprocess_stop - preprocess_start) * 1000:.4f} ms") return predictions_in_original_coords
def allocate_buffers(engine,melgan_time_step): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size if(binding == 'input'): size = 1 * n_mel_channels * melgan_time_step if (binding == 'output'): size = hop_length * melgan_time_step dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def create_graph(self): """""" uff_model = uff.from_tensorflow_frozen_model( self.model_file, ['InceptionResnetV2/Logits/Predictions']) G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR) parser = uffparser.create_uff_parser() parser.register_input('input_image', (3, 512, 512), 0) parser.register_output('InceptionResnetV2/Logits/Predictions') engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 32) parser.destroy() runtime = trt.infer.create_infer_runtime(G_LOGGER) self.context = engine.create_execution_context() self.output = np.empty(len(self.id2name), dtype=np.float32) self.d_input = cuda.mem_alloc(1 * 512 * 512 * 3 * 4) self.d_output = cuda.mem_alloc(1 * len(self.id2name) * 4) self.bindings = [int(self.d_input), int(self.d_output)] self.stream = cuda.Stream()
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() binding_to_type = {"data": np.float32, "im_info": np.float32, "bbox_pred": np.float32, "cls_prob": np.float32, "rois": np.float32} for binding in engine: print("binding:",binding) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size # dtype = trt.nptype(engine.get_binding_dtype(binding)) dtype = binding_to_type[str(binding)] # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def __init__(self, model_path=None, cuda_ctx=None): self._model_path = model_path if self._model_path is None: print("please set trt model path!") exit() self.cuda_ctx = cuda_ctx if self.cuda_ctx is None: self.cuda_ctx = cuda.Device(0).make_context() if self.cuda_ctx: self.cuda_ctx.push() self.trt_logger = trt.Logger(trt.Logger.INFO) self._load_plugins() self.engine = self._load_engine() try: self.context = self.engine.create_execution_context() self.stream = cuda.Stream() self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers( ) except Exception as e: raise RuntimeError('fail to allocate CUDA resources') from e finally: if self.cuda_ctx: self.cuda_ctx.pop()
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: # binding相当于一组字符串名称,包括'data','prob', engine可以切片, engine[0]='data', engine[1]='prob' size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) # (784,)是把img拉直的一个一维数组,作为主机的缓存 device_mem = cuda.mem_alloc( host_mem.nbytes) # obj,可以int(obj),是所占的GPU内存比特数 # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) # (2,) 分别是data和prob else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) print(binding) print(size) print(dtype) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def _allocate_buffers(self): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in self.engine: size = trt.volume(self.engine.get_binding_shape( binding)) * self.engine.max_batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if self.engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: # print(binding) # 绑定的输入输出 # print(engine.get_binding_shape(binding)) # get_binding_shape 是变量的大小 size = trt.volume(engine.get_binding_shape(binding))*engine.max_batch_size # volume 计算可迭代变量的空间,指元素个数 # size = trt.volume(engine.get_binding_shape(binding)) # 如果采用固定bs的onnx,则采用该句 dtype = trt.nptype(engine.get_binding_dtype(binding)) # get_binding_dtype 获得binding的数据类型 # nptype等价于numpy中的dtype,即数据类型 # allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) # 创建锁业内存 device_mem = cuda.mem_alloc(host_mem.nbytes) # cuda分配空间 # print(int(device_mem)) # binding在计算图中的缓冲地址 bindings.append(int(device_mem)) #append to the appropriate list if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def Plan(*args, **kwds): mempool = kwds.pop('mempool', None) context_obj = kwds.pop('context', None) stream_obj = kwds.pop('stream', None) if stream_obj is not None: device = cuda.Context.get_device() wait_for_finish = False elif context_obj is not None: device = context_obj.get_device() wait_for_finish = True stream_obj = None else: device = cuda.Context.get_device() stream_obj = cuda.Stream() wait_for_finish = True if 'wait_for_finish' not in kwds or kwds['wait_for_finish'] is None: kwds['wait_for_finish'] = wait_for_finish context = Context(device, stream_obj, mempool) return FFTPlan(context, *args, **kwds)
def setUp(self): #Set which CL device to use, and disable kernel caching self.gpu_ctx = Common.CUDAContext() # Make some host data which we can play with self.nx = 3 self.ny = 5 self.nx_halo = 1 self.ny_halo = 2 self.dataShape = (self.ny + 2 * self.ny_halo, self.nx + 2 * self.nx_halo) self.buf1 = np.zeros(self.dataShape, dtype=np.float32, order='C') self.dbuf1 = np.zeros(self.dataShape) self.buf3 = np.zeros(self.dataShape, dtype=np.float32, order='C') self.dbuf3 = np.zeros(self.dataShape) for j in range(self.dataShape[0]): for i in range(self.dataShape[1]): self.buf1[j, i] = i * 100 + j self.dbuf1[j, i] = self.buf1[j, i] self.buf3[j, i] = j * 1000 - i self.dbuf3[j, i] = self.buf3[j, i] self.explicit_free = False self.device_name = self.gpu_ctx.cuda_device.name() self.gpu_stream = cuda.Stream() self.tests_failed = True self.cudaarray = Common.CUDAArray2D(self.gpu_stream, \ self.nx, self.ny, \ self.nx_halo, self.ny_halo, \ self.buf1) self.double_cudaarray = None
def __init__(self, model): # load tensorrt engine TRT_LOGGER = trt.Logger(trt.Logger.INFO) TRTbin = model print('trtbin', TRTbin) with open(TRTbin, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.context = engine.create_execution_context() # allocate memory inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append({'host': host_mem, 'device': device_mem}) else: outputs.append({'host': host_mem, 'device': device_mem}) # save to class self.inputs = inputs self.outputs = outputs self.bindings = bindings self.stream = stream self.no = 12 self.output_shapes = [(1, 3, 56, 56, self.no), (1, 3, 28, 28, self.no), (1, 3, 14, 14, self.no)] self.names = [ 'angular_leafspot', 'anthracnose_fruit_rot', 'blossom_blight', 'gray_mold', 'leaf_spot', 'powdery_mildew_fruit', 'powdery_mildew_leaf' ] self.img_size = 448
def _init_thread_memory(self, dev_id:int, ctx:cuda.Context, alloc_size: int) -> None: ''' Single thread that initializes the memory for all the stream for a single GPU. ''' ctx.push() size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream)) # Initialize streams for i in range(self.num_stream): self.streams[dev_id].append(cuda.Stream()) for i in range(0, self.num_stream, 1): # allocate memory on device self.moments_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10)))) self.w_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.x_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.y_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) # set host memory for returned output self.c_moments[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7)))) self.mu[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.yf[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.m1[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5)))) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(0), size_per_batch, size_per_batch, block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(1), size_per_batch, np.int32(0), block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.x1[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w1[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.x2[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w2[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) ctx.synchronize() ctx.pop()
def do_inference_overlap(context, inp): '''Do inference using a TRT engine and time it Execution and device-to-host copy are completed asynchronously ''' # Typical Python-TRT used in samples would copy input data from host to device. # Because the PyTorch Tensor is already on the device, such a copy is unneeded. t0 = time.perf_counter() # Create output buffers and stream stream = cuda.Stream() outputs, bindings, out_shape = trtutils.allocate_buffers_with_existing_inputs( context, inp) t01 = time.perf_counter() t1 = time.perf_counter() # Run inference and transfer outputs to host asynchronously context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] stream.synchronize() t2 = time.perf_counter() copyto = t1 - t0 inference = t2 - t1 outputs[0].device.free() out = perfutils.torchify_trt_out(outputs[0].host, out_shape) return out, t2 - t1
def infer(context, input_img, output_size, batch_size): # Load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) # Convert input data to float32 input_img = input_img.astype(np.float32) # Create host buffer to receive data output = np.empty(output_size, dtype = np.float32) # Allocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() # Transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) # Execute model context.enqueue(batch_size, bindings, stream.handle, None) # Transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) # Synchronize threads stream.synchronize() # Return predictions return output
def main(): # initialize TensorRT engine and parse ONNX model engine, context = build_engine(ONNX_FILE_PATH) # get sizes of input and output and allocate memory required for input data and for output data for binding in engine: if engine.binding_is_input(binding): # we expect only one input input_shape = engine.get_binding_shape(binding) input_size = ( trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize ) # in bytes device_input = cuda.mem_alloc(input_size) else: # and one output output_shape = engine.get_binding_shape(binding) # create page-locked memory buffers (i.e. won't be swapped to disk) host_output = cuda.pagelocked_empty( trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32 ) device_output = cuda.mem_alloc(host_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # preprocess input data host_input = np.array( preprocess_image("turkish_coffee.jpg").numpy(), dtype=np.float32, order="C" ) cuda.memcpy_htod_async(device_input, host_input, stream) # postprocess results output_data = torch.Tensor(host_output).reshape( engine.max_batch_size, output_shape[0] ) postprocess(output_data)
def __init__(self, context: trt.IExecutionContext, stream=None, device=None, cuda_device=None, cuda_context=None): self.engine = context.engine if device is None: self.torch_device = torch.device("cuda:0") else: self.torch_device = device inputs, outputs, bindings = allocate_buffers_torch( self.engine, self.torch_device) self.context = context self.inputs = inputs self.outputs = outputs self.bindings = bindings self.input_dict = {mem.name: mem for mem in inputs} self.output_dict = {mem.name: mem for mem in outputs} if stream is None: self.stream = cuda.Stream() self._batch_size = None self.cuda_device = cuda_device self.cuda_context = cuda_context
def main(args): model = torch.load(args.model_path) # Prepare input image = np.zeros((3, 64, 128), dtype=np.float32) flat_image = image.ravel() tensorrt_out = np.empty(2, dtype=np.float32) # Prepare torch version model.eval() image_shape = tuple([1] + list(image.shape)) torch_image = np.reshape(image, image_shape) batch = Variable(torch.from_numpy(torch_image).cuda() / 255.0) torch_out = model.forward(batch, None).cpu().data.numpy()[0] print("Torch Predictions:", torch_out) weights = model.state_dict() engine = build_engine(weights) context = engine.create_execution_context() d_input = cuda.mem_alloc(1 * flat_image.size * flat_image.dtype.itemsize) d_output = cuda.mem_alloc(1 * tensorrt_out.size * tensorrt_out.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, flat_image, stream) #execute model context.enqueue(1, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(tensorrt_out, d_output, stream) #syncronize threads stream.synchronize() print("TensorRT Predictions: ", tensorrt_out)
def load_engine(self, model): # load tensorrt engine TRT_LOGGER = trt.Logger(trt.Logger.INFO) with open(model, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.context = engine.create_execution_context() # allocate memory inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append({'host': host_mem, 'device': device_mem}) else: outputs.append({'host': host_mem, 'device': device_mem}) # save to class self.inputs = inputs self.outputs = outputs self.bindings = bindings self.stream = stream
def __init__(self, cuda_engine, device_id=0): """Create an ``Engine``. Parameters ---------- cuda_engine : tensorrt.ICudaEngine The built cuda engine. device_id : int, optional, default=0 The index of executing device. """ # Create executing resources. self._cuda_engine = cuda_engine self._device_id = device_id self._context = cuda_engine.create_execution_context() self._stream = driver.Stream(0) # Create bindings. num_binding = self._cuda_engine.num_bindings self._bindings = [Binding(cuda_engine, self._context, i, device_id) for i in range(num_binding)] self._inputs = [b for b in self._bindings if b.is_input] self._outputs = [b for b in self._bindings if not b.is_input] # Report the engine info. logging.info('TensorRT engine built.') binding_info = 'InputInfo: {\n' for b in self._inputs: binding_info += ' * Binding("{}", shape={}, dtype={})\n' \ .format(b.name, b.shape, b.dtype) logging.info(binding_info + '}') binding_info = 'OutputInfo: {\n' for b in self._outputs: binding_info += ' * Binding("{}", shape={}, dtype={})\n' \ .format(b.name, b.shape, b.dtype) logging.info(binding_info + '}')
def image_cuda(grids): """ Run 2d FFT to image each plane of grid array """ from pyfft.cuda import Plan from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray import pycuda.driver as cuda nints, npixx, npixy = grids.shape cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((npixx, npixy), stream=stream) grid_gpu = gpuarray.to_gpu(grids) for i in range(0, nints): plan.execute(grid_gpu[i], inverse=True) grids = grid_gpu.get() context.pop() return recenter(grids.real, (npixx//2, npixy//2))
def do_inference(engine, input): with engine.create_execution_context() as context: # h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=trt.nptype(data_type)) # h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=trt.nptype(data_type)) h_input = cuda.pagelocked_empty(trt.volume( context.get_binding_shape(0)), dtype=np.float32) h_output = cuda.pagelocked_empty(trt.volume( context.get_binding_shape(1)), dtype=np.float32) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() input = np.array(input, order='C').ravel() np.copyto(h_input, input) # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input, h_input, stream) # Show time spent in each layer # context.profiler = trt.Profiler() # Run inference. context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Return the host output. output = h_output.reshape((1, 3, 512, 512)) return torch.tensor(output)
def do_inference(inf_context, inf_host_in, inf_host_out): """ Perform inference using the CUDA context :param inf_context: context created by engine :param inf_host_in: input from the host :param inf_host_out: output to save on host :return: """ inference_engine = inf_context.engine # Input and output bindings are required for inference assert inference_engine.num_bindings == 2 # allocate memory in GPU using CUDA bindings device_in = cuda.mem_alloc(inf_host_in.nbytes) device_out = cuda.mem_alloc(inf_host_out.nbytes) # create bindings for input and output bindings = [int(device_in), int(device_out)] # create CUDA stream for simultaneous CUDA operations stream = cuda.Stream() # copy input from host (CPU) to device (GPU) in stream cuda.memcpy_htod_async(device_in, inf_host_in, stream) # execute inference using context provided by engine inf_context.execute_async(bindings=bindings, stream_handle=stream.handle) # copy output back from device (GPU) to host (CPU) cuda.memcpy_dtoh_async(inf_host_out, device_out, stream) # synchronize the stream to prevent issues # (block CUDA and wait for CUDA operations to be completed) stream.synchronize()
def __init__(self, engine_path): """Tensorrt engine model dynamic inference Args: engine_path (trt.tensorrt.ICudaEngine) """ super(TRTModel, self).__init__() # cfx多线程需要加的限制 self.cfx = pycuda.autoinit.context self.engine_path = engine_path self.logger = trt.Logger(getattr(trt.Logger, 'ERROR')) ## load engine for engine_path self.engine = self.load_engine() self.stream = cuda.Stream() # default profile index is 0 self.profile_index = 0 ## create context for cuda engine self.context = self.engine.create_execution_context() self.batch_size_ranges = [] ## get input/deploy_trtoutput cuda swap address use idx self.input_binding_idxs, self.output_binding_idxs = self._get_binding_idxs( ) ## get network input/output name self.input_names, self.output_names = self.get_input_output_name()
img, label = MNIST_DATASETS.test.next_batch(1) img = img[0] #convert input data to Float32 img = img.astype(np.float32) label = label[0] # runtime context runtime = trt.infer.create_infer_runtime(G_LOGGER) context = engine.create_execution_context() # output = np.empty(10, dtype=np.float32) #alocate device memory d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize) d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize) # bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, img, stream) #execute model context.enqueue(1, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #syncronize threads stream.synchronize() # print("Test Case: " + str(label)) print("Prediction: " + str(np.argmax(output))) # trt.utils.write_engine_to_file("./tf_mnist.engine", engine.serialize()) # new_engine = trt.utils.load_engine(G_LOGGER, "./tf_mnist.engine")
def main(): parser = argparse.ArgumentParser(description='BERT Inference Benchmark') parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine') parser.add_argument( '-b', '--batch-size', default=[], action="append", help= 'Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int) parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int) parser.add_argument( '-i', '--iterations', default=200, help='Number of iterations to run when benchmarking each batch size.', type=int) parser.add_argument( '-w', '--warm-up-runs', default=10, help='Number of iterations to run prior to benchmarking.', type=int) parser.add_argument('-r', '--random-seed', required=False, default=12345, help='Random seed.', type=int) args, _ = parser.parse_known_args() args.batch_size = args.batch_size or [1] # Import necessary plugins for BERT TensorRT ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL) with open(args.engine, 'rb') as f, trt.Runtime( TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read( )) as engine, engine.create_execution_context() as context: # Allocate buffers large enough to store the largest batch size max_input_shape = (max(args.batch_size), args.sequence_length) max_output_shape = (max(args.batch_size), args.sequence_length, 2, 1, 1) buffers = [ DeviceBuffer(max_input_shape), DeviceBuffer(max_input_shape), DeviceBuffer(max_input_shape), DeviceBuffer(max_output_shape) ] # Prepare random input pseudo_vocab_size = 30522 pseudo_type_vocab_size = 2 np.random.seed(args.random_seed) test_word_ids = np.random.randint( 0, pseudo_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32) test_segment_ids = np.random.randint( 0, pseudo_type_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32) test_input_mask = np.ones((max(args.batch_size), args.sequence_length), dtype=np.int32) # Copy input h2d cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel()) cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel()) cuda.memcpy_htod(buffers[2].buf, test_input_mask.ravel()) num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles bench_times = {} for idx, batch_size in enumerate(sorted(args.batch_size)): context.active_optimization_profile = idx # Each profile has unique bindings binding_idx_offset = idx * num_binding_per_profile bindings = [0] * binding_idx_offset + [ buf.binding() for buf in buffers ] shapes = { "input_ids": (batch_size, args.sequence_length), "segment_ids": (batch_size, args.sequence_length), "input_mask": (batch_size, args.sequence_length), } for binding, shape in shapes.items(): context.set_binding_shape(engine[binding] + binding_idx_offset, shape) assert context.all_binding_shapes_specified # Inference total_time = 0 start = cuda.Event() end = cuda.Event() stream = cuda.Stream() # Warmup for _ in range(args.warm_up_runs): context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) stream.synchronize() # Timing loop times = [] for _ in range(args.iterations): start.record(stream) context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) end.record(stream) stream.synchronize() times.append(end.time_since(start)) # Compute average time, 95th percentile time and 99th percentile time. bench_times[batch_size] = times [b.free() for b in buffers] for batch_size, times in bench_times.items(): total_time = sum(times) avg_time = total_time / float(len(times)) times.sort() percentile95 = times[int(len(times) * 0.95)] percentile99 = times[int(len(times) * 0.99)] print( "Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:} ms \tAverage Time: {:} ms\t95th Percentile Time: {:} ms\t99th Percentile Time: {:}" .format(args.iterations, batch_size, total_time, avg_time, percentile95, percentile99))
def _thread(pid, tid, cuda_context, cuda_kernel, dispatcher, temp_storage, total_edge_count, log_lock, merge_lock, exit_signal, exit_state): try: with log_lock: logging.debug('Clustering subprocess {} thread {} started.'.format( pid, tid)) cuda_context.push() ref_block_height, ref_block_width = block_dimensions edg_path = Path(temp_storage, 'edg') dps_path = Path(temp_storage, 'dps') ranked_spectra = session.ranked_spectra cuda_stream = drv.Stream() allocation_size_divisor = allocation_size_initial_divisor allocation_size = int(ref_block_height * ref_block_width / allocation_size_divisor) reallocated = False with log_lock: logging.debug( 'Clustering subprocess {} thread {}: Allocating host and device memory.' .format(pid, tid)) # allocate host pagelocked memory # input plm_precursor_mass = drv.pagelocked_empty( ref_block_height + ref_block_width, dtype=CG_PRECURSOR_MASS_DATA_TYPE) plm_mz = drv.pagelocked_empty( (ref_block_height + ref_block_width, num_of_peaks), dtype=CG_MZ_DATA_TYPE) plm_intensity = drv.pagelocked_empty( (ref_block_height + ref_block_width, num_of_peaks), dtype=CG_INTENSITY_DATA_TYPE) plm_block_dimensions = drv.pagelocked_empty( 2, dtype=CG_BLOCK_DIMENSIONS_DATA_TYPE) plm_offset = drv.pagelocked_empty(2, dtype=CG_OFFSET_DATA_TYPE) plm_allocation_size = drv.pagelocked_empty( 1, dtype=CG_ALLOCATION_SIZE_DATA_TYPE) # output plm_counter = drv.pagelocked_empty(1, dtype=CG_COUNTER_DATA_TYPE) plm_edge = drv.pagelocked_empty((allocation_size, 2), dtype=CG_EDGE_DATA_TYPE) plm_dot_product = drv.pagelocked_empty(allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE) plm_overflowed = drv.pagelocked_empty(1, dtype=CG_OVERFLOWED_DATA_TYPE) # allocate device memory # input dvp_precursor_mass = drv.mem_alloc_like(plm_precursor_mass) dvp_mz = drv.mem_alloc_like(plm_mz) dvp_intensity = drv.mem_alloc_like(plm_intensity) dvp_block_dimensions = drv.mem_alloc_like(plm_block_dimensions) dvp_offset = drv.mem_alloc_like(plm_offset) dvp_allocation_size = drv.mem_alloc_like(plm_allocation_size) # output dvp_counter = drv.mem_alloc_like(plm_counter) dvp_edge = drv.mem_alloc_like(plm_edge) dvp_dot_product = drv.mem_alloc_like(plm_dot_product) dvp_overflowed = drv.mem_alloc_like(plm_overflowed) with log_lock: logging.debug( 'Clustering subprocess {} thread {}: Start iterating dispatcher.' .format(pid, tid)) previous_row_id = -1 dispatcher.connect(pid, tid) # iterate dispatcher to get blocks for row_id, column_id, block in dispatcher.iterate(pid, tid): if exit_signal.value: with log_lock: logging.debug( 'Subprocess {} thread {}: Received exit signal, exits now.' .format(pid, tid)) break try: y_range, x_range = block block_height = y_range[1] - y_range[0] block_width = x_range[1] - x_range[0] if row_id != previous_row_id: with log_lock: logging.debug( '\033[92mSubprocess {} thread {}: Processing row {} (y:{}->{}).\033[0m' .format(pid, tid, row_id, *y_range)) previous_row_id = row_id # get necessary data plm_precursor_mass[: block_height] = ranked_spectra.precursor_mass[ y_range[0]:y_range[1]] plm_precursor_mass[ block_height:block_height + block_width] = ranked_spectra.precursor_mass[ x_range[0]:x_range[1]] plm_mz[:block_height] = ranked_spectra.mz[ y_range[0]:y_range[1]] plm_mz[block_height:block_height + block_width] = ranked_spectra.mz[x_range[0]:x_range[1]] plm_intensity[:block_height] = ranked_spectra.intensity[ y_range[0]:y_range[1]] plm_intensity[block_height:block_height + block_width] = ranked_spectra.intensity[ x_range[0]:x_range[1]] plm_block_dimensions[:] = (block_height, block_width) plm_offset[:] = (y_range[0], x_range[0]) # upload data drv.memcpy_htod_async(dvp_precursor_mass, plm_precursor_mass, cuda_stream) drv.memcpy_htod_async(dvp_mz, plm_mz, cuda_stream) drv.memcpy_htod_async(dvp_intensity, plm_intensity, cuda_stream) drv.memcpy_htod_async(dvp_block_dimensions, plm_block_dimensions, cuda_stream) drv.memcpy_htod_async(dvp_offset, plm_offset, cuda_stream) if reallocated: allocation_size_divisor = allocation_size_initial_divisor allocation_size = int(ref_block_height * ref_block_width / allocation_size_divisor) # reallocate host pagelocked memory del plm_edge del plm_dot_product plm_edge = drv.pagelocked_empty((allocation_size, 2), dtype=CG_EDGE_DATA_TYPE) plm_dot_product = drv.pagelocked_empty( allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE) # reallocate device memory del dvp_edge del dvp_dot_product dvp_edge = drv.mem_alloc_like(plm_edge) dvp_dot_product = drv.mem_alloc_like(plm_dot_product) with log_lock: logging.debug( '\033[92mSubprocess {} thread {}: Reset memory allocation size divisor to {}.\033[0m' .format(pid, tid, allocation_size_divisor)) reallocated = False cublockdim = (cuda_block_dimensions[1], cuda_block_dimensions[0], 1) cugriddim = (math.ceil(block_width / cuda_block_dimensions[1]), math.ceil(block_height / cuda_block_dimensions[0])) while True: plm_allocation_size[0] = allocation_size plm_counter[0] = 0 plm_overflowed[0] = False drv.memcpy_htod_async(dvp_allocation_size, plm_allocation_size, cuda_stream) drv.memcpy_htod_async(dvp_counter, plm_counter, cuda_stream) drv.memcpy_htod_async(dvp_overflowed, plm_overflowed, cuda_stream) cuda_kernel.prepared_async_call( cugriddim, cublockdim, cuda_stream, dvp_precursor_mass, dvp_mz, dvp_intensity, dvp_block_dimensions, dvp_offset, dvp_allocation_size, dvp_counter, dvp_edge, dvp_dot_product, dvp_overflowed) # transfer computation result from device to host drv.memcpy_dtoh_async(plm_edge, dvp_edge, cuda_stream) drv.memcpy_dtoh_async(plm_counter, dvp_counter, cuda_stream) drv.memcpy_dtoh_async(plm_overflowed, dvp_overflowed, cuda_stream) drv.memcpy_dtoh_async(plm_dot_product, dvp_dot_product, cuda_stream) cuda_stream.synchronize() if plm_overflowed[0]: allocation_size_divisor = int(allocation_size_divisor / 2) if allocation_size_divisor < 1: err_msg = ( '\nSubprocess {} thread {}: Allocation size divisor reached to the impossible value of {}.' .format(pid, tid, allocation_size_divisor)) with log_lock: logging.error(err_msg) raise Exception(err_msg) with log_lock: logging.debug( '\033[92mSubprocess {} thread {}: Edge list overflowed, ' 'decreases allocation size divisor to {}.\033[0m' .format(pid, tid, allocation_size_divisor)) allocation_size = int(block_width * block_height / allocation_size_divisor) # reallocate host pagelocked memory del plm_edge del plm_dot_product plm_edge = drv.pagelocked_empty( (allocation_size, 2), dtype=CG_EDGE_DATA_TYPE) plm_dot_product = drv.pagelocked_empty( allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE) # reallocate device memory del dvp_edge del dvp_dot_product dvp_edge = drv.mem_alloc_like(plm_edge) dvp_dot_product = drv.mem_alloc_like(plm_dot_product) reallocated = True continue else: break if abs(plm_precursor_mass[block_height - 1] - plm_precursor_mass[block_height + block_width - 1]) > precursor_tolerance: dispatcher.next_row(pid, tid) with merge_lock: edge_list_size = int(plm_counter[0]) if edge_list_size != 0: total_edge_count.value += edge_list_size edg = np.memmap(str(edg_path), dtype=CG_EDGE_DATA_TYPE, mode='r+', shape=(total_edge_count.value, 2)) dps = np.memmap(str(dps_path), dtype=CG_DOT_PRODUCT_DATA_TYPE, mode='r+', shape=total_edge_count.value) edg[-edge_list_size:] = plm_edge[:edge_list_size] dps[-edge_list_size:] = plm_dot_product[: edge_list_size] except Exception: err_msg = '\nSubprocess {} thread {}: Failed to clustering block (y:{}->{}, x:{}->{}).' \ .format(pid, tid, y_range[0], y_range[1], x_range[0], x_range[1]) with log_lock: logging.error(err_msg) raise with log_lock: if not exit_signal.value: logging.debug( 'Subprocess {} thread {}: Reached the end of iteration, work done.' .format(pid, tid)) cuda_context.pop() except (Exception, KeyboardInterrupt) as e: if type(e) is KeyboardInterrupt: with log_lock: logging.debug( 'Subprocess {} thread {}: Received KeyboardInterrupt, exits now.' .format(pid, tid)) else: with log_lock: logging.exception( '\nSubprocess {} thread {}: Ended unexpectedly. Logging traceback:\n' '==========TRACEBACK==========\n'.format(pid, tid)) exit_signal.value = True exit_state.value = 1 cuda_context.pop() return
def pycuda_multi_kernel(img, k_harris, thresh, executions): """ Finds and returns list of corners :param img: grayscale image :param k: Harris corner constant. Usually 0.04 - 0.06 :param thresh: The threshold above which a corner is counted :param executions: Number of times to be executed :return: corner_list: List with corners :return: average_execution_time: Average execution time in seconds """ # only for 256 by 512 images assert img.shape[0] == 256 # height assert img.shape[1] == 512 # width height = img.shape[0] width = img.shape[1] vector_size = img.shape[0] * img.shape[1] corner_list = [] offset = 2 # to fit still in a 32-bit integer thresh = int(thresh / 10) # function template func_mod_template = Template(""" #include<stdio.h> #define INDEX(a, b) a*${HEIGHT}+b __global__ void corners( float *dest, float *ixx, float *ixy, float *iyy, int offset, float k, int threshold) { unsigned int idx = threadIdx.x + threadIdx.y*blockDim.y + (blockIdx.x*(blockDim.x*blockDim.y)); unsigned int a = idx/${HEIGHT}; unsigned int b = idx%${HEIGHT}; float sxx = 0; float sxy = 0; float syy = 0; float det = 0; float trace = 0; float r = 0; if ((a >= offset) & (a <= (${WIDTH}-offset - 1)) & (b >= offset) & (b <= (${HEIGHT}-offset - 1))) { for (int bi = b - offset; bi < b + offset + 1; ++bi) { for (int ai = a - offset; ai < a + offset + 1; ++ai) { sxx = sxx + ixx[INDEX(ai, bi)]; sxy = sxy + ixy[INDEX(ai, bi)]; syy = syy + iyy[INDEX(ai, bi)]; } } det = sxx*syy - sxy*sxy; trace = sxx + syy; r = det - k*(trace*trace); if ((r/10) > threshold) dest[INDEX(a, b)] = r; } } """) # Find x and y derivatives dy, dx = np.gradient(img) Ixx = dx**2 Ixy = dy * dx Iyy = dy**2 ixx = Ixx.reshape(vector_size, order='F') ixy = Ixy.reshape(vector_size, order='F') iyy = Iyy.reshape(vector_size, order='F') dest_r = np.zeros_like(ixx) # the image is divided in four parts and processed in 4 diff kernels n = 4 # Number of slices (and concurrent operations) used. k_height = height k_width = int(width / n) func_mod = SourceModule( func_mod_template.substitute(HEIGHT=k_height, WIDTH=k_width)) pycuda_corners = func_mod.get_function("corners") ###### Start concurrency configuration ####### # Allocate memory on the host. d_ixx, d_ixy, d_iyy, d_dest_r = [], [], [], [] slice_size = int(vector_size / n) for k in range(n): # Allocate memory on device. d_ixx.append(drv.mem_alloc(ixx[0:slice_size].nbytes)) d_ixy.append(drv.mem_alloc(ixy[0:slice_size].nbytes)) d_iyy.append(drv.mem_alloc(iyy[0:slice_size].nbytes)) d_dest_r.append(drv.mem_alloc(dest_r[0:slice_size].nbytes)) # Create the streams and events needed. stream = [] event = [] event_dtoh = [] marker_names = ['kernel_begin', 'kernel_end'] for k in range(n): stream.append(drv.Stream()) event.append(dict([(marker_names[l], drv.Event()) \ for l in range(len(marker_names))])) event_dtoh.append(drv.Event()) # Use this event as a reference point. ref = drv.Event() finish = drv.Event() ref.record() #### Important ###### # The size of the slices must be larger (+ offset) to calculate # r at the limits of each section of the image. # This version does not calculate an r values for the limits of the # different image sections. #### Important ###### for _ in range(executions): # Transfer to device. for k in range(n): drv.memcpy_htod_async(d_ixx[k], ixx[slice_size * k:slice_size * (k + 1)], stream=stream[k]) drv.memcpy_htod_async(d_ixy[k], ixy[slice_size * k:slice_size * (k + 1)], stream=stream[k]) drv.memcpy_htod_async(d_iyy[k], iyy[slice_size * k:slice_size * (k + 1)], stream=stream[k]) drv.memcpy_htod_async(d_dest_r[k], dest_r[slice_size * k:slice_size * (k + 1)], stream=stream[k]) # Run kernels for k in range(n): event[k]['kernel_begin'].record(stream[k]) pycuda_corners( d_dest_r[k], d_ixx[k], d_ixy[k], d_iyy[k], np.uint32(offset), np.float32(k_harris), np.uint32(thresh), # max 1024 threds, 32x32 is a regular choice block=(32, 32, 1), grid=(int(128 / n), 1, 1), stream=stream[k]) for k in range(n): event[k]['kernel_end'].record(stream[k]) # Transfer data back to host. for k in range(n): drv.memcpy_dtoh_async(dest_r[slice_size * k:slice_size * (k + 1)], d_dest_r[k], stream=stream[k]) # event that it completed the transfer event_dtoh[k].record(stream[k]) stream[k].synchronize() # finish finish.record() finish.synchronize() ###### Output results ##### print('Timing info of stream launches in seconds') for k in range(n): print('Stream', k) for l in range(len(marker_names)): print(marker_names[l], ':', ref.time_till(event[k][marker_names[l]]) * 1e-3) # extract the corners r = np.reshape(dest_r, (256, 512), order='F') corners = np.where(r > 0) for i, j in zip(corners[0], corners[1]): corner_list.append([j, i, r[i, j]]) average_execution_time = (ref.time_till(finish) * 1e-3) / executions # for profiling # pycuda.autoinit.context.detach() return corner_list, average_execution_time
def run_simulation(self): # setup data#{{{ data = { 'weights': self.weights, 'lengths': self.lengths, 'params': self.params.T } base_shape = self.n_work_items, for name, shape in dict( tavg0=(self.exposures, self.args.n_regions,), tavg1=(self.exposures, self.args.n_regions,), state=(self.buf_len, self.states * self.args.n_regions), ).items(): # memory error exception for compute device try: data[name] = np.zeros(shape + base_shape, 'f') except MemoryError as e: self.logger.error('%s.\n\t Please check the parameter dimensions %d x %d, they are to large ' 'for this compute device', e, self.args.n_sweep_arg0, self.args.n_sweep_arg1) exit(1) gpu_data = self.make_gpu_data(data)#{{{ # setup CUDA stuff#{{{ step_fn = self.make_kernel( source_file=self.args.filename, warp_size=32, # block_dim_x=self.args.n_sweep_arg0, # ext_options=preproccesor_defines, # caching=args.caching, args=self.args, lineinfo=self.args.lineinfo, nh=self.buf_len, )#}}} # setup simulation#{{{ tic = time.time() n_streams = 32 streams = [drv.Stream() for i in range(n_streams)] events = [drv.Event() for i in range(n_streams)] tavg_unpinned = [] try: tavg = drv.pagelocked_zeros((n_streams,) + data['tavg0'].shape, dtype=np.float32) except drv.MemoryError as e: self.logger.error( '%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU', e, self.params.size) exit(1) # determine optimal grid recursively def dog(fgd): maxgd, mingd = max(fgd), min(fgd) maxpos = fgd.index(max(fgd)) if (maxgd - 1) * mingd * bx * by >= nwi: fgd[maxpos] = fgd[maxpos] - 1 dog(fgd) else: return fgd # n_sweep_arg0 scales griddim.x, n_sweep_arg1 scales griddim.y # form an optimal grid recursively bx, by = self.args.blockszx, self.args.blockszy nwi = self.n_work_items rootnwi = int(np.ceil(np.sqrt(nwi))) gridx = int(np.ceil(rootnwi / bx)) gridy = int(np.ceil(rootnwi / by)) final_block_dim = bx, by, 1 fgd = [gridx, gridy] dog(fgd) final_grid_dim = fgd[0], fgd[1] assert gridx * gridy * bx * by >= nwi self.logger.info('history shape %r', gpu_data['state'].shape) self.logger.info('gpu_data %s', gpu_data['tavg0'].shape) self.logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, )) self.logger.info('final block dim %r', final_block_dim) self.logger.info('final grid dim %r', final_grid_dim) # run simulation#{{{ nstep = self.args.n_time self.gpu_mem_info() if self.args.verbose else None try: for i in tqdm.trange(nstep, file=sys.stdout): try: event = events[i % n_streams] stream = streams[i % n_streams] if i > 0: stream.wait_for_event(events[(i - 1) % n_streams]) step_fn(np.uintc(i * self.n_inner_steps), np.uintc(self.args.n_regions), np.uintc(self.buf_len), np.uintc(self.n_inner_steps), np.uintc(self.n_work_items), np.float32(self.dt), gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'], gpu_data['tavg%d' % (i%2,)], block=final_block_dim, grid=final_grid_dim) event.record(streams[i % n_streams]) except drv.LaunchError as e: self.logger.error('%s', e) exit(1) tavgk = 'tavg%d' % ((i + 1) % 2,) # async wrt. other streams & host, but not this stream. if i >= n_streams: stream.synchronize() tavg_unpinned.append(tavg[i % n_streams].copy()) drv.memcpy_dtoh_async(tavg[i % n_streams], gpu_data[tavgk].ptr, stream=stream) # recover uncopied data from pinned buffer if nstep > n_streams: for i in range(nstep % n_streams, n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) for i in range(nstep % n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) except drv.LogicError as e: self.logger.error('%s. Check the number of states of the model or ' 'GPU block shape settings blockdim.x/y %r, griddim %r.', e, final_block_dim, final_grid_dim) exit(1) except drv.RuntimeError as e: self.logger.error('%s', e) exit(1) # self.logger.info('kernel finish..') # release pinned memory tavg = np.array(tavg_unpinned) # also release gpu_data self.release_gpumem(gpu_data) self.logger.info('kernel finished') return tavg