def data_finder(u, ss, sp, gpu_direct=True): data_package = data_list[u][ss][sp] dp = data_package.copy() memory_type = dp.memory_type if memory_type == 'devptr': if gpu_direct: devptr = data_list[u][ss][sp].devptr return devptr, dp else: devptr = data_list[u][ss][sp].devptr shape = dp.data_memory_shape bcmd = dp.data_contents_memory_dtype if log_type in ['time','all']: st = time.time() buf = numpy.empty((shape), dtype=bcmd) cuda.memcpy_dtoh_async(buf, devptr, stream=stream[1]) # buf = cuda.from_device(devptr, shape, bcmd) if log_type in ['time','all']: u = dp.unique_id bytes = dp.data_bytes t = MPI.Wtime()-st ms = 1000*t bw = bytes/GIGA/t log("rank%d, \"%s\", u=%d, GPU%d data transfer from GPU memory to CPU memory, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, device_number, bytes/MEGA, ms, bw),'time', log_type) dp.memory_type = 'memory' dp.data_dtype = type(buf) return buf, dp else: data = data_list[u][ss][sp].data return data, dp return None, None
def synchronize_start(self): """ Start the synchronization process. """ # Use shorter, easier names for class variables. bufs = self._sync_buffers ptrs = self._sync_ptrs streams = self._sync_streams adj = self._sync_adj # Start the transfer operations needed. self._sync_tags = [mpi_tag() for k in range(2)] # Mpi message tags. # Forward send. drv.memcpy_dtoh_async(bufs[0], ptrs['forw_src'], stream=streams[0]) # Backward send. drv.memcpy_dtoh_async(bufs[1], ptrs['back_src'], stream=streams[1]) # Forward receive. self._sync_req_forw = comm.Irecv(bufs[2], source=adj['back'], \ tag=self._sync_tags[0]) # Backward receive. self._sync_req_back = comm.Irecv(bufs[3], source=adj['forw'], \ tag=self._sync_tags[1]) # Signalling variables needed to complete transfers. self._sync_part2_start = [False, False, False, False]
def send(data, data_package, dest=None, gpu_direct=True): global s_requests tag = 52 dp = data_package # send data_package send_data_package(dp, dest=dest, tag=tag) bytes = dp.data_bytes memory_type = dp.memory_type if log_type in ['time','all']: st = time.time() flag = False request = None if memory_type == 'devptr': # data in the GPU if gpu_direct: # want to use GPU direct devptr = data buf = MPI.make_buffer(devptr.__int__(), bytes) ctx.synchronize() request = comm.Isend([buf, MPI.BYTE], dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, buf, devptr)) flag = True else:# not want to use GPU direct # copy to CPU shape = dp.data_memory_shape dtype = dp.data_contents_memory_dtype buf = numpy.empty(shape, dtype=dtype) cuda.memcpy_dtoh_async(buf, data, stream=stream_list[1]) request = comm.Isend(buf, dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, buf, None)) else: # data in the CPU # want to use GPU direct, not exist case # not want to use GPU direct if dp.data_dtype == numpy.ndarray: request = comm.Isend(data, dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, data, None)) if log_type in ['time','all']: u = dp.unique_id bytes = dp.data_bytes t = MPI.Wtime()-st ms = 1000*t bw = bytes/GIGA/t if flag: log("rank%d, \"%s\", u=%d, from rank%d to rank%d GPU direct send, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type) else: log("rank%d, \"%s\", u=%d, from rank%d to rank%d MPI data transfer, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type) return request
def copy(self, fb, dim, pool, stream=None): fmt = 'u1' if self.pix_fmt in ('yuv444p10', 'yuv420p10', 'yuv444p12'): fmt = 'u2' dims = (3, dim.h, dim.w) if self.pix_fmt == 'yuv420p10': dims = (dim.h * dim.w * 6 / 4,) h_out = pool.allocate(dims, fmt) cuda.memcpy_dtoh_async(h_out, fb.d_back, stream) return h_out
def run(self, scomp, scopy): # Pack kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata, v.mapping, v.cstrides or 0, v.rstrides or 0, m) # Copy the packed buffer to the host event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
def get(self, stream=None): """ copy device array to host. Returns: the host numpy array """ assert self.is_contiguous, "Array in get() must be contiguous" ary = np.empty(self.shape, self.dtype) drv.memcpy_dtoh_async(ary, self.gpudata, stream) return ary
def get_async(self, stream=None, ary=None): if ary is None: ary = drv.pagelocked_empty(self.shape, self.dtype) else: assert ary.size == self.size assert ary.dtype == self.dtype if self.size: drv.memcpy_dtoh_async(ary, self.gpudata, stream) return ary
def cpy_back(a, a_gpu, auto_init_context=True): """Data transfer from device to host. Asynchronous will be enabled when auto_init_context is True, otherwise use normal transfer. """ import pycuda.driver as drv if auto_init_context: strm = drv.Stream() drv.memcpy_dtoh_async(a, a_gpu, strm) return strm else: drv.memcpy_dtoh(a, a_gpu)
def get_async(self, stream=None, ary=None): if ary is None: ary = drv.pagelocked_empty(self.shape, self.dtype) ary = _as_strided(ary, strides=self.strides) else: assert ary.size == self.size assert ary.dtype == self.dtype assert ary.flags.forc assert self.flags.forc, "Array in get() must be contiguous" if self.size: drv.memcpy_dtoh_async(ary, self.gpudata, stream) return ary
def get_host_result(self): if not self.gpu_finished: if self.gpu_finished_evt.query(): self.gpu_finished = True self.copy_stream = get_stream() self.host_dest = self.pagelocked_allocator( self.gpu_result.shape, self.gpu_result.dtype, self.copy_stream) drv.memcpy_dtoh_async(self.host_dest, self.gpu_result.gpudata, self.copy_stream) self.copy_finished_evt = drv.Event() self.copy_finished_evt.record() else: if self.copy_finished_evt.query(): STREAM_POOL.append(self.copy_stream) return self.host_dest
def get_async(self, stream = None, ary = None): if ary is None: ary = cuda.pagelocked_empty(self.shape, self.dtype) else: assert ary.size == self.size assert ary.dtype == ary.dtype if ary.base.__class__ != cuda.HostAllocation: raise TypeError("asynchronous memory trasfer requires pagelocked numpy array") if self.size: if self.M == 1: cuda.memcpy_dtoh_async(ary, self.gpudata, stream) else: PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream) return ary
def run(self, scomp, scopy): # If we are unpacking then copy the host buffer to the GPU if op == 'unpack': cuda.memcpy_htod_async(m.data, m.hdata, scopy) event.record(scopy) scomp.wait_for_event(event) # Call the CUDA kernel (pack or unpack) fn.prepared_async_call(grid, block, scomp, v.nrow, v.ncol, v.mapping, v.strides, m, v.mapping.leaddim, v.strides.leaddim, m.leaddim) # If we have been packing then copy the GPU buffer to the host if op == 'pack': event.record(scomp) scopy.wait_for_event(event) cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
def execute(): # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() with engine.create_execution_context() as context: # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input, h_input, stream) # Run inference. context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Return the host output. return h_output
def _compute(self, data: np.ndarray, context: trt.tensorrt.IExecutionContext) -> np.ndarray: assert (data.dtype == self.array_in_dtype) assert (data.shape == self.array_in_shape) self.h_input = data # Transfer input data to the GPU. cuda.memcpy_htod_async(self.d_input, self.h_input, self.stream) # Run inference. context.execute_async(bindings=[int(self.d_input), int(self.d_output)], stream_handle=self.stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(self.h_output, self.d_output, self.stream) # Synchronize the stream self.stream.synchronize() # Return the host output. return self.h_output.reshape(self.array_out_shape)
def inference(engine_file_path): engine, context = load(engine_file_path) # get sizes of input and output and allocate memory required for input data and for output data device_input = None device_output, host_output = None, None for binding in engine: if engine.binding_is_input(binding): # we expect only one input input_shape = engine.get_binding_shape(binding) input_size = trt.volume( input_shape) * engine.max_batch_size * np.dtype( np.float32).itemsize # in bytes device_input = cuda.mem_alloc(input_size) else: # and one output output_shape = engine.get_binding_shape(binding) # create page-locked memory buffers (i.e. won't be swapped to disk) host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32) device_output = cuda.mem_alloc(host_output.nbytes) assert device_input assert device_output # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # preprocess input data host_input = np.array(np.random.rand(1, 3, 50, 200), dtype=np.float32, order='C') cuda.memcpy_htod_async(device_input, host_input, stream) # run inference context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_output, device_output, stream) stream.synchronize() # postprocess results output_data = torch.Tensor(host_output).reshape(engine.max_batch_size, output_shape[0]) print(output_data)
def predict(): time1 = time.time() cuda.init() device = cuda.Device(0) ctx = device.make_context() time2 = time.time() print("time to get context : ", time2 - time1) with builder.build_cuda_engine(network) as engine: output = np.empty(10 * BATCH_SIZE, dtype=np.float32) d_input = cuda.mem_alloc(1 * img.nbytes) d_output = cuda.mem_alloc(1 * output.nbytes) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() with engine.create_execution_context() as context: cuda.memcpy_htod_async(d_input, img, stream) context.execute_async(bindings=bindings, stream_handle=stream.handle, batch_size=BATCH_SIZE) cuda.memcpy_dtoh_async(output, d_output, stream) stream.synchronize() # print("true label : ", label) result = [] accuracy = np.zeros((1, BATCH_SIZE), np.uint8) for ii in range(BATCH_SIZE): result.append(np.argmax(output[ii * 10:(ii + 1) * 10])) if result[ii] == label[ii]: accuracy[0, ii] = 1 # print(output[ii*10:(ii+1)*10]) # print(result) print("accuracy : ", np.sum(accuracy) / BATCH_SIZE) ctx.pop() return "Done\n" #str(output)
def inference(self, img): # copy img to input memory # self.inputs[0]['host'] = np.ascontiguousarray(img) self.inputs[0]['host'] = np.ravel(img) # transfer data to the gpu for inp in self.inputs: cuda.memcpy_htod_async(inp['device'], inp['host'], self.stream) # run inference start = time.time() self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) end = time.time() print('execution time:', end - start) # fetch outputs from gpu for out in self.outputs: cuda.memcpy_dtoh_async(out['host'], out['device'], self.stream) # synchronize stream self.stream.synchronize() return [out['host'] for out in self.outputs]
def inference(self, img): self.watch.start() ih, iw = img.shape[:-1] if (iw, ih) != self.input_shape: img = cv2.resize(img, self.input_shape) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = img.transpose((2, 0, 1)).astype(np.float32) img *= (2.0 / 255.0) img -= 1.0 self.watch.stop(Stopwatch.MODE_PREPROCESS) self.watch.start() np.copyto(self.host_inputs[0], img.ravel()) cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream) self.context.execute_async(batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1], self.stream) cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream) self.stream.synchronize() self.watch.stop(Stopwatch.MODE_INFER) self.watch.start() output = self.host_outputs[0] results = [] for prefix in range(0, len(output), 7): conf = float(output[prefix + 2]) if conf < 0.5: continue x1 = output[prefix + 3] * iw y1 = output[prefix + 4] * ih x2 = (output[prefix + 5] - output[prefix + 3]) * iw y2 = (output[prefix + 6] - output[prefix + 4]) * ih cls = int(output[prefix + 1]) results.append(((x1, y1, x2, y2), cls, conf)) self.watch.stop(Stopwatch.MODE_POSTPROCESS) return results
def inference(self, inputs): np.copyto(self.host_inputs[0], inputs[0].ravel()) np.copyto(self.host_inputs[1], inputs[1].ravel()) if self.cuda_ctx: self.cuda_ctx.push() cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream) cuda.memcpy_htod_async(self.cuda_inputs[1], self.host_inputs[1], self.stream) self.context.execute_async(batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream) self.stream.synchronize() if self.cuda_ctx: self.cuda_ctx.pop() output = self.host_outputs[0] return output
def may_download(self): ctx = self.cuda_context if self.pixels is not None or not ctx or self.freed: return assert self.cuda_device_buffer, "bug: no device buffer" start = monotonic_time() ctx.push() host_buffer = driver.pagelocked_empty(self.buffer_size, dtype=numpy.byte) driver.memcpy_dtoh_async(host_buffer, self.cuda_device_buffer, self.stream) self.wait_for_stream() self.pixels = host_buffer.tobytes() elapsed = monotonic_time() - start log("may_download() from %#x to %s, size=%s, elapsed=%ims - %iMB/s", int(self.cuda_device_buffer), host_buffer, self.buffer_size, int(1000 * elapsed), self.buffer_size / elapsed / 1024 / 1024) self.free_cuda() ctx.pop()
def infer(self, input_img, output_size, num_binding): #self.runtime=self.create_runtime() #self.context=self.create_context() assert (self.__engine.get_nb_bindings() == num_binding) output = np.empty(output_size, dtype=np.float32) d_input = cuda.mem_alloc(self.batchsize * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(self.batchsize * output.size * output.dtype.itemsize) # pointers to gpu memory bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model self.context.enqueue(self.batchsize, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #syncronize threads stream.synchronize() print 'all of activities in stream is done: {}'.format( stream.is_done()) #destroy cuda context d_input.free() d_output.free() print 1999 - cuda.mem_get_info()[0] / 1048576, cuda.mem_get_info( )[1] / 1048576 #self.context.destroy #self.runtime.destroy() return output
def detect(self, img, conf_th=0.3, conf_class=[]): """Detect objects in the input image.""" img_resized = _preprocess_trt(img, self.input_shape) np.copyto(self.host_inputs[0], img_resized.ravel()) cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream) self.context.execute_async(batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1], self.stream) cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream) self.stream.synchronize() output = self.host_outputs[0] return _postprocess_trt(img, output, conf_th, self.output_layout, conf_class)
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") import numpy shape = (32, 8) a = drv.pagelocked_zeros(shape, dtype=numpy.float32) b = drv.pagelocked_zeros(shape, dtype=numpy.float32) a[:] = numpy.random.randn(*shape) b[:] = numpy.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1, ), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest - a * b) == 0
def do_inference(engine, tensor, h_input_1, d_input_1, h_output, d_output, stream, batch_size1): """ This is the function to run the inference Args: engine : Path to the TensorRT engine. tensor : Input VTK file to the model. h_input_1: Input in the host. d_input_1: Input in the device. h_output_1: Output in the host. d_output_1: Output in the device. stream: CUDA stream. batch_size : Batch size for execution time. height: Height of the output image. width: Width of the output image. Output: The list of output images. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) """ print("[INFO] load file to buffer...") load_file_to_buffer(tensor, h_input_1) with engine.create_execution_context() as context: # Transfer input data to the GPU. start = time.time() cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. #context.profiler = trt.Profiler() context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) #context.execute_async(batch_size=1, bindings=[int(d_input_1), int(d_output)], stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream. stream.synchronize() # Return the host output. end = time.time() out = h_output.reshape((batch_size1, 18, 18, 8, 3)) print("inference time by TensorRT for one batch with size of {}: {}". format(batch_size1, end - start)) return out
def _test_model(self, model_name, input_shape=(3, 224, 224), normalization_hint=0): model = getattr(models, model_name)(pretrained=True) shape = (1, ) + input_shape dummy_input = (torch.randn(shape), ) onnx_name = model_name + ".onnx" torch.onnx.export(model, dummy_input, onnx_name, input_names=[], output_names=[], verbose=False, export_params=True, opset_version=9) with self.build_engine_onnx(onnx_name) as engine: h_input, d_input, h_output, d_output, stream = allocate_buffers( engine) with engine.create_execution_context() as context: err_count = 0 for index, f in enumerate(self.image_files): test_case = load_normalized_test_case(input_shape, f,\ h_input, normalization_hint) cuda.memcpy_htod_async(d_input, h_input, stream) context.execute_async_v2(bindings=[d_input, d_output], stream_handle=stream.handle) cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() amax = np.argmax(h_output) pred = self.labels[amax] if "_".join(pred.split()) not in\ os.path.splitext(os.path.basename(test_case))[0]: err_count = err_count + 1 self.assertLessEqual(err_count, 1, "Too many recognition errors")
def inference(self, img): """ Detect objects in the input image. Args: img: uint8 numpy array with shape (img_height, img_width, channels) Returns: result: a dictionary contains of [{"id": 0, "bbox": [x1, y1, x2, y2], "score": s% }, {...}, {...}, ...] """ img_resized = self._preprocess_trt(img) # transfer the data to the GPU, run inference and the copy the results back np.copyto(self.host_inputs[0], img_resized.ravel()) # Start inference time t_begin = time.perf_counter() cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream) self.context.execute_async(batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.host_outputs[1], self.cuda_outputs[1], self.stream) cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream) self.stream.synchronize() inference_time = time.perf_counter() - t_begin # Seconds # Calculate Frames rate (fps) self.fps = convert_infr_time_to_fps(inference_time) output = self.host_outputs[0] boxes, scores, classes = self._postprocess_trt(img, output) result = [] for i in range(len(boxes)): # number of boxes if classes[i] == self.class_id + 1: result.append({ "id": str(classes[i] - 1) + '-' + str(i), "bbox": boxes[i], "score": scores[i] }) return result
def run(batchSize, nRow, nCol): print("test", batchSize, nRow, nCol) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, nRow, nCol) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() condition = np.array(np.random.randint(0, 2, [batchSize, nRow, nCol]), dtype=np.int32) inputX = np.full([batchSize, nRow, nCol], 1, dtype=np.float32) inputY = np.full([batchSize, nRow, nCol], -1, dtype=np.float32) inputH0 = np.ascontiguousarray(condition.reshape(-1)) inputH1 = np.ascontiguousarray(inputX.reshape(-1)) inputH2 = np.ascontiguousarray(inputY.reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) inputD1 = cuda.mem_alloc(inputH1.nbytes) inputD2 = cuda.mem_alloc(inputH2.nbytes) outputH0 = np.empty((batchSize, ) + tuple(engine.get_binding_shape(3)), dtype=trt.nptype(engine.get_binding_dtype(3))) outputD0 = cuda.mem_alloc(outputH0.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) cuda.memcpy_htod_async(inputD1, inputH1, stream) cuda.memcpy_htod_async(inputD2, inputH2, stream) context.execute_async( batchSize, [int(inputD0), int(inputD1), int(inputD2), int(outputD0)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) stream.synchronize() outputH0CPU = whereCPU(condition, inputX, inputY) print("Check result:", ["True" if np.all(outputH0 == outputH0CPU) else "False"][0])
def detect(self, img, conf_thr=0.4, nms_thr=0.4): # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings # Do image preprocess input_image = _preprocess_yolo(img, self.input_shape) # Copy input image to host buffer np.copyto(host_inputs[0], input_image.ravel()) # Transfer input data to the GPU. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) # Run inference. context.execute_async(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) # Synchronize the stream stream.synchronize() # Here we use the first row of output in that batch_size = 1 output = host_outputs[0] raw_h, raw_w = img.shape[:2] print("raw shape:", img.shape) w_scale = raw_w / self.input_shape[1] h_scale = raw_h / self.input_shape[0] boxes, scores, classes = _postprocess_yolo(output, w_scale, h_scale, conf_thr, nms_thr) boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, img.shape[1] - 1) boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, img.shape[0] - 1) for i in range(len(boxes)): box = boxes[i] plot_one_box(box, img, label="{}:{:.2f}".format(classes[i], scores[i])) cv2.imwrite('test.jpg', img) return boxes, scores, classes
def predict_image_dn_trt(noised, clean, out_shape, context): x = torch.from_numpy(np.float32(noised / 255)).permute(2, 0, 1).unsqueeze(0) image = np.ascontiguousarray(x) d_input = cuda.mem_alloc(1 * image.size * image.dtype.itemsize) y = np.empty(out_shape, dtype=np.float32) d_output = cuda.mem_alloc(1 * y.size * y.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() cuda.memcpy_htod_async(d_input, image, stream) context.execute_async(int(1), bindings, stream.handle, None) cuda.memcpy_dtoh_async(y, d_output, stream) stream.synchronize() y = torch.from_numpy(y) if clean is not None: xx = torch.from_numpy(np.float32(clean / 255)).permute(2, 0, 1).unsqueeze(0) psnr_ = np.round(batch_PSNR(y, xx, data_range=1.0), 3) else: psnr_ = 0 img = np.array(y.cpu().squeeze(0).permute(1, 2, 0) * 255).astype('uint8') return psnr_, img
def infer(context, h_input, data_type=np.float32): # dynamic shape, batch size ## context.active_optimization_profile = 0 ## context.set_binding_shape(0, (h_input.shape)) # print(context.get_binding_shape(0), context.get_binding_shape(1)) size = trt.volume(h_input.shape) * np.dtype(data_type).itemsize d_input = cuda.mem_alloc(size) h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=data_type) d_output = cuda.mem_alloc(h_output.nbytes) stream = cuda.Stream() cuda.memcpy_htod_async(d_input, h_input, stream) context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(h_outpdataut, d_output, stream) stream.synchronize() return h_output
def _do_inference(self): # Transfer input data to the GPU.(optionally serialized via stream) [cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs] # Run inference. self.context.execute_async(bindings=self.bindings, stream_handle=self.stream.handle) # Transfer predictions back from the GPU.(optionally serialized via stream) [cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs] # Synchronize the stream self.stream.synchronize() # Return only the host outputs. return [out.host for out in self.outputs]
def infer(cls, context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device_memory, inp.host_memory, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host_memory, out.device_memory, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host_memory for out in outputs]
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. success_flag = context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Bug [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream return [out.host for out in outputs]
def trt_inference(stream, trt_ctx, d_input, d_output, input_signal, input_signal_length): print("infer with shape: {}".format(input_signal.shape)) trt_ctx.set_binding_shape(0, input_signal.shape) assert trt_ctx.all_binding_shapes_specified h_output = cuda.pagelocked_empty(tuple(trt_ctx.get_binding_shape(1)), dtype=np.float32) h_input_signal = cuda.register_host_memory( np.ascontiguousarray(input_signal.cpu().numpy().ravel())) cuda.memcpy_htod_async(d_input, h_input_signal, stream) trt_ctx.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() greedy_predictions = torch.tensor(h_output).argmax(dim=-1, keepdim=False) return greedy_predictions
def do_inference(context, bindings, inputs, outputs, stream): # Transfer input data to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs]
def predict(context, batch, d_input, stream, bindings, p_output, d_output): # result gets copied into output # transfer input data to device cuda.memcpy_htod_async(d_input, batch, stream) # execute model context.execute_async_v2(bindings, stream.handle, None) # transfer predictions back cuda.memcpy_dtoh_async(p_output, d_output, stream) # syncronize threads stream.synchronize() # ============================================================================= # print(f"lr_shape: {batch.shape}" # f"lr input : {batch}") # # print(f"hr_shape: {p_output.shape}" # f"hr output: {p_output}") # ============================================================================= return p_output
def test_streamed_kernel(self): # this differs from the "simple_kernel" case in that *all* computation # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") import numpy shape = (32,8) a = drv.pagelocked_zeros(shape, dtype=numpy.float32) b = drv.pagelocked_zeros(shape, dtype=numpy.float32) a[:] = numpy.random.randn(*shape) b[:] = numpy.random.randn(*shape) a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) strm = drv.Stream() drv.memcpy_htod_async(a_gpu, a, strm) drv.memcpy_htod_async(b_gpu, b, strm) strm.synchronize() dest = drv.pagelocked_empty_like(a) multiply_them( drv.Out(dest), a_gpu, b_gpu, block=shape+(1,), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() assert la.norm(dest-a*b) == 0
def main(): # initialize TensorRT engine and parse ONNX model engine, context = build_engine(ONNX_FILE_PATH) # get sizes of input and output and allocate memory required for input data and for output data for binding in engine: if engine.binding_is_input(binding): # we expect only one input input_shape = engine.get_binding_shape(binding) input_size = trt.volume( input_shape) * engine.max_batch_size * np.dtype( np.float32).itemsize # in bytes device_input = cuda.mem_alloc(input_size) else: # and one output output_shape = engine.get_binding_shape(binding) # create page-locked memory buffers (i.e. won't be swapped to disk) host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32) device_output = cuda.mem_alloc(host_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # preprocess input data host_input = np.array(preprocess_image("input.jpeg").numpy(), dtype=np.float32, order='C') cuda.memcpy_htod_async(device_input, host_input, stream) # run inference context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_output, device_output, stream) stream.synchronize() # postprocess results output_data = torch.Tensor(host_output).reshape(engine.max_batch_size, output_shape[1]) postprocess(output_data)
def loadONNX2TensorRT(self, filepath): ''' 通过onnx文件,构建TensorRT运行引擎 :param filepath: onnx文件路径 ''' engine = self.ONNX_build_engine(filepath) # 读取测试集 datas = DataLoaders() test_loader = datas.testDataLoader() img, target = next(iter(test_loader)) img = img.numpy() target = target.numpy() img = img.ravel() context = engine.create_execution_context() output = np.empty((100, 10), dtype=np.float32) # 分配内存 d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize) d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] # pycuda操作缓冲区 stream = cuda.Stream() # 将输入数据放入device cuda.memcpy_htod_async(d_input, img, stream) # 执行模型 context.execute_async(100, bindings, stream.handle, None) # 将预测结果从从缓冲区取出 cuda.memcpy_dtoh_async(output, d_output, stream) # 线程同步 stream.synchronize() print("Test Case: " + str(target)) print("Prediction 100: " + str(np.argmax(output, axis=1))) del context del engine
def do_inference(engine, pics_1, h_input_1, d_input_1, h_output, d_output, stream, batch_size, height, width): """ This is the function to run the inference Args: engine : Path to the TensorRT engine pics_1 : Input images to the model. h_input_1: Input in the host d_input_1: Input in the device h_output_1: Output in the host d_output_1: Output in the device stream: CUDA stream batch_size : Batch size for execution time height: Height of the output image width: Width of the output image Output: The list of output images """ print('load images to buffer') load_images_to_buffer(pics_1, h_input_1) with engine.create_execution_context() as context: context.debug_sync = False # Transfer input data to the GPU. cuda.memcpy_htod_async(d_input_1, h_input_1, stream) # Run inference. print('load profiler') context.profiler = trt.Profiler() print('execute') context.execute(batch_size=1, bindings=[int(d_input_1), int(d_output)]) print('Transfer predictions back from the GPU.') # Transfer predictions back from the GPU. cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() # Return the host output. print(h_output.shape) out = h_output.reshape((1,-1)) return out
def run_inference(self): """ This function is generalized for multiple inputs/outputs. inputs and outputs are expected to be lists of HostDeviceMem objects. """ # Transfer input data to the GPU. for inp in self.inputs: cuda.memcpy_htod_async(inp.device, inp.host, self.stream) # Run inference. self.context.execute_async( batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.handle, ) # Transfer predictions back from the GPU. for out in self.outputs: cuda.memcpy_dtoh_async(out.host, out.device, self.stream) # Synchronize the stream self.stream.synchronize() # Return only the host outputs. return [out.host for out in self.outputs]
def test_kernel(n, a, x_gpu, y_gpu): code = """ #include <stdio.h> __global__ void saxpy(int n, float a, float *x, float *y) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride) { y[i] = a*x[i] + y[i]; } } """ mod = SourceModule(code) saxpy = mod.get_function("saxpy") saxpy(n, a, x_gpu, y_gpu, block=(1024, 1, 1), grid=(1, 1)) out = cuda.register_host_memory(np.empty(n, dtype=np.float32)) cuda.memcpy_dtoh_async(out, y_gpu) return out
def run(nRow, nCol): print("test: nRow=%d,nCol=%d" % (nRow, nCol)) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) engine = buildEngine(logger, nRow, nCol) if engine == None: print("Failed building engine!") return None print("Succeeded building engine!") context = engine.create_execution_context() stream = cuda.Stream() data = np.full((nRow, nCol), 1, dtype=np.float32) # uniform distribution #data = np.tile(np.arange(0,nCol,1,dtype=np.float32),[nRow,1]) # non-uniform distribution inputH0 = np.ascontiguousarray(data.reshape(-1)) inputD0 = cuda.mem_alloc(inputH0.nbytes) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) outputH1 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2))) outputD0 = cuda.mem_alloc(outputH0.nbytes) outputD1 = cuda.mem_alloc(outputH1.nbytes) cuda.memcpy_htod_async(inputD0, inputH0, stream) context.execute_async( 1, [int(inputD0), int(outputD0), int(outputD1)], stream.handle) cuda.memcpy_dtoh_async(outputH0, outputD0, stream) cuda.memcpy_dtoh_async(outputH1, outputD1, stream) stream.synchronize() print("outputH0") print( np.shape(outputH0), "mean=%.2f,var=%.2f,max=%d,min=%d" % (np.mean(outputH0), np.var(outputH0), np.max(outputH0), np.min(outputH0))) print("outputH1") print(np.shape(outputH1), "mean=%.2f" % (np.mean(outputH1)))
def threshold_integrated(series, value): global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val t = numpy.float32(value**2) nb = int(numpy.ceil(float(len(series))/nt/gs)) if _bn is None or len(_bn) < nb: _bn = gpuarray.zeros(nb, dtype=numpy.uint32) if _n is None: _n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP) ptr = numpy.intp(_n.base.get_device_pointer()) class T(): pass _dn = T() _dn.gpudata = ptr _dn.flags = _n.flags if _loc_tmp is None or len(series) > len(_loc_tmp): _loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32) _loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32) _val_out = gpuarray.zeros(len(series), dtype=series.dtype) _val = driver.pagelocked_empty((4096*256), numpy.complex64) _loc = driver.pagelocked_empty((4096*256), numpy.uint32) #Do the thresholding by block stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1)) # Recombine the blocks into a final output stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1)) # We need to get the data back now pycbc.scheme.mgr.state.context.synchronize() if _n != 0: driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata) driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata) pycbc.scheme.mgr.state.context.synchronize() return _loc[0:_n], _val[0:_n]
def run(self, scomp, scopy): cuda.memcpy_dtoh_async(mpimat.hdata, mpimat.data, scomp)
flop = 3*(nx*ny*nz*30)*tgap flops = np.zeros(tmax/tgap+1) start, stop = cuda.Event(), cuda.Event() start.record() # main loop ey_tmp = cuda.pagelocked_zeros((ny,nz),'f') ez_tmp = cuda.pagelocked_zeros_like(ey_tmp) hy_tmp = cuda.pagelocked_zeros_like(ey_tmp) hz_tmp = cuda.pagelocked_zeros_like(ey_tmp) stream1 = cuda.Stream() for tn in xrange(1, tmax+1): update_h.prepared_async_call(bpg0, stream1, np.int32(By), *eh_args) for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args) if rank == 0: cuda.memcpy_dtoh_async(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) cuda.memcpy_dtoh_async(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) stream1.synchronize() comm.Send(hy_tmp, 1, 20) comm.Send(hz_tmp, 1, 21) elif rank == 1: comm.Recv(hy_tmp, 0, 20) comm.Recv(hz_tmp, 0, 21) cuda.memcpy_htod_async(int(hy_gpu), hy_tmp, stream1) cuda.memcpy_htod_async(int(hz_gpu), hz_tmp, stream1) cuda.memcpy_dtoh_async(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) cuda.memcpy_dtoh_async(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32'], stream1) stream1.synchronize() comm.Send(hy_tmp, 2, 20) comm.Send(hz_tmp, 2, 21) elif rank == 2:
if len(shape) <= 1: if isinstance(src, GPUArray): if isinstance(dst, GPUArray): if async: drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream) else: drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes) else: # The arrays might be contiguous in the sense of # having no gaps, but the axes could be transposed # so that the order is neither Fortran or C. # So, we attempt to get a contiguous view of dst. dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,)) if async: drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream) else: drv.memcpy_dtoh(dst, src.gpudata) else: src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,)) if async: drv.memcpy_htod_async(dst.gpudata, src, stream=stream) else: drv.memcpy_htod(dst.gpudata, src) return if len(shape) == 2: copy = drv.Memcpy2D() elif len(shape) == 3: copy = drv.Memcpy3D() else:
def from_buf_async(self, cl_buf, stream=None): cuda.memcpy_dtoh_async(self.buffers[cl_buf], cl_buf, stream)
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes==ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width/x_div, 4) out_height = roundup(self.dst_height/y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w/blockw/xdiv) if gridw*2*blockw<w: gridw += 1 gridh = max(1, h/blockh/ydiv) if gridh*2*blockh<h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides)-1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
def copy(self, fb, dim, pool, stream=None): h_out = pool.allocate((dim.h, dim.w, 4), "u1") cuda.memcpy_dtoh_async(h_out, fb.d_back, stream) return h_out
Stream2 = drv.Stream() MT_state_buf = drv.mem_alloc(SIZE * MT_N * 4) MT_state_res_buf = drv.mem_alloc(MT_state_result.nbytes) prg = SourceModule( transform_to_cuda( gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH) ) ) prog = prg.get_function('mt_brute') zzz = time.time() ev = prog(np.uint32(0), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream) drv.memcpy_dtoh_async(MT_state_result, MT_state_res_buf, stream=Stream2) for i in xrange(TEST_ITERATIONS): prog(np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream) drv.memcpy_dtoh(MT_state_result, MT_state_res_buf)#, stream=Stream2) zzz = time.time() - zzz print '>>>', zzz for row in MT_state_result[0]: print row MT_state_buf.free() MT_state_res_buf.free()