def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") onnxFile = onnxFile1 # 已经有 model.plan,读进 model1.onnx 做 Refit else: onnxFile = onnxFile0 # 还没有 model.plan,先用 model0.onnx 构建 model.plan builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.flags = 1 << int(trt.BuilderFlag.REFIT) config.max_workspace_size = 3 << 30 parser = trt.OnnxParser(network, logger) if not os.path.exists(onnxFile): print("Failed finding .onnx file!") exit() print("Succeeded finding .onnx file!") with open(onnxFile, 'rb') as model: if not parser.parse(model.read()): print("Failed parsing .onnx file!") for error in range(parser.num_errors): print(parser.get_error(error)) exit() print("Succeeded parsing .onnx file!") if os.path.isfile(trtFile): # 进行 Refit refitter = trt.Refitter(engine, logger) layerNameList, weightRoleList = refitter.get_all() for name, role in zip(layerNameList, weightRoleList): print("LayerName:%s,WeightRolw:%s"%(name, role)) for i in range(network.num_layers): layer = network.get_layer(i) if layer.name in layerNameList: # 据实际网络情况,可能需要添加更多 Layer if layer.type == trt.LayerType.CONVOLUTION: layer.__class__ = trt.IConvolutionLayer refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel) refitter.set_weights(layer.name, trt.WeightsRole.BIAS, layer.bias) layerNameList.remove if layer.type == trt.LayerType.FULLY_CONNECTED: layer.__class__ = trt.IFullyConnectedLayer refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel) if layer.type == trt.LayerType.CONSTANT: layer.__class__ = trt.IConstantLayer refitter.set_weights(layer.name, trt.WeightsRole.CONSTANT, layer.weights) if refitter.refit_cuda_engine() == False: print("Failed refitting engine, missing weight:") [missingLayer, weightRole] = refitter.get_missing() for layer, role in zip(missingLayer, weightRole): print("\tLayerName:%s,WeightRolw:%s"%(name, role)) return print("Succeeded refitting engine!") else: # 构建 model.plan inputTensor = network.get_input(0) inputTensor.shape = [1, 1, 28, 28] ''' # 逐层打印网络信息 for i in range(network.num_layers): layer = network.get_layer(i) print(i, "%s,in=%d,out=%d,%s" % (str(layer.type)[10:], layer.num_inputs, layer.num_outputs, layer.name)) for j in range(layer.num_inputs): tensor = layer.get_input(j) if tensor == None: print("\tInput %2d:" % j, "None") else: print("\tInput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name)) for j in range(layer.num_outputs): tensor = layer.get_output(j) if tensor == None: print("\tOutput %2d:" % j, "None") else: print("\tOutput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name)) ''' engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") exit() print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [1, 1, 28, 28]) _, stream = cudart.cudaStreamCreate() print("Binding0->", engine.get_binding_shape(0), context.get_binding_shape(0), engine.get_binding_dtype(0)) print("Binding1->", engine.get_binding_shape(1), context.get_binding_shape(1), engine.get_binding_dtype(1)) data = cv2.imread(inputImage, cv2.IMREAD_GRAYSCALE).astype(np.float32) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("inputH0 :", data.shape) #print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0) print("Succeeded running model in TensorRT!")
engine.get_binding_name(i)) bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append( np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) for i in range(10): context.execute_v2(bufferD) for i in range(nInput + nOutput): print(engine.get_binding_name(i)) for b in bufferD: cudart.cudaFree(b)
def run(useTimeCache): logger = trt.Logger(trt.Logger.ERROR) timeCache = b"" if useTimeCache and os.path.isfile(timeCacheFile): with open(timeCacheFile, 'rb') as f: timeCache = f.read() if timeCache == None: print("Failed getting serialized timing cache!") return print("Succeeded getting serialized timing cache!") builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 if useTimeCache: cache = config.create_timing_cache(timeCache) config.set_timing_cache(cache, False) inputTensor = network.add_input('inputT0', trt.float32, [-1, 1, 28, 28]) profile.set_shape(inputTensor.name, (1, 1, 28, 28), (4, 1, 28, 28), (8, 1, 28, 28)) config.add_optimization_profile(profile) np.random.seed(97) # 保持每次权重都一样 w = np.random.rand(32, 1, 5, 5).astype(np.float32).reshape(-1) b = np.random.rand(32).astype(np.float32).reshape(-1) _0 = network.add_convolution_nd(inputTensor, 32, [5, 5], w, b) _0.padding_nd = [2, 2] _1 = network.add_activation(_0.get_output(0), trt.ActivationType.RELU) _2 = network.add_pooling_nd(_1.get_output(0), trt.PoolingType.MAX, [2, 2]) _2.stride_nd = [2, 2] w = np.random.rand(64, 32, 5, 5).astype(np.float32).reshape(-1) b = np.random.rand(64).astype(np.float32).reshape(-1) _3 = network.add_convolution_nd(_2.get_output(0), 64, [5, 5], w, b) _3.padding_nd = [2, 2] _4 = network.add_activation(_3.get_output(0), trt.ActivationType.RELU) _5 = network.add_pooling_nd(_4.get_output(0), trt.PoolingType.MAX, [2, 2]) _5.stride_nd = [2, 2] _6 = network.add_shuffle(_5.get_output(0)) _6.first_transpose = (0, 2, 3, 1) _6.reshape_dims = (-1, 64 * 7 * 7, 1, 1) w = np.random.rand(1024, 64 * 7 * 7).astype(np.float32).reshape(-1) b = np.random.rand(1024).astype(np.float32).reshape(-1) _7 = network.add_fully_connected(_6.get_output(0), 1024, w, b) _8 = network.add_activation(_7.get_output(0), trt.ActivationType.RELU) w = np.random.rand(10, 1024).astype(np.float32).reshape(-1) b = np.random.rand(10).astype(np.float32).reshape(-1) _9 = network.add_fully_connected(_8.get_output(0), 10, w, b) _10 = network.add_activation(_9.get_output(0), trt.ActivationType.RELU) _11 = network.add_shuffle(_10.get_output(0)) _11.reshape_dims = [-1, 10] _12 = network.add_softmax(_11.get_output(0)) _12.axes = 1 << 1 _13 = network.add_topk(_12.get_output(0), trt.TopKOperation.MAX, 1, 1 << 1) network.mark_output(_13.get_output(1)) t0 = time() engineString = builder.build_serialized_network(network, config) t1 = time() print("%s timing cache, %f ms" % ("With" if useTimeCache else "Without", (t1 - t0) * 1000)) if useTimeCache and not os.path.isfile(timeCacheFile): timeCache = config.get_timing_cache() timeCacheString = timeCache.serialize() with open(timeCacheFile, 'wb') as f: f.write(timeCacheString) print("Succeeded saving .cache file!") engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [1, 1, 28, 28]) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append( np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) #for i in range(nInput + nOutput): # print(engine.get_binding_name(i)) # print(bufferH[i].reshape(context.get_binding_shape(i))) for b in bufferD: cudart.cudaFree(b)
def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputList = [] for i in range(nGEMM + 1): inputT = network.add_input('inputT' + str(i), trt.float32, [-1, 4, sizeGEMM, sizeGEMM]) profile.set_shape(inputT.name, (1, 4, sizeGEMM, sizeGEMM), (4, 4, sizeGEMM, sizeGEMM), (sizeGEMM, 4, sizeGEMM, sizeGEMM)) inputList.append(inputT) config.add_optimization_profile(profile) tempTensor = inputList[0] for i in range(1, nGEMM + 1): tempLayer = network.add_matrix_multiply(tempTensor, trt.MatrixOperation.NONE, inputList[i], trt.MatrixOperation.NONE) tempTensor = tempLayer.get_output(0) network.mark_output(tempLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() for i in range(nGEMM + 1): context.set_binding_shape(i, [4, 4, sizeGEMM, sizeGEMM]) stream = cudart.cudaStreamCreate()[1] bufferSize = [ trt.volume(context.get_binding_shape(i)) * np.array([0], dtype=trt.nptype(engine.get_binding_dtype(i))).nbytes for i in range(engine.num_bindings) ] bufferH = [] bufferD = [] for i in range(nGEMM + 2): bufferH.append( cudart.cudaHostAlloc(bufferSize[i], cudart.cudaHostAllocWriteCombined)[1]) bufferD.append(cudart.cudaMallocAsync(bufferSize[i], stream)[1]) # 不用 CUDA Graph 来执行 for i in range(nGEMM + 1): cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2(bufferD, stream) cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) for n in range(nInference): for i in range(nGEMM + 1): cudart.cudaMemcpyAsync( bufferD[i], bufferH[i], bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2(bufferD, stream) cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) # 捕获 CUDA Graph 并运行 cudart.cudaStreamBeginCapture( stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) for i in range(nGEMM + 1): cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i], cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2(bufferD, stream) cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1], cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) #cudart.cudaStreamSynchronize(stream) # 不用在 graph 内同步 _, graph = cudart.cudaStreamEndCapture(stream) _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0) cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) for n in range(nInference): cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) for i in range(nGEMM + 2): cudart.cudaFree(bufferD[i]) cudart.cudaStreamDestroy(stream)
def run(): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1]) profile.set_shape(inputTensor.name, (1, 1, 1), (3, 4, 5), (6, 8, 10)) config.add_optimization_profile(profile) identityLayer = network.add_identity(inputTensor) network.mark_output(identityLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [3, 4, 5]) _, stream = cudart.cudaStreamCreate() data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) # 捕获 CUDA Graph 之前需要先运行一次推理 cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) # 捕获 CUDA Graph 并运行 cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) #cudart.cudaStreamSynchronize(stream) # 不用在 graph 内同步 _, graph = cudart.cudaStreamEndCapture(stream) _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0) cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) print("outputH0Big:", outputH0.shape) print(outputH0) # 输入尺寸改变后,也需要先运行一次推理,再重新捕获 CUDA Graph,最后再运行 context.set_binding_shape(0, [2, 3, 4]) inputH0 = np.ascontiguousarray(-data[:2 * 3 * 4].reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) _, graph = cudart.cudaStreamEndCapture(stream) _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0) cudart.cudaGraphLaunch(graphExe, stream) cudart.cudaStreamSynchronize(stream) print("outputH0Small:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def run(nRunTime): logger = trt.Logger(trt.Logger.ERROR) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.flags = 1 << int(trt.BuilderFlag.REFIT) inputT0 = network.add_input('inputT0', trt.float32, (nIn, cIn, hIn, wIn)) fakeWeight = np.zeros([cOut, cIn, wW, wW], dtype=np.float32) fakeBias = np.zeros([cOut], dtype=np.float32) convolutionLayer = network.add_convolution_nd(inputT0, cOut, (hW, wW), fakeWeight, fakeBias) #convolutionLayer.name = 'conv' network.set_weights_name(convolutionLayer.kernel, "conv-w") network.set_weights_name(convolutionLayer.bias, "conv-b") network.mark_output(convolutionLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) if nRunTime == 0: print("Do not refit!") else: print("Refit!") refitter = trt.Refitter(engine, logger) refitter.set_named_weights("conv-w", weight) refitter.set_named_weights("conv-b", bias) [missingLayer, weightRole] = refitter.get_missing() for layer, role in zip(missingLayer, weightRole): print("[", layer, "-", role, "]") if refitter.refit_cuda_engine() == False: print("Failed Refitting engine!") return context = engine.create_execution_context() _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("data:", data.shape) print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def run1(engine): context = engine.create_execution_context() context.set_binding_shape(0, [nIn, cIn, hIn, wIn]) _, stream = cudart.cudaStreamCreate() data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape( nIn, cIn, hIn, wIn) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) # 完整一次推理 cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) # 数据拷贝 HtoD 计时 for i in range(10): cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) trtTimeStart = time() for i in range(30): cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, DataCopyHtoD" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) # 推理计时 for i in range(10): context.execute_async_v2([int(inputD0), int(outputD0)], stream) trtTimeStart = time() for i in range(30): context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, Inference" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) # 数据拷贝 DtoH 计时 for i in range(10): cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) trtTimeStart = time() for i in range(30): cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, DataCopyDtoH" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) # 总时间计时 for i in range(10): context.execute_async_v2([int(inputD0), int(outputD0)], stream) trtTimeStart = time() for i in range(30): cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) trtTimeEnd = time() print("%6.3fms - 1 stream, DataCopy + Inference" % ((trtTimeEnd - trtTimeStart) / 30 * 1000)) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def run(shape): testCase = "<shape=%s>" % (shape) trtFile = "./model-%d.plan" % (shape[2]) print("Test %s" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engineStr = f.read() engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape]) profile.set_shape(inputT0.name, [1, 1, shape[2]], shape, shape) config.add_optimization_profile(profile) pluginLayer = network.add_plugin_v2([inputT0], getLayerNormPlugin(epsilon)) network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, shape) #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput #for i in range(engine.num_bindings): # print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput), # engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i)) bufferH = [] bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape)) for i in range(nOutput): bufferH.append( np.empty(context.get_binding_shape(nInput + i), dtype=trt.nptype(engine.get_binding_dtype(nInput + i)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy( bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nOutput): cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i], bufferH[nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) outputCPU = layerNormCPU(bufferH[:nInput], epsilon) ''' for i in range(nInput): printArrayInfo(bufferH[i]) for i in range(nOutput): printArrayInfo(bufferH[nInput+i]) for i in range(nOutput): printArrayInfo(outputCPU[i]) ''' check(bufferH[nInput:][0], outputCPU[0], True) for buffer in bufferD: cudart.cudaFree(buffer) print("Test %s finish!\n" % testCase)
check1 = check( bufferH[indexEncoderOutLens], np.sum(ioData['encoder_out_lens'].astype(np.int32), axis=2)[:, 0], True) string = "%4d,%4d,%8.3f,%9.3e,%9.3e,%9.3e,%9.3e,%9.3e" % ( batchSize, sequenceLength, timePerInference, batchSize * sequenceLength / timePerInference * 1000, check0[1], check0[2], check1[1], check1[2]) print(string + ", %s" % ("Good" if check0[1] < 3.5e-2 and check0[2] < 2e-3 and check1[2] < 1e-1 else "Bad")) f.write(string + "\n") for i in range(nInput + nOutput): cudart.cudaFree(bufferD[i]) #------------------------------------------------------------------------------- print("Test Decoder Part!") with open(decoderScoreFile, 'w') as f: if os.path.isfile(decoderPlanFile): with open(decoderPlanFile, 'rb') as decoderF: engine = trt.Runtime(logger).deserialize_cuda_engine( decoderF.read()) if engine is None: print("Failed loading %s" % decoderPlanFile) exit() print("Succeeded loading %s" % decoderPlanFile) else:
def __del__(self): cudart.cudaFree(self.dIn)
def test_tf_nn_conv2d(): print( "\ntf.nn.conv2d ------------------------------------------------------" ) # TensorFlow part ---------------------------------------------------------- x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x') weight = tf.compat.v1.get_variable( 'w1', shape=[hW, wW, cIn, cOut], initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1)) y = tf.nn.conv2d( \ x, filter=weight, strides=None, padding='SAME', use_cudnn_on_gpu=True, data_format='NHWC', dilations=[1, 1, 1, 1], name='y', filters=None ) tfConfig = tf.compat.v1.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.compat.v1.Session(config=tfConfig) sess.run(tf.compat.v1.global_variables_initializer()) outputTF = sess.run(y, feed_dict={x: inputData}) tfPara = {} # 保存权重 print("Weight:") for i in tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES): name, value = i.name, sess.run(i) print(name, value.shape) tfPara[name] = value np.savez("para_tf_nn_conv2d.npz", **tfPara) sess.close() # TensorRT part ------------------------------------------------------------ logger = trt.Logger(trt.Logger.ERROR) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() inputT0 = network.add_input('inputT0', trt.float32, (-1, -1, -1, cIn)) profile.set_shape(inputT0.name, (1, 1, 1, cIn), (nIn, hIn, wIn, cIn), (nIn * 2, hIn * 2, wIn * 2, cIn)) # 范围覆盖住之后需要的值就好 config.add_optimization_profile(profile) _h1 = network.add_shuffle(inputT0) # NHWC to NCHW _h1.first_transpose = (0, 3, 1, 2) weight = np.load('./para_tf_nn_conv2d.npz')['w1:0'].transpose( 3, 2, 0, 1).reshape(-1) # 读取权重 _h2 = network.add_convolution_nd(_h1.get_output(0), cOut, [hW, wW], weight, None) _h2.padding_nd = (2, 2) _h3 = network.add_shuffle(_h2.get_output(0)) # NCHW to NHWC,与 TF 模型保持一致 _h3.first_transpose = (0, 2, 3, 1) network.mark_output(_h3.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [nIn, hIn, wIn, cIn]) _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(inputData.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) printArray(inputData, "input") #print(inputData) printArray(outputTF, "TF output") #print(outputTF) printArray(outputH0, "TRT output") #print(outputH0) check(outputTF, outputH0, True) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def run(shape, scalar): testCase = "<shape=%s,scalar=%f>" % (shape, scalar) trtFile = "./model-Shape[%s].plan" % ( "".join([str(i) + "-" for i in shape[:-1]]) + str(shape[-1])) print("Test %s" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) builder.max_batch_size = 32 network = builder.create_network() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 inputT0 = network.add_input('inputT0', trt.float32, shape[1:]) pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput #for i in range(engine.num_bindings): # print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput), # engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i)) bufferH = [] bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape)) for i in range(nOutput): bufferH.append( np.empty( (shape[0], ) + tuple(context.get_binding_shape(nInput + i)), dtype=trt.nptype(engine.get_binding_dtype(nInput + i)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy( bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute(shape[0], bufferD) for i in range(nOutput): cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i], bufferH[nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) outputCPU = addScalarCPU(bufferH[:nInput], scalar) ''' for i in range(nInput): printArrayInfo(bufferH[i]) for i in range(nOutput): printArrayInfo(bufferH[nInput+i]) for i in range(nOutput): printArrayInfo(outputCPU[i]) ''' check(bufferH[nInput:][0], outputCPU[0], True) for buffer in bufferD: cudart.cudaFree(buffer) print("Test %s finish!\n" % testCase)
def run(shape0, shape1, scalar): testCase = "<shape0:%s,shape1:%s,scalar=%f>" % (shape0, shape1, scalar) trtFile = "./model-Dims" + str(len(shape0)) + ".plan" print("\nTest", testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile0 = builder.create_optimization_profile() profile1 = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 config.flags = 1 << int(trt.BuilderFlag.FP16) # 注释掉这一行,Pugin 就仅使用 FP32 inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape0]) profile0.set_shape(inputT0.name, [1 for i in shape0], [8 for i in shape0], [32 for i in shape0]) config.add_optimization_profile(profile0) profile1.set_shape(inputT0.name, [1 for i in shape1], [8 for i in shape1], [32 for i in shape1]) config.add_optimization_profile(profile1) pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() stream = 0 # 使用默认 CUDA 流 cudart.cudaStreamSynchronize(stream) # 使用 Profile 0 print("Use Profile 0") context.set_optimization_profile_async(0, stream) cudart.cudaStreamSynchronize(stream) #context.active_optimization_profile = 0 # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃 context.set_binding_shape(0, shape0) print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) for i in range(engine.num_bindings): print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) data = np.random.rand(np.prod(shape0)).reshape(shape0).astype(np.float32) * 2 - 1 inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMalloc(inputH0.nbytes) _, outputD0 = cudart.cudaMalloc(outputH0.nbytes) cudart.cudaMemcpy(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) print("before inference") context.execute_v2([int(inputD0), int(outputD0), int(0), int(0)]) print("after inference") cudart.cudaMemcpy(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) # 使用 Profile 1 print("Use Profile 1") context.set_optimization_profile_async(1, stream) cudart.cudaStreamSynchronize(stream) #context.active_optimization_profile = 1 # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃 context.set_binding_shape(2, shape1) print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) for i in range(engine.num_bindings): print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) data = np.random.rand(np.prod(shape1)).reshape(shape1).astype(np.float32) * 2 - 1 inputH1 = np.ascontiguousarray(data.reshape(-1)) outputH1 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2))) _, inputD1 = cudart.cudaMalloc(inputH1.nbytes) _, outputD1 = cudart.cudaMalloc(outputH1.nbytes) cudart.cudaMemcpy(inputD1, inputH1.ctypes.data, inputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) print("before inference") context.execute_v2([int(0), int(0), int(inputD1), int(outputD1)]) print("after inference") cudart.cudaMemcpy(outputH1.ctypes.data, outputD1, outputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) cudart.cudaFree(inputD0) cudart.cudaFree(inputD1) cudart.cudaFree(outputD0) cudart.cudaFree(outputD1)
def run(shape, scalar): testCase = "<shape=%s,scalar=%f>" % (shape, scalar) trtFile = "./model-Dim%s.plan" % str(len(shape)) print("Test %s" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 config.flags = 1 << int(trt.BuilderFlag.INT8) config.int8_calibrator = MyCalibrator(1, shape, cacheFile) inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape]) profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape], [32 for i in shape]) config.add_optimization_profile(profile) #inputT0.dynamic_range = [-100,100] # 不使用 calibrator 的时候要手动设置 dynamic range pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) pluginLayer.precision = trt.int8 pluginLayer.set_output_type(0, trt.int8) pluginLayer.get_output(0).dtype = trt.int8 #pluginLayer.get_output(0).dynamic_range = [-120,120] identityLayer = network.add_identity( pluginLayer.get_output(0)) # 手动转为 float32 类型,否则要自行处理输出的 int8 类型 identityLayer.get_output(0).dtype = trt.float32 network.mark_output(identityLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, shape) #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput #for i in range(engine.num_bindings): # print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput), # engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i)) bufferH = [] bufferH.append( np.random.rand(np.prod(shape)).astype(np.float32).reshape(shape) * 200 - 100) for i in range(nOutput): bufferH.append( np.empty(context.get_binding_shape(nInput + i), dtype=trt.nptype(engine.get_binding_dtype(nInput + i)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy( bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nOutput): cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i], bufferH[nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) outputCPU = addScalarCPU(bufferH[:nInput], scalar) ''' for i in range(nInput): printArrayInfo(bufferH[i]) for i in range(nOutput): printArrayInfo(bufferH[nInput+i]) for i in range(nOutput): printArrayInfo(outputCPU[i]) ''' check(bufferH[nInput:][0], outputCPU[0], True) for buffer in bufferD: cudart.cudaFree(buffer) print("Test %s finish!\n" % testCase)
def test(engine, context, nBatchSize): nProfile = engine.num_optimization_profiles if nProfile == 1: bindingBias = 0 else: if nBatchSize <= 4: bindingBias = 0 context.set_optimization_profile_async(0, 0) cudart.cudaStreamSynchronize(0) else: bindingBias = 2 context.set_optimization_profile_async(1, 0) cudart.cudaStreamSynchronize(0) context.set_binding_shape(bindingBias, [nBatchSize, 1]) nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput for i in range(nInput): print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) for i in range(nInput, nInput + nOutput): print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) nInput = nInput // nProfile nOutput = nOutput // nProfile data = np.random.rand(nBatchSize).reshape(nBatchSize, 1).astype(np.float32) bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append(np.empty(context.get_binding_shape(bindingBias + i), dtype=trt.nptype(engine.get_binding_dtype(bindingBias + i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) if nProfile == 1 or nBatchSize <= 4: bufferD = bufferD + [int(0), int(0)] else: bufferD = [int(0), int(0)] + bufferD for i in range(nInput): cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) for i in range(nWarm): context.execute_v2(bufferD) t0 = time_ns() for i in range(nTest): context.execute_v2(bufferD) t1 = time_ns() print("+---- BatchSize=%2d: %.4fms\n" % (nBatchSize, (t1 - t0) / 1e6 / nTest)) if nProfile == 1 or nBatchSize <= 4: bufferD = bufferD[:2] else: bufferD = bufferD[-2:] for b in bufferD: cudart.cudaFree(b)
def run(): testCase = "%d-%d-%d-fp%s" % (nBS, nSL, nEmbedding, '16' if int(npDataType == np.float16) else '32') print("Test <%s>" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) trtFile = "./model-" + testCase + ".plan" if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engineStr = f.read() engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network(1 << 0) config = builder.create_builder_config() config.max_workspace_size = 6 << 30 config.flags = 1 << int(trt.BuilderFlag.FP16) if int(npDataType == np.float16) else 0 inputTensorList = [] trtDataType = trt.float16 if int(npDataType == np.float16) else trt.float32 inputTensorList.append(network.add_input('inputT', trtDataType, [-1, -1, -1])) profile = builder.create_optimization_profile() profile.set_shape('inputT', [1, 1, nEmbedding], [nBS, nSL, nEmbedding], [nBS * 2, nSL * 2, nEmbedding]) config.add_optimization_profile(profile) pluginLayer = network.add_plugin_v2(inputTensorList, getLayerNormPlugin()) pluginLayer.get_output(0).dtype = trtDataType network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [nBS, nSL, nEmbedding]) print("Binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput for i in range(engine.num_bindings): print("input ->" if engine.binding_is_input(i) else "output->", engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i)) bufferH = [] bufferH.append(np.random.rand(nBS, nSL, nEmbedding).astype(np.float32).reshape(nBS, nSL, nEmbedding) * 2 - 1) bufferH.append(np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy(bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) resCPU = layerNormCPU(bufferH, epsilon)[-1] #printArrayInfo(resCPU) #printArrayInfo(bufferH[-1]) check(bufferH[-1], resCPU, True) for b in bufferD: cudart.cudaFree(b) print("Test <%s> finish!" % testCase)
def run(): logger = trt.Logger(trt.Logger.ERROR) # 指定 Logger,可用等级:VERBOSE,INFO,WARNING,ERRROR,INTERNAL_ERROR if os.path.isfile(trtFile): # 如果有 .plan 文件则直接读取 with open(trtFile, 'rb') as f: engineString = f.read() if engineString == None: print("Failed getting serialized engine!") return print("Succeeded getting serialized engine!") else: # 没有 .plan 文件,从头开始创建 builder = trt.Builder(logger) # 网络元信息,Builder/Network/BuilderConfig/Profile 相关 network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1]) # 指定输入张量 profile.set_shape(inputTensor.name, [1, 1, 1], [3, 4, 5], [6, 8, 10]) # 指定输入张量 Dynamic Shape 范围 config.add_optimization_profile(profile) identityLayer = network.add_identity(inputTensor) # 恒等变换 network.mark_output(identityLayer.get_output(0)) # 标记输出张量 engineString = builder.build_serialized_network(network, config) # 生成序列化网络 if engineString == None: print("Failed getting serialized engine!") return print("Succeeded getting serialized engine!") with open(trtFile, 'wb') as f: # 将序列化网络保存为 .plan 文件 f.write(engineString) print("Succeeded saving .plan file!") engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) # 使用 Runtime 来创建 engine if engine == None: print("Failed building engine!") return print("Succeeded building engine!") context = engine.create_execution_context() # 创建 context(相当于 GPU 进程) context.set_binding_shape(0, [3, 4, 5]) # Dynamic Shape 模式需要绑定真实数据形状 nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)]) # 获取 engine 绑定信息 nOutput = engine.num_bindings - nInput for i in range(nInput): print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) for i in range(nInput,nInput+nOutput): print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5) # 准备数据和 Host/Device 端内存 bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append(np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): # 首先将 Host 数据拷贝到 Device 端 cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) # 运行推理计算 for i in range(nInput, nInput + nOutput): # 将结果从 Device 端拷回 Host 端 cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) for i in range(nInput + nOutput): print(engine.get_binding_name(i)) print(bufferH[i].reshape(context.get_binding_shape(i))) for b in bufferD: # 释放 Device 端内存 cudart.cudaFree(b)
convQDQLayer = network.add_dequantize(convQLayer.get_output(0), qTensor) convQDQLayer.axis = 0 network.mark_output(convQDQLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) print("inputH0 :", data.shape) print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)
def test_tf_nn_linalg_matmul(): print("\ntf.nn.linalg.matmul -----------------------------------------------") # TensorFlow part ---------------------------------------------------------- x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x') weight = tf.compat.v1.get_variable('w1', shape=[hIn * wIn * cIn, cOut], initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1)) _h1 = tf.reshape(x, [-1, hIn * wIn * cIn]) y = tf.linalg.matmul( \ _h1, weight, transpose_a=False, transpose_b=False, adjoint_a=False, adjoint_b=False, a_is_sparse=False, b_is_sparse=False, name='y' ) tfConfig = tf.compat.v1.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.compat.v1.Session(config=tfConfig) sess.run(tf.compat.v1.global_variables_initializer()) outputTF = sess.run(y, feed_dict={x: inputData}) tfPara = {} # 保存权重 print("Weight:") for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES): name, value = i.name, sess.run(i) print(name, value.shape) tfPara[name] = value np.savez("para_tf_nn_linalg_matmul.npz", **tfPara) sess.close() # TensorRT part ------------------------------------------------------------ logger = trt.Logger(trt.Logger.ERROR) builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() inputT0 = network.add_input('inputT0', trt.float32, (-1, hIn, wIn, cIn)) profile.set_shape(inputT0.name, (1, hIn, wIn, cIn), (nIn, hIn, wIn, cIn), (nIn * 2, hIn, wIn, cIn)) # 范围覆盖住之后需要的值就好 config.add_optimization_profile(profile) weight = np.load('./para_tf_nn_linalg_matmul.npz')['w1:0'].transpose(1, 0).reshape(-1) # 读取权重 _h1 = network.add_fully_connected(inputT0, cOut, weight, None) _h2 = network.add_shape(_h1.get_output(0)) # 把最后两维的 (1,1) 去掉,对齐 TF 模型 _h3 = network.add_slice(_h2.get_output(0), [0], [2], [1]) _h4 = network.add_shuffle(_h1.get_output(0)) _h4.set_input(1, _h3.get_output(0)) network.mark_output(_h4.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [nIn, hIn, wIn, cIn]) _, stream = cudart.cudaStreamCreate() inputH0 = np.ascontiguousarray(inputData.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream) _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream) cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream) context.execute_async_v2([int(inputD0), int(outputD0)], stream) cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream) cudart.cudaStreamSynchronize(stream) printArray(inputData, "input") #print(inputData) printArray(outputTF, "TF output") #print(outputTF) printArray(outputH0, "TRT output") #print(outputH0) check(outputTF, outputH0, True) cudart.cudaStreamDestroy(stream) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0)