def __init__(self, calibrationCount, inputShape, cacheFile): trt.IInt8EntropyCalibrator2.__init__(self) self.calibrationCount = calibrationCount self.shape = inputShape self.buffeSize = trt.volume(inputShape) * trt.float32.itemsize self.cacheFile = cacheFile _, self.dIn = cudart.cudaMalloc(self.buffeSize) self.count = 0
def __init__(self, calibrationDataPath, calibrationCount, inputShape, cacheFile): trt.IInt8EntropyCalibrator2.__init__(self) self.imageList = glob(calibrationDataPath + "*.jpg")[:100] self.calibrationCount = calibrationCount self.shape = inputShape # (N,C,H,W) self.buffeSize = trt.volume(inputShape) * trt.float32.itemsize self.cacheFile = cacheFile _, self.dIn = cudart.cudaMalloc(self.buffeSize) self.oneBatch = self.batchGenerator() print(int(self.dIn))
def run(shape): testCase = "<shape=%s>" % (shape) trtFile = "./model-%d.plan" % (shape[2]) print("Test %s" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engineStr = f.read() engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape]) profile.set_shape(inputT0.name, [1, 1, shape[2]], shape, shape) config.add_optimization_profile(profile) pluginLayer = network.add_plugin_v2([inputT0], getLayerNormPlugin(epsilon)) network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, shape) #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput #for i in range(engine.num_bindings): # print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput), # engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i)) bufferH = [] bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape)) for i in range(nOutput): bufferH.append( np.empty(context.get_binding_shape(nInput + i), dtype=trt.nptype(engine.get_binding_dtype(nInput + i)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy( bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nOutput): cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i], bufferH[nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) outputCPU = layerNormCPU(bufferH[:nInput], epsilon) ''' for i in range(nInput): printArrayInfo(bufferH[i]) for i in range(nOutput): printArrayInfo(bufferH[nInput+i]) for i in range(nOutput): printArrayInfo(outputCPU[i]) ''' check(bufferH[nInput:][0], outputCPU[0], True) for buffer in bufferD: cudart.cudaFree(buffer) print("Test %s finish!\n" % testCase)
def run(): testCase = "%d-%d-%d-fp%s" % (nBS, nSL, nEmbedding, '16' if int(npDataType == np.float16) else '32') print("Test <%s>" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFilePath) trtFile = "./model-" + testCase + ".plan" if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engineStr = f.read() engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network(1 << 0) config = builder.create_builder_config() config.max_workspace_size = 6 << 30 config.flags = 1 << int(trt.BuilderFlag.FP16) if int(npDataType == np.float16) else 0 inputTensorList = [] trtDataType = trt.float16 if int(npDataType == np.float16) else trt.float32 inputTensorList.append(network.add_input('inputT', trtDataType, [-1, -1, -1])) profile = builder.create_optimization_profile() profile.set_shape('inputT', [1, 1, nEmbedding], [nBS, nSL, nEmbedding], [nBS * 2, nSL * 2, nEmbedding]) config.add_optimization_profile(profile) pluginLayer = network.add_plugin_v2(inputTensorList, getLayerNormPlugin()) pluginLayer.get_output(0).dtype = trtDataType network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [nBS, nSL, nEmbedding]) print("Binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput for i in range(engine.num_bindings): print("input ->" if engine.binding_is_input(i) else "output->", engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i)) bufferH = [] bufferH.append(np.random.rand(nBS, nSL, nEmbedding).astype(np.float32).reshape(nBS, nSL, nEmbedding) * 2 - 1) bufferH.append(np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy(bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) resCPU = layerNormCPU(bufferH, epsilon)[-1] #printArrayInfo(resCPU) #printArrayInfo(bufferH[-1]) check(bufferH[-1], resCPU, True) for b in bufferD: cudart.cudaFree(b) print("Test <%s> finish!" % testCase)
context = engine.create_execution_context() stream = 0 # 使用默认 CUDA 流 # 使用 Profile 0 print("Use Profile 0") context.set_optimization_profile_async(0, stream) cudart.cudaStreamSynchronize(stream) #context.active_optimization_profile = 0 # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃 context.set_binding_shape(0, [nIn, cIn, hIn, wIn]) print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) for i in range(engine.num_bindings): print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMalloc(inputH0.nbytes) _, outputD0 = cudart.cudaMalloc(outputH0.nbytes) cudart.cudaMemcpy(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2([int(inputD0), int(outputD0), int(0), int(0)]) cudart.cudaMemcpy(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) print("check result:", np.all(outputH0 == -inputH0.reshape(nIn, cIn, hIn, wIn))) # 使用 Profile 1 print("Use Profile 1") context.set_optimization_profile_async(1, stream) cudart.cudaStreamSynchronize(stream) #context.active_optimization_profile = 1 # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃 context.set_binding_shape(2, [nIn, cIn, hIn, wIn]) print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) for i in range(engine.num_bindings):
def run(shape, scalar): testCase = "<shape=%s,scalar=%f>" % (shape, scalar) trtFile = "./model-Dim%s.plan" % str(len(shape)) print("Test %s" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 config.flags = 1 << int(trt.BuilderFlag.INT8) config.int8_calibrator = MyCalibrator(1, shape, cacheFile) inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape]) profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape], [32 for i in shape]) config.add_optimization_profile(profile) #inputT0.dynamic_range = [-100,100] # 不使用 calibrator 的时候要手动设置 dynamic range pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) pluginLayer.precision = trt.int8 pluginLayer.set_output_type(0, trt.int8) pluginLayer.get_output(0).dtype = trt.int8 #pluginLayer.get_output(0).dynamic_range = [-120,120] identityLayer = network.add_identity( pluginLayer.get_output(0)) # 手动转为 float32 类型,否则要自行处理输出的 int8 类型 identityLayer.get_output(0).dtype = trt.float32 network.mark_output(identityLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, shape) #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput #for i in range(engine.num_bindings): # print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput), # engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i)) bufferH = [] bufferH.append( np.random.rand(np.prod(shape)).astype(np.float32).reshape(shape) * 200 - 100) for i in range(nOutput): bufferH.append( np.empty(context.get_binding_shape(nInput + i), dtype=trt.nptype(engine.get_binding_dtype(nInput + i)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy( bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nOutput): cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i], bufferH[nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) outputCPU = addScalarCPU(bufferH[:nInput], scalar) ''' for i in range(nInput): printArrayInfo(bufferH[i]) for i in range(nOutput): printArrayInfo(bufferH[nInput+i]) for i in range(nOutput): printArrayInfo(outputCPU[i]) ''' check(bufferH[nInput:][0], outputCPU[0], True) for buffer in bufferD: cudart.cudaFree(buffer) print("Test %s finish!\n" % testCase)
engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) for i in range(nInput, nInput + nOutput): print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append( np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) for i in range(10): context.execute_v2(bufferD) for i in range(nInput + nOutput):
def run(useTimeCache): logger = trt.Logger(trt.Logger.ERROR) timeCache = b"" if useTimeCache and os.path.isfile(timeCacheFile): with open(timeCacheFile, 'rb') as f: timeCache = f.read() if timeCache == None: print("Failed getting serialized timing cache!") return print("Succeeded getting serialized timing cache!") builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 if useTimeCache: cache = config.create_timing_cache(timeCache) config.set_timing_cache(cache, False) inputTensor = network.add_input('inputT0', trt.float32, [-1, 1, 28, 28]) profile.set_shape(inputTensor.name, (1, 1, 28, 28), (4, 1, 28, 28), (8, 1, 28, 28)) config.add_optimization_profile(profile) np.random.seed(97) # 保持每次权重都一样 w = np.random.rand(32, 1, 5, 5).astype(np.float32).reshape(-1) b = np.random.rand(32).astype(np.float32).reshape(-1) _0 = network.add_convolution_nd(inputTensor, 32, [5, 5], w, b) _0.padding_nd = [2, 2] _1 = network.add_activation(_0.get_output(0), trt.ActivationType.RELU) _2 = network.add_pooling_nd(_1.get_output(0), trt.PoolingType.MAX, [2, 2]) _2.stride_nd = [2, 2] w = np.random.rand(64, 32, 5, 5).astype(np.float32).reshape(-1) b = np.random.rand(64).astype(np.float32).reshape(-1) _3 = network.add_convolution_nd(_2.get_output(0), 64, [5, 5], w, b) _3.padding_nd = [2, 2] _4 = network.add_activation(_3.get_output(0), trt.ActivationType.RELU) _5 = network.add_pooling_nd(_4.get_output(0), trt.PoolingType.MAX, [2, 2]) _5.stride_nd = [2, 2] _6 = network.add_shuffle(_5.get_output(0)) _6.first_transpose = (0, 2, 3, 1) _6.reshape_dims = (-1, 64 * 7 * 7, 1, 1) w = np.random.rand(1024, 64 * 7 * 7).astype(np.float32).reshape(-1) b = np.random.rand(1024).astype(np.float32).reshape(-1) _7 = network.add_fully_connected(_6.get_output(0), 1024, w, b) _8 = network.add_activation(_7.get_output(0), trt.ActivationType.RELU) w = np.random.rand(10, 1024).astype(np.float32).reshape(-1) b = np.random.rand(10).astype(np.float32).reshape(-1) _9 = network.add_fully_connected(_8.get_output(0), 10, w, b) _10 = network.add_activation(_9.get_output(0), trt.ActivationType.RELU) _11 = network.add_shuffle(_10.get_output(0)) _11.reshape_dims = [-1, 10] _12 = network.add_softmax(_11.get_output(0)) _12.axes = 1 << 1 _13 = network.add_topk(_12.get_output(0), trt.TopKOperation.MAX, 1, 1 << 1) network.mark_output(_13.get_output(1)) t0 = time() engineString = builder.build_serialized_network(network, config) t1 = time() print("%s timing cache, %f ms" % ("With" if useTimeCache else "Without", (t1 - t0) * 1000)) if useTimeCache and not os.path.isfile(timeCacheFile): timeCache = config.get_timing_cache() timeCacheString = timeCache.serialize() with open(timeCacheFile, 'wb') as f: f.write(timeCacheString) print("Succeeded saving .cache file!") engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, [1, 1, 28, 28]) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append( np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) #for i in range(nInput + nOutput): # print(engine.get_binding_name(i)) # print(bufferH[i].reshape(context.get_binding_shape(i))) for b in bufferD: cudart.cudaFree(b)
def run(): logger = trt.Logger(trt.Logger.ERROR) # 指定 Logger,可用等级:VERBOSE,INFO,WARNING,ERRROR,INTERNAL_ERROR if os.path.isfile(trtFile): # 如果有 .plan 文件则直接读取 with open(trtFile, 'rb') as f: engineString = f.read() if engineString == None: print("Failed getting serialized engine!") return print("Succeeded getting serialized engine!") else: # 没有 .plan 文件,从头开始创建 builder = trt.Builder(logger) # 网络元信息,Builder/Network/BuilderConfig/Profile 相关 network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1]) # 指定输入张量 profile.set_shape(inputTensor.name, [1, 1, 1], [3, 4, 5], [6, 8, 10]) # 指定输入张量 Dynamic Shape 范围 config.add_optimization_profile(profile) identityLayer = network.add_identity(inputTensor) # 恒等变换 network.mark_output(identityLayer.get_output(0)) # 标记输出张量 engineString = builder.build_serialized_network(network, config) # 生成序列化网络 if engineString == None: print("Failed getting serialized engine!") return print("Succeeded getting serialized engine!") with open(trtFile, 'wb') as f: # 将序列化网络保存为 .plan 文件 f.write(engineString) print("Succeeded saving .plan file!") engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) # 使用 Runtime 来创建 engine if engine == None: print("Failed building engine!") return print("Succeeded building engine!") context = engine.create_execution_context() # 创建 context(相当于 GPU 进程) context.set_binding_shape(0, [3, 4, 5]) # Dynamic Shape 模式需要绑定真实数据形状 nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)]) # 获取 engine 绑定信息 nOutput = engine.num_bindings - nInput for i in range(nInput): print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) for i in range(nInput,nInput+nOutput): print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5) # 准备数据和 Host/Device 端内存 bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append(np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): # 首先将 Host 数据拷贝到 Device 端 cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) # 运行推理计算 for i in range(nInput, nInput + nOutput): # 将结果从 Device 端拷回 Host 端 cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) for i in range(nInput + nOutput): print(engine.get_binding_name(i)) print(bufferH[i].reshape(context.get_binding_shape(i))) for b in bufferD: # 释放 Device 端内存 cudart.cudaFree(b)
def test(engine, context, nBatchSize): nProfile = engine.num_optimization_profiles if nProfile == 1: bindingBias = 0 else: if nBatchSize <= 4: bindingBias = 0 context.set_optimization_profile_async(0, 0) cudart.cudaStreamSynchronize(0) else: bindingBias = 2 context.set_optimization_profile_async(1, 0) cudart.cudaStreamSynchronize(0) context.set_binding_shape(bindingBias, [nBatchSize, 1]) nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput for i in range(nInput): print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) for i in range(nInput, nInput + nOutput): print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) nInput = nInput // nProfile nOutput = nOutput // nProfile data = np.random.rand(nBatchSize).reshape(nBatchSize, 1).astype(np.float32) bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append(np.empty(context.get_binding_shape(bindingBias + i), dtype=trt.nptype(engine.get_binding_dtype(bindingBias + i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) if nProfile == 1 or nBatchSize <= 4: bufferD = bufferD + [int(0), int(0)] else: bufferD = [int(0), int(0)] + bufferD for i in range(nInput): cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nInput, nInput + nOutput): cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) for i in range(nWarm): context.execute_v2(bufferD) t0 = time_ns() for i in range(nTest): context.execute_v2(bufferD) t1 = time_ns() print("+---- BatchSize=%2d: %.4fms\n" % (nBatchSize, (t1 - t0) / 1e6 / nTest)) if nProfile == 1 or nBatchSize <= 4: bufferD = bufferD[:2] else: bufferD = bufferD[-2:] for b in bufferD: cudart.cudaFree(b)
if engineString == None: print("Failed building engine!") exit() print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() print("Binding0->", engine.get_binding_shape(0), context.get_binding_shape(0), engine.get_binding_dtype(0)) print("Binding1->", engine.get_binding_shape(1), context.get_binding_shape(1), engine.get_binding_dtype(1)) data = cv2.imread(inputImage, cv2.IMREAD_GRAYSCALE).astype(np.float32) inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) inputD0 = cudart.cudaMalloc(inputH0.nbytes)[1] outputD0 = cudart.cudaMalloc(outputH0.nbytes)[1] cudart.cudaMemcpy(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2([int(inputD0), int(outputD0)]) cudart.cudaMemcpy(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) print("inputH0 :", data.shape) #print(data) print("outputH0:", outputH0.shape) print(outputH0) cudart.cudaFree(inputD0) cudart.cudaFree(outputD0) print("Succeeded running model in TensorRT!")
def run(shape, scalar): testCase = "<shape=%s,scalar=%f>" % (shape, scalar) trtFile = "./model-Shape[%s].plan" % ( "".join([str(i) + "-" for i in shape[:-1]]) + str(shape[-1])) print("Test %s" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) builder.max_batch_size = 32 network = builder.create_network() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 inputT0 = network.add_input('inputT0', trt.float32, shape[1:]) pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput #for i in range(engine.num_bindings): # print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput), # engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i)) bufferH = [] bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape)) for i in range(nOutput): bufferH.append( np.empty( (shape[0], ) + tuple(context.get_binding_shape(nInput + i)), dtype=trt.nptype(engine.get_binding_dtype(nInput + i)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy( bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute(shape[0], bufferD) for i in range(nOutput): cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i], bufferH[nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) outputCPU = addScalarCPU(bufferH[:nInput], scalar) ''' for i in range(nInput): printArrayInfo(bufferH[i]) for i in range(nOutput): printArrayInfo(bufferH[nInput+i]) for i in range(nOutput): printArrayInfo(outputCPU[i]) ''' check(bufferH[nInput:][0], outputCPU[0], True) for buffer in bufferD: cudart.cudaFree(buffer) print("Test %s finish!\n" % testCase)
def run(shape0, shape1, scalar): testCase = "<shape0:%s,shape1:%s,scalar=%f>" % (shape0, shape1, scalar) trtFile = "./model-Dims" + str(len(shape0)) + ".plan" print("\nTest", testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") return print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile0 = builder.create_optimization_profile() profile1 = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 config.flags = 1 << int(trt.BuilderFlag.FP16) # 注释掉这一行,Pugin 就仅使用 FP32 inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape0]) profile0.set_shape(inputT0.name, [1 for i in shape0], [8 for i in shape0], [32 for i in shape0]) config.add_optimization_profile(profile0) profile1.set_shape(inputT0.name, [1 for i in shape1], [8 for i in shape1], [32 for i in shape1]) config.add_optimization_profile(profile1) pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() stream = 0 # 使用默认 CUDA 流 cudart.cudaStreamSynchronize(stream) # 使用 Profile 0 print("Use Profile 0") context.set_optimization_profile_async(0, stream) cudart.cudaStreamSynchronize(stream) #context.active_optimization_profile = 0 # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃 context.set_binding_shape(0, shape0) print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) for i in range(engine.num_bindings): print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) data = np.random.rand(np.prod(shape0)).reshape(shape0).astype(np.float32) * 2 - 1 inputH0 = np.ascontiguousarray(data.reshape(-1)) outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))) _, inputD0 = cudart.cudaMalloc(inputH0.nbytes) _, outputD0 = cudart.cudaMalloc(outputH0.nbytes) cudart.cudaMemcpy(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) print("before inference") context.execute_v2([int(inputD0), int(outputD0), int(0), int(0)]) print("after inference") cudart.cudaMemcpy(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) # 使用 Profile 1 print("Use Profile 1") context.set_optimization_profile_async(1, stream) cudart.cudaStreamSynchronize(stream) #context.active_optimization_profile = 1 # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃 context.set_binding_shape(2, shape1) print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)])) for i in range(engine.num_bindings): print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) data = np.random.rand(np.prod(shape1)).reshape(shape1).astype(np.float32) * 2 - 1 inputH1 = np.ascontiguousarray(data.reshape(-1)) outputH1 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2))) _, inputD1 = cudart.cudaMalloc(inputH1.nbytes) _, outputD1 = cudart.cudaMalloc(outputH1.nbytes) cudart.cudaMemcpy(inputD1, inputH1.ctypes.data, inputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) print("before inference") context.execute_v2([int(0), int(0), int(inputD1), int(outputD1)]) print("after inference") cudart.cudaMemcpy(outputH1.ctypes.data, outputD1, outputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) cudart.cudaFree(inputD0) cudart.cudaFree(inputD1) cudart.cudaFree(outputD0) cudart.cudaFree(outputD1)