convQDQLayer = network.add_dequantize(convQLayer.get_output(0), qTensor)
convQDQLayer.axis = 0

network.mark_output(convQDQLayer.get_output(0))
engineString = builder.build_serialized_network(network, config)
engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
context = engine.create_execution_context()
_, stream = cudart.cudaStreamCreate()

inputH0 = np.ascontiguousarray(data.reshape(-1))
outputH0 = np.empty(context.get_binding_shape(1),
                    dtype=trt.nptype(engine.get_binding_dtype(1)))
_, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
_, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                       cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
context.execute_async_v2([int(inputD0), int(outputD0)], stream)
cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                       cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
cudart.cudaStreamSynchronize(stream)

print("inputH0 :", data.shape)
print(data)
print("outputH0:", outputH0.shape)
print(outputH0)

cudart.cudaStreamDestroy(stream)
cudart.cudaFree(inputD0)
cudart.cudaFree(outputD0)
Ejemplo n.º 2
0
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            exit()
        print("Succeeded loading engine!")
        onnxFile = onnxFile1  # 已经有 model.plan,读进 model1.onnx 做 Refit
    else:
        onnxFile = onnxFile0  # 还没有 model.plan,先用 model0.onnx 构建 model.plan

    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    config = builder.create_builder_config()
    config.flags = 1 << int(trt.BuilderFlag.REFIT)
    config.max_workspace_size = 3 << 30
    parser = trt.OnnxParser(network, logger)
    if not os.path.exists(onnxFile):
        print("Failed finding .onnx file!")
        exit()
    print("Succeeded finding .onnx file!")
    with open(onnxFile, 'rb') as model:
        if not parser.parse(model.read()):
            print("Failed parsing .onnx file!")
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            exit()
        print("Succeeded parsing .onnx file!")

    if os.path.isfile(trtFile):  # 进行 Refit
        refitter = trt.Refitter(engine, logger)
        layerNameList, weightRoleList = refitter.get_all()
        for name, role in zip(layerNameList, weightRoleList):
            print("LayerName:%s,WeightRolw:%s"%(name, role))

        for i in range(network.num_layers):
            layer = network.get_layer(i)        
            if layer.name in layerNameList:
                                
                # 据实际网络情况,可能需要添加更多 Layer
                if layer.type == trt.LayerType.CONVOLUTION:
                    layer.__class__ = trt.IConvolutionLayer            
                    refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel)
                    refitter.set_weights(layer.name, trt.WeightsRole.BIAS, layer.bias)
                    layerNameList.remove
                    
                if layer.type == trt.LayerType.FULLY_CONNECTED:
                    layer.__class__ = trt.IFullyConnectedLayer
                    refitter.set_weights(layer.name, trt.WeightsRole.KERNEL, layer.kernel)

                if layer.type == trt.LayerType.CONSTANT:
                    layer.__class__ = trt.IConstantLayer
                    refitter.set_weights(layer.name, trt.WeightsRole.CONSTANT, layer.weights)
                                                    
        if refitter.refit_cuda_engine() == False:        
            print("Failed refitting engine, missing weight:")
            [missingLayer, weightRole] = refitter.get_missing()
            for layer, role in zip(missingLayer, weightRole):
                print("\tLayerName:%s,WeightRolw:%s"%(name, role))
            return
        print("Succeeded refitting engine!")

    else:  # 构建 model.plan
        inputTensor = network.get_input(0)
        inputTensor.shape = [1, 1, 28, 28]
        '''  # 逐层打印网络信息
        for i in range(network.num_layers):
            layer = network.get_layer(i)        
            print(i, "%s,in=%d,out=%d,%s" % (str(layer.type)[10:], layer.num_inputs, layer.num_outputs, layer.name))
            for j in range(layer.num_inputs):
                tensor = layer.get_input(j)
                if tensor == None:
                    print("\tInput  %2d:" % j, "None")
                else:
                    print("\tInput  %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name))
            for j in range(layer.num_outputs):
                tensor = layer.get_output(j)
                if tensor == None:
                    print("\tOutput %2d:" % j, "None")
                else:
                    print("\tOutput %2d:%s,%s,%s" % (j, tensor.shape, str(tensor.dtype)[9:], tensor.name))
        '''
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            exit()
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [1, 1, 28, 28])
    _, stream = cudart.cudaStreamCreate()
    print("Binding0->", engine.get_binding_shape(0), context.get_binding_shape(0), engine.get_binding_dtype(0))
    print("Binding1->", engine.get_binding_shape(1), context.get_binding_shape(1), engine.get_binding_dtype(1))

    data = cv2.imread(inputImage, cv2.IMREAD_GRAYSCALE).astype(np.float32)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    print("inputH0 :", data.shape)
    #print(data)
    print("outputH0:", outputH0.shape)
    print(outputH0)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
    print("Succeeded running model in TensorRT!")
Ejemplo n.º 3
0
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engineString = f.read()
        if engineString == None:
            print("Failed getting serialized engine!")
            return
        print("Succeeded getting serialized engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30

        inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1])
        profile.set_shape(inputTensor.name, [1, 1, 1], [cIn, hIn, wIn],
                          [cIn * 2, hIn * 2, wIn * 2])
        config.add_optimization_profile(profile)

        identityLayer = network.add_identity(inputTensor)
        network.mark_output(identityLayer.get_output(0))

        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed getting serialized engine!")
            return
        print("Succeeded getting serialized engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
            print("Succeeded saving .plan file!")

    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
    if engine == None:
        print("Failed building engine!")
        return
    print("Succeeded building engine!")

    context = engine.create_execution_context()
    context.set_binding_shape(0, [cIn, hIn, wIn])
    _, stream = cudart.cudaStreamCreate()
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    for i in range(nInput):
        print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i),
              engine.get_binding_shape(i), context.get_binding_shape(i),
              engine.get_binding_name(i))
    for i in range(nInput, nInput + nOutput):
        print("Bind[%2d]:o[%2d]->" % (i, i - nInput),
              engine.get_binding_dtype(i), engine.get_binding_shape(i),
              context.get_binding_shape(i), engine.get_binding_name(i))

    npData = []
    bufferSize = []
    bufferH = []
    bufferD = []

    for i in range(nInput):  # 输入 numpy 数组和返回 numpy 数组
        bufferSize.append(
            trt.volume(context.get_binding_shape(i)) *
            engine.get_binding_dtype(i).itemsize)
        npData.append(
            np.arange(cIn * hIn * wIn,
                      dtype=np.float32).reshape(cIn, hIn, wIn))
    for i in range(nInput, nInput + nOutput):
        bufferSize.append(
            trt.volume(context.get_binding_shape(i)) *
            engine.get_binding_dtype(i).itemsize)
        npData.append(
            np.empty(context.get_binding_shape(i),
                     dtype=trt.nptype(engine.get_binding_dtype(i))))

    for i in range(nInput + nOutput):  # 申请 Host 端页锁定内存和 Device 端显存
        bufferH.append(
            cudart.cudaHostAlloc(bufferSize[i],
                                 cudart.cudaHostAllocWriteCombined)[1])
        bufferD.append(cudart.cudaMallocAsync(bufferSize[i], stream)[1])

    for i in range(nInput):  # numpy 数组 -> 页锁定内存
        cudart.cudaMemcpyAsync(bufferH[i], npData[i].ctypes.data,
                               bufferSize[i],
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)

    context.execute_async_v2(bufferH, stream)  # 直接使用页锁定内存

    for i in range(nInput, nInput + nOutput):  # 页锁定内存 -> 返回 numpy 数组
        cudart.cudaMemcpyAsync(npData[i].ctypes.data, bufferH[i],
                               bufferSize[i],
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    cudart.cudaStreamSynchronize(stream)

    for i in range(nInput + nOutput):
        print(engine.get_binding_name(i))
        print(npData[i].reshape(context.get_binding_shape(i)))

    for b in bufferH:
        cudart.cudaFreeHost(b)
    for b in bufferD:
        cudart.cudaFreeAsync(b, stream)
    cudart.cudaStreamDestroy(stream)
Ejemplo n.º 4
0
def run(nRunTime):
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        config = builder.create_builder_config()
        config.flags = 1 << int(trt.BuilderFlag.REFIT)

        inputT0 = network.add_input('inputT0', trt.float32,
                                    (nIn, cIn, hIn, wIn))
        fakeWeight = np.zeros([cOut, cIn, wW, wW], dtype=np.float32)
        fakeBias = np.zeros([cOut], dtype=np.float32)
        convolutionLayer = network.add_convolution_nd(inputT0, cOut, (hW, wW),
                                                      fakeWeight, fakeBias)
        #convolutionLayer.name = 'conv'
        network.set_weights_name(convolutionLayer.kernel, "conv-w")
        network.set_weights_name(convolutionLayer.bias, "conv-b")

        network.mark_output(convolutionLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    if nRunTime == 0:
        print("Do not refit!")
    else:
        print("Refit!")
        refitter = trt.Refitter(engine, logger)
        refitter.set_named_weights("conv-w", weight)
        refitter.set_named_weights("conv-b", bias)

        [missingLayer, weightRole] = refitter.get_missing()
        for layer, role in zip(missingLayer, weightRole):
            print("[", layer, "-", role, "]")

        if refitter.refit_cuda_engine() == False:
            print("Failed Refitting engine!")
            return

    context = engine.create_execution_context()
    _, stream = cudart.cudaStreamCreate()
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                           stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    print("data:", data.shape)
    print(data)
    print("outputH0:", outputH0.shape)
    print(outputH0)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30

        inputList = []
        for i in range(nGEMM + 1):
            inputT = network.add_input('inputT' + str(i), trt.float32,
                                       [-1, 4, sizeGEMM, sizeGEMM])
            profile.set_shape(inputT.name, (1, 4, sizeGEMM, sizeGEMM),
                              (4, 4, sizeGEMM, sizeGEMM),
                              (sizeGEMM, 4, sizeGEMM, sizeGEMM))
            inputList.append(inputT)
        config.add_optimization_profile(profile)

        tempTensor = inputList[0]
        for i in range(1, nGEMM + 1):
            tempLayer = network.add_matrix_multiply(tempTensor,
                                                    trt.MatrixOperation.NONE,
                                                    inputList[i],
                                                    trt.MatrixOperation.NONE)
            tempTensor = tempLayer.get_output(0)

        network.mark_output(tempLayer.get_output(0))

        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    for i in range(nGEMM + 1):
        context.set_binding_shape(i, [4, 4, sizeGEMM, sizeGEMM])
    stream = cudart.cudaStreamCreate()[1]

    bufferSize = [
        trt.volume(context.get_binding_shape(i)) *
        np.array([0], dtype=trt.nptype(engine.get_binding_dtype(i))).nbytes
        for i in range(engine.num_bindings)
    ]

    bufferH = []
    bufferD = []
    for i in range(nGEMM + 2):
        bufferH.append(
            cudart.cudaHostAlloc(bufferSize[i],
                                 cudart.cudaHostAllocWriteCombined)[1])
        bufferD.append(cudart.cudaMallocAsync(bufferSize[i], stream)[1])

    # 不用 CUDA Graph 来执行
    for i in range(nGEMM + 1):
        cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i],
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
    context.execute_async_v2(bufferD, stream)
    cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1],
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    for n in range(nInference):
        for i in range(nGEMM + 1):
            cudart.cudaMemcpyAsync(
                bufferD[i], bufferH[i], bufferSize[i],
                cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
        context.execute_async_v2(bufferD, stream)
        cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1],
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    cudart.cudaStreamSynchronize(stream)

    # 捕获 CUDA Graph 并运行
    cudart.cudaStreamBeginCapture(
        stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
    for i in range(nGEMM + 1):
        cudart.cudaMemcpyAsync(bufferD[i], bufferH[i], bufferSize[i],
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
    context.execute_async_v2(bufferD, stream)
    cudart.cudaMemcpyAsync(bufferH[-1], bufferD[-1], bufferSize[-1],
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    #cudart.cudaStreamSynchronize(stream)                       # 不用在 graph 内同步
    _, graph = cudart.cudaStreamEndCapture(stream)
    _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0)

    cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    for n in range(nInference):
        cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    for i in range(nGEMM + 2):
        cudart.cudaFree(bufferD[i])
    cudart.cudaStreamDestroy(stream)
Ejemplo n.º 6
0
def run1(engine):
    context = engine.create_execution_context()
    context.set_binding_shape(0, [nIn, cIn, hIn, wIn])
    _, stream = cudart.cudaStreamCreate()

    data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape(
        nIn, cIn, hIn, wIn)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    # 完整一次推理
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                           stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    # 数据拷贝 HtoD 计时
    for i in range(10):
        cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)

    trtTimeStart = time()
    for i in range(30):
        cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, DataCopyHtoD" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    # 推理计时
    for i in range(10):
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)

    trtTimeStart = time()
    for i in range(30):
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, Inference" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    # 数据拷贝 DtoH 计时
    for i in range(10):
        cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)

    trtTimeStart = time()
    for i in range(30):
        cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, DataCopyDtoH" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    # 总时间计时
    for i in range(10):
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)

    trtTimeStart = time()
    for i in range(30):
        cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
        context.execute_async_v2([int(inputD0), int(outputD0)], stream)
        cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    cudart.cudaStreamSynchronize(stream)
    trtTimeEnd = time()
    print("%6.3fms - 1 stream, DataCopy + Inference" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
Ejemplo n.º 7
0
def run2(engine):
    context = engine.create_execution_context()
    context.set_binding_shape(0, [nIn, cIn, hIn, wIn])
    _, stream0 = cudart.cudaStreamCreate()
    _, stream1 = cudart.cudaStreamCreate()
    _, event0 = cudart.cudaEventCreate()
    _, event1 = cudart.cudaEventCreate()

    data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape(
        nIn, cIn, hIn, wIn)
    inputSize = trt.volume(context.get_binding_shape(0)) * np.array(
        [0], dtype=trt.nptype(engine.get_binding_dtype(0))).nbytes
    outputSize = trt.volume(context.get_binding_shape(1)) * np.array(
        [0], dtype=trt.nptype(engine.get_binding_dtype(1))).nbytes
    _, inputH0 = cudart.cudaHostAlloc(inputSize,
                                      cudart.cudaHostAllocWriteCombined)
    _, inputH1 = cudart.cudaHostAlloc(inputSize,
                                      cudart.cudaHostAllocWriteCombined)
    _, outputH0 = cudart.cudaHostAlloc(outputSize,
                                       cudart.cudaHostAllocWriteCombined)
    _, outputH1 = cudart.cudaHostAlloc(outputSize,
                                       cudart.cudaHostAllocWriteCombined)
    _, inputD0 = cudart.cudaMallocAsync(inputSize, stream0)
    _, inputD1 = cudart.cudaMallocAsync(inputSize, stream1)
    _, outputD0 = cudart.cudaMallocAsync(outputSize, stream0)
    _, outputD1 = cudart.cudaMallocAsync(outputSize, stream1)

    # 总时间计时
    for i in range(10):
        context.execute_async_v2([int(inputD0), int(outputD0)], stream0)

    trtTimeStart = time()
    cudart.cudaEventRecord(event1, stream1)

    for i in range(30):
        inputH, outputH = [inputH1, outputH1] if i & 1 else [inputH0, outputH0]
        inputD, outputD = [inputD1, outputD1] if i & 1 else [inputD0, outputD0]
        eventBefore, eventAfter = [event0, event1
                                   ] if i & 1 else [event1, event0]
        stream = stream1 if i & 1 else stream0

        cudart.cudaMemcpyAsync(inputD, inputH, inputSize,
                               cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                               stream)
        cudart.cudaStreamWaitEvent(stream, eventBefore,
                                   cudart.cudaEventWaitDefault)
        context.execute_async_v2([int(inputD), int(outputD)], stream)
        cudart.cudaEventRecord(eventAfter, stream)
        cudart.cudaMemcpyAsync(outputH, outputD, outputSize,
                               cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                               stream)
    '''# 奇偶循环拆开写
    for i in range(30//2):
        cudart.cudaMemcpyAsync(inputD0, inputH0, inputSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream0)
        cudart.cudaStreamWaitEvent(stream0,event1,cudart.cudaEventWaitDefault)
        context.execute_async_v2([int(inputD0), int(outputD0)], stream0)
        cudart.cudaEventRecord(event0,stream0)
        cudart.cudaMemcpyAsync(outputH0, outputD0, outputSize, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream0)

        cudart.cudaMemcpyAsync(inputD1, inputH1, inputSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream1)
        cudart.cudaStreamWaitEvent(stream1,event0,cudart.cudaEventWaitDefault)
        context.execute_async_v2([int(inputD1), int(outputD1)], stream1)
        cudart.cudaEventRecord(event1,stream1)
        cudart.cudaMemcpyAsync(outputH1, outputD1, outputSize, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream1)
    '''
    cudart.cudaEventSynchronize(event1)
    trtTimeEnd = time()
    print("%6.3fms - 2 stream, DataCopy + Inference" %
          ((trtTimeEnd - trtTimeStart) / 30 * 1000))
def test_tf_nn_conv2d():
    print(
        "\ntf.nn.conv2d ------------------------------------------------------"
    )
    # TensorFlow part ----------------------------------------------------------
    x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x')
    weight = tf.compat.v1.get_variable(
        'w1',
        shape=[hW, wW, cIn, cOut],
        initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1))
    y       = tf.nn.conv2d( \
                x,
                filter=weight,
                strides=None,
                padding='SAME',
                use_cudnn_on_gpu=True,
                data_format='NHWC',
                dilations=[1, 1, 1, 1],
                name='y',
                filters=None
                )

    tfConfig = tf.compat.v1.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.compat.v1.Session(config=tfConfig)
    sess.run(tf.compat.v1.global_variables_initializer())

    outputTF = sess.run(y, feed_dict={x: inputData})
    tfPara = {}  # 保存权重
    print("Weight:")
    for i in tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES):
        name, value = i.name, sess.run(i)
        print(name, value.shape)
        tfPara[name] = value
    np.savez("para_tf_nn_conv2d.npz", **tfPara)
    sess.close()

    # TensorRT part ------------------------------------------------------------
    logger = trt.Logger(trt.Logger.ERROR)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    profile = builder.create_optimization_profile()
    config = builder.create_builder_config()
    inputT0 = network.add_input('inputT0', trt.float32, (-1, -1, -1, cIn))
    profile.set_shape(inputT0.name, (1, 1, 1, cIn), (nIn, hIn, wIn, cIn),
                      (nIn * 2, hIn * 2, wIn * 2, cIn))  # 范围覆盖住之后需要的值就好
    config.add_optimization_profile(profile)

    _h1 = network.add_shuffle(inputT0)  # NHWC to NCHW
    _h1.first_transpose = (0, 3, 1, 2)
    weight = np.load('./para_tf_nn_conv2d.npz')['w1:0'].transpose(
        3, 2, 0, 1).reshape(-1)  # 读取权重
    _h2 = network.add_convolution_nd(_h1.get_output(0), cOut, [hW, wW], weight,
                                     None)
    _h2.padding_nd = (2, 2)
    _h3 = network.add_shuffle(_h2.get_output(0))  # NCHW to NHWC,与 TF 模型保持一致
    _h3.first_transpose = (0, 2, 3, 1)

    network.mark_output(_h3.get_output(0))
    engineString = builder.build_serialized_network(network, config)
    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
    context = engine.create_execution_context()
    context.set_binding_shape(0, [nIn, hIn, wIn, cIn])
    _, stream = cudart.cudaStreamCreate()

    inputH0 = np.ascontiguousarray(inputData.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1),
                        dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
                           stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes,
                           cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
                           stream)
    cudart.cudaStreamSynchronize(stream)

    printArray(inputData, "input")
    #print(inputData)
    printArray(outputTF, "TF output")
    #print(outputTF)
    printArray(outputH0, "TRT output")
    #print(outputH0)
    check(outputTF, outputH0, True)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
Ejemplo n.º 9
0
def run():
    logger = trt.Logger(trt.Logger.ERROR)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30

        inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1])
        profile.set_shape(inputTensor.name, (1, 1, 1), (3, 4, 5), (6, 8, 10))
        config.add_optimization_profile(profile)

        identityLayer = network.add_identity(inputTensor)
        network.mark_output(identityLayer.get_output(0))

        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [3, 4, 5])
    _, stream = cudart.cudaStreamCreate()

    data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5)
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    # 捕获 CUDA Graph 之前需要先运行一次推理
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    # 捕获 CUDA Graph 并运行
    cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    #cudart.cudaStreamSynchronize(stream)  # 不用在 graph 内同步
    _, graph = cudart.cudaStreamEndCapture(stream)
    _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0)

    cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    print("outputH0Big:", outputH0.shape)
    print(outputH0)

    # 输入尺寸改变后,也需要先运行一次推理,再重新捕获 CUDA Graph,最后再运行
    context.set_binding_shape(0, [2, 3, 4])
    inputH0 = np.ascontiguousarray(-data[:2 * 3 * 4].reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    _, graph = cudart.cudaStreamEndCapture(stream)
    _, graphExe, _ = cudart.cudaGraphInstantiate(graph, b"", 0)

    cudart.cudaGraphLaunch(graphExe, stream)
    cudart.cudaStreamSynchronize(stream)

    print("outputH0Small:", outputH0.shape)
    print(outputH0)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)
def test_tf_nn_linalg_matmul():
    print("\ntf.nn.linalg.matmul -----------------------------------------------")
    # TensorFlow part ----------------------------------------------------------
    x = tf.compat.v1.placeholder(tf.float32, [None, hIn, wIn, cIn], name='x')
    weight = tf.compat.v1.get_variable('w1', shape=[hIn * wIn * cIn, cOut], initializer=tf.truncated_normal_initializer(mean=0, stddev=0.1))
    _h1 = tf.reshape(x, [-1, hIn * wIn * cIn])
    y       = tf.linalg.matmul( \
                _h1,
                weight,
                transpose_a=False,
                transpose_b=False,
                adjoint_a=False,
                adjoint_b=False,
                a_is_sparse=False,
                b_is_sparse=False,
                name='y'
                )

    tfConfig = tf.compat.v1.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.compat.v1.Session(config=tfConfig)
    sess.run(tf.compat.v1.global_variables_initializer())

    outputTF = sess.run(y, feed_dict={x: inputData})
    tfPara = {}  # 保存权重
    print("Weight:")
    for i in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES):
        name, value = i.name, sess.run(i)
        print(name, value.shape)
        tfPara[name] = value
    np.savez("para_tf_nn_linalg_matmul.npz", **tfPara)
    sess.close()

    # TensorRT part ------------------------------------------------------------
    logger = trt.Logger(trt.Logger.ERROR)
    builder = trt.Builder(logger)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    profile = builder.create_optimization_profile()
    config = builder.create_builder_config()
    inputT0 = network.add_input('inputT0', trt.float32, (-1, hIn, wIn, cIn))
    profile.set_shape(inputT0.name, (1, hIn, wIn, cIn), (nIn, hIn, wIn, cIn), (nIn * 2, hIn, wIn, cIn))  # 范围覆盖住之后需要的值就好
    config.add_optimization_profile(profile)

    weight = np.load('./para_tf_nn_linalg_matmul.npz')['w1:0'].transpose(1, 0).reshape(-1)  # 读取权重
    _h1 = network.add_fully_connected(inputT0, cOut, weight, None)
    _h2 = network.add_shape(_h1.get_output(0))  # 把最后两维的 (1,1) 去掉,对齐 TF 模型
    _h3 = network.add_slice(_h2.get_output(0), [0], [2], [1])
    _h4 = network.add_shuffle(_h1.get_output(0))
    _h4.set_input(1, _h3.get_output(0))

    network.mark_output(_h4.get_output(0))
    engineString = builder.build_serialized_network(network, config)
    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
    context = engine.create_execution_context()
    context.set_binding_shape(0, [nIn, hIn, wIn, cIn])
    _, stream = cudart.cudaStreamCreate()

    inputH0 = np.ascontiguousarray(inputData.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMallocAsync(inputH0.nbytes, stream)
    _, outputD0 = cudart.cudaMallocAsync(outputH0.nbytes, stream)

    cudart.cudaMemcpyAsync(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
    context.execute_async_v2([int(inputD0), int(outputD0)], stream)
    cudart.cudaMemcpyAsync(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
    cudart.cudaStreamSynchronize(stream)

    printArray(inputData, "input")
    #print(inputData)
    printArray(outputTF, "TF output")
    #print(outputTF)
    printArray(outputH0, "TRT output")
    #print(outputH0)
    check(outputTF, outputH0, True)

    cudart.cudaStreamDestroy(stream)
    cudart.cudaFree(inputD0)
    cudart.cudaFree(outputD0)