def get_batch(self, nameList=None, inputNodeName=None):  # do NOT change name
     try:
         data = next(self.oneBatch)
         cudart.cudaMemcpy(self.dIn, data.ctypes.data, self.buffeSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
         return [int(self.dIn)]
     except StopIteration:
         return None
Ejemplo n.º 2
0
 def get_batch(self,
               nameList=None,
               inputNodeName=None):  # do NOT change name
     if self.count < self.calibrationCount:
         self.count += 1
         data = np.ascontiguousarray(
             np.random.rand(np.prod(self.shape)).astype(np.float32).reshape(
                 *self.shape) * 200 - 100)
         cudart.cudaMemcpy(self.dIn, data.ctypes.data, self.buffeSize,
                           cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
         return [int(self.dIn)]
     else:
         return None
def run(shape):
    testCase = "<shape=%s>" % (shape)
    trtFile = "./model-%d.plan" % (shape[2])
    print("Test %s" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engineStr = f.read()
            engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr)
        if engine == None:
            print("Failed loading engine!")
            exit()
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30

        inputT0 = network.add_input('inputT0', trt.float32,
                                    [-1 for i in shape])
        profile.set_shape(inputT0.name, [1, 1, shape[2]], shape, shape)
        config.add_optimization_profile(profile)

        pluginLayer = network.add_plugin_v2([inputT0],
                                            getLayerNormPlugin(epsilon))
        network.mark_output(pluginLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, shape)
    #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)]))
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    #for i in range(engine.num_bindings):
    #    print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput),
    #            engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i))

    bufferH = []
    bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape))
    for i in range(nOutput):
        bufferH.append(
            np.empty(context.get_binding_shape(nInput + i),
                     dtype=trt.nptype(engine.get_binding_dtype(nInput + i))))
    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(
            bufferD[i],
            np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data,
            bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nOutput):
        cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i],
                          bufferH[nInput + i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    outputCPU = layerNormCPU(bufferH[:nInput], epsilon)
    '''
    for i in range(nInput):
        printArrayInfo(bufferH[i])
    for i in range(nOutput):
        printArrayInfo(bufferH[nInput+i])
    for i in range(nOutput):
        printArrayInfo(outputCPU[i])
    '''
    check(bufferH[nInput:][0], outputCPU[0], True)

    for buffer in bufferD:
        cudart.cudaFree(buffer)
    print("Test %s finish!\n" % testCase)
def run():
    testCase = "%d-%d-%d-fp%s" % (nBS, nSL, nEmbedding, '16' if int(npDataType == np.float16) else '32')
    print("Test <%s>" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFilePath)

    trtFile = "./model-" + testCase + ".plan"
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engineStr = f.read()
            engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr)
        if engine == None:
            print("Failed loading engine!")
            exit()
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(1 << 0)
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30
        config.flags = 1 << int(trt.BuilderFlag.FP16) if int(npDataType == np.float16) else 0

        inputTensorList = []
        trtDataType = trt.float16 if int(npDataType == np.float16) else trt.float32
        inputTensorList.append(network.add_input('inputT', trtDataType, [-1, -1, -1]))

        profile = builder.create_optimization_profile()
        profile.set_shape('inputT', [1, 1, nEmbedding], [nBS, nSL, nEmbedding], [nBS * 2, nSL * 2, nEmbedding])
        config.add_optimization_profile(profile)

        pluginLayer = network.add_plugin_v2(inputTensorList, getLayerNormPlugin())
        pluginLayer.get_output(0).dtype = trtDataType

        network.mark_output(pluginLayer.get_output(0))

        engineString = builder.build_serialized_network(network, config)

    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [nBS, nSL, nEmbedding])

    print("Binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))

    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    for i in range(engine.num_bindings):
        print("input ->" if engine.binding_is_input(i) else "output->", engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i))

    bufferH = []
    bufferH.append(np.random.rand(nBS, nSL, nEmbedding).astype(np.float32).reshape(nBS, nSL, nEmbedding) * 2 - 1)
    bufferH.append(np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1))))

    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nInput, nInput + nOutput):
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    resCPU = layerNormCPU(bufferH, epsilon)[-1]
    #printArrayInfo(resCPU)
    #printArrayInfo(bufferH[-1])
    check(bufferH[-1], resCPU, True)

    for b in bufferD:
        cudart.cudaFree(b)

    print("Test <%s> finish!" % testCase)
Ejemplo n.º 5
0
# 使用 Profile 0
print("Use Profile 0")
context.set_optimization_profile_async(0, stream)
cudart.cudaStreamSynchronize(stream)
#context.active_optimization_profile = 0  # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃
context.set_binding_shape(0, [nIn, cIn, hIn, wIn])
print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))
for i in range(engine.num_bindings):
    print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

inputH0 = np.ascontiguousarray(data.reshape(-1))
outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
_, inputD0 = cudart.cudaMalloc(inputH0.nbytes)
_, outputD0 = cudart.cudaMalloc(outputH0.nbytes)

cudart.cudaMemcpy(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
context.execute_v2([int(inputD0), int(outputD0), int(0), int(0)])
cudart.cudaMemcpy(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
print("check result:", np.all(outputH0 == -inputH0.reshape(nIn, cIn, hIn, wIn)))

# 使用 Profile 1
print("Use Profile 1")
context.set_optimization_profile_async(1, stream)
cudart.cudaStreamSynchronize(stream)
#context.active_optimization_profile = 1  # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃
context.set_binding_shape(2, [nIn, cIn, hIn, wIn])
print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))
for i in range(engine.num_bindings):
    print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

inputH1 = np.ascontiguousarray(data.reshape(-1))
def run(shape, scalar):
    testCase = "<shape=%s,scalar=%f>" % (shape, scalar)
    trtFile = "./model-Dim%s.plan" % str(len(shape))
    print("Test %s" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(
            1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30
        config.flags = 1 << int(trt.BuilderFlag.INT8)
        config.int8_calibrator = MyCalibrator(1, shape, cacheFile)

        inputT0 = network.add_input('inputT0', trt.float32,
                                    [-1 for i in shape])
        profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape],
                          [32 for i in shape])
        config.add_optimization_profile(profile)
        #inputT0.dynamic_range = [-100,100]  # 不使用 calibrator 的时候要手动设置 dynamic range

        pluginLayer = network.add_plugin_v2([inputT0],
                                            getAddScalarPlugin(scalar))
        pluginLayer.precision = trt.int8
        pluginLayer.set_output_type(0, trt.int8)
        pluginLayer.get_output(0).dtype = trt.int8
        #pluginLayer.get_output(0).dynamic_range = [-120,120]

        identityLayer = network.add_identity(
            pluginLayer.get_output(0))  # 手动转为 float32 类型,否则要自行处理输出的 int8 类型
        identityLayer.get_output(0).dtype = trt.float32

        network.mark_output(identityLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, shape)
    #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)]))
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    #for i in range(engine.num_bindings):
    #    print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput),
    #            engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i))

    bufferH = []
    bufferH.append(
        np.random.rand(np.prod(shape)).astype(np.float32).reshape(shape) *
        200 - 100)
    for i in range(nOutput):
        bufferH.append(
            np.empty(context.get_binding_shape(nInput + i),
                     dtype=trt.nptype(engine.get_binding_dtype(nInput + i))))
    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(
            bufferD[i],
            np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data,
            bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nOutput):
        cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i],
                          bufferH[nInput + i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    outputCPU = addScalarCPU(bufferH[:nInput], scalar)
    '''
    for i in range(nInput):
        printArrayInfo(bufferH[i])
    for i in range(nOutput):
        printArrayInfo(bufferH[nInput+i])
    for i in range(nOutput):
        printArrayInfo(outputCPU[i])
    '''
    check(bufferH[nInput:][0], outputCPU[0], True)

    for buffer in bufferD:
        cudart.cudaFree(buffer)
    print("Test %s finish!\n" % testCase)
Ejemplo n.º 7
0
    print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i),
          engine.get_binding_shape(i), context.get_binding_shape(i),
          engine.get_binding_name(i))

bufferH = []
bufferH.append(np.ascontiguousarray(data.reshape(-1)))
for i in range(nInput, nInput + nOutput):
    bufferH.append(
        np.empty(context.get_binding_shape(i),
                 dtype=trt.nptype(engine.get_binding_dtype(i))))
bufferD = []
for i in range(nInput + nOutput):
    bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

for i in range(nInput):
    cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes,
                      cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

context.execute_v2(bufferD)

for i in range(nInput, nInput + nOutput):
    cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes,
                      cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

for i in range(10):
    context.execute_v2(bufferD)

for i in range(nInput + nOutput):
    print(engine.get_binding_name(i))

for b in bufferD:
    cudart.cudaFree(b)
Ejemplo n.º 8
0
def run(useTimeCache):
    logger = trt.Logger(trt.Logger.ERROR)
    timeCache = b""
    if useTimeCache and os.path.isfile(timeCacheFile):
        with open(timeCacheFile, 'rb') as f:
            timeCache = f.read()
        if timeCache == None:
            print("Failed getting serialized timing cache!")
            return
        print("Succeeded getting serialized timing cache!")

    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    profile = builder.create_optimization_profile()
    config = builder.create_builder_config()
    config.max_workspace_size = 6 << 30
    if useTimeCache:
        cache = config.create_timing_cache(timeCache)
        config.set_timing_cache(cache, False)

    inputTensor = network.add_input('inputT0', trt.float32, [-1, 1, 28, 28])
    profile.set_shape(inputTensor.name, (1, 1, 28, 28), (4, 1, 28, 28),
                      (8, 1, 28, 28))
    config.add_optimization_profile(profile)

    np.random.seed(97)  # 保持每次权重都一样
    w = np.random.rand(32, 1, 5, 5).astype(np.float32).reshape(-1)
    b = np.random.rand(32).astype(np.float32).reshape(-1)
    _0 = network.add_convolution_nd(inputTensor, 32, [5, 5], w, b)
    _0.padding_nd = [2, 2]
    _1 = network.add_activation(_0.get_output(0), trt.ActivationType.RELU)
    _2 = network.add_pooling_nd(_1.get_output(0), trt.PoolingType.MAX, [2, 2])
    _2.stride_nd = [2, 2]

    w = np.random.rand(64, 32, 5, 5).astype(np.float32).reshape(-1)
    b = np.random.rand(64).astype(np.float32).reshape(-1)
    _3 = network.add_convolution_nd(_2.get_output(0), 64, [5, 5], w, b)
    _3.padding_nd = [2, 2]
    _4 = network.add_activation(_3.get_output(0), trt.ActivationType.RELU)
    _5 = network.add_pooling_nd(_4.get_output(0), trt.PoolingType.MAX, [2, 2])
    _5.stride_nd = [2, 2]

    _6 = network.add_shuffle(_5.get_output(0))
    _6.first_transpose = (0, 2, 3, 1)
    _6.reshape_dims = (-1, 64 * 7 * 7, 1, 1)

    w = np.random.rand(1024, 64 * 7 * 7).astype(np.float32).reshape(-1)
    b = np.random.rand(1024).astype(np.float32).reshape(-1)
    _7 = network.add_fully_connected(_6.get_output(0), 1024, w, b)
    _8 = network.add_activation(_7.get_output(0), trt.ActivationType.RELU)

    w = np.random.rand(10, 1024).astype(np.float32).reshape(-1)
    b = np.random.rand(10).astype(np.float32).reshape(-1)
    _9 = network.add_fully_connected(_8.get_output(0), 10, w, b)
    _10 = network.add_activation(_9.get_output(0), trt.ActivationType.RELU)

    _11 = network.add_shuffle(_10.get_output(0))
    _11.reshape_dims = [-1, 10]

    _12 = network.add_softmax(_11.get_output(0))
    _12.axes = 1 << 1

    _13 = network.add_topk(_12.get_output(0), trt.TopKOperation.MAX, 1, 1 << 1)

    network.mark_output(_13.get_output(1))

    t0 = time()
    engineString = builder.build_serialized_network(network, config)
    t1 = time()
    print("%s timing cache, %f ms" % ("With" if useTimeCache else "Without",
                                      (t1 - t0) * 1000))

    if useTimeCache and not os.path.isfile(timeCacheFile):
        timeCache = config.get_timing_cache()
        timeCacheString = timeCache.serialize()
        with open(timeCacheFile, 'wb') as f:
            f.write(timeCacheString)
            print("Succeeded saving .cache file!")

    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    context.set_binding_shape(0, [1, 1, 28, 28])
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput

    bufferH = []
    bufferH.append(np.ascontiguousarray(data.reshape(-1)))
    for i in range(nInput, nInput + nOutput):
        bufferH.append(
            np.empty(context.get_binding_shape(i),
                     dtype=trt.nptype(engine.get_binding_dtype(i))))
    bufferD = []
    for i in range(nInput + nOutput):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data,
                          bufferH[i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)

    for i in range(nInput, nInput + nOutput):
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i],
                          bufferH[i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    #for i in range(nInput + nOutput):
    #    print(engine.get_binding_name(i))
    #    print(bufferH[i].reshape(context.get_binding_shape(i)))

    for b in bufferD:
        cudart.cudaFree(b)
def run():
    logger = trt.Logger(trt.Logger.ERROR)                                       # 指定 Logger,可用等级:VERBOSE,INFO,WARNING,ERRROR,INTERNAL_ERROR
    if os.path.isfile(trtFile):                                                 # 如果有 .plan 文件则直接读取
        with open(trtFile, 'rb') as f:
            engineString = f.read()
        if engineString == None:
            print("Failed getting serialized engine!")
            return
        print("Succeeded getting serialized engine!")
    else:                                                                       # 没有 .plan 文件,从头开始创建
        builder = trt.Builder(logger)                                           # 网络元信息,Builder/Network/BuilderConfig/Profile 相关
        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30

        inputTensor = network.add_input('inputT0', trt.float32, [-1, -1, -1])  # 指定输入张量
        profile.set_shape(inputTensor.name, [1, 1, 1], [3, 4, 5], [6, 8, 10])   # 指定输入张量 Dynamic Shape 范围
        config.add_optimization_profile(profile)

        identityLayer = network.add_identity(inputTensor)                       # 恒等变换
        network.mark_output(identityLayer.get_output(0))                        # 标记输出张量

        engineString = builder.build_serialized_network(network, config)        # 生成序列化网络
        if engineString == None:
            print("Failed getting serialized engine!")
            return
        print("Succeeded getting serialized engine!")
        with open(trtFile, 'wb') as f:                                          # 将序列化网络保存为 .plan 文件
            f.write(engineString)
            print("Succeeded saving .plan file!")

    engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)          # 使用 Runtime 来创建 engine
    if engine == None:
        print("Failed building engine!")
        return
    print("Succeeded building engine!")

    context = engine.create_execution_context()                                 # 创建 context(相当于 GPU 进程)
    context.set_binding_shape(0, [3, 4, 5])                                     # Dynamic Shape 模式需要绑定真实数据形状
    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])  # 获取 engine 绑定信息
    nOutput = engine.num_bindings - nInput
    for i in range(nInput):
        print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
    for i in range(nInput,nInput+nOutput):
        print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5)              # 准备数据和 Host/Device 端内存
    bufferH = []
    bufferH.append(np.ascontiguousarray(data.reshape(-1)))
    for i in range(nInput, nInput + nOutput):
        bufferH.append(np.empty(context.get_binding_shape(i), dtype=trt.nptype(engine.get_binding_dtype(i))))
    bufferD = []
    for i in range(nInput + nOutput):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):                                                     # 首先将 Host 数据拷贝到 Device 端
        cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute_v2(bufferD)                                                 # 运行推理计算

    for i in range(nInput, nInput + nOutput):                                   # 将结果从 Device 端拷回 Host 端
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    for i in range(nInput + nOutput):
        print(engine.get_binding_name(i))
        print(bufferH[i].reshape(context.get_binding_shape(i)))

    for b in bufferD:                                                           # 释放 Device 端内存
        cudart.cudaFree(b)
Ejemplo n.º 10
0
def test(engine, context, nBatchSize):
    nProfile = engine.num_optimization_profiles
    if nProfile == 1:
        bindingBias = 0
    else:
        if nBatchSize <= 4:
            bindingBias = 0
            context.set_optimization_profile_async(0, 0)
            cudart.cudaStreamSynchronize(0)
        else:
            bindingBias = 2
            context.set_optimization_profile_async(1, 0)
            cudart.cudaStreamSynchronize(0)

    context.set_binding_shape(bindingBias, [nBatchSize, 1])
    nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    for i in range(nInput):
        print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))
    for i in range(nInput, nInput + nOutput):
        print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    nInput = nInput // nProfile
    nOutput = nOutput // nProfile

    data = np.random.rand(nBatchSize).reshape(nBatchSize, 1).astype(np.float32)
    bufferH = []
    bufferH.append(np.ascontiguousarray(data.reshape(-1)))
    for i in range(nInput, nInput + nOutput):
        bufferH.append(np.empty(context.get_binding_shape(bindingBias + i), dtype=trt.nptype(engine.get_binding_dtype(bindingBias + i))))
    bufferD = []
    for i in range(nInput + nOutput):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    if nProfile == 1 or nBatchSize <= 4:
        bufferD = bufferD + [int(0), int(0)]
    else:
        bufferD = [int(0), int(0)] + bufferD

    for i in range(nInput):
        cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
    context.execute_v2(bufferD)
    for i in range(nInput, nInput + nOutput):
        cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    for i in range(nWarm):
        context.execute_v2(bufferD)

    t0 = time_ns()
    for i in range(nTest):
        context.execute_v2(bufferD)
    t1 = time_ns()
    print("+---- BatchSize=%2d: %.4fms\n" % (nBatchSize, (t1 - t0) / 1e6 / nTest))

    if nProfile == 1 or nBatchSize <= 4:
        bufferD = bufferD[:2]
    else:
        bufferD = bufferD[-2:]

    for b in bufferD:
        cudart.cudaFree(b)
def run(shape, scalar):
    testCase = "<shape=%s,scalar=%f>" % (shape, scalar)
    trtFile = "./model-Shape[%s].plan" % (
        "".join([str(i) + "-" for i in shape[:-1]]) + str(shape[-1]))
    print("Test %s" % testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        builder.max_batch_size = 32
        network = builder.create_network()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30

        inputT0 = network.add_input('inputT0', trt.float32, shape[1:])
        pluginLayer = network.add_plugin_v2([inputT0],
                                            getAddScalarPlugin(scalar))
        network.mark_output(pluginLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)]))
    nInput = np.sum(
        [engine.binding_is_input(i) for i in range(engine.num_bindings)])
    nOutput = engine.num_bindings - nInput
    #for i in range(engine.num_bindings):
    #    print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput),
    #            engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i))

    bufferH = []
    bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape))
    for i in range(nOutput):
        bufferH.append(
            np.empty(
                (shape[0], ) + tuple(context.get_binding_shape(nInput + i)),
                dtype=trt.nptype(engine.get_binding_dtype(nInput + i))))
    bufferD = []
    for i in range(engine.num_bindings):
        bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])

    for i in range(nInput):
        cudart.cudaMemcpy(
            bufferD[i],
            np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data,
            bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)

    context.execute(shape[0], bufferD)

    for i in range(nOutput):
        cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i],
                          bufferH[nInput + i].nbytes,
                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    outputCPU = addScalarCPU(bufferH[:nInput], scalar)
    '''
    for i in range(nInput):
        printArrayInfo(bufferH[i])
    for i in range(nOutput):
        printArrayInfo(bufferH[nInput+i])
    for i in range(nOutput):
        printArrayInfo(outputCPU[i])
    '''
    check(bufferH[nInput:][0], outputCPU[0], True)

    for buffer in bufferD:
        cudart.cudaFree(buffer)
    print("Test %s finish!\n" % testCase)
def run(shape0, shape1, scalar):
    testCase = "<shape0:%s,shape1:%s,scalar=%f>" % (shape0, shape1, scalar)
    trtFile = "./model-Dims" + str(len(shape0)) + ".plan"
    print("\nTest", testCase)
    logger = trt.Logger(trt.Logger.ERROR)
    trt.init_libnvinfer_plugins(logger, '')
    ctypes.cdll.LoadLibrary(soFile)
    if os.path.isfile(trtFile):
        with open(trtFile, 'rb') as f:
            engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
        if engine == None:
            print("Failed loading engine!")
            return
        print("Succeeded loading engine!")
    else:
        builder = trt.Builder(logger)
        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        profile0 = builder.create_optimization_profile()
        profile1 = builder.create_optimization_profile()
        config = builder.create_builder_config()
        config.max_workspace_size = 6 << 30
        config.flags = 1 << int(trt.BuilderFlag.FP16)  # 注释掉这一行,Pugin 就仅使用 FP32

        inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape0])
        profile0.set_shape(inputT0.name, [1 for i in shape0], [8 for i in shape0], [32 for i in shape0])
        config.add_optimization_profile(profile0)
        profile1.set_shape(inputT0.name, [1 for i in shape1], [8 for i in shape1], [32 for i in shape1])
        config.add_optimization_profile(profile1)

        pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar))

        network.mark_output(pluginLayer.get_output(0))
        engineString = builder.build_serialized_network(network, config)
        if engineString == None:
            print("Failed building engine!")
            return
        print("Succeeded building engine!")
        with open(trtFile, 'wb') as f:
            f.write(engineString)
        engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)

    context = engine.create_execution_context()
    stream = 0  # 使用默认 CUDA 流
    cudart.cudaStreamSynchronize(stream)

    # 使用 Profile 0
    print("Use Profile 0")
    context.set_optimization_profile_async(0, stream)
    cudart.cudaStreamSynchronize(stream)
    #context.active_optimization_profile = 0  # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃
    context.set_binding_shape(0, shape0)
    print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))
    for i in range(engine.num_bindings):
        print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    data = np.random.rand(np.prod(shape0)).reshape(shape0).astype(np.float32) * 2 - 1
    inputH0 = np.ascontiguousarray(data.reshape(-1))
    outputH0 = np.empty(context.get_binding_shape(1), dtype=trt.nptype(engine.get_binding_dtype(1)))
    _, inputD0 = cudart.cudaMalloc(inputH0.nbytes)
    _, outputD0 = cudart.cudaMalloc(outputH0.nbytes)

    cudart.cudaMemcpy(inputD0, inputH0.ctypes.data, inputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
    print("before inference")
    context.execute_v2([int(inputD0), int(outputD0), int(0), int(0)])
    print("after inference")
    cudart.cudaMemcpy(outputH0.ctypes.data, outputD0, outputH0.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    # 使用 Profile 1
    print("Use Profile 1")
    context.set_optimization_profile_async(1, stream)
    cudart.cudaStreamSynchronize(stream)
    #context.active_optimization_profile = 1  # 与上面两行等价的选择 profile 的方法,不需要用 stream,但是将被废弃
    context.set_binding_shape(2, shape1)
    print("Context binding all? %s" % (["No", "Yes"][int(context.all_binding_shapes_specified)]))
    for i in range(engine.num_bindings):
        print(i, "Input " if engine.binding_is_input(i) else "Output", engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i))

    data = np.random.rand(np.prod(shape1)).reshape(shape1).astype(np.float32) * 2 - 1
    inputH1 = np.ascontiguousarray(data.reshape(-1))
    outputH1 = np.empty(context.get_binding_shape(2), dtype=trt.nptype(engine.get_binding_dtype(2)))
    _, inputD1 = cudart.cudaMalloc(inputH1.nbytes)
    _, outputD1 = cudart.cudaMalloc(outputH1.nbytes)

    cudart.cudaMemcpy(inputD1, inputH1.ctypes.data, inputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
    print("before inference")
    context.execute_v2([int(0), int(0), int(inputD1), int(outputD1)])
    print("after inference")
    cudart.cudaMemcpy(outputH1.ctypes.data, outputD1, outputH1.nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)

    cudart.cudaFree(inputD0)
    cudart.cudaFree(inputD1)
    cudart.cudaFree(outputD0)
    cudart.cudaFree(outputD1)