def allocate_buffers(engine):
    # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)),
                                    dtype=trt.nptype(ModelData.DTYPE))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)),
                                     dtype=trt.nptype(ModelData.DTYPE))
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream
Example #2
0
    def inference(self, resized_rgb_image) -> list:
        """
        Inference function sets input tensor to input image and gets the output.
        The interpreter instance provides corresponding class id output which is used for creating result
        Args:
            resized_rgb_image: Array of images with shape (no_images, img_height, img_width, channels)
        Returns:
            result: List of class id for each input image. ex: [0, 0, 1, 1, 0]
            scores: The classification confidence for each class. ex: [.99, .75, .80, 1.0]
        """
        self.INPUT_DATA_TYPE = np.float32
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(self.trt_logger)
        with open(self.model_path, "rb") as f:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        
        self.context = self.engine.create_execution_context()
        
        self.stream = cuda.Stream()
        
        self.host_in = cuda.pagelocked_empty(trt.volume(self.engine.get_binding_shape(0)), dtype=self.INPUT_DATA_TYPE)
        self.host_out = cuda.pagelocked_empty(trt.volume(self.engine.get_binding_shape(1)), dtype=self.INPUT_DATA_TYPE)
        self.devide_in = cuda.mem_alloc(self.host_in.nbytes)
        self.devide_out = cuda.mem_alloc(self.host_out.nbytes)


        if np.shape(resized_rgb_image)[0] == 0:
            return [], []
        result = []
        net_results = []
        for img in resized_rgb_image:
            img = np.expand_dims(img, axis=0)
            bindings = [int(self.devide_in), int(self.devide_out)]
            np.copyto(self.host_in, img.ravel())
            t_begin = time.perf_counter()
            cuda.memcpy_htod_async(self.devide_in, self.host_in, self.stream)
            self.context.execute_async(bindings=bindings, stream_handle=self.stream.handle)
            cuda.memcpy_dtoh_async(self.host_out, self.devide_out, self.stream)
            self.stream.synchronize()
            inference_time = time.perf_counter() - t_begin  # Seconds
            self.fps = convert_infr_time_to_fps(inference_time)
            out = self.host_out
            pred = np.argmax(out)
            net_results.append(out)
            result.append(pred)

        # TODO: optimized without for
        scores = []
        for i, itm in enumerate(net_results):
            scores.append(itm[result[i]])

        return result, scores
Example #3
0
def evaluate(asr_model, asr_onnx, labels_map, wer, qat):
    # Eval the model
    hypotheses = []
    references = []
    stream = cuda.Stream()
    vocabulary_size = len(labels_map) + 1
    engine_file_path = build_trt_engine(asr_model, asr_onnx, qat)
    with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        trt_engine = runtime.deserialize_cuda_engine(f.read())
        trt_ctx = trt_engine.create_execution_context()

        profile_shape = trt_engine.get_profile_shape(profile_index=0,
                                                     binding=0)
        print("profile shape min:{}, opt:{}, max:{}".format(
            profile_shape[0], profile_shape[1], profile_shape[2]))
        max_input_shape = profile_shape[2]
        input_nbytes = trt.volume(max_input_shape) * trt.float32.itemsize
        d_input = cuda.mem_alloc(input_nbytes)
        max_output_shape = [
            max_input_shape[0], vocabulary_size, (max_input_shape[-1] + 1) // 2
        ]
        output_nbytes = trt.volume(max_output_shape) * trt.float32.itemsize
        d_output = cuda.mem_alloc(output_nbytes)

        for test_batch in asr_model.test_dataloader():
            if can_gpu:
                test_batch = [x.cuda() for x in test_batch]
            processed_signal, processed_signal_length = asr_model.preprocessor(
                input_signal=test_batch[0], length=test_batch[1])

            greedy_predictions = trt_inference(
                stream,
                trt_ctx,
                d_input,
                d_output,
                input_signal=processed_signal,
                input_signal_length=processed_signal_length,
            )
            hypotheses += wer.ctc_decoder_predictions_tensor(
                greedy_predictions)
            for batch_ind in range(greedy_predictions.shape[0]):
                seq_len = test_batch[3][batch_ind].cpu().detach().numpy()
                seq_ids = test_batch[2][batch_ind].cpu().detach().numpy()
                reference = ''.join(
                    [labels_map[c] for c in seq_ids[0:seq_len]])
                references.append(reference)
            del test_batch
        wer_value = word_error_rate(hypotheses=hypotheses,
                                    references=references,
                                    use_cer=wer.use_cer)

    return wer_value
Example #4
0
def alloc_buf(engine):
    # host cpu mem
    h_in_size = trt.volume(engine.get_binding_shape(0))
    h_out_size = trt.volume(engine.get_binding_shape(1))
    h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
    h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
    in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
    out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
    # allocate gpu mem
    in_gpu = cuda.mem_alloc(in_cpu.nbytes)
    out_gpu = cuda.mem_alloc(out_cpu.nbytes)
    stream = cuda.Stream()
    return in_cpu, out_cpu, in_gpu, out_gpu, stream
def allocate_buffers(engine):
    host_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)),
                                       trt.nptype(engine.get_binding_dtype(0)))
    host_output = cuda.pagelocked_empty(
        trt.volume(engine.get_binding_shape(1)),
        trt.nptype(engine.get_binding_dtype(1)))

    device_input = cuda.mem_alloc(host_input.nbytes)
    device_output = cuda.mem_alloc(host_output.nbytes)

    stream = cuda.Stream()

    return host_input, device_input, host_output, device_output, stream
Example #6
0
 def initialize_engine(self):
     print("initializing engine")
     self.h_input = cuda.pagelocked_empty(trt.volume(
         self.engine.get_binding_shape(0)),
                                          dtype=np.float32)
     self.h_output = cuda.pagelocked_empty(trt.volume(
         self.engine.get_binding_shape(1)),
                                           dtype=np.float32)
     self.d_input = cuda.mem_alloc(self.h_input.nbytes)
     self.d_output = cuda.mem_alloc(self.h_output.nbytes)
     self.stream = cuda.Stream()
     self.execution_context = self.engine.create_execution_context()
     print("engine initialized")
Example #7
0
 def _context_init(self):
     volume = trt.volume(self.trt_engine.get_binding_shape(
         0)) * self.trt_engine.max_batch_size
     self.input_dtype = trt.nptype(self.trt_engine.get_binding_dtype(0))
     self.host_input = cuda.pagelocked_empty(volume, dtype=self.input_dtype)
     volume = trt.volume(self.trt_engine.get_binding_shape(
         1)) * self.trt_engine.max_batch_size
     dtype = trt.nptype(self.trt_engine.get_binding_dtype(1))
     self.host_output = cuda.pagelocked_empty(volume, dtype=dtype)
     # Allocate device memory for inputs and outputs.
     self.cuda_input = cuda.mem_alloc(self.host_input.nbytes)
     self.cuda_output = cuda.mem_alloc(self.host_output.nbytes)
     self.context = self.trt_engine.create_execution_context()
     self.context.active_optimization_profile = 0
     self.stream = cuda.Stream()
    def allocate_buffers(self, engine):
        print('allocate buffers')

        h_input = cuda.pagelocked_empty(
            trt.volume(engine.get_binding_shape(0)),
            trt.nptype(engine.get_binding_dtype(0)))
        h_output = cuda.pagelocked_empty(
            trt.volume(engine.get_binding_shape(1)),
            trt.nptype(engine.get_binding_dtype(1)))
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)

        stream = cuda.Stream()

        return stream, h_input, d_input, h_output, d_output
Example #9
0
def allocate_buffers(engine, batch_size, data_type):
    """
    allocate buffers for input and output in the device
    """
    h_input = cuda.pagelocked_empty(batch_size *
                                    trt.volume(engine.get_binding_shape(0)),
                                    dtype=trt.nptype(data_type))
    h_output = cuda.pagelocked_empty(batch_size *
                                     trt.volume(engine.get_binding_shape(1)),
                                     dtype=trt.nptype(data_type))
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)

    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream
Example #10
0
def allocate_buffers(engine):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    output_idx = 0
    stream = cuda.Stream()
    assert 3 <= len(engine) <= 4  # expect 1 input, plus 2 or 3 outpus
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * \
               engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            # each grid has 3 anchors, each anchor generates a detection
            # output of 7 float32 values
            assert size % 7 == 0
            outputs.append(HostDeviceMem(host_mem, device_mem))
            output_idx += 1
    return inputs, outputs, bindings, stream
Example #11
0
    def __init__(self, model):

        # Initialize TRT environment
        self.input_shape = (300, 300)
        trt_logger = trt.Logger(trt.Logger.INFO)
        trt.init_libnvinfer_plugins(trt_logger, '')
        with open(model, 'rb') as f, trt.Runtime(trt_logger) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())

        self.host_inputs = []
        self.cuda_inputs = []
        self.host_outputs = []
        self.cuda_outputs = []
        self.bindings = []
        self.stream = cuda.Stream()

        for binding in engine:
            size = trt.volume(
                engine.get_binding_shape(binding)) * engine.max_batch_size
            host_mem = cuda.pagelocked_empty(size, np.float32)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            self.bindings.append(int(cuda_mem))
            if engine.binding_is_input(binding):
                self.host_inputs.append(host_mem)
                self.cuda_inputs.append(cuda_mem)
            else:
                self.host_outputs.append(host_mem)
                self.cuda_outputs.append(cuda_mem)
        self.context = engine.create_execution_context()

        self.watch = Stopwatch()
 def _allocate_buffers(self, context):
     """
     Allocate device memory space for data.
     :param context:
     :return:
     """
     inputs = []
     outputs = []
     bindings = []
     stream = cuda.Stream()
     for binding in self._engine:
         size = trt.volume(self._engine.get_binding_shape(
             binding)) * self._engine.max_batch_size
         dtype = trt.nptype(self._engine.get_binding_dtype(binding))
         # Allocate host and device buffers
         host_mem = cuda.pagelocked_empty(size, dtype)
         device_mem = cuda.mem_alloc(host_mem.nbytes)
         # Append the device buffer to device bindings.
         bindings.append(int(device_mem))
         # Append to the appropriate list.
         if self._engine.binding_is_input(binding):
             inputs.append(HostDeviceMem(host_mem, device_mem))
         else:
             outputs.append(HostDeviceMem(host_mem, device_mem))
     return inputs, outputs, bindings, stream
Example #13
0
    def get_batch(self, names):

        # if there are not enough calibration images to form a batch,
        # we have reached the end of our data set
        if self.counter == self.num_calib_imgs:
            return None

        batch_imgs = np.zeros((self.batch_size, trt.volume(self.model_shape)))
        for i in range(self.batch_size):
            img = cv2.imread(self.calib_imgs[self.counter + i])
            img = cv2.resize(img, (self.model_shape[2], self.model_shape[1]))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            # HWC -> CHW
            img = img.transpose((2, 0, 1))
            # Normalize to [-1.0, 1.0] interval (expected by model)
            img = (2.0 / 255.0) * img - 1.0
            # add this image to the batch array
            batch_imgs[i, :] = img.ravel()

        # increase the counter for this batch
        self.counter += self.batch_size

        # Copy to device, then return a list containing pointers to input device buffers.
        cuda.memcpy_htod(self.device_input, batch_imgs.astype(np.float32))
        return [int(self.device_input)]
Example #14
0
    def _allocate_buffers(self, engine):
        # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        out_shapes = []
        input_shapes = []
        out_names = []
        input_names = []

        max_batch_size = engine.max_batch_size
        for binding in engine:
            # get binding_shape (value == -1 means dynamic shape)
            binding_shape = engine.get_binding_shape(binding)
            # compute max_size and dtype
            size = abs(trt.volume(binding_shape)) * max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # collect info to appropriate list
            if engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
                input_shapes.append(binding_shape)
                input_names.append(binding)
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
                out_shapes.append(binding_shape)
                out_names.append(binding)
        return bindings, stream, max_batch_size, inputs, input_shapes, input_names, outputs, out_shapes, out_names
Example #15
0
    def __init__(self,
                 images,
                 width=256,
                 height=256,
                 channel=3,
                 batch_size=1,
                 cache_file='./{}.cache'.format('int8')):
        """

        :param images: type: list, e.g: [img1.jpg, img2.jpg, ...]
        :param width:
        :param height:
        :param channel:
        :param batch_size:
        :param cache_file:
        """
        super(ImageCalibrator, self).__init__()
        self.cache_file = cache_file
        self.batch_size = batch_size
        self.channel = channel
        self.height = height
        self.width = width

        assert isinstance(images, list) and len(images) > 0
        self.imgs = images
        self.batch_idx = 0
        self.max_batch_idx = len(self.imgs) // self.batch_size
        self.data_size = trt.volume([
            self.batch_size, self.channel, self.height, self.width
        ]) * trt.float32.itemsize
        self.device_input = cuda.mem_alloc(self.data_size)
        self.one_batch = self.batch_generator()
Example #16
0
    def _setup_bindings(self, engine):
        """

        :param engine:
        """

        self._inputs = {}
        self._outputs = []
        self._stream = cuda.Stream()
        for binding in engine:
            name = binding
            shape = engine.get_binding_shape(binding)
            shape[0] = engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            size = trt.volume(shape)

            # Allocate host and device buffers
            # https://documen.tician.de/pycuda/util.html
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self._inputs[name] = MemoryBinding(name, shape, dtype,
                                                   host_mem, device_mem, True)
            else:
                self._outputs.append(
                    MemoryBinding(name, shape, dtype, host_mem, device_mem,
                                  False))
    def __init__(self, loader, cache_file, c, h, w):
        # Whenever you specify a custom constructor for a TensorRT class,
        # you MUST call the constructor of the parent explicitly.
        trt.IInt8EntropyCalibrator2.__init__(self)

        self.cache_file = cache_file

        #data, targets = torch.load(datafolder)

        for cal_data in loader:

            #print("============data==========", cal_data.shape)
            self.all_files = cal_data.numpy()

            # Find out the shape of a batch and then allocate a device buffer of that size.
            # self.shape, _, _ = self.read_batch_file(self.batch_files[0])
            self.shape = [1, c, h, w]
            #print("==================self.shape=================", self.shape)
            # Each element of the calibration data is a float32.
            self.device_input = cuda.mem_alloc(
                trt.volume(self.shape) * trt.float32.itemsize)
            #print("==================self.device_input=================", self.device_input)

        # Create a generator that will give us batches. We can use next() to iterate over the result.
        def load_batches():
            for idx in range(len(self.all_files)):
                cal_data = self.read_batch_file(idx)
                yield cal_data

        self.batches = load_batches()
    def __init__(self, batch_data_dir, cache_file):
        # Whenever you specify a custom constructor for a TensorRT class,
        # you MUST call the constructor of the parent explicitly.
        trt.IInt8EntropyCalibrator2.__init__(self)

        self.cache_file = cache_file
        # Get a list of all the batch files in the batch folder.
        self.batch_files = [
            os.path.join(batch_data_dir, f) for f in os.listdir(batch_data_dir)
        ]

        # Find out the shape of a batch and then allocate a device buffer of that size.
        self.batch_size = 1
        self.batch_round = 100
        self.shape = self.read_batch_file(
            self.batch_files[0:self.batch_size]).shape
        print(self.shape)
        # Each element of the calibration data is a float32.
        self.device_input = cuda.mem_alloc(
            trt.volume(self.shape) * trt.float32.itemsize)

        # Create a generator that will give us batches. We can use next() to iterate over the result.
        def load_batches():
            start = 0
            for i in range(self.batch_round):
                print("Start Calibration using batch {:d}".format(i))
                yield self.read_batch_file(self.batch_files[start:start +
                                                            self.batch_size])
                start = start + self.batch_size

        self.batches = load_batches()
Example #19
0
def alloc_buf(engine):
    # h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
    # h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)

    dtype = trt.nptype(DTYPE)
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=dtype)
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=dtype)

    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    stream = cuda.Stream()

    # np.copyto(h_input, (np.random.random((1, 3, input_size, input_size)).astype(np.float32)).reshape(-1))

    return h_input, h_output, d_input, d_output, stream
Example #20
0
    def init_model(self, trt_path, ctx_id):
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
        cuda.init()
        device = cuda.Device(ctx_id)
        self.ctx = device.make_context()
        with open(trt_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())

        self.input_buffs = {}
        self.output_buffs = {}
        self.bindings = []
        self.stream = cuda.Stream()
        for name in engine:
            shape = engine.get_binding_shape(name)
            size = trt.volume(shape) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(name))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            self.bindings.append(int(device_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(name):
                self.input_buffs[name] = HostDeviceMem(host_mem, device_mem, shape)
            else:
                self.output_buffs[name] = HostDeviceMem(host_mem, device_mem, shape)

        self.model = engine.create_execution_context()
        self.logger.info("Warmup up...")
        self.inference_loops(10)
    def __init__(self):
        logger = trt.Logger(trt.Logger.INFO)
        model = 'models/yolov5s-simple-2.trt'

        with open(model, 'rb') as f, trt.Runtime(logger) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
       
        self.context = engine.create_execution_context()

        # allocate memory
        inputs, outputs, bindings = [], [], []
        stream = cuda.Stream()
        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) # * \
                   # engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))
            if engine.binding_is_input(binding):
                inputs.append({ 'host': host_mem, 'device': device_mem })
            else:
                outputs.append({ 'host': host_mem, 'device': device_mem })
            
        # save to class
        self.inputs = inputs
        self.outputs = outputs
        self.bindings = bindings
        self.stream = stream
Example #22
0
    def allocate_buffers(self):
        """Allocates GPU memory for future use and creates an asynchronous stream"""

        # determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host i/o
        self.h_input = cuda.pagelocked_empty(
            trt.volume(self.engine.get_binding_shape(0)),
            dtype=trt.nptype(self.CONSTANTS["dtype"]))
        self.h_output = cuda.pagelocked_empty(
            trt.volume(self.engine.get_binding_shape(1)),
            dtype=trt.nptype(self.CONSTANTS["dtype"]))

        # allocate device memory for inputs and outputs
        self.d_input = cuda.mem_alloc(self.h_input.nbytes)
        self.d_output = cuda.mem_alloc(self.h_output.nbytes)

        self.stream = cuda.Stream()
Example #23
0
def allocate_buffers(engine: trt.ICudaEngine, batch_size: int):
    print('Allocating buffers ...')

    inputs = []
    outputs = []
    dbindings = []

    stream = cuda.Stream()

    for binding in engine:
        size = batch_size * abs(trt.volume(engine.get_binding_shape(binding)))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        dbindings.append(int(device_mem))

        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, dbindings, stream
Example #24
0
def __allocate_buffers(engine):
    """Allocates all buffers required for the specified engine."""
    inputs = []
    outputs = []
    bindings = []

    for binding in engine:
        # Get binding (tensor/buffer) size
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        # Get binding (tensor/buffer) data type (numpy-equivalent)
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate page-locked memory (i.e., pinned memory) buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        # Allocate linear piece of device memory
        device_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(device_mem))

        if engine.binding_is_input(binding):
            inputs.append(__HostDeviceTuple(host_mem, device_mem))
        else:
            outputs.append(__HostDeviceTuple(host_mem, device_mem))

    stream = cuda.Stream()
    return inputs, outputs, bindings, stream
Example #25
0
    def _allocate_buffers(self):
        self.inputs = []
        self.outputs = []
        self.bindings = []
        self.stream = cuda.Stream()

        # NMS implementation in TRT 6 only supports DataType.FLOAT
        binding_to_type = {
            "Input": np.float32,
            "NMS": np.float32,
            "NMS_1": np.int32
        }
        for binding in self.__trt_engine:
            shape = self.__trt_engine.get_binding_shape(binding)
            size = trt.volume(shape) * self.__trt_engine.max_batch_size
            dtype = binding_to_type[str(binding)]

            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            # Append the device buffer to device bindings.
            self.bindings.append(int(device_mem))

            # Append to the appropriate list.
            if self.__trt_engine.binding_is_input(binding):
                self.inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                self.outputs.append(HostDeviceMem(host_mem, device_mem))
Example #26
0
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    #创建一个cuda流
    stream = cuda.Stream()
    for binding in engine:
        #trt.volume用来计算可迭代对象的体积
        #get_binding_shape用来获取相应绑定的维度
        #size表示engine中绑定的所需要的最大维度
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        #get_binding_dtype用来获取相应绑定的数据类型
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        #给主机和设备分配缓冲区
        #cuda.pagelocked_empty给主机分配相关的页面锁定内存
        host_mem = cuda.pagelocked_empty(size, dtype)
        #给设备分配内存
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        #将分配给设备的内存添加到设备绑定
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        #确定绑定是否是一个输入绑定
        if engine.binding_is_input(binding):
            #如果是的话
            #HostDeviceMem的实现参考common.py
            #将相应的内存地址添加到对应的列表里面
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            #如果不是的话
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Example #27
0
def allocate_buffers(engine):
    """
    Allocates all buffers required for the specified engine
    """
    inputs = []
    outputs = []
    bindings = []
    # Iterate over binding names in engine
    for binding in engine:
        # Get binding (tensor/buffer) size
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        # Get binding (tensor/buffer) data type (numpy-equivalent)
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate page-locked memory (i.e., pinned memory) buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        # Allocate linear piece of device memory
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings
        bindings.append(int(device_mem))
        # Append to inputs/ouputs list
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    # Create a stream (to eventually copy inputs/outputs and run inference)
    stream = cuda.Stream()
    return inputs, outputs, bindings, stream
Example #28
0
def allocate_buffers(engine, is_explicit_batch=False, dynamic_shapes=[]):
    inputs = []
    outputs = []
    bindings = []

    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

        def __repr__(self):
            return self.__str__()

    for binding in engine:
        dims = engine.get_binding_shape(binding)
        if dims[0] == -1:
            assert(len(dynamic_shapes) > 0)
            dims[0] = dynamic_shapes[0]
        size = trt.volume(dims) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings
Example #29
0
def allocate_buffers(engine, batch_size):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:

        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
        dims = engine.get_binding_shape(binding)

        # in case batch dimension is -1 (dynamic)
        if dims[0] < 0:
            size *= -1

        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Example #30
0
def main_tensorrt():
    """Executes TensorRT test board predictions."""
    print("TensorRT predictions")
    if cuda is None or trt is None:
        raise ImportError("Unable to import pycuda or tensorrt")

    trt_logger = trt.Logger(trt.Logger.VERBOSE)
    # Read and deserialize the serialized ICudaEngine
    with open(MODEL_PATH_TRT, 'rb') as f, trt.Runtime(trt_logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

    inputs, outputs, bindings, stream = __allocate_buffers(engine)

    img_array = np.zeros(
        (engine.max_batch_size, trt.volume((IMG_SIZE_TRT, IMG_SIZE_TRT, 3))))

    # Create an IExecutionContext (context for executing inference)
    with engine.create_execution_context() as context:

        def obtain_pieces_probs(pieces):
            # Assuming batch size == 64
            for i, piece in enumerate(pieces):
                img_array[i] = load_image(piece, IMG_SIZE_TRT,
                                          PRE_INPUT_TRT).ravel()
            np.copyto(inputs[0].host, img_array.ravel())
            trt_outputs = __infer(
                context, bindings, inputs, outputs, stream)[-1]

            return [trt_outputs[ind:ind + 13] for ind in range(0, 13 * 64, 13)]

        test_predict_board(obtain_pieces_probs)