def infer(context, input_img, output_size, batch_size):
    #load engine
    engine = context.get_engine()
    #print("Bindings: {}").format(engine.get_nb_bindings())
    assert (engine.get_nb_bindings() == 2)
    #convert input data to Float32
    input_img = input_img.astype(np.float32)

    #create output array to receive data
    #output = np.empty(output_size, dtype = np.float32)

    #alocate pagelocked memory
    output = cuda.pagelocked_empty(output_size, dtype=np.float32)

    # #alocate device memory
    # print(input_img.size)
    # print(input_img.dtype.itemsize) #itemsize in byte
    d_input = cuda.mem_alloc(batch_size * input_img.size *
                             input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img,
                           stream)  #likely copy from here(pc) to device(gpu)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    #synchronise threads
    stream.synchronize()

    #save our engine to a file to use later
    #trt.utils.write_engine_to_file("/root/tensorrt/tiny-yolo.engine", engine.serialize())
    return output
Exemple #2
0
def detect(engine: trt.ICudaEngine, img: np.ndarray) -> "tuple[np.ndarray, np.ndarray, np.ndarray]":
    #this function performs network execution on the given img
    #additionally this function does preprocessing and postprocessing of img
    #param engine: tensor rt engine created from network weights
    #param img: image to perform detection on
    #return value: predictions in original image coordinates
    #predictions: bounding boxes, confidences, class_ids

    with engine.create_execution_context() as context:
        h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
        h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
        #preprocess
        preprocess_start = time.time()
        preprocessed_img, resize_ratio, padding_size = preprocess_img(img)
        preprocess_stop = time.time()
        #copy our input image to buffer
        np.copyto(h_input, preprocessed_img.flatten())
        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        postprocess_start = time.time()
        predictions = postprocess(h_output)
        predictions_in_original_coords = transform_detected_coords_to_original(predictions, resize_ratio, padding_size)
        postprocess_stop = time.time()
        print(f"Preprocessing time: {(preprocess_stop - preprocess_start) * 1000:.4f} ms")
        print(f"Postprocessing time: {(postprocess_stop - postprocess_start) * 1000:.4f} ms")
        print(f"Complete detection time: {(postprocess_stop - preprocess_start) * 1000:.4f} ms")
    return predictions_in_original_coords
def allocate_buffers(engine,melgan_time_step):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        if(binding == 'input'):
            size = 1 * n_mel_channels * melgan_time_step
        if (binding == 'output'):
            size = hop_length * melgan_time_step
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Exemple #4
0
    def create_graph(self):
        """"""
        uff_model = uff.from_tensorflow_frozen_model(
            self.model_file, ['InceptionResnetV2/Logits/Predictions'])
        G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)
        parser = uffparser.create_uff_parser()
        parser.register_input('input_image', (3, 512, 512), 0)
        parser.register_output('InceptionResnetV2/Logits/Predictions')

        engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1,
                                             1 << 32)

        parser.destroy()

        runtime = trt.infer.create_infer_runtime(G_LOGGER)
        self.context = engine.create_execution_context()

        self.output = np.empty(len(self.id2name), dtype=np.float32)
        self.d_input = cuda.mem_alloc(1 * 512 * 512 * 3 * 4)
        self.d_output = cuda.mem_alloc(1 * len(self.id2name) * 4)

        self.bindings = [int(self.d_input), int(self.d_output)]
        self.stream = cuda.Stream()
Exemple #5
0
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    binding_to_type = {"data": np.float32, "im_info": np.float32, "bbox_pred": np.float32, "cls_prob": np.float32, "rois": np.float32}
    for binding in engine:
        print("binding:",binding)
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
       # dtype = trt.nptype(engine.get_binding_dtype(binding))
        dtype = binding_to_type[str(binding)]
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream
 def __init__(self, model_path=None, cuda_ctx=None):
     self._model_path = model_path
     if self._model_path is None:
         print("please set trt model path!")
         exit()
     self.cuda_ctx = cuda_ctx
     if self.cuda_ctx is None:
         self.cuda_ctx = cuda.Device(0).make_context()
     if self.cuda_ctx:
         self.cuda_ctx.push()
     self.trt_logger = trt.Logger(trt.Logger.INFO)
     self._load_plugins()
     self.engine = self._load_engine()
     try:
         self.context = self.engine.create_execution_context()
         self.stream = cuda.Stream()
         self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers(
         )
     except Exception as e:
         raise RuntimeError('fail to allocate CUDA resources') from e
     finally:
         if self.cuda_ctx:
             self.cuda_ctx.pop()
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:  # binding相当于一组字符串名称,包括'data','prob', engine可以切片, engine[0]='data', engine[1]='prob'
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size,
                                         dtype)  # (784,)是把img拉直的一个一维数组,作为主机的缓存
        device_mem = cuda.mem_alloc(
            host_mem.nbytes)  # obj,可以int(obj),是所占的GPU内存比特数
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem,
                                        device_mem))  # (2,) 分别是data和prob
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Exemple #8
0
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        print(binding)
        print(size)
        print(dtype)
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Exemple #9
0
    def _allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()

        for binding in self.engine:

            size = trt.volume(self.engine.get_binding_shape(
                binding)) * self.engine.max_batch_size
            dtype = trt.nptype(self.engine.get_binding_dtype(binding))

            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            bindings.append(int(device_mem))

            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings, stream
def allocate_buffers(engine):
    inputs, outputs, bindings = [], [], []
    stream = cuda.Stream()
    for binding in engine:
        # print(binding) # 绑定的输入输出
        # print(engine.get_binding_shape(binding)) # get_binding_shape 是变量的大小
        size = trt.volume(engine.get_binding_shape(binding))*engine.max_batch_size
        # volume 计算可迭代变量的空间,指元素个数
        # size = trt.volume(engine.get_binding_shape(binding)) # 如果采用固定bs的onnx,则采用该句
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # get_binding_dtype  获得binding的数据类型
        # nptype等价于numpy中的dtype,即数据类型
        # allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)  # 创建锁业内存
        device_mem = cuda.mem_alloc(host_mem.nbytes)    # cuda分配空间
        # print(int(device_mem)) # binding在计算图中的缓冲地址
        bindings.append(int(device_mem))
        #append to the appropriate list
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Exemple #11
0
def Plan(*args, **kwds):
    mempool = kwds.pop('mempool', None)
    context_obj = kwds.pop('context', None)
    stream_obj = kwds.pop('stream', None)

    if stream_obj is not None:
        device = cuda.Context.get_device()
        wait_for_finish = False
    elif context_obj is not None:
        device = context_obj.get_device()
        wait_for_finish = True
        stream_obj = None
    else:
        device = cuda.Context.get_device()
        stream_obj = cuda.Stream()
        wait_for_finish = True

    if 'wait_for_finish' not in kwds or kwds['wait_for_finish'] is None:
        kwds['wait_for_finish'] = wait_for_finish

    context = Context(device, stream_obj, mempool)

    return FFTPlan(context, *args, **kwds)
Exemple #12
0
    def setUp(self):

        #Set which CL device to use, and disable kernel caching
        self.gpu_ctx = Common.CUDAContext()

        # Make some host data which we can play with
        self.nx = 3
        self.ny = 5
        self.nx_halo = 1
        self.ny_halo = 2
        self.dataShape = (self.ny + 2 * self.ny_halo,
                          self.nx + 2 * self.nx_halo)

        self.buf1 = np.zeros(self.dataShape, dtype=np.float32, order='C')
        self.dbuf1 = np.zeros(self.dataShape)
        self.buf3 = np.zeros(self.dataShape, dtype=np.float32, order='C')
        self.dbuf3 = np.zeros(self.dataShape)
        for j in range(self.dataShape[0]):
            for i in range(self.dataShape[1]):
                self.buf1[j, i] = i * 100 + j
                self.dbuf1[j, i] = self.buf1[j, i]
                self.buf3[j, i] = j * 1000 - i
                self.dbuf3[j, i] = self.buf3[j, i]

        self.explicit_free = False

        self.device_name = self.gpu_ctx.cuda_device.name()
        self.gpu_stream = cuda.Stream()

        self.tests_failed = True

        self.cudaarray = Common.CUDAArray2D(self.gpu_stream, \
                                            self.nx, self.ny, \
                                            self.nx_halo, self.ny_halo, \
                                            self.buf1)

        self.double_cudaarray = None
    def __init__(self, model):
        # load tensorrt engine
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        TRTbin = model
        print('trtbin', TRTbin)
        with open(TRTbin, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
        self.context = engine.create_execution_context()
        # allocate memory
        inputs, outputs, bindings = [], [], []
        stream = cuda.Stream()
        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding))
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))
            if engine.binding_is_input(binding):
                inputs.append({'host': host_mem, 'device': device_mem})
            else:
                outputs.append({'host': host_mem, 'device': device_mem})
        # save to class
        self.inputs = inputs
        self.outputs = outputs
        self.bindings = bindings
        self.stream = stream

        self.no = 12
        self.output_shapes = [(1, 3, 56, 56, self.no), (1, 3, 28, 28, self.no),
                              (1, 3, 14, 14, self.no)]
        self.names = [
            'angular_leafspot', 'anthracnose_fruit_rot', 'blossom_blight',
            'gray_mold', 'leaf_spot', 'powdery_mildew_fruit',
            'powdery_mildew_leaf'
        ]

        self.img_size = 448
Exemple #14
0
    def _init_thread_memory(self, dev_id:int, ctx:cuda.Context, alloc_size: int) -> None:
        '''
        Single thread that initializes the memory for all the stream for a single 
        GPU. 
        '''       
        
        ctx.push()
        size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream))
        # Initialize streams
        for i in range(self.num_stream):
            self.streams[dev_id].append(cuda.Stream())

        for i in range(0, self.num_stream, 1):
            # allocate memory on device
            self.moments_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10))))
            self.w_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.x_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.y_device[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))

            # set host memory for returned output
            self.c_moments[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7))))
            self.mu[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.yf[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

            self.m1[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5))))
            self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(0), size_per_batch, size_per_batch,
                    block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i])
            self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(1), size_per_batch, np.int32(0),
                    block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i])

            self.x1[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w1[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.x2[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w2[dev_id].append((cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

        ctx.synchronize()
        ctx.pop()
Exemple #15
0
def do_inference_overlap(context, inp):
    '''Do inference using a TRT engine and time it
    Execution and device-to-host copy are completed asynchronously
    '''
    # Typical Python-TRT used in samples would copy input data from host to device.
    # Because the PyTorch Tensor is already on the device, such a copy is unneeded.

    t0 = time.perf_counter()
    # Create output buffers and stream
    stream = cuda.Stream()
    outputs, bindings, out_shape = trtutils.allocate_buffers_with_existing_inputs(
        context, inp)
    t01 = time.perf_counter()
    t1 = time.perf_counter()
    # Run inference and transfer outputs to host asynchronously
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    stream.synchronize()
    t2 = time.perf_counter()
    copyto = t1 - t0
    inference = t2 - t1
    outputs[0].device.free()
    out = perfutils.torchify_trt_out(outputs[0].host, out_shape)
    return out, t2 - t1
Exemple #16
0
def infer(context, input_img, output_size, batch_size):
    # Load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    # Convert input data to float32
    input_img = input_img.astype(np.float32)
    # Create host buffer to receive data
    output = np.empty(output_size, dtype = np.float32)
    # Allocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]
    stream = cuda.Stream()
    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # Execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    # Synchronize threads
    stream.synchronize()
    # Return predictions
    return output
def main():
    # initialize TensorRT engine and parse ONNX model
    engine, context = build_engine(ONNX_FILE_PATH)

    # get sizes of input and output and allocate memory required for input data and for output data
    for binding in engine:
        if engine.binding_is_input(binding):  # we expect only one input
            input_shape = engine.get_binding_shape(binding)
            input_size = (
                trt.volume(input_shape)
                * engine.max_batch_size
                * np.dtype(np.float32).itemsize
            )  # in bytes
            device_input = cuda.mem_alloc(input_size)
        else:  # and one output
            output_shape = engine.get_binding_shape(binding)
            # create page-locked memory buffers (i.e. won't be swapped to disk)
            host_output = cuda.pagelocked_empty(
                trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32
            )
            device_output = cuda.mem_alloc(host_output.nbytes)

    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()

    # preprocess input data
    host_input = np.array(
        preprocess_image("turkish_coffee.jpg").numpy(), dtype=np.float32, order="C"
    )
    cuda.memcpy_htod_async(device_input, host_input, stream)

    # postprocess results
    output_data = torch.Tensor(host_output).reshape(
        engine.max_batch_size, output_shape[0]
    )
    postprocess(output_data)
Exemple #18
0
 def __init__(self,
              context: trt.IExecutionContext,
              stream=None,
              device=None,
              cuda_device=None,
              cuda_context=None):
     self.engine = context.engine
     if device is None:
         self.torch_device = torch.device("cuda:0")
     else:
         self.torch_device = device
     inputs, outputs, bindings = allocate_buffers_torch(
         self.engine, self.torch_device)
     self.context = context
     self.inputs = inputs
     self.outputs = outputs
     self.bindings = bindings
     self.input_dict = {mem.name: mem for mem in inputs}
     self.output_dict = {mem.name: mem for mem in outputs}
     if stream is None:
         self.stream = cuda.Stream()
     self._batch_size = None
     self.cuda_device = cuda_device
     self.cuda_context = cuda_context
Exemple #19
0
def main(args):

    model = torch.load(args.model_path)

    # Prepare input
    image = np.zeros((3, 64, 128), dtype=np.float32)
    flat_image = image.ravel()
    tensorrt_out = np.empty(2, dtype=np.float32)

    # Prepare torch version
    model.eval()
    image_shape = tuple([1] + list(image.shape))
    torch_image = np.reshape(image, image_shape)
    batch = Variable(torch.from_numpy(torch_image).cuda() / 255.0)
    torch_out = model.forward(batch, None).cpu().data.numpy()[0]
    print("Torch Predictions:", torch_out)
    weights = model.state_dict()
    engine = build_engine(weights)
    context = engine.create_execution_context()
    d_input = cuda.mem_alloc(1 * flat_image.size * flat_image.dtype.itemsize)
    d_output = cuda.mem_alloc(1 * tensorrt_out.size *
                              tensorrt_out.dtype.itemsize)
    bindings = [int(d_input), int(d_output)]
    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, flat_image, stream)

    #execute model
    context.enqueue(1, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(tensorrt_out, d_output, stream)
    #syncronize threads
    stream.synchronize()

    print("TensorRT Predictions: ", tensorrt_out)
Exemple #20
0
 def load_engine(self, model):
     # load tensorrt engine
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     with open(model, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
         engine = runtime.deserialize_cuda_engine(f.read())
     self.context = engine.create_execution_context()
     # allocate memory
     inputs, outputs, bindings = [], [], []
     stream = cuda.Stream()
     for binding in engine:
         size = trt.volume(engine.get_binding_shape(binding))
         dtype = trt.nptype(engine.get_binding_dtype(binding))
         host_mem = cuda.pagelocked_empty(size, dtype)
         device_mem = cuda.mem_alloc(host_mem.nbytes)
         bindings.append(int(device_mem))
         if engine.binding_is_input(binding):
             inputs.append({'host': host_mem, 'device': device_mem})
         else:
             outputs.append({'host': host_mem, 'device': device_mem})
     # save to class
     self.inputs = inputs
     self.outputs = outputs
     self.bindings = bindings
     self.stream = stream
Exemple #21
0
    def __init__(self, cuda_engine, device_id=0):
        """Create an ``Engine``.

        Parameters
        ----------
        cuda_engine : tensorrt.ICudaEngine
            The built cuda engine.
        device_id : int, optional, default=0
            The index of executing device.

        """
        # Create executing resources.
        self._cuda_engine = cuda_engine
        self._device_id = device_id
        self._context = cuda_engine.create_execution_context()
        self._stream = driver.Stream(0)

        # Create bindings.
        num_binding = self._cuda_engine.num_bindings
        self._bindings = [Binding(cuda_engine, self._context, i, device_id)
                          for i in range(num_binding)]
        self._inputs = [b for b in self._bindings if b.is_input]
        self._outputs = [b for b in self._bindings if not b.is_input]

        # Report the engine info.
        logging.info('TensorRT engine built.')
        binding_info = 'InputInfo: {\n'
        for b in self._inputs:
            binding_info += '  * Binding("{}", shape={}, dtype={})\n' \
                            .format(b.name, b.shape, b.dtype)
        logging.info(binding_info + '}')
        binding_info = 'OutputInfo: {\n'
        for b in self._outputs:
            binding_info += '  * Binding("{}", shape={}, dtype={})\n' \
                            .format(b.name, b.shape, b.dtype)
        logging.info(binding_info + '}')
Exemple #22
0
def image_cuda(grids):
    """ Run 2d FFT to image each plane of grid array
    """

    from pyfft.cuda import Plan
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as cuda

    nints, npixx, npixy = grids.shape

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((npixx, npixy), stream=stream)

    grid_gpu = gpuarray.to_gpu(grids)
    for i in range(0, nints):
        plan.execute(grid_gpu[i], inverse=True)
    grids = grid_gpu.get()

    context.pop()
    return recenter(grids.real, (npixx//2, npixy//2))
def do_inference(engine, input):
    with engine.create_execution_context() as context:
        # h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=trt.nptype(data_type))
        # h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=trt.nptype(data_type))
        h_input = cuda.pagelocked_empty(trt.volume(
            context.get_binding_shape(0)),
                                        dtype=np.float32)
        h_output = cuda.pagelocked_empty(trt.volume(
            context.get_binding_shape(1)),
                                         dtype=np.float32)
        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()

        input = np.array(input, order='C').ravel()
        np.copyto(h_input, input)

        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)

        # Show time spent in each layer
        # context.profiler = trt.Profiler()

        # Run inference.
        context.execute_async_v2(bindings=[int(d_input),
                                           int(d_output)],
                                 stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()
        # Return the host output.
        output = h_output.reshape((1, 3, 512, 512))
        return torch.tensor(output)
Exemple #24
0
    def do_inference(inf_context, inf_host_in, inf_host_out):
        """
        Perform inference using the CUDA context
        :param inf_context: context created by engine
        :param inf_host_in: input from the host
        :param inf_host_out: output to save on host
        :return:
        """

        inference_engine = inf_context.engine
        # Input and output bindings are required for inference
        assert inference_engine.num_bindings == 2

        # allocate memory in GPU using CUDA bindings
        device_in = cuda.mem_alloc(inf_host_in.nbytes)
        device_out = cuda.mem_alloc(inf_host_out.nbytes)

        # create bindings for input and output
        bindings = [int(device_in), int(device_out)]

        # create CUDA stream for simultaneous CUDA operations
        stream = cuda.Stream()

        # copy input from host (CPU) to device (GPU)  in stream
        cuda.memcpy_htod_async(device_in, inf_host_in, stream)

        # execute inference using context provided by engine
        inf_context.execute_async(bindings=bindings,
                                  stream_handle=stream.handle)

        # copy output back from device (GPU) to host (CPU)
        cuda.memcpy_dtoh_async(inf_host_out, device_out, stream)

        # synchronize the stream to prevent issues
        #       (block CUDA and wait for CUDA operations to be completed)
        stream.synchronize()
Exemple #25
0
    def __init__(self, engine_path):
        """Tensorrt engine model dynamic inference

        Args:
            engine_path (trt.tensorrt.ICudaEngine)
        """
        super(TRTModel, self).__init__()
        # cfx多线程需要加的限制
        self.cfx = pycuda.autoinit.context
        self.engine_path = engine_path
        self.logger = trt.Logger(getattr(trt.Logger, 'ERROR'))
        ## load engine for engine_path
        self.engine = self.load_engine()
        self.stream = cuda.Stream()
        # default profile index is 0
        self.profile_index = 0
        ## create context for cuda engine
        self.context = self.engine.create_execution_context()
        self.batch_size_ranges = []
        ## get input/deploy_trtoutput cuda swap address use idx
        self.input_binding_idxs, self.output_binding_idxs = self._get_binding_idxs(
        )
        ## get network input/output name
        self.input_names, self.output_names = self.get_input_output_name()
img, label = MNIST_DATASETS.test.next_batch(1)
img = img[0]
#convert input data to Float32
img = img.astype(np.float32)
label = label[0]
# runtime context
runtime = trt.infer.create_infer_runtime(G_LOGGER)
context = engine.create_execution_context()
#
output = np.empty(10, dtype=np.float32)
#alocate device memory
d_input = cuda.mem_alloc(1 * img.size * img.dtype.itemsize)
d_output = cuda.mem_alloc(1 * output.size * output.dtype.itemsize)
#
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
#transfer input data to device
cuda.memcpy_htod_async(d_input, img, stream)
#execute model
context.enqueue(1, bindings, stream.handle, None)
#transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
#syncronize threads
stream.synchronize()
#
print("Test Case: " + str(label))
print("Prediction: " + str(np.argmax(output)))
#
trt.utils.write_engine_to_file("./tf_mnist.engine", engine.serialize())
#
new_engine = trt.utils.load_engine(G_LOGGER, "./tf_mnist.engine")
Exemple #27
0
def main():
    parser = argparse.ArgumentParser(description='BERT Inference Benchmark')
    parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine')
    parser.add_argument(
        '-b',
        '--batch-size',
        default=[],
        action="append",
        help=
        'Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.',
        type=int)
    parser.add_argument('-s',
                        '--sequence-length',
                        default=128,
                        help='Sequence length of the BERT model',
                        type=int)
    parser.add_argument(
        '-i',
        '--iterations',
        default=200,
        help='Number of iterations to run when benchmarking each batch size.',
        type=int)
    parser.add_argument(
        '-w',
        '--warm-up-runs',
        default=10,
        help='Number of iterations to run prior to benchmarking.',
        type=int)
    parser.add_argument('-r',
                        '--random-seed',
                        required=False,
                        default=12345,
                        help='Random seed.',
                        type=int)
    args, _ = parser.parse_known_args()
    args.batch_size = args.batch_size or [1]

    # Import necessary plugins for BERT TensorRT
    ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)

    with open(args.engine, 'rb') as f, trt.Runtime(
            TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read(
            )) as engine, engine.create_execution_context() as context:
        # Allocate buffers large enough to store the largest batch size
        max_input_shape = (max(args.batch_size), args.sequence_length)
        max_output_shape = (max(args.batch_size), args.sequence_length, 2, 1,
                            1)
        buffers = [
            DeviceBuffer(max_input_shape),
            DeviceBuffer(max_input_shape),
            DeviceBuffer(max_input_shape),
            DeviceBuffer(max_output_shape)
        ]

        # Prepare random input
        pseudo_vocab_size = 30522
        pseudo_type_vocab_size = 2
        np.random.seed(args.random_seed)
        test_word_ids = np.random.randint(
            0,
            pseudo_vocab_size, (max(args.batch_size), args.sequence_length),
            dtype=np.int32)
        test_segment_ids = np.random.randint(
            0,
            pseudo_type_vocab_size,
            (max(args.batch_size), args.sequence_length),
            dtype=np.int32)
        test_input_mask = np.ones((max(args.batch_size), args.sequence_length),
                                  dtype=np.int32)

        # Copy input h2d
        cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
        cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel())
        cuda.memcpy_htod(buffers[2].buf, test_input_mask.ravel())

        num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles

        bench_times = {}

        for idx, batch_size in enumerate(sorted(args.batch_size)):
            context.active_optimization_profile = idx

            # Each profile has unique bindings
            binding_idx_offset = idx * num_binding_per_profile
            bindings = [0] * binding_idx_offset + [
                buf.binding() for buf in buffers
            ]

            shapes = {
                "input_ids": (batch_size, args.sequence_length),
                "segment_ids": (batch_size, args.sequence_length),
                "input_mask": (batch_size, args.sequence_length),
            }

            for binding, shape in shapes.items():
                context.set_binding_shape(engine[binding] + binding_idx_offset,
                                          shape)
            assert context.all_binding_shapes_specified

            # Inference
            total_time = 0
            start = cuda.Event()
            end = cuda.Event()
            stream = cuda.Stream()

            # Warmup
            for _ in range(args.warm_up_runs):
                context.execute_async_v2(bindings=bindings,
                                         stream_handle=stream.handle)
                stream.synchronize()

            # Timing loop
            times = []
            for _ in range(args.iterations):
                start.record(stream)
                context.execute_async_v2(bindings=bindings,
                                         stream_handle=stream.handle)
                end.record(stream)
                stream.synchronize()
                times.append(end.time_since(start))

            # Compute average time, 95th percentile time and 99th percentile time.
            bench_times[batch_size] = times

        [b.free() for b in buffers]

        for batch_size, times in bench_times.items():
            total_time = sum(times)
            avg_time = total_time / float(len(times))
            times.sort()
            percentile95 = times[int(len(times) * 0.95)]
            percentile99 = times[int(len(times) * 0.99)]
            print(
                "Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:} ms \tAverage Time: {:} ms\t95th Percentile Time: {:} ms\t99th Percentile Time: {:}"
                .format(args.iterations, batch_size, total_time, avg_time,
                        percentile95, percentile99))
Exemple #28
0
def _thread(pid, tid, cuda_context, cuda_kernel, dispatcher, temp_storage,
            total_edge_count, log_lock, merge_lock, exit_signal, exit_state):
    try:
        with log_lock:
            logging.debug('Clustering subprocess {} thread {} started.'.format(
                pid, tid))

        cuda_context.push()

        ref_block_height, ref_block_width = block_dimensions
        edg_path = Path(temp_storage, 'edg')
        dps_path = Path(temp_storage, 'dps')
        ranked_spectra = session.ranked_spectra
        cuda_stream = drv.Stream()

        allocation_size_divisor = allocation_size_initial_divisor
        allocation_size = int(ref_block_height * ref_block_width /
                              allocation_size_divisor)
        reallocated = False

        with log_lock:
            logging.debug(
                'Clustering subprocess {} thread {}: Allocating host and device memory.'
                .format(pid, tid))
        # allocate host pagelocked memory
        # input
        plm_precursor_mass = drv.pagelocked_empty(
            ref_block_height + ref_block_width,
            dtype=CG_PRECURSOR_MASS_DATA_TYPE)
        plm_mz = drv.pagelocked_empty(
            (ref_block_height + ref_block_width, num_of_peaks),
            dtype=CG_MZ_DATA_TYPE)
        plm_intensity = drv.pagelocked_empty(
            (ref_block_height + ref_block_width, num_of_peaks),
            dtype=CG_INTENSITY_DATA_TYPE)
        plm_block_dimensions = drv.pagelocked_empty(
            2, dtype=CG_BLOCK_DIMENSIONS_DATA_TYPE)
        plm_offset = drv.pagelocked_empty(2, dtype=CG_OFFSET_DATA_TYPE)
        plm_allocation_size = drv.pagelocked_empty(
            1, dtype=CG_ALLOCATION_SIZE_DATA_TYPE)
        # output
        plm_counter = drv.pagelocked_empty(1, dtype=CG_COUNTER_DATA_TYPE)
        plm_edge = drv.pagelocked_empty((allocation_size, 2),
                                        dtype=CG_EDGE_DATA_TYPE)
        plm_dot_product = drv.pagelocked_empty(allocation_size,
                                               dtype=CG_DOT_PRODUCT_DATA_TYPE)
        plm_overflowed = drv.pagelocked_empty(1, dtype=CG_OVERFLOWED_DATA_TYPE)

        # allocate device memory
        # input
        dvp_precursor_mass = drv.mem_alloc_like(plm_precursor_mass)
        dvp_mz = drv.mem_alloc_like(plm_mz)
        dvp_intensity = drv.mem_alloc_like(plm_intensity)
        dvp_block_dimensions = drv.mem_alloc_like(plm_block_dimensions)
        dvp_offset = drv.mem_alloc_like(plm_offset)
        dvp_allocation_size = drv.mem_alloc_like(plm_allocation_size)
        # output
        dvp_counter = drv.mem_alloc_like(plm_counter)
        dvp_edge = drv.mem_alloc_like(plm_edge)
        dvp_dot_product = drv.mem_alloc_like(plm_dot_product)
        dvp_overflowed = drv.mem_alloc_like(plm_overflowed)

        with log_lock:
            logging.debug(
                'Clustering subprocess {} thread {}: Start iterating dispatcher.'
                .format(pid, tid))
        previous_row_id = -1
        dispatcher.connect(pid, tid)

        # iterate dispatcher to get blocks
        for row_id, column_id, block in dispatcher.iterate(pid, tid):
            if exit_signal.value:
                with log_lock:
                    logging.debug(
                        'Subprocess {} thread {}: Received exit signal, exits now.'
                        .format(pid, tid))
                break

            try:
                y_range, x_range = block
                block_height = y_range[1] - y_range[0]
                block_width = x_range[1] - x_range[0]
                if row_id != previous_row_id:
                    with log_lock:
                        logging.debug(
                            '\033[92mSubprocess {} thread {}: Processing row {} (y:{}->{}).\033[0m'
                            .format(pid, tid, row_id, *y_range))
                    previous_row_id = row_id

                # get necessary data
                plm_precursor_mass[:
                                   block_height] = ranked_spectra.precursor_mass[
                                       y_range[0]:y_range[1]]
                plm_precursor_mass[
                    block_height:block_height +
                    block_width] = ranked_spectra.precursor_mass[
                        x_range[0]:x_range[1]]
                plm_mz[:block_height] = ranked_spectra.mz[
                    y_range[0]:y_range[1]]
                plm_mz[block_height:block_height +
                       block_width] = ranked_spectra.mz[x_range[0]:x_range[1]]
                plm_intensity[:block_height] = ranked_spectra.intensity[
                    y_range[0]:y_range[1]]
                plm_intensity[block_height:block_height +
                              block_width] = ranked_spectra.intensity[
                                  x_range[0]:x_range[1]]
                plm_block_dimensions[:] = (block_height, block_width)
                plm_offset[:] = (y_range[0], x_range[0])
                # upload data
                drv.memcpy_htod_async(dvp_precursor_mass, plm_precursor_mass,
                                      cuda_stream)
                drv.memcpy_htod_async(dvp_mz, plm_mz, cuda_stream)
                drv.memcpy_htod_async(dvp_intensity, plm_intensity,
                                      cuda_stream)
                drv.memcpy_htod_async(dvp_block_dimensions,
                                      plm_block_dimensions, cuda_stream)
                drv.memcpy_htod_async(dvp_offset, plm_offset, cuda_stream)

                if reallocated:
                    allocation_size_divisor = allocation_size_initial_divisor
                    allocation_size = int(ref_block_height * ref_block_width /
                                          allocation_size_divisor)
                    # reallocate host pagelocked memory
                    del plm_edge
                    del plm_dot_product
                    plm_edge = drv.pagelocked_empty((allocation_size, 2),
                                                    dtype=CG_EDGE_DATA_TYPE)
                    plm_dot_product = drv.pagelocked_empty(
                        allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE)
                    # reallocate device memory
                    del dvp_edge
                    del dvp_dot_product
                    dvp_edge = drv.mem_alloc_like(plm_edge)
                    dvp_dot_product = drv.mem_alloc_like(plm_dot_product)
                    with log_lock:
                        logging.debug(
                            '\033[92mSubprocess {} thread {}: Reset memory allocation size divisor to {}.\033[0m'
                            .format(pid, tid, allocation_size_divisor))
                    reallocated = False

                cublockdim = (cuda_block_dimensions[1],
                              cuda_block_dimensions[0], 1)
                cugriddim = (math.ceil(block_width / cuda_block_dimensions[1]),
                             math.ceil(block_height /
                                       cuda_block_dimensions[0]))
                while True:
                    plm_allocation_size[0] = allocation_size
                    plm_counter[0] = 0
                    plm_overflowed[0] = False
                    drv.memcpy_htod_async(dvp_allocation_size,
                                          plm_allocation_size, cuda_stream)
                    drv.memcpy_htod_async(dvp_counter, plm_counter,
                                          cuda_stream)
                    drv.memcpy_htod_async(dvp_overflowed, plm_overflowed,
                                          cuda_stream)

                    cuda_kernel.prepared_async_call(
                        cugriddim, cublockdim, cuda_stream, dvp_precursor_mass,
                        dvp_mz, dvp_intensity, dvp_block_dimensions,
                        dvp_offset, dvp_allocation_size, dvp_counter, dvp_edge,
                        dvp_dot_product, dvp_overflowed)

                    # transfer computation result from device to host
                    drv.memcpy_dtoh_async(plm_edge, dvp_edge, cuda_stream)
                    drv.memcpy_dtoh_async(plm_counter, dvp_counter,
                                          cuda_stream)
                    drv.memcpy_dtoh_async(plm_overflowed, dvp_overflowed,
                                          cuda_stream)
                    drv.memcpy_dtoh_async(plm_dot_product, dvp_dot_product,
                                          cuda_stream)
                    cuda_stream.synchronize()

                    if plm_overflowed[0]:
                        allocation_size_divisor = int(allocation_size_divisor /
                                                      2)
                        if allocation_size_divisor < 1:
                            err_msg = (
                                '\nSubprocess {} thread {}: Allocation size divisor reached to the impossible value of {}.'
                                .format(pid, tid, allocation_size_divisor))
                            with log_lock:
                                logging.error(err_msg)
                            raise Exception(err_msg)
                        with log_lock:
                            logging.debug(
                                '\033[92mSubprocess {} thread {}: Edge list overflowed, '
                                'decreases allocation size divisor to {}.\033[0m'
                                .format(pid, tid, allocation_size_divisor))
                        allocation_size = int(block_width * block_height /
                                              allocation_size_divisor)
                        # reallocate host pagelocked memory
                        del plm_edge
                        del plm_dot_product
                        plm_edge = drv.pagelocked_empty(
                            (allocation_size, 2), dtype=CG_EDGE_DATA_TYPE)
                        plm_dot_product = drv.pagelocked_empty(
                            allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE)
                        # reallocate device memory
                        del dvp_edge
                        del dvp_dot_product
                        dvp_edge = drv.mem_alloc_like(plm_edge)
                        dvp_dot_product = drv.mem_alloc_like(plm_dot_product)
                        reallocated = True
                        continue
                    else:
                        break

                if abs(plm_precursor_mass[block_height - 1] -
                       plm_precursor_mass[block_height + block_width -
                                          1]) > precursor_tolerance:
                    dispatcher.next_row(pid, tid)
                with merge_lock:
                    edge_list_size = int(plm_counter[0])
                    if edge_list_size != 0:
                        total_edge_count.value += edge_list_size
                        edg = np.memmap(str(edg_path),
                                        dtype=CG_EDGE_DATA_TYPE,
                                        mode='r+',
                                        shape=(total_edge_count.value, 2))
                        dps = np.memmap(str(dps_path),
                                        dtype=CG_DOT_PRODUCT_DATA_TYPE,
                                        mode='r+',
                                        shape=total_edge_count.value)
                        edg[-edge_list_size:] = plm_edge[:edge_list_size]
                        dps[-edge_list_size:] = plm_dot_product[:
                                                                edge_list_size]

            except Exception:
                err_msg = '\nSubprocess {} thread {}: Failed to clustering block (y:{}->{}, x:{}->{}).' \
                    .format(pid, tid, y_range[0], y_range[1], x_range[0], x_range[1])
                with log_lock:
                    logging.error(err_msg)
                raise

        with log_lock:
            if not exit_signal.value:
                logging.debug(
                    'Subprocess {} thread {}: Reached the end of iteration, work done.'
                    .format(pid, tid))
        cuda_context.pop()

    except (Exception, KeyboardInterrupt) as e:
        if type(e) is KeyboardInterrupt:
            with log_lock:
                logging.debug(
                    'Subprocess {} thread {}: Received KeyboardInterrupt, exits now.'
                    .format(pid, tid))
        else:
            with log_lock:
                logging.exception(
                    '\nSubprocess {} thread {}: Ended unexpectedly. Logging traceback:\n'
                    '==========TRACEBACK==========\n'.format(pid, tid))
            exit_signal.value = True
            exit_state.value = 1
        cuda_context.pop()
    return
def pycuda_multi_kernel(img, k_harris, thresh, executions):
    """
    Finds and returns list of corners
        :param img: grayscale image
        :param k: Harris corner constant. Usually 0.04 - 0.06
        :param thresh: The threshold above which a corner is counted
        :param executions: Number of times to be executed
        :return: corner_list: List with corners
        :return: average_execution_time: Average execution time in seconds
    """
    # only for 256 by 512 images
    assert img.shape[0] == 256  # height
    assert img.shape[1] == 512  # width
    height = img.shape[0]
    width = img.shape[1]

    vector_size = img.shape[0] * img.shape[1]
    corner_list = []
    offset = 2
    # to fit still in a 32-bit integer
    thresh = int(thresh / 10)

    # function template
    func_mod_template = Template("""
    #include<stdio.h>
    #define INDEX(a, b) a*${HEIGHT}+b

    __global__ void corners(
        float *dest,
        float *ixx,
        float *ixy,
        float *iyy,
        int offset,
        float k,
        int threshold) {

        unsigned int idx = threadIdx.x + threadIdx.y*blockDim.y +
                            (blockIdx.x*(blockDim.x*blockDim.y));

        unsigned int a = idx/${HEIGHT};
        unsigned int b = idx%${HEIGHT};

        float sxx = 0;
        float sxy = 0;
        float syy = 0;
        float det = 0;
        float trace = 0;
        float r = 0;

        if ((a >= offset) & (a <= (${WIDTH}-offset - 1)) &
            (b >= offset) & (b <= (${HEIGHT}-offset - 1))) {
            for (int bi = b - offset; bi < b + offset + 1; ++bi) {
                for (int ai = a - offset; ai < a + offset + 1; ++ai) {
                    sxx = sxx + ixx[INDEX(ai, bi)];
                    sxy = sxy + ixy[INDEX(ai, bi)];
                    syy = syy + iyy[INDEX(ai, bi)];
                }
            }
            det = sxx*syy - sxy*sxy;
            trace = sxx + syy;
            r = det - k*(trace*trace);
            if ((r/10) > threshold)
                dest[INDEX(a, b)] = r;
        }
    }
    """)

    # Find x and y derivatives
    dy, dx = np.gradient(img)
    Ixx = dx**2
    Ixy = dy * dx
    Iyy = dy**2

    ixx = Ixx.reshape(vector_size, order='F')
    ixy = Ixy.reshape(vector_size, order='F')
    iyy = Iyy.reshape(vector_size, order='F')
    dest_r = np.zeros_like(ixx)

    # the image is divided in four parts and processed in 4 diff kernels
    n = 4  # Number of slices (and concurrent operations) used.

    k_height = height
    k_width = int(width / n)

    func_mod = SourceModule(
        func_mod_template.substitute(HEIGHT=k_height, WIDTH=k_width))
    pycuda_corners = func_mod.get_function("corners")

    ###### Start concurrency configuration #######

    # Allocate memory on the host.
    d_ixx, d_ixy, d_iyy, d_dest_r = [], [], [], []
    slice_size = int(vector_size / n)
    for k in range(n):
        # Allocate memory on device.
        d_ixx.append(drv.mem_alloc(ixx[0:slice_size].nbytes))
        d_ixy.append(drv.mem_alloc(ixy[0:slice_size].nbytes))
        d_iyy.append(drv.mem_alloc(iyy[0:slice_size].nbytes))
        d_dest_r.append(drv.mem_alloc(dest_r[0:slice_size].nbytes))

    # Create the streams and events needed.
    stream = []
    event = []
    event_dtoh = []
    marker_names = ['kernel_begin', 'kernel_end']
    for k in range(n):
        stream.append(drv.Stream())
        event.append(dict([(marker_names[l], drv.Event()) \
                    for l in range(len(marker_names))]))
        event_dtoh.append(drv.Event())

    # Use this event as a reference point.
    ref = drv.Event()
    finish = drv.Event()
    ref.record()

    #### Important ######
    # The size of the slices must be larger (+ offset) to calculate
    # r at the limits of each section of the image.
    # This version does not calculate an r values for the limits of the
    # different image sections.
    #### Important ######

    for _ in range(executions):
        # Transfer to device.
        for k in range(n):
            drv.memcpy_htod_async(d_ixx[k],
                                  ixx[slice_size * k:slice_size * (k + 1)],
                                  stream=stream[k])
            drv.memcpy_htod_async(d_ixy[k],
                                  ixy[slice_size * k:slice_size * (k + 1)],
                                  stream=stream[k])
            drv.memcpy_htod_async(d_iyy[k],
                                  iyy[slice_size * k:slice_size * (k + 1)],
                                  stream=stream[k])
            drv.memcpy_htod_async(d_dest_r[k],
                                  dest_r[slice_size * k:slice_size * (k + 1)],
                                  stream=stream[k])

        # Run kernels
        for k in range(n):
            event[k]['kernel_begin'].record(stream[k])
            pycuda_corners(
                d_dest_r[k],
                d_ixx[k],
                d_ixy[k],
                d_iyy[k],
                np.uint32(offset),
                np.float32(k_harris),
                np.uint32(thresh),
                # max 1024 threds, 32x32 is a regular choice
                block=(32, 32, 1),
                grid=(int(128 / n), 1, 1),
                stream=stream[k])
        for k in range(n):
            event[k]['kernel_end'].record(stream[k])

        # Transfer data back to host.
        for k in range(n):
            drv.memcpy_dtoh_async(dest_r[slice_size * k:slice_size * (k + 1)],
                                  d_dest_r[k],
                                  stream=stream[k])
            # event that it completed the transfer
            event_dtoh[k].record(stream[k])
            stream[k].synchronize()

    # finish
    finish.record()
    finish.synchronize()

    ###### Output results #####

    print('Timing info of stream launches in seconds')
    for k in range(n):
        print('Stream', k)
        for l in range(len(marker_names)):
            print(marker_names[l], ':',
                  ref.time_till(event[k][marker_names[l]]) * 1e-3)

    # extract the corners
    r = np.reshape(dest_r, (256, 512), order='F')
    corners = np.where(r > 0)

    for i, j in zip(corners[0], corners[1]):
        corner_list.append([j, i, r[i, j]])

    average_execution_time = (ref.time_till(finish) * 1e-3) / executions

    # for profiling
    # pycuda.autoinit.context.detach()

    return corner_list, average_execution_time
Exemple #30
0
	def run_simulation(self):

		# setup data#{{{
		data = { 'weights': self.weights, 'lengths': self.lengths, 'params': self.params.T }
		base_shape = self.n_work_items,
		for name, shape in dict(
			tavg0=(self.exposures, self.args.n_regions,),
			tavg1=(self.exposures, self.args.n_regions,),
			state=(self.buf_len, self.states * self.args.n_regions),
			).items():
			# memory error exception for compute device
			try:
				data[name] = np.zeros(shape + base_shape, 'f')
			except MemoryError as e:
				self.logger.error('%s.\n\t Please check the parameter dimensions %d x %d, they are to large '
							 'for this compute device',
							 e, self.args.n_sweep_arg0, self.args.n_sweep_arg1)
				exit(1)

		gpu_data = self.make_gpu_data(data)#{{{

		# setup CUDA stuff#{{{
		step_fn = self.make_kernel(
			source_file=self.args.filename,
			warp_size=32,
			# block_dim_x=self.args.n_sweep_arg0,
			# ext_options=preproccesor_defines,
			# caching=args.caching,
			args=self.args,
			lineinfo=self.args.lineinfo,
			nh=self.buf_len,
			)#}}}

		# setup simulation#{{{
		tic = time.time()

		n_streams = 32
		streams = [drv.Stream() for i in range(n_streams)]
		events = [drv.Event() for i in range(n_streams)]
		tavg_unpinned = []

		try:
			tavg = drv.pagelocked_zeros((n_streams,) + data['tavg0'].shape, dtype=np.float32)
		except drv.MemoryError as e:
			self.logger.error(
				'%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU',
				e, self.params.size)
			exit(1)

		# determine optimal grid recursively
		def dog(fgd):
			maxgd, mingd = max(fgd), min(fgd)
			maxpos = fgd.index(max(fgd))
			if (maxgd - 1) * mingd * bx * by >= nwi:
				fgd[maxpos] = fgd[maxpos] - 1
				dog(fgd)
			else:
				return fgd

		# n_sweep_arg0 scales griddim.x, n_sweep_arg1 scales griddim.y
		# form an optimal grid recursively
		bx, by = self.args.blockszx, self.args.blockszy
		nwi = self.n_work_items
		rootnwi = int(np.ceil(np.sqrt(nwi)))
		gridx = int(np.ceil(rootnwi / bx))
		gridy = int(np.ceil(rootnwi / by))

		final_block_dim = bx, by, 1

		fgd = [gridx, gridy]
		dog(fgd)
		final_grid_dim = fgd[0], fgd[1]

		assert gridx * gridy * bx * by >= nwi

		self.logger.info('history shape %r', gpu_data['state'].shape)
		self.logger.info('gpu_data %s', gpu_data['tavg0'].shape)
		self.logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, ))
		self.logger.info('final block dim %r', final_block_dim)
		self.logger.info('final grid dim %r', final_grid_dim)

		# run simulation#{{{
		nstep = self.args.n_time

		self.gpu_mem_info() if self.args.verbose else None

		try:
			for i in tqdm.trange(nstep, file=sys.stdout):

				try:
					event = events[i % n_streams]
					stream = streams[i % n_streams]

					if i > 0:
						stream.wait_for_event(events[(i - 1) % n_streams])

					step_fn(np.uintc(i * self.n_inner_steps), np.uintc(self.args.n_regions), np.uintc(self.buf_len),
							np.uintc(self.n_inner_steps), np.uintc(self.n_work_items), np.float32(self.dt),
							gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'],
							gpu_data['tavg%d' % (i%2,)],
							block=final_block_dim, grid=final_grid_dim)

					event.record(streams[i % n_streams])
				except drv.LaunchError as e:
					self.logger.error('%s', e)
					exit(1)

				tavgk = 'tavg%d' % ((i + 1) % 2,)

				# async wrt. other streams & host, but not this stream.
				if i >= n_streams:
					stream.synchronize()
					tavg_unpinned.append(tavg[i % n_streams].copy())

				drv.memcpy_dtoh_async(tavg[i % n_streams], gpu_data[tavgk].ptr, stream=stream)

			# recover uncopied data from pinned buffer
			if nstep > n_streams:
				for i in range(nstep % n_streams, n_streams):
					stream.synchronize()
					tavg_unpinned.append(tavg[i].copy())

			for i in range(nstep % n_streams):
				stream.synchronize()
				tavg_unpinned.append(tavg[i].copy())

		except drv.LogicError as e:
			self.logger.error('%s. Check the number of states of the model or '
						 'GPU block shape settings blockdim.x/y %r, griddim %r.',
						 e, final_block_dim, final_grid_dim)
			exit(1)
		except drv.RuntimeError as e:
			self.logger.error('%s', e)
			exit(1)


		# self.logger.info('kernel finish..')
		# release pinned memory
		tavg = np.array(tavg_unpinned)

		# also release gpu_data
		self.release_gpumem(gpu_data)

		self.logger.info('kernel finished')
		return tavg