def __init__(self,**params):

        '''
        Hack-ish way to avoid initialisation until the weights are transfered:
        '''
        should_apply = self.apply_output_fns_init
        params['apply_output_fns_init'] = False

        super(GPUSparseCFProjection,self).__init__(**params)
        # Transfering the weights:
        self.pycuda_stream = cuda.Stream()
        self.weights_gpu = cusparse.CSR.to_CSR(self.weights.toSparseArray().transpose())
        # Getting the row and columns indices for the *transposed* matrix. Used for Hebbian learning and normalisation:
        nzcols, nzrows = self.weights.nonzero()
        tups = sorted(zip(nzrows, nzcols))
        nzrows = [x[0] for x in tups]
        nzcols = [x[1] for x in tups]

        '''
        Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the
        main memory without the involvment of the CPU:
        '''
        self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32)
        self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)

        self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED)
        self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32)

        self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)

        # Getting them on the GPU:
        self.nzcount = self.weights.getnnz()
        self.nzrows_gpu = gpuarray.to_gpu(np.array(nzrows, np.int32))
        self.nzcols_gpu = gpuarray.to_gpu(np.array(nzcols, np.int32))
        # Helper array for normalization:
        self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32))
        # Kernel that applies the normalisation:
        self.normalize_kernel = ElementwiseKernel(
                        "int *nzrows, float *norm_total, float *weights",
                        "weights[i] *= norm_total[nzrows[i]]",
                        "divisive_normalize")
        # Kernel that calculates the learning:
        self.hebbian_kernel = ElementwiseKernel(
                        "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result",
                        "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]",
                        "hebbian_learning")

        params['apply_output_fns_init'] = should_apply
        self.apply_output_fns_init = should_apply
        if self.apply_output_fns_init:
            self.apply_learn_output_fns()
Example #2
0
    def _gpuAlloc(self):
        #Get GPU information
        self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available
        self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols)))    # multiply by 4 as that is size of float
        # set max rows to smaller number to save memory usage
        if self.totalRows < self.maxPossRows:
            print "reducing max rows to reduce memory use on GPU"
            self.maxPossRows = self.totalRows

        # create pagelocked buffers and GPU arrays
        self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32)
        self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes)
        self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
Example #3
0
    def __init__(self,**params):
        #Hack-ish way to avoid initialisation until the weights are transfered:
        should_apply = self.apply_output_fns_init
        params['apply_output_fns_init'] = False
        super(GPUSparseCFProjection,self).__init__(**params)
        # The sparse matrix is stored in COO format, used for Hebbian learning and normalisation:
        nzcols, nzrows, values = self.weights.getTriplets()
        tups = sorted(zip(nzrows, nzcols, values))
        nzrows = np.array([x[0] for x in tups], np.int32)
        nzcols = np.array([x[1] for x in tups], np.int32)
        values = np.array([x[2] for x in tups], np.float32)
        # Getting them on the GPU:
        self.nzcount = self.weights.getnnz()
        self.nzrows_gpu = gpuarray.to_gpu(nzrows)
        self.nzcols_gpu = gpuarray.to_gpu(nzcols)
        # Setting the projection weights in CSR format for dot product calculation:
        rowPtr = cusparse.coo2csr(self.nzrows_gpu, self.weights.shape[1])
        descrA = cusparse.cusparseCreateMatDescr()
        cusparse.cusparseSetMatType(descrA, cusparse.CUSPARSE_MATRIX_TYPE_GENERAL)
        cusparse.cusparseSetMatIndexBase(descrA, cusparse.CUSPARSE_INDEX_BASE_ZERO)

        self.weights_gpu = cusparse.CSR(descrA, values, rowPtr, self.nzcols_gpu, (self.weights.shape[1], self.weights.shape[0]))
        # Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the
        # main memory without the involvment of the CPU:
        self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32)
        self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)

        self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED)
        self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32)

        self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)
        # Helper array for normalization:
        self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32))
        # Kernel that applies the normalisation:
        self.normalize_kernel = ElementwiseKernel(
                        "int *nzrows, float *norm_total, float *weights",
                        "weights[i] *= norm_total[nzrows[i]]",
                        "divisive_normalize")
        # Kernel that calculates the learning:
        self.hebbian_kernel = ElementwiseKernel(
                        "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result",
                        "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]",
                        "hebbian_learning")
        self.pycuda_stream = cuda.Stream()
        # Finishing the initialisation that might have been delayed:
        params['apply_output_fns_init'] = should_apply
        self.apply_output_fns_init = should_apply
        if self.apply_output_fns_init:
            self.apply_learn_output_fns()
Example #4
0
  def get_next_minibatch(self, i, train=TRAIN):
    if train == TRAIN:
      data = self.train_data
    else:
      data = self.test_data

    batch_data = data.data
    batch_label = data.labels
    batch_size = self.batch_size

    mini_data = batch_data[:, i * batch_size: (i + 1) * batch_size]
    locked_data = driver.pagelocked_empty(mini_data.shape, mini_data.dtype, order='C',
                                          mem_flags=driver.host_alloc_flags.PORTABLE)
    locked_data[:] = mini_data

    if self.input is not None and locked_data.shape == self.input.shape:
      self.input.set(locked_data)
    else:
      self.input = gpuarray.to_gpu(locked_data)
    
    label = batch_label[i * batch_size : (i + 1) * batch_size]
    #label = gpuarray.to_gpu(label)

    #label = gpuarray.to_gpu(np.require(batch_label[i * batch_size : (i + 1) * batch_size],  dtype =
    #  np.float, requirements = 'C'))

    return self.input, label
Example #5
0
    def to_cpu(self):
        if self.flags.forc:
            return self.get(pagelocked=True)

        result = cuda.pagelocked_empty(self.shape, self.dtype)
        copy_non_contiguous(result, self)
        return result
Example #6
0
    def to_cpu(self):
        if self.flags.forc:
            return self.get(pagelocked=True)

        result = cuda.pagelocked_empty(self.shape, self.dtype)
        copy_non_contiguous(result, self)
        return result
Example #7
0
def allocate_buffers(engine: trt.ICudaEngine, batch_size: int):
    print('Allocating buffers ...')

    inputs = []
    outputs = []
    dbindings = []

    stream = cuda.Stream()

    for binding in engine:
        size = batch_size * abs(trt.volume(engine.get_binding_shape(binding)))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        dbindings.append(int(device_mem))

        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, dbindings, stream
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []

    # Simple helper data class that's a little nicer to use than a 2-tuple.
    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(
                self.device)

        def __repr__(self):
            return self.__str__()

    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Example #9
0
def allocate_buffers_with_existing_inputs(engine, inp, batch_size=1):
    '''
    allocate_buffers() (see TRT python samples) but uses an existing inputs on device

    inp:  List of pointers to device memory. Pointers are in the same order as
          would be produced by allocate_buffers(). That is, inputs are in the
          order defined by iterating through `engine`
    '''

    # Add input to bindings
    bindings = []
    outputs = []
    stream = cuda.Stream()
    inp_idx = 0

    for binding in engine:
        if engine.binding_is_input(binding):
            bindings.append(inp[inp_idx])
            inp_idx += 1
        else:
            # Unchanged from do_inference()
            size = trt.volume(engine.get_binding_shape(binding)) * batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes * 2)
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # Append to the appropriate list.
            outputs.append(HostDeviceMem(host_mem, device_mem))

    return outputs, bindings, stream
Example #10
0
    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.cfx = cuda.Device(0).make_context()
        self.stream = cuda.Stream()
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.context = self.engine.create_execution_context()

        self.host_inputs = []
        self.cuda_inputs = []
        self.host_outputs = []
        self.cuda_outputs = []
        self.bindings = []

        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(
                binding)) * self.engine.max_batch_size
            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            self.bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if self.engine.binding_is_input(binding):
                self.host_inputs.append(host_mem)
                self.cuda_inputs.append(cuda_mem)
            else:
                self.host_outputs.append(host_mem)
                self.cuda_outputs.append(cuda_mem)
Example #11
0
def __allocate_buffers(engine):
    """Allocates all buffers required for the specified engine."""
    inputs = []
    outputs = []
    bindings = []

    for binding in engine:
        # Get binding (tensor/buffer) size
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        # Get binding (tensor/buffer) data type (numpy-equivalent)
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate page-locked memory (i.e., pinned memory) buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        # Allocate linear piece of device memory
        device_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(device_mem))

        if engine.binding_is_input(binding):
            inputs.append(__HostDeviceTuple(host_mem, device_mem))
        else:
            outputs.append(__HostDeviceTuple(host_mem, device_mem))

    stream = cuda.Stream()
    return inputs, outputs, bindings, stream
Example #12
0
def alloc_buf(engine):
    # h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
    # h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)

    dtype = trt.nptype(DTYPE)
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=dtype)
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=dtype)

    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    stream = cuda.Stream()

    # np.copyto(h_input, (np.random.random((1, 3, input_size, input_size)).astype(np.float32)).reshape(-1))

    return h_input, h_output, d_input, d_output, stream
Example #13
0
    def _allocate_buffers(self):
        self.inputs = []
        self.outputs = []
        self.bindings = []
        self.stream = cuda.Stream()

        # NMS implementation in TRT 6 only supports DataType.FLOAT
        binding_to_type = {
            "Input": np.float32,
            "NMS": np.float32,
            "NMS_1": np.int32
        }
        for binding in self.__trt_engine:
            shape = self.__trt_engine.get_binding_shape(binding)
            size = trt.volume(shape) * self.__trt_engine.max_batch_size
            dtype = binding_to_type[str(binding)]

            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            # Append the device buffer to device bindings.
            self.bindings.append(int(device_mem))

            # Append to the appropriate list.
            if self.__trt_engine.binding_is_input(binding):
                self.inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                self.outputs.append(HostDeviceMem(host_mem, device_mem))
Example #14
0
def allocate_buffers2(engine, h_, w_):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    tmp = [1, 32, 16, 8]
    print('engine.get_binding_format_desc', engine.get_binding_format_desc(0))
    for count, binding in enumerate(engine):
        print('binding:', binding)
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size * (int)(
                h_ / tmp[count]) * (int)(w_ / tmp[count])
        #size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        #dtype=np.float16
        print('dtype:', dtype)
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))

        print('size:', size)
        print('input:', inputs)
        print('output:', outputs)
        print('------------------')
    return inputs, outputs, bindings, stream
Example #15
0
 def _allocate_buffers(self):
     for binding in self.engine:
         # print("binding  = {}".format(binding))  # input and output
         dims = self.engine.get_binding_shape(binding)
         # print("dims = {}".format(dims)) -> (1, 3, 256, 192); -> (1, 17, 64, 48)
         # print("before dims = {}".format(dims))
         # 使用context获取最大的显存,并进行分配
         if dims[0] < 0:
             # 要设置最大的尺寸,一次性给给运行环境分配最大的显存,后面到真实数据的时候再对context的输入进行改变
             if binding == 'input':
                 self.context.set_binding_shape(binding=0, shape=(self.maxBs, 3, dims[2], dims[3]))
             size = trt.volume(self.context.get_binding_shape(0 if binding == 'input' else 1))
         else:
             # 下面两种方法都可以
             # size = trt.volume(self.engine.get_binding_shape(binding)) * self.batch_size
             size = trt.volume(self.context.get_binding_shape(0 if binding == 'input' else 1)) * self.batch_size
         # print("after dims = {}".format(self.context.get_binding_shape(0 if binding == 'input' else 1)))
         dtype = trt.nptype(self.engine.get_binding_dtype(binding))
         # Allocate host and device buffers
         host_mem = cuda.pagelocked_empty(size, dtype)
         device_mem = cuda.mem_alloc(host_mem.nbytes)
         # Append the device buffer to device bindings.
         self.bindings.append(int(device_mem))
         # Append to the appropriate list.
         if self.engine.binding_is_input(binding):
             self.inputs.append(HostDeviceMem(host_mem, device_mem))
         else:
             self.outputs.append(HostDeviceMem(host_mem, device_mem))
Example #16
0
def infer(engine, x, batch_size):
    n = engine.get_nb_bindings()
    print('%d bindings' % n)

    mems = []  # CPU mem
    d_mems = []  # CUDA mem
    shapes = []
    for i in range(n):
        dims = engine.get_binding_dimensions(i)
        shape = dims.shape()
        print('bind %d :: %s' % (i, shape))
        cnt = volume(shape) * batch_size
        mem = cuda.pagelocked_empty(cnt, dtype=np.float32)
        d_mem = cuda.mem_alloc(cnt * mem.dtype.itemsize)
        shapes.append(shape)
        mems.append(mem)
        d_mems.append(d_mem)

    np.copyto(mems[0], x.flatten())

    stream = cuda.Stream()

    ids = list(range(n))
    inputs_ids = ids[:1]
    outputs_ids = ids[1:]

    for i in inputs_ids:
        cuda.memcpy_htod_async(d_mems[i], mems[i], stream)
    context = engine.create_execution_context()
    context.enqueue(batch_size, [int(p) for p in d_mems], stream.handle, None)
    context.destroy()
    for i in outputs_ids:
        cuda.memcpy_dtoh_async(mems[i], d_mems[i], stream)
    stream.synchronize()
    return [mems[i].reshape(shapes[i]) for i in outputs_ids]
Example #17
0
def allocate_buffers(engine, is_explicit_batch=False, dynamic_shapes=[]):
    inputs = []
    outputs = []
    bindings = []

    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

        def __repr__(self):
            return self.__str__()

    for binding in engine:
        dims = engine.get_binding_shape(binding)
        if dims[0] == -1:
            assert(len(dynamic_shapes) > 0)
            dims[0] = dynamic_shapes[0]
        size = trt.volume(dims) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings
Example #18
0
def allocate_buffers(engine, is_explicit_batch=False, input_shape=None):
    inputs = []
    outputs = []
    bindings = []

    class HostDeviceMem(object):
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(
                self.device)

        def __repr__(self):
            return self.__str__()

    for binding in engine:

        dims = engine.get_binding_shape(binding)
        print("*******" + str(dims) + " dims[-1] " + str(dims[-1]))

        if dims[-1] == -1:
            assert (input_shape is not None)
            dims[-2], dims[-1] = input_shape
        size = trt.volume(dims) * engine.max_batch_siz
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(device_mem))
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings
 def initialize_bindings(self):
     self.input_bindings = []
     self.output_bindings = []
     for idx in range(self.engine.num_bindings):
         print(
             self.engine.get_binding_name(idx),
             self.engine.get_binding_dtype(idx),
             self.engine.get_binding_shape(idx))
         if self.engine.binding_is_input(idx):  # we expect only one input
             input_shape = self.engine.get_binding_shape(idx)
             input_size = trt.volume(input_shape) * self.engine.max_batch_size * np.dtype(np.float32).itemsize  # in bytes
             self.input_bindings.append({
                 "input_shape": input_shape,
                 "input_size": input_size,
                 "device_input": cuda.mem_alloc(input_size),
             })
         else:  # and one output
             output_shape = self.engine.get_binding_shape(idx)
             host_output = cuda.pagelocked_empty(trt.volume(output_shape) * self.engine.max_batch_size, dtype=np.float32)
             device_output = cuda.mem_alloc(host_output.nbytes)
             self.output_bindings.append({
                 "output_shape": output_shape,
                 "host_output": host_output,
                 "device_output": device_output,
                 "name": self.engine.get_binding_name(idx)
             })
Example #20
0
    def load_model(self, engine_path):
        with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.context = self.engine.create_execution_context()

        self.device_input1, self.device_input2 = [None] * 2
        for binding in self.engine:
            if self.engine.binding_is_input(
                    binding):  # we expect only one input
                input_shape = self.engine.get_binding_shape(binding)
                input_size = trt.volume(
                    input_shape) * self.engine.max_batch_size * np.dtype(
                        np.float32).itemsize  # in bytes
                if self.device_input1 is None:
                    self.device_input1 = cuda.mem_alloc(input_size)
                elif self.device_input2 is None:
                    self.device_input2 = cuda.mem_alloc(input_size)
                else:
                    raise Exception("Network expects more than 2 inputs.")
            else:  # and one output
                self.output_shape = self.engine.get_binding_shape(binding)
                # create page-locked memory buffers (i.e. won't be swapped to disk)
                self.host_output = cuda.pagelocked_empty(
                    trt.volume(self.output_shape) * self.engine.max_batch_size,
                    dtype=np.float32)
                self.device_output = cuda.mem_alloc(self.host_output.nbytes)

        # Create a stream in which to copy inputs/outputs and run inference.
        self.stream = cuda.Stream()
Example #21
0
    def __init__(self, backend, ioshape, initval, extent, aliases, tags):
        # Call the standard matrix constructor
        super().__init__(backend, ioshape, initval, extent, aliases, tags)

        # Allocate a page-locked buffer on the host for MPI to send/recv from
        self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
                                           self.dtype, 'C')
Example #22
0
def allocate_buffersV2(engine, context):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    down_stride = 1
    print('engine.get_binding_format_desc', engine.get_binding_format_desc(0))
    for count, binding in enumerate(engine):
        # print('binding:', binding)
        size = trt.volume(
            context.get_binding_shape(count)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # print('dtype:', dtype)
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
        print('count: ', count)
        print('binding: ', binding)
        print('binding shape: ', engine.get_binding_shape(binding))
        print('binding shape: ', context.get_binding_shape(count))
        print('max_batch_size:', engine.max_batch_size)
        print('size:', size)
        print('input:', inputs)
        print('output:', outputs)
        print('------------------')
    return inputs, outputs, bindings, stream
Example #23
0
def allocate_buffers(engine, context):
    """Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
    Args:
        engine: trt.ICudaEngine.
        context: TensorRT execution context.
    Returns:
        inputs: input buffers.
        outputs: outputs buffers.
        bindings: memory bindings.
        stream: TensorRT CUDA stream.
    """
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(context.get_binding_shape(
            engine[binding]))  # * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
Example #24
0
 def _padded_array(self,ar): #{{{
     nrows_pad = ar.shape[0]
     ncols_pad = 16*((ar.shape[1]+15)/16)
     #arpad = numpy.empty((nrows_pad,ncols_pad),dtype=ar.dtype)
     arpad = cuda.pagelocked_empty((nrows_pad,ncols_pad),dtype=ar.dtype)
     arpad[0:ar.shape[0],0:ar.shape[1]] = ar
     return arpad
def allocate_buffers(engine):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    output_idx = 0
    stream = cuda.Stream()
    assert 3 <= len(engine) <= 4  # expect 1 input, plus 2 or 3 outpus
    for binding in engine:
        binding_dims = engine.get_binding_shape(binding)
        if len(binding_dims) == 4:
            # explicit batch case (TensorRT 7+)
            size = trt.volume(binding_dims)
        elif len(binding_dims) == 3:
            # implicit batch case (TensorRT 6 or older)
            size = trt.volume(binding_dims) * engine.max_batch_size
        else:
            raise ValueError('bad dims of binding %s: %s' %
                             (binding, str(binding_dims)))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            # each grid has 3 anchors, each anchor generates a detection
            # output of 7 float32 values
            assert size % 7 == 0
            outputs.append(HostDeviceMem(host_mem, device_mem))
            output_idx += 1
    return inputs, outputs, bindings, stream
Example #26
0
    def init_model(self, trt_path, ctx_id):
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
        cuda.init()
        device = cuda.Device(ctx_id)
        self.ctx = device.make_context()
        with open(trt_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())

        self.input_buffs = {}
        self.output_buffs = {}
        self.bindings = []
        self.stream = cuda.Stream()
        for name in engine:
            shape = engine.get_binding_shape(name)
            size = trt.volume(shape) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(name))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            self.bindings.append(int(device_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(name):
                self.input_buffs[name] = HostDeviceMem(host_mem, device_mem, shape)
            else:
                self.output_buffs[name] = HostDeviceMem(host_mem, device_mem, shape)

        self.model = engine.create_execution_context()
        self.logger.info("Warmup up...")
        self.inference_loops(10)
Example #27
0
def infer(engine, input_img, batch_size):
    #load engine
    context = engine.create_execution_context()
    assert (engine.get_nb_bindings() == 2)

    #create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size

    #Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype=np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size *
                             input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output
Example #28
0
def allocate_buffers(engine):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    output_idx = 0
    stream = cuda.Stream()
    # try:
    #     stream = cuda.Stream()
    # except Exception as e:
    #         raise RuntimeError('fail to use cuda stream') from e

    print("Stream done")
    assert 3 <= len(engine) <= 4  # expect 1 input, plus 2 or 3 outpus
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * \
               engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            # each grid has 3 anchors, each anchor generates a detection
            # output of 7 float32 values
            assert size % 7 == 0
            outputs.append(HostDeviceMem(host_mem, device_mem))
            output_idx += 1
    return inputs, outputs, bindings, stream
Example #29
0
    def _allocate_buffers(self, engine):
        # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        out_shapes = []
        input_shapes = []
        out_names = []
        input_names = []

        max_batch_size = engine.max_batch_size
        for binding in engine:
            # get binding_shape (value == -1 means dynamic shape)
            binding_shape = engine.get_binding_shape(binding)
            # compute max_size and dtype
            size = abs(trt.volume(binding_shape)) * max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # collect info to appropriate list
            if engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
                input_shapes.append(binding_shape)
                input_names.append(binding)
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
                out_shapes.append(binding_shape)
                out_names.append(binding)
        return bindings, stream, max_batch_size, inputs, input_shapes, input_names, outputs, out_shapes, out_names
Example #30
0
    def allocate_buffers(self, engine):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()

        for binding in engine:
            bindig_shape = tuple(engine.get_binding_shape(binding))
            # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size  # engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            host_mem = cuda.pagelocked_empty(bindig_shape, dtype)
            # print('\tAllocate host buffer: host_mem -> {}, {}'.format(host_mem, host_mem.nbytes))  # host mem

            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # print('\tAllocate device buffer: device_mem -> {}, {}'.format(device_mem, int(device_mem))) # device mem

            # print('\t# Append the device buffer to device bindings.......')
            bindings.append(int(device_mem))
            # print('\tbindings: ', bindings)

            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                # print("this is the input!")
                # print('____HostDeviceMem(host_mem, device_mem)): {}, {}'.format(HostDeviceMem(host_mem, device_mem),type(HostDeviceMem(host_mem, device_mem))))
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                # print("This is the output!")
                outputs.append(HostDeviceMem(host_mem, device_mem))
            # print("----------------------end allocating one binding in the onnx model-------------------------")

        return inputs, outputs, bindings, stream
Example #31
0
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    #创建一个cuda流
    stream = cuda.Stream()
    for binding in engine:
        #trt.volume用来计算可迭代对象的体积
        #get_binding_shape用来获取相应绑定的维度
        #size表示engine中绑定的所需要的最大维度
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        #get_binding_dtype用来获取相应绑定的数据类型
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        #给主机和设备分配缓冲区
        #cuda.pagelocked_empty给主机分配相关的页面锁定内存
        host_mem = cuda.pagelocked_empty(size, dtype)
        #给设备分配内存
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        #将分配给设备的内存添加到设备绑定
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        #确定绑定是否是一个输入绑定
        if engine.binding_is_input(binding):
            #如果是的话
            #HostDeviceMem的实现参考common.py
            #将相应的内存地址添加到对应的列表里面
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            #如果不是的话
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
 def _allocate_buffers(self, context):
     """
     Allocate device memory space for data.
     :param context:
     :return:
     """
     inputs = []
     outputs = []
     bindings = []
     stream = cuda.Stream()
     for binding in self._engine:
         size = trt.volume(self._engine.get_binding_shape(
             binding)) * self._engine.max_batch_size
         dtype = trt.nptype(self._engine.get_binding_dtype(binding))
         # Allocate host and device buffers
         host_mem = cuda.pagelocked_empty(size, dtype)
         device_mem = cuda.mem_alloc(host_mem.nbytes)
         # Append the device buffer to device bindings.
         bindings.append(int(device_mem))
         # Append to the appropriate list.
         if self._engine.binding_is_input(binding):
             inputs.append(HostDeviceMem(host_mem, device_mem))
         else:
             outputs.append(HostDeviceMem(host_mem, device_mem))
     return inputs, outputs, bindings, stream
Example #33
0
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    out_shapes = []
    input_shapes = []
    out_names = []
    max_batch_size = engine.get_profile_shape(0, 0)[2][0]
    for binding in engine:
        binding_shape = engine.get_binding_shape(binding)
        #Fix -1 dimension for proper memory allocation for batch_size > 1
        if binding_shape[0] == -1:
            binding_shape = (1, ) + binding_shape[1:]
        size = trt.volume(binding_shape) * max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            input_shapes.append(engine.get_binding_shape(binding))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
            #Collect original output shapes and names from engine
            out_shapes.append(engine.get_binding_shape(binding))
            out_names.append(binding)
    return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size
Example #34
0
def allocate_buffers(engine, batch_size):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:

        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
        dims = engine.get_binding_shape(binding)

        # in case batch dimension is -1 (dynamic)
        if dims[0] < 0:
            size *= -1

        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream
def allocate_buffers(engine, grid_sizes):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    output_idx = 0
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * \
               engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            # each grid has 3 anchors, each anchor generates a detection
            # output of 7 float32 values
            assert size == grid_sizes[
                output_idx] * 3 * 7 * engine.max_batch_size
            outputs.append(HostDeviceMem(host_mem, device_mem))
            output_idx += 1
    return inputs, outputs, bindings, stream
Example #36
0
    def allocate_buffers(self):
        """Allocates GPU memory for future use and creates an asynchronous stream"""

        # determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host i/o
        self.h_input = cuda.pagelocked_empty(
            trt.volume(self.engine.get_binding_shape(0)),
            dtype=trt.nptype(self.CONSTANTS["dtype"]))
        self.h_output = cuda.pagelocked_empty(
            trt.volume(self.engine.get_binding_shape(1)),
            dtype=trt.nptype(self.CONSTANTS["dtype"]))

        # allocate device memory for inputs and outputs
        self.d_input = cuda.mem_alloc(self.h_input.nbytes)
        self.d_output = cuda.mem_alloc(self.h_output.nbytes)

        self.stream = cuda.Stream()
Example #37
0
    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()

        engine = self.engine
        for binding in engine:
            shape = engine.get_binding_shape(binding)
            size = trt.volume(shape) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem, dtype,
                                            shape))
            else:
                outputs.append(
                    HostDeviceMem(host_mem, device_mem, dtype, shape))

        return inputs, outputs, bindings, stream
Example #38
0
    def get_async(self, stream=None, ary=None):
        if ary is None:
            ary = drv.pagelocked_empty(self.shape, self.dtype)
        else:
            assert ary.size == self.size
            assert ary.dtype == self.dtype

        if self.size:
            drv.memcpy_dtoh_async(ary, self.gpudata, stream)
        return ary
Example #39
0
 def get(self, ary=None, pagelocked=False):
     if ary is None:
         if pagelocked:
             ary = drv.pagelocked_empty(self.shape, self.dtype)
         else:
             ary = numpy.empty(self.shape, self.dtype)
     else:
         assert ary.size == self.size
         assert ary.dtype == self.dtype
     if self.size:
         drv.memcpy_dtoh(ary, self.gpudata)
     return ary
Example #40
0
  def __gpu_decorate_nodes(self, samples, labels):
    si_0 = driver.pagelocked_empty(self.n_samples, dtype = self.dtype_indices)
    si_1 = driver.pagelocked_empty(self.n_samples, dtype = self.dtype_indices)
    self.values_array = np.empty(self.n_nodes, dtype = self.dtype_labels)
    cuda.memcpy_dtoh(si_0, self.sorted_indices_gpu.ptr)
    cuda.memcpy_dtoh(si_1, self.sorted_indices_gpu_.ptr)
    
    decorate(self.target, 
              si_0, 
              si_1, 
              self.values_idx_array, 
              self.values_si_idx_array, 
              self.values_array, 
              self.n_nodes)

    self.values_idx_array = None
    self.values_si_idx_array = None
    self.left_children.resize(self.n_nodes, refcheck = False)
    self.right_children.resize(self.n_nodes, refcheck = False) 
    self.feature_threshold_array.resize(self.n_nodes, refcheck = False) 
    self.feature_idx_array.resize(self.n_nodes, refcheck = False)
Example #41
0
File: types.py Project: pv101/PyFR
    def __init__(self, backend, ioshape, initval, extent, aliases, tags):
        # Call the standard matrix constructor
        super().__init__(backend, ioshape, initval, extent, aliases, tags)

        # If MPI is CUDA-aware then construct a buffer out of our CUDA
        # device allocation and pass this directly to MPI
        if backend.mpitype == 'cuda-aware':
            self.hdata = _make_pybuf(self.data, self.nbytes, 0x200)
        # Otherwise, allocate a buffer on the host for MPI to send/recv from
        else:
            self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
                                               self.dtype, 'C')
Example #42
0
def threshold_integrated(series, value):
    global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val
        
    t = numpy.float32(value**2)
    nb = int(numpy.ceil(float(len(series))/nt/gs))
    
    if _bn is None or len(_bn) < nb:
        _bn = gpuarray.zeros(nb, dtype=numpy.uint32)
        
    if _n is None:
        _n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
        ptr = numpy.intp(_n.base.get_device_pointer())
        class T():
            pass
        _dn = T()
        _dn.gpudata = ptr
        _dn.flags = _n.flags
        
    if _loc_tmp is None or len(series) > len(_loc_tmp):
        _loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32)
        _loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32)
        _val_out = gpuarray.zeros(len(series), dtype=series.dtype)
        _val = driver.pagelocked_empty((4096*256), numpy.complex64)
        _loc = driver.pagelocked_empty((4096*256), numpy.uint32)
    
    #Do the thresholding by block
    stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1))
    
    # Recombine the blocks into a final output
    stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1))
    
    # We need to get the data back now
    pycbc.scheme.mgr.state.context.synchronize()
    if _n != 0: 
        driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata)
        driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata)
        pycbc.scheme.mgr.state.context.synchronize()
    return _loc[0:_n], _val[0:_n]
Example #43
0
    def get_async(self, stream=None, ary=None):
        if ary is None:
            ary = drv.pagelocked_empty(self.shape, self.dtype)

            ary = _as_strided(ary, strides=self.strides)
        else:
            assert ary.size == self.size
            assert ary.dtype == self.dtype
            assert ary.flags.forc

        assert self.flags.forc, "Array in get() must be contiguous"

        if self.size:
            drv.memcpy_dtoh_async(ary, self.gpudata, stream)
        return ary
Example #44
0
 def get_async(self, stream = None, ary = None):
     if ary is None:
         ary = cuda.pagelocked_empty(self.shape, self.dtype)
         
     else:
         assert ary.size == self.size
         assert ary.dtype == ary.dtype
         if ary.base.__class__ != cuda.HostAllocation:
             raise TypeError("asynchronous memory trasfer requires pagelocked numpy array")
     
     if self.size:
         if self.M == 1:
             cuda.memcpy_dtoh_async(ary, self.gpudata, stream)
         else:
             PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream)
             
     return ary
    def get(self, ary=None, astype=None, pagelocked=False):
        if ary is None:
            if pagelocked:
                ary = drv.pagelocked_empty(self.shape, self.dtype)
            else:
                ary = np.empty(self.shape, self.dtype)

            ary = _as_strided(ary, strides=self.strides)
        else:
            assert ary.size == self.size
            assert ary.dtype == self.dtype
            assert ary.flags.forc

        assert self.flags.forc, "Array in get() must be contiguous"

        if self.size:
            drv.memcpy_dtoh(ary, self.gpudata)

        if astype is not None:
            ary = ary.astype(astype) * 2 ** (self.iwl - 15)

        return ary
Example #46
0
    def get(self, ary = None, pagelocked = False):
        """
        get the PitchArray to an ndarray
        if ary is specified, will transfer device memory to ary's memory
        pagelocked is ary's memory is pagelocked
        """
        if ary is None:
            if pagelocked:
                ary = cuda.pagelocked_empty(self.shape, self.dtype)
            else:
                ary = np.empty(self.shape, self.dtype)
        else:
            assert ary.size == self.size
            assert ary.dtype == ary.dtype

        
        if self.size:
            if self.M == 1:
                cuda.memcpy_dtoh(ary, self.gpudata)
            else:
                PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype)
                
        return ary
Example #47
0
 def alloc_host_mem(self, shape, dtype):
     return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)
Example #48
0
"""

threshold_kernel = ElementwiseKernel(
            " %(tp_in)s *in, %(tp_out1)s *outv, %(tp_out2)s *outl, %(tp_th)s threshold, %(tp_n)s *bn" % {
                "tp_in": dtype_to_ctype(numpy.complex64),
                "tp_out1": dtype_to_ctype(numpy.complex64),
                "tp_out2": dtype_to_ctype(numpy.uint32),
                "tp_th": dtype_to_ctype(numpy.float32),
                "tp_n": dtype_to_ctype(numpy.uint32),
                },
            threshold_op,
            "getstuff")

import pycuda.driver as drv
n = drv.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
nptr = numpy.intp(n.base.get_device_pointer())

val = drv.pagelocked_empty((4096*256), numpy.complex64, mem_flags=drv.host_alloc_flags.DEVICEMAP)
vptr = numpy.intp(val.base.get_device_pointer())

loc = drv.pagelocked_empty((4096*256), numpy.int32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
lptr = numpy.intp(loc.base.get_device_pointer())

class T():
    pass

tn = T()
tv = T()
tl = T()
tn.gpudata = nptr
Example #49
0
                    stacklevel=2)
            ary = ary.reshape(self.shape)

        if ary.dtype != self.dtype:
            raise ValueError("ary and self must have the same dtype")

        if self.size:
            _memcpy_discontig(self, ary, async=async, stream=stream)

    def set_async(self, ary, stream=None):
        return self.set(ary, async=True, stream=stream)

    def get(self, ary=None, pagelocked=False, async=False, stream=None):
        if ary is None:
            if pagelocked:
                ary = drv.pagelocked_empty(self.shape, self.dtype)
            else:
                ary = np.empty(self.shape, self.dtype)

            strides = _compact_strides(self)
            ary = _as_strided(ary, strides=strides)
        else:
            if self.size != ary.size:
                raise ValueError("self and ary must be the same size")
            if self.shape != ary.shape:
                from warnings import warn
                warn("get() between arrays of different shape is deprecated "
                        "and will be removed in PyCUDA 2017.x",
                        DeprecationWarning, stacklevel=2)
                ary = ary.reshape(self.shape)
Example #50
0
        self.count=0
	self.display=display       
    def displayResults(self,res, cm=pylab.cm.gray, title='Specify a title'):
        if self.display:
		self.count=self.count+1
        	pylab.figure(self.count)
        	pylab.imshow(res, cm, interpolation='nearest')
        	pylab.colorbar()
        	pylab.title(title)
        
fx = 0
fy = 0
nx = 2**10
ny = 2**10

g1 = cuda.pagelocked_empty((nx,ny),'int16')
g2 = cuda.pagelocked_empty((nx,ny),'int16')

imname1 = "fr1.bin"
imname2 = "fr2.bin"
outname = "frame.vtk"

pivim.load_bin_image(imname1,g1)
pivim.load_bin_image(imname2,g2)

grid = g1.astype('complex64').reshape(nx,ny)
#grid2 = g2.astype('complex64').reshape(nx,ny)

displayResults = mydisplay(display=True).displayResults

displayResults(g1,title="Initial Grid")
Example #51
0
def cu_template_render_image_single(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128):
    """
    CPU part of the SPH render code that executes the rendering on the GPU
    
    does some basic particle set prunning and sets up the image
    tiles. It launches cuda kernels for rendering the individual sections of the image
    """
    import pycuda.driver as drv
    import pycuda.tools
    import pycuda.autoinit
    from pycuda.compiler import SourceModule
    from radix_sort import radix_sort

    global_start = time.clock()

    start = time.clock()
    # construct an array of particles
    Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')]
    ps = drv.pagelocked_empty(len(s),dtype=Partstruct)
    
    with s.immediate_mode : 
        ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']]

    if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start)

    ymin,ymax = xmin,xmax

    # ----------------------
    # setup the global image
    # ----------------------
    image = np.zeros((nx,ny),dtype=np.float32)
    
    dx = float32((xmax-xmin)/nx)
    dy = float32((ymax-ymin)/ny)
    
    x_start = xmin+dx/2
    y_start = ymin+dy/2

    zplane = 0.0

    start = time.clock()

    # ------------------
    # set up the kernels
    # ------------------
    code = file('/home/itp/roskar/homegrown/template_kernel.cu').read()
    mod = SourceModule(code)
    tile_histogram = mod.get_function("tile_histogram")
    distribute_particles = mod.get_function("distribute_particles")
    tile_render_kernel = mod.get_function("tile_render_kernel")
    calculate_keys = mod.get_function("calculate_keys")

    # allocate histogram array
    hist = np.zeros(Ntiles,dtype=np.int32)
    
    # transfer histogram array and particle data to GPU
    hist_gpu = drv.mem_alloc(hist.nbytes)
    drv.memcpy_htod(hist_gpu,hist)
    
    start_g = drv.Event()
    end_g = drv.Event()

    start_g.record()
    ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes)
    drv.memcpy_htod(ps_on_gpu,ps_gpu)
    end_g.record()
    end_g.synchronize()

    if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g))

    # make everything the right size
    xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax])
    nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles])

    start_g.record()
    tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles,
                   block=(nthreads,1,1),grid=(32,1,1))

    drv.Context.synchronize()
    drv.memcpy_dtoh(hist,hist_gpu)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g))
    print "<<< Total particle array = %d"%(hist.sum())

    # ---------------------------------------------------------------------------------
    # figured out the numbers of particles per tile -- set up the tile particle buffers
    # ---------------------------------------------------------------------------------
    ps_tiles = np.empty(hist.sum(),dtype=Partstruct)
    ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes)

    tile_offsets = np.array([0],dtype=np.int32)
    tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32))
    tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes)
    drv.memcpy_htod(tile_offsets_gpu,tile_offsets)

    start_g.record()
    distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), 
                         xmin, xmax, ymin, ymax, nx, ny, Ntiles, 
                         block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g))
    drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu)

    
    # -------------------------
    # start going through tiles
    # -------------------------
   
    # initialize the image on the device
    im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes)
    drv.memcpy_htod(im_gpu,image.astype(np.float32))
   

    # allocate key arrays -- these will be keys to sort particles into softening bins
    start_g.record()
    keys_gpu = drv.mem_alloc(int(4*hist.sum()))
    calculate_keys(ps_tiles_gpu, keys_gpu, np.int32(hist.sum()), np.float32(dx), 
                   block=(nthreads,1,1),grid=(32,1,1))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g))

    keys = np.empty(hist.sum(), dtype=np.int32)


    # ----------------------------------------
    # sort particles by their softening length
    # ----------------------------------------
    for i in xrange(Ntiles) : 
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            radix_sort(int(keys_gpu), int(ps_tiles_gpu), tile_offsets[i], n_per_tile)

    drv.memcpy_dtoh(keys,keys_gpu)
    drv.memcpy_dtoh(ps_tiles,ps_tiles_gpu)
#    return keys,ps_tiles,tile_offsets,dx
        
    drv.Context.synchronize()

    tile_start = time.clock()
    for i in xrange(Ntiles) :
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            my_stream = streams[i%16]
            
            xmin_p, xmax_p, ymin_p, ymax_p  = tiles_physical[i]
            xmin_t, xmax_t, ymin_t, ymax_t  = tiles_pix[i]
            
            nx_tile = xmax_t-xmin_t+1
            ny_tile = ymax_t-ymin_t+1
                    
                
            # make everything the right size
            xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t])
            xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p])
            
            tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i),
                               xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t,
                               im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]),
                               block=(nthreads,1,1),stream=my_stream)

    if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start)
    
    # ----------------------------------------------------------------------------------
    # process the particles with large smoothing lengths concurrently with GPU execution
    # ----------------------------------------------------------------------------------
    #if ind[1] != len(xs) : 
    #    start = time.clock()
    #    image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:],
    #                                  nx,ny,xmin,xmax,ymin,ymax)).T
    #    if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1],
    #                                                                                           time.clock()-start)
    drv.Context.synchronize()
    if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start)

    drv.memcpy_dtoh(image,im_gpu)
    drv.stop_profiler()
    
    if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start)

    del(start_g)
    del(end_g)
    
    return image
Example #52
0
def cu_template_render_image(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128, tile_size=100):
    """
    CPU part of the SPH render code that executes the rendering on the GPU
    
    does some basic particle set prunning and sets up the image
    tiles. It launches cuda kernels for rendering the individual sections of the image
    """
    import pycuda.driver as drv
    import pycuda.tools
    import pycuda.autoinit
    from pycuda.compiler import SourceModule
    from radix_sort import radix_sort

    global_start = time.clock()

    

    start = time.clock()
    # construct an array of particles
    Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')]
    ps = drv.pagelocked_empty(len(s),dtype=Partstruct)
    
    with s.immediate_mode : 
        ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']]

    if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start)

    ymin,ymax = xmin,xmax

    # ----------------------
    # setup the global image
    # ----------------------
    image = np.zeros((nx,ny),dtype=np.float32)
    
    dx = float32((xmax-xmin)/nx)
    dy = float32((ymax-ymin)/ny)
    
    x_start = xmin+dx/2
    y_start = ymin+dy/2

    zplane = 0.0

    # ------------------------------------------------------------------------------------------------
    # trim particles based on smoothing length -- the GPU will only render those that need < 32 pixels
    # ------------------------------------------------------------------------------------------------

    start = time.clock()
  #  gpu_bool = 2*ps['h'] < 15.*dx
    
    ps_gpu = ps#[gpu_bool]
   # ps_cpu = ps[~gpu_bool]
    #del(ps)
    if timing: '<<< Setting up gpu/cpu particle struct arrays took %f s'%(time.clock()-start)

    # -----------------------------------------------------------------
    # set up the image slices -- max. size is 100x100 pixels 
    # in this step only process particles that need kernels < 40 pixels
    # tiles are 100x100 = 1e4 pixels x 4 bytes = 40k
    # kernels are 31x31 pixels max = 3844 bytes
    # max shared memory size is 48k
    # -----------------------------------------------------------------
    
    start = time.clock()
    tiles_pix, tiles_physical = make_tiles(nx,ny,xmin,xmax,ymin,ymax,tile_size)
    if timing: print '<<< Tiles made in %f s'%(time.clock()-start)

    Ntiles = tiles_pix.shape[0]

     
    
    # ------------------
    # set up the kernels
    # ------------------
    code = file(os.path.join(os.path.dirname(__file__),'template_kernel.cu')).read()
    mod = SourceModule(code,options=["--ptxas-options=-v"])
    tile_histogram = mod.get_function("tile_histogram")
    distribute_particles = mod.get_function("distribute_particles")
    tile_render_kernel = mod.get_function("tile_render_kernel")
    calculate_keys = mod.get_function("calculate_keys")


    # -------------------------------------------------------------
    # set up streams and figure out particle distributions per tile 
    # -------------------------------------------------------------
   

    # allocate histogram array
    hist = np.zeros(Ntiles,dtype=np.int32)
    
    # transfer histogram array and particle data to GPU
    hist_gpu = drv.mem_alloc(hist.nbytes)
    drv.memcpy_htod(hist_gpu,hist)
    
    start_g = drv.Event()
    end_g = drv.Event()

    start_g.record()
    ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes)
    drv.memcpy_htod(ps_on_gpu,ps_gpu)
    end_g.record()
    end_g.synchronize()

    if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g))

    # make everything the right size
    xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax])
    nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles])

    # -----------------------------
    # calculate pixels per particle
    # -----------------------------

    # allocate key arrays -- these will be keys to sort particles into softening bins
    start_g.record()
    keys_gpu = drv.mem_alloc(int(4*len(s)))
    calculate_keys(ps_on_gpu, keys_gpu, np.int32(len(s)), np.float32(dx), 
                   block=(nthreads,1,1),grid=(1024,1,1))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g))

    # ----------------------------------------
    # sort particles by their softening length
    # ----------------------------------------
    start_g.record()
    radix_sort(int(keys_gpu), int(ps_on_gpu), np.int32(0), np.int32(len(s)))
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Radix sorting all tiles took %f ms'%(start_g.time_till(end_g))

    start_g.record()
    tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles,
                   block=(nthreads,1,1),grid=(1024,1,1))

    drv.Context.synchronize()
    drv.memcpy_dtoh(hist,hist_gpu)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g))
    print "<<< Total particle array = %d"%(hist.sum())

    # ---------------------------------------------------------------------------------
    # figured out the numbers of particles per tile -- set up the tile particle buffers
    # ---------------------------------------------------------------------------------
    ps_tiles = np.empty(hist.sum(),dtype=Partstruct)
    ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes)

    tile_offsets = np.array([0],dtype=np.int32)
    tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32))
    tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes)
    drv.memcpy_htod(tile_offsets_gpu,tile_offsets)

    start_g.record()
    distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), 
                         xmin, xmax, ymin, ymax, nx, ny, Ntiles, 
                         block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4)
    end_g.record()
    end_g.synchronize()
    if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g))
    drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu)

    
    # -------------------------
    # start going through tiles
    # -------------------------
   
    # initialize the image on the device
    im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes)
    drv.memcpy_htod(im_gpu,image.astype(np.float32))
   

    

  
    tile_start = time.clock()
    
    streams = [drv.Stream() for i in range(16)]    
    
    for i in xrange(Ntiles) :
        n_per_tile = tile_offsets[i+1] - tile_offsets[i]
        if n_per_tile > 0 : 
            my_stream = streams[i%(16)]
            
            xmin_p, xmax_p, ymin_p, ymax_p  = tiles_physical[i]
            xmin_t, xmax_t, ymin_t, ymax_t  = tiles_pix[i]
            
            nx_tile = xmax_t-xmin_t+1
            ny_tile = ymax_t-ymin_t+1
                    
                
            # make everything the right size
            xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t])
            xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p])
            
            if n_per_tile > nthreads*256: ngrid=128
            else : ngrid = 64
            
            tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i),
                               xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t,
                               im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]),
                               block=(nthreads,1,1),grid=(ngrid,1,1),stream=my_stream)

    if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start)
    
    # ----------------------------------------------------------------------------------
    # process the particles with large smoothing lengths concurrently with GPU execution
    # ----------------------------------------------------------------------------------
    #if ind[1] != len(xs) : 
    #    start = time.clock()
    #    image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:],
    #                                  nx,ny,xmin,xmax,ymin,ymax)).T
    #    if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1],
    #                                                                                           time.clock()-start)
    drv.Context.synchronize()
    if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start)

    drv.memcpy_dtoh(image,im_gpu)
    drv.stop_profiler()
    
    if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start)

    del(start_g)
    del(end_g)
    
    return image
Example #53
0
        self.count=0
	self.display=display       
    def displayResults(self,res, cm=pylab.cm.gray, title='Specify a title'):
        if self.display:
		self.count=self.count+1
        	pylab.figure(self.count)
        	pylab.imshow(res, cm, interpolation='nearest')
        	pylab.colorbar()
        	pylab.title(title)
        
fx = 0
fy = 0
nx = 2**10
ny = 2**10

g1 = cuda.pagelocked_empty(1024*1024,'int16')
g2 = cuda.pagelocked_empty(1024*1024,'int16')

cuda.load_bin_image("synthetic4_B.bin",g1)
cuda.load_bin_image("synthetic4_A.bin",g2)

grid = g1.astype('complex64').reshape(1024,1024)
grid2 = g2.astype('complex64').reshape(1024,1024)

#grid = FGrid(fx,fy,nx,ny).grid
#grid2 = FGrid(fx,fy,nx,ny).grid

#grid[0:4,0:4] = 5+0j;

#grid[7:11,0:4] = 5+0j;
#grid2[1:5,1:5] = 5+0j;
increment_mod = SourceModule("""
__global__ void increment(double *a, int N)
{
    int idx = threadIdx.x;
    if (idx < N)
        a[idx] = a[idx]+1;
}
""")
increment = increment_mod.get_function("increment")

N = 20
M = 3

# Time use of pinned host memory:
x = drv.pagelocked_empty((N, N), np.float64, mem_flags=drv.host_alloc_flags.DEVICEMAP)
x_gpu_ptr = np.intp(x.base.get_device_pointer())

times = np.empty(M)
for i in xrange(M):
    x[:, :] = np.random.rand(N, N)
    x_orig = x.copy()
    start = time()
    increment(x_gpu_ptr, np.uint32(x.size), block=(512, 1, 1))
    times[i] = time()-start
    np.allclose(x_orig + 1, x)

print "Average kernel execution time with pinned memory:   %3.7f" % np.mean(times)

# Time use of pageable host memory:
x = np.empty((N, N), np.float64)
Example #55
0
    def __init__(self, array_or_dtype, x_overlap=0):
        """ Create a spatial grid on the GPU(s).

        Input variables
        array_or_dtype -- can either be a numpy array of the same shape as
            the global space, or a numpy dtype. If a valid array is passed, 
            it will be loaded on to the GPU. If a dtype is passed, then
            an array of zeros, of that dtype will be loaded onto the GPU.

        Optional variables
        x_overlap -- the number of adjacent cells in either the negative or
            positive x-direction that need to simultaneously be accessed along
            with the current cell. Must be a non-negative integer. Default
            value is 0.

        """

        shape = get_space_info()['shape'] # Get the shape of the space.
        xr = get_space_info()['x_range'] # Get the local x_range.
        all_x_ranges = get_space_info()['all_x_ranges'] # Get the local x_range.
        local_shape = (xr[1]-xr[0], shape[1], shape[2])

        self._set_gce_type('grid') # Set the gce type to grid.

        # Make sure overlap option is valid.
        if type(x_overlap) is not int:
            raise TypeError('x_overlap must be an integer.')
        elif x_overlap < 0:
            raise TypeError('x_overlap must be a non-negative integer.')

        if comm.rank == 0:
            # Process the array_or_dtype input variable.
            if type(array_or_dtype) is np.ndarray: # Input is an array.
                array = array_or_dtype

                # Make sure the array is of the correct shape.
                if array.shape != shape:
                    raise TypeError('Shape of array does not match shape of space.')

                # Make sure the array is of a valid datatype.
                self._get_dtype(array.dtype.type)


            elif type(array_or_dtype) is type: # Input is a datatype.
                self._get_dtype(array_or_dtype) # Validate the dtype.
                array = np.zeros(shape, dtype=self.dtype) # Make a zeros array.

            else: # Invalid input.
                raise TypeError('Input variable must be a numpy array or dtype')

            # Prepare array to be scattered.
            array = [array[r[0]:r[1],:,:] for r in all_x_ranges]

        else:
            array = None

        array = comm.scatter(array)
        self._get_dtype(array.dtype.type)

#         # Narrow down the array to local x_range.
#         array = array[xr[0]:xr[1],:,:]

        # Add padding to array, if needed.
        self._xlap = x_overlap
        if self._xlap is not 0:
            padding = np.empty((self._xlap,) + shape[1:3], dtype=array.dtype)
            array = np.concatenate((padding, array, padding), axis=0)

        self.to_gpu(array) # Load onto device.

        # Determine information needed for synchronization.
        if self._xlap is not 0:
            # Calculates the pointer to the x offset in a grid.
            ptr_dx = lambda x_pos: self.data.ptr + self.data.dtype.itemsize * \
                                        x_pos * shape[1] * shape[2]
            
            # Pointers to different sections of the grid that are relevant
            # for synchronization.
            self._sync_ptrs = { 'forw_src': ptr_dx(xr[1]-xr[0]), \
                                'back_dest': ptr_dx(0), \
                                'back_src': ptr_dx(self._xlap), \
                                'forw_dest': ptr_dx(xr[1]-xr[0] + self._xlap)}

            # Buffers used during synchronization.
            self._sync_buffers = [drv.pagelocked_empty( \
                                    (self._xlap, shape[1], shape[2]), \
                                    self.dtype) for k in range(4)]

            # Streams used during synchronization.
            self._sync_streams = [drv.Stream() for k in range(4)]

            # Used to identify neighboring MPI nodes with whom to synchronize.
            self._sync_adj = get_space_info()['mpi_adj']

            # Offset in bytes to the true start of the grid.
            # This is used to "hide" overlap areas from the kernel.
            self._xlap_offset = self.data.dtype.itemsize * \
                                self._xlap * shape[1] * shape[2]

            self.synchronize() # Synchronize the grid.
            comm.Barrier() # Wait for all grids to synchronize before proceeding.
    Pdb(color_scheme="LightBG").set_trace()
    _magma_spotrf_gpu(
        char_pointer(uplo),
        int_pointer(n),
        # ====================================================================
        # = I think this argument is responsible for the segmentation fault. =
        # ====================================================================
        ctypes.POINTER(ctypes.c_float).from_address(get_gpu_pointer(A)),
        int_pointer(lda),
        work.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        int_pointer(info),
    )
    return info


if __name__ == "__main__":
    n = 10

    # Create matrix to be factored
    A_orig = (np.eye(n) + np.ones((n, n)) * 0.3).astype("float32")
    A_gpu = cuda.to_device(A_orig)

    # Allocate pagelocked work array
    nwork = magma_get_spotrf_nb(int_pointer(n))
    print nwork
    work_gpu = cuda.pagelocked_empty((nwork, nwork), dtype="float32")

    # # Do Cholesky factorization
    info = magma_spotrf_gpu("L", n, A_gpu, n, work_gpu)
DMP = DeviceMemoryPool()
"""
cells = nps.cells

max_cell_pop = 0
for cellkey in cells.keys():
        #print cellkey, cells[cellkey].nparticles[0]
        #print cells[cellkey].lindices[0].get_npy_array()
        if cells[cellkey].nparticles[0] > max_cell_pop:
            max_cell_pop = cells[cellkey].nparticles[0]
"""

max_cell_pop_gpu, nc, num_particles = nps.get_max_cell_pop(0, DMP)
max_cell_pop = max_cell_pop_gpu.get()

nbrs_gpu = cuda.pagelocked_empty(shape=(pa.num_real_particles, 27*max_cell_pop), dtype=np.int32)
nbrs_gpu_ptr = np.intp(nbrs_gpu.base.get_device_pointer())

nnbrs_gpu = cuda.pagelocked_empty(shape=(pa.num_real_particles), dtype=np.int32)
nnbrs_gpu_ptr = np.intp(nnbrs_gpu.base.get_device_pointer())


cells_gpu = gpuarray.zeros((nc[0]*nc[1]*nc[2], int(max_cell_pop)), dtype=np.int32)-1
#cellpop_gpu = gpuarray.zeros((nc[0], nc[1], nc[2]), dtype=np.int32)

indices = arange_uint(num_particles)

iters = 100
print "NumPa:", pa.num_real_particles
bint = 0.
npst = 0.
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from magma_wrapper import magma_spotrf_gpu_wrap, magma_get_spotrf_nb_wrap
    
n = 1000

# Create matrix to be factored
A = np.eye(n, dtype='float32') + np.ones((n,n))*.1
A_gpu = cuda.to_device(A)

# Allocate pagelocked work array
nwork = magma_get_spotrf_nb_wrap(n)
work_gpu = cuda.pagelocked_empty((nb,nb), dtype='float32')

# Do Cholesky factorization
info = magma_spotrf_gpu_wrap('U', n, A_gpu, n, work_gpu)

# Copy back the Cholesy factor and check for correctness
L = cuda.from_device(A_gpu, (n,n), 'float32')
print np.abs(np.dot(L,L.T)-A).max()