def __init__(self,**params): ''' Hack-ish way to avoid initialisation until the weights are transfered: ''' should_apply = self.apply_output_fns_init params['apply_output_fns_init'] = False super(GPUSparseCFProjection,self).__init__(**params) # Transfering the weights: self.pycuda_stream = cuda.Stream() self.weights_gpu = cusparse.CSR.to_CSR(self.weights.toSparseArray().transpose()) # Getting the row and columns indices for the *transposed* matrix. Used for Hebbian learning and normalisation: nzcols, nzrows = self.weights.nonzero() tups = sorted(zip(nzrows, nzcols)) nzrows = [x[0] for x in tups] nzcols = [x[1] for x in tups] ''' Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the main memory without the involvment of the CPU: ''' self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32) self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED) self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32) self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) # Getting them on the GPU: self.nzcount = self.weights.getnnz() self.nzrows_gpu = gpuarray.to_gpu(np.array(nzrows, np.int32)) self.nzcols_gpu = gpuarray.to_gpu(np.array(nzcols, np.int32)) # Helper array for normalization: self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32)) # Kernel that applies the normalisation: self.normalize_kernel = ElementwiseKernel( "int *nzrows, float *norm_total, float *weights", "weights[i] *= norm_total[nzrows[i]]", "divisive_normalize") # Kernel that calculates the learning: self.hebbian_kernel = ElementwiseKernel( "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result", "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]", "hebbian_learning") params['apply_output_fns_init'] = should_apply self.apply_output_fns_init = should_apply if self.apply_output_fns_init: self.apply_learn_output_fns()
def _gpuAlloc(self): #Get GPU information self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols))) # multiply by 4 as that is size of float # set max rows to smaller number to save memory usage if self.totalRows < self.maxPossRows: print "reducing max rows to reduce memory use on GPU" self.maxPossRows = self.totalRows # create pagelocked buffers and GPU arrays self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32) self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32) self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes) self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
def __init__(self,**params): #Hack-ish way to avoid initialisation until the weights are transfered: should_apply = self.apply_output_fns_init params['apply_output_fns_init'] = False super(GPUSparseCFProjection,self).__init__(**params) # The sparse matrix is stored in COO format, used for Hebbian learning and normalisation: nzcols, nzrows, values = self.weights.getTriplets() tups = sorted(zip(nzrows, nzcols, values)) nzrows = np.array([x[0] for x in tups], np.int32) nzcols = np.array([x[1] for x in tups], np.int32) values = np.array([x[2] for x in tups], np.float32) # Getting them on the GPU: self.nzcount = self.weights.getnnz() self.nzrows_gpu = gpuarray.to_gpu(nzrows) self.nzcols_gpu = gpuarray.to_gpu(nzcols) # Setting the projection weights in CSR format for dot product calculation: rowPtr = cusparse.coo2csr(self.nzrows_gpu, self.weights.shape[1]) descrA = cusparse.cusparseCreateMatDescr() cusparse.cusparseSetMatType(descrA, cusparse.CUSPARSE_MATRIX_TYPE_GENERAL) cusparse.cusparseSetMatIndexBase(descrA, cusparse.CUSPARSE_INDEX_BASE_ZERO) self.weights_gpu = cusparse.CSR(descrA, values, rowPtr, self.nzcols_gpu, (self.weights.shape[1], self.weights.shape[0])) # Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the # main memory without the involvment of the CPU: self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32) self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED) self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32) self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32) # Helper array for normalization: self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32)) # Kernel that applies the normalisation: self.normalize_kernel = ElementwiseKernel( "int *nzrows, float *norm_total, float *weights", "weights[i] *= norm_total[nzrows[i]]", "divisive_normalize") # Kernel that calculates the learning: self.hebbian_kernel = ElementwiseKernel( "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result", "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]", "hebbian_learning") self.pycuda_stream = cuda.Stream() # Finishing the initialisation that might have been delayed: params['apply_output_fns_init'] = should_apply self.apply_output_fns_init = should_apply if self.apply_output_fns_init: self.apply_learn_output_fns()
def get_next_minibatch(self, i, train=TRAIN): if train == TRAIN: data = self.train_data else: data = self.test_data batch_data = data.data batch_label = data.labels batch_size = self.batch_size mini_data = batch_data[:, i * batch_size: (i + 1) * batch_size] locked_data = driver.pagelocked_empty(mini_data.shape, mini_data.dtype, order='C', mem_flags=driver.host_alloc_flags.PORTABLE) locked_data[:] = mini_data if self.input is not None and locked_data.shape == self.input.shape: self.input.set(locked_data) else: self.input = gpuarray.to_gpu(locked_data) label = batch_label[i * batch_size : (i + 1) * batch_size] #label = gpuarray.to_gpu(label) #label = gpuarray.to_gpu(np.require(batch_label[i * batch_size : (i + 1) * batch_size], dtype = # np.float, requirements = 'C')) return self.input, label
def to_cpu(self): if self.flags.forc: return self.get(pagelocked=True) result = cuda.pagelocked_empty(self.shape, self.dtype) copy_non_contiguous(result, self) return result
def allocate_buffers(engine: trt.ICudaEngine, batch_size: int): print('Allocating buffers ...') inputs = [] outputs = [] dbindings = [] stream = cuda.Stream() for binding in engine: size = batch_size * abs(trt.volume(engine.get_binding_shape(binding))) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. dbindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, dbindings, stream
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] # Simple helper data class that's a little nicer to use than a 2-tuple. class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str( self.device) def __repr__(self): return self.__str__() stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers_with_existing_inputs(engine, inp, batch_size=1): ''' allocate_buffers() (see TRT python samples) but uses an existing inputs on device inp: List of pointers to device memory. Pointers are in the same order as would be produced by allocate_buffers(). That is, inputs are in the order defined by iterating through `engine` ''' # Add input to bindings bindings = [] outputs = [] stream = cuda.Stream() inp_idx = 0 for binding in engine: if engine.binding_is_input(binding): bindings.append(inp[inp_idx]) inp_idx += 1 else: # Unchanged from do_inference() size = trt.volume(engine.get_binding_shape(binding)) * batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes * 2) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. outputs.append(HostDeviceMem(host_mem, device_mem)) return outputs, bindings, stream
def __init__(self, engine_file_path): # Create a Context on this device, self.cfx = cuda.Device(0).make_context() self.stream = cuda.Stream() runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(engine_file_path, "rb") as f: self.engine = runtime.deserialize_cuda_engine(f.read()) self.context = self.engine.create_execution_context() self.host_inputs = [] self.cuda_inputs = [] self.host_outputs = [] self.cuda_outputs = [] self.bindings = [] for binding in self.engine: size = trt.volume(self.engine.get_binding_shape( binding)) * self.engine.max_batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. self.bindings.append(int(cuda_mem)) # Append to the appropriate list. if self.engine.binding_is_input(binding): self.host_inputs.append(host_mem) self.cuda_inputs.append(cuda_mem) else: self.host_outputs.append(host_mem) self.cuda_outputs.append(cuda_mem)
def __allocate_buffers(engine): """Allocates all buffers required for the specified engine.""" inputs = [] outputs = [] bindings = [] for binding in engine: # Get binding (tensor/buffer) size size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size # Get binding (tensor/buffer) data type (numpy-equivalent) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate page-locked memory (i.e., pinned memory) buffers host_mem = cuda.pagelocked_empty(size, dtype) # Allocate linear piece of device memory device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append(__HostDeviceTuple(host_mem, device_mem)) else: outputs.append(__HostDeviceTuple(host_mem, device_mem)) stream = cuda.Stream() return inputs, outputs, bindings, stream
def alloc_buf(engine): # h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32) # h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32) dtype = trt.nptype(DTYPE) h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=dtype) h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=dtype) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) stream = cuda.Stream() # np.copyto(h_input, (np.random.random((1, 3, input_size, input_size)).astype(np.float32)).reshape(-1)) return h_input, h_output, d_input, d_output, stream
def _allocate_buffers(self): self.inputs = [] self.outputs = [] self.bindings = [] self.stream = cuda.Stream() # NMS implementation in TRT 6 only supports DataType.FLOAT binding_to_type = { "Input": np.float32, "NMS": np.float32, "NMS_1": np.int32 } for binding in self.__trt_engine: shape = self.__trt_engine.get_binding_shape(binding) size = trt.volume(shape) * self.__trt_engine.max_batch_size dtype = binding_to_type[str(binding)] # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. self.bindings.append(int(device_mem)) # Append to the appropriate list. if self.__trt_engine.binding_is_input(binding): self.inputs.append(HostDeviceMem(host_mem, device_mem)) else: self.outputs.append(HostDeviceMem(host_mem, device_mem))
def allocate_buffers2(engine, h_, w_): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() tmp = [1, 32, 16, 8] print('engine.get_binding_format_desc', engine.get_binding_format_desc(0)) for count, binding in enumerate(engine): print('binding:', binding) size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size * (int)( h_ / tmp[count]) * (int)(w_ / tmp[count]) #size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) #dtype=np.float16 print('dtype:', dtype) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) print('size:', size) print('input:', inputs) print('output:', outputs) print('------------------') return inputs, outputs, bindings, stream
def _allocate_buffers(self): for binding in self.engine: # print("binding = {}".format(binding)) # input and output dims = self.engine.get_binding_shape(binding) # print("dims = {}".format(dims)) -> (1, 3, 256, 192); -> (1, 17, 64, 48) # print("before dims = {}".format(dims)) # 使用context获取最大的显存,并进行分配 if dims[0] < 0: # 要设置最大的尺寸,一次性给给运行环境分配最大的显存,后面到真实数据的时候再对context的输入进行改变 if binding == 'input': self.context.set_binding_shape(binding=0, shape=(self.maxBs, 3, dims[2], dims[3])) size = trt.volume(self.context.get_binding_shape(0 if binding == 'input' else 1)) else: # 下面两种方法都可以 # size = trt.volume(self.engine.get_binding_shape(binding)) * self.batch_size size = trt.volume(self.context.get_binding_shape(0 if binding == 'input' else 1)) * self.batch_size # print("after dims = {}".format(self.context.get_binding_shape(0 if binding == 'input' else 1))) dtype = trt.nptype(self.engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. self.bindings.append(int(device_mem)) # Append to the appropriate list. if self.engine.binding_is_input(binding): self.inputs.append(HostDeviceMem(host_mem, device_mem)) else: self.outputs.append(HostDeviceMem(host_mem, device_mem))
def infer(engine, x, batch_size): n = engine.get_nb_bindings() print('%d bindings' % n) mems = [] # CPU mem d_mems = [] # CUDA mem shapes = [] for i in range(n): dims = engine.get_binding_dimensions(i) shape = dims.shape() print('bind %d :: %s' % (i, shape)) cnt = volume(shape) * batch_size mem = cuda.pagelocked_empty(cnt, dtype=np.float32) d_mem = cuda.mem_alloc(cnt * mem.dtype.itemsize) shapes.append(shape) mems.append(mem) d_mems.append(d_mem) np.copyto(mems[0], x.flatten()) stream = cuda.Stream() ids = list(range(n)) inputs_ids = ids[:1] outputs_ids = ids[1:] for i in inputs_ids: cuda.memcpy_htod_async(d_mems[i], mems[i], stream) context = engine.create_execution_context() context.enqueue(batch_size, [int(p) for p in d_mems], stream.handle, None) context.destroy() for i in outputs_ids: cuda.memcpy_dtoh_async(mems[i], d_mems[i], stream) stream.synchronize() return [mems[i].reshape(shapes[i]) for i in outputs_ids]
def allocate_buffers(engine, is_explicit_batch=False, dynamic_shapes=[]): inputs = [] outputs = [] bindings = [] class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() for binding in engine: dims = engine.get_binding_shape(binding) if dims[0] == -1: assert(len(dynamic_shapes) > 0) dims[0] = dynamic_shapes[0] size = trt.volume(dims) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings
def allocate_buffers(engine, is_explicit_batch=False, input_shape=None): inputs = [] outputs = [] bindings = [] class HostDeviceMem(object): def __init__(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str( self.device) def __repr__(self): return self.__str__() for binding in engine: dims = engine.get_binding_shape(binding) print("*******" + str(dims) + " dims[-1] " + str(dims[-1])) if dims[-1] == -1: assert (input_shape is not None) dims[-2], dims[-1] = input_shape size = trt.volume(dims) * engine.max_batch_siz dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings
def initialize_bindings(self): self.input_bindings = [] self.output_bindings = [] for idx in range(self.engine.num_bindings): print( self.engine.get_binding_name(idx), self.engine.get_binding_dtype(idx), self.engine.get_binding_shape(idx)) if self.engine.binding_is_input(idx): # we expect only one input input_shape = self.engine.get_binding_shape(idx) input_size = trt.volume(input_shape) * self.engine.max_batch_size * np.dtype(np.float32).itemsize # in bytes self.input_bindings.append({ "input_shape": input_shape, "input_size": input_size, "device_input": cuda.mem_alloc(input_size), }) else: # and one output output_shape = self.engine.get_binding_shape(idx) host_output = cuda.pagelocked_empty(trt.volume(output_shape) * self.engine.max_batch_size, dtype=np.float32) device_output = cuda.mem_alloc(host_output.nbytes) self.output_bindings.append({ "output_shape": output_shape, "host_output": host_output, "device_output": device_output, "name": self.engine.get_binding_name(idx) })
def load_model(self, engine_path): with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: self.engine = runtime.deserialize_cuda_engine(f.read()) self.context = self.engine.create_execution_context() self.device_input1, self.device_input2 = [None] * 2 for binding in self.engine: if self.engine.binding_is_input( binding): # we expect only one input input_shape = self.engine.get_binding_shape(binding) input_size = trt.volume( input_shape) * self.engine.max_batch_size * np.dtype( np.float32).itemsize # in bytes if self.device_input1 is None: self.device_input1 = cuda.mem_alloc(input_size) elif self.device_input2 is None: self.device_input2 = cuda.mem_alloc(input_size) else: raise Exception("Network expects more than 2 inputs.") else: # and one output self.output_shape = self.engine.get_binding_shape(binding) # create page-locked memory buffers (i.e. won't be swapped to disk) self.host_output = cuda.pagelocked_empty( trt.volume(self.output_shape) * self.engine.max_batch_size, dtype=np.float32) self.device_output = cuda.mem_alloc(self.host_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. self.stream = cuda.Stream()
def __init__(self, backend, ioshape, initval, extent, aliases, tags): # Call the standard matrix constructor super().__init__(backend, ioshape, initval, extent, aliases, tags) # Allocate a page-locked buffer on the host for MPI to send/recv from self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol), self.dtype, 'C')
def allocate_buffersV2(engine, context): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() down_stride = 1 print('engine.get_binding_format_desc', engine.get_binding_format_desc(0)) for count, binding in enumerate(engine): # print('binding:', binding) size = trt.volume( context.get_binding_shape(count)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # print('dtype:', dtype) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) print('count: ', count) print('binding: ', binding) print('binding shape: ', engine.get_binding_shape(binding)) print('binding shape: ', context.get_binding_shape(count)) print('max_batch_size:', engine.max_batch_size) print('size:', size) print('input:', inputs) print('output:', outputs) print('------------------') return inputs, outputs, bindings, stream
def allocate_buffers(engine, context): """Allocates all buffers required for an engine, i.e. host/device inputs/outputs. Args: engine: trt.ICudaEngine. context: TensorRT execution context. Returns: inputs: input buffers. outputs: outputs buffers. bindings: memory bindings. stream: TensorRT CUDA stream. """ inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(context.get_binding_shape( engine[binding])) # * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def _padded_array(self,ar): #{{{ nrows_pad = ar.shape[0] ncols_pad = 16*((ar.shape[1]+15)/16) #arpad = numpy.empty((nrows_pad,ncols_pad),dtype=ar.dtype) arpad = cuda.pagelocked_empty((nrows_pad,ncols_pad),dtype=ar.dtype) arpad[0:ar.shape[0],0:ar.shape[1]] = ar return arpad
def allocate_buffers(engine): """Allocates all host/device in/out buffers required for an engine.""" inputs = [] outputs = [] bindings = [] output_idx = 0 stream = cuda.Stream() assert 3 <= len(engine) <= 4 # expect 1 input, plus 2 or 3 outpus for binding in engine: binding_dims = engine.get_binding_shape(binding) if len(binding_dims) == 4: # explicit batch case (TensorRT 7+) size = trt.volume(binding_dims) elif len(binding_dims) == 3: # implicit batch case (TensorRT 6 or older) size = trt.volume(binding_dims) * engine.max_batch_size else: raise ValueError('bad dims of binding %s: %s' % (binding, str(binding_dims))) dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: # each grid has 3 anchors, each anchor generates a detection # output of 7 float32 values assert size % 7 == 0 outputs.append(HostDeviceMem(host_mem, device_mem)) output_idx += 1 return inputs, outputs, bindings, stream
def init_model(self, trt_path, ctx_id): TRT_LOGGER = trt.Logger(trt.Logger.WARNING) cuda.init() device = cuda.Device(ctx_id) self.ctx = device.make_context() with open(trt_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) self.input_buffs = {} self.output_buffs = {} self.bindings = [] self.stream = cuda.Stream() for name in engine: shape = engine.get_binding_shape(name) size = trt.volume(shape) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(name)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. self.bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(name): self.input_buffs[name] = HostDeviceMem(host_mem, device_mem, shape) else: self.output_buffs[name] = HostDeviceMem(host_mem, device_mem, shape) self.model = engine.create_execution_context() self.logger.info("Warmup up...") self.inference_loops(10)
def infer(engine, input_img, batch_size): #load engine context = engine.create_execution_context() assert (engine.get_nb_bindings() == 2) #create output array to receive data dims = engine.get_binding_dimensions(1).to_DimsCHW() elt_count = dims.C() * dims.H() * dims.W() * batch_size #Allocate pagelocked memory output = cuda.pagelocked_empty(elt_count, dtype=np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
def allocate_buffers(engine): """Allocates all host/device in/out buffers required for an engine.""" inputs = [] outputs = [] bindings = [] output_idx = 0 stream = cuda.Stream() # try: # stream = cuda.Stream() # except Exception as e: # raise RuntimeError('fail to use cuda stream') from e print("Stream done") assert 3 <= len(engine) <= 4 # expect 1 input, plus 2 or 3 outpus for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * \ engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: # each grid has 3 anchors, each anchor generates a detection # output of 7 float32 values assert size % 7 == 0 outputs.append(HostDeviceMem(host_mem, device_mem)) output_idx += 1 return inputs, outputs, bindings, stream
def _allocate_buffers(self, engine): # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. inputs = [] outputs = [] bindings = [] stream = cuda.Stream() out_shapes = [] input_shapes = [] out_names = [] input_names = [] max_batch_size = engine.max_batch_size for binding in engine: # get binding_shape (value == -1 means dynamic shape) binding_shape = engine.get_binding_shape(binding) # compute max_size and dtype size = abs(trt.volume(binding_shape)) * max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # collect info to appropriate list if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) input_shapes.append(binding_shape) input_names.append(binding) else: outputs.append(HostDeviceMem(host_mem, device_mem)) out_shapes.append(binding_shape) out_names.append(binding) return bindings, stream, max_batch_size, inputs, input_shapes, input_names, outputs, out_shapes, out_names
def allocate_buffers(self, engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: bindig_shape = tuple(engine.get_binding_shape(binding)) # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size # engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(bindig_shape, dtype) # print('\tAllocate host buffer: host_mem -> {}, {}'.format(host_mem, host_mem.nbytes)) # host mem device_mem = cuda.mem_alloc(host_mem.nbytes) # print('\tAllocate device buffer: device_mem -> {}, {}'.format(device_mem, int(device_mem))) # device mem # print('\t# Append the device buffer to device bindings.......') bindings.append(int(device_mem)) # print('\tbindings: ', bindings) # Append to the appropriate list. if engine.binding_is_input(binding): # print("this is the input!") # print('____HostDeviceMem(host_mem, device_mem)): {}, {}'.format(HostDeviceMem(host_mem, device_mem),type(HostDeviceMem(host_mem, device_mem)))) inputs.append(HostDeviceMem(host_mem, device_mem)) else: # print("This is the output!") outputs.append(HostDeviceMem(host_mem, device_mem)) # print("----------------------end allocating one binding in the onnx model-------------------------") return inputs, outputs, bindings, stream
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] #创建一个cuda流 stream = cuda.Stream() for binding in engine: #trt.volume用来计算可迭代对象的体积 #get_binding_shape用来获取相应绑定的维度 #size表示engine中绑定的所需要的最大维度 size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size #get_binding_dtype用来获取相应绑定的数据类型 dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers #给主机和设备分配缓冲区 #cuda.pagelocked_empty给主机分配相关的页面锁定内存 host_mem = cuda.pagelocked_empty(size, dtype) #给设备分配内存 device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. #将分配给设备的内存添加到设备绑定 bindings.append(int(device_mem)) # Append to the appropriate list. #确定绑定是否是一个输入绑定 if engine.binding_is_input(binding): #如果是的话 #HostDeviceMem的实现参考common.py #将相应的内存地址添加到对应的列表里面 inputs.append(HostDeviceMem(host_mem, device_mem)) else: #如果不是的话 outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def _allocate_buffers(self, context): """ Allocate device memory space for data. :param context: :return: """ inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in self._engine: size = trt.volume(self._engine.get_binding_shape( binding)) * self._engine.max_batch_size dtype = trt.nptype(self._engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if self._engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() out_shapes = [] input_shapes = [] out_names = [] max_batch_size = engine.get_profile_shape(0, 0)[2][0] for binding in engine: binding_shape = engine.get_binding_shape(binding) #Fix -1 dimension for proper memory allocation for batch_size > 1 if binding_shape[0] == -1: binding_shape = (1, ) + binding_shape[1:] size = trt.volume(binding_shape) * max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) input_shapes.append(engine.get_binding_shape(binding)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) #Collect original output shapes and names from engine out_shapes.append(engine.get_binding_shape(binding)) out_names.append(binding) return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size
def allocate_buffers(engine, batch_size): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * batch_size dims = engine.get_binding_shape(binding) # in case batch dimension is -1 (dynamic) if dims[0] < 0: size *= -1 dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
def allocate_buffers(engine, grid_sizes): """Allocates all host/device in/out buffers required for an engine.""" inputs = [] outputs = [] bindings = [] output_idx = 0 stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * \ engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: # each grid has 3 anchors, each anchor generates a detection # output of 7 float32 values assert size == grid_sizes[ output_idx] * 3 * 7 * engine.max_batch_size outputs.append(HostDeviceMem(host_mem, device_mem)) output_idx += 1 return inputs, outputs, bindings, stream
def allocate_buffers(self): """Allocates GPU memory for future use and creates an asynchronous stream""" # determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host i/o self.h_input = cuda.pagelocked_empty( trt.volume(self.engine.get_binding_shape(0)), dtype=trt.nptype(self.CONSTANTS["dtype"])) self.h_output = cuda.pagelocked_empty( trt.volume(self.engine.get_binding_shape(1)), dtype=trt.nptype(self.CONSTANTS["dtype"])) # allocate device memory for inputs and outputs self.d_input = cuda.mem_alloc(self.h_input.nbytes) self.d_output = cuda.mem_alloc(self.h_output.nbytes) self.stream = cuda.Stream()
def allocate_buffers(self): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() engine = self.engine for binding in engine: shape = engine.get_binding_shape(binding) size = trt.volume(shape) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem, dtype, shape)) else: outputs.append( HostDeviceMem(host_mem, device_mem, dtype, shape)) return inputs, outputs, bindings, stream
def get_async(self, stream=None, ary=None): if ary is None: ary = drv.pagelocked_empty(self.shape, self.dtype) else: assert ary.size == self.size assert ary.dtype == self.dtype if self.size: drv.memcpy_dtoh_async(ary, self.gpudata, stream) return ary
def get(self, ary=None, pagelocked=False): if ary is None: if pagelocked: ary = drv.pagelocked_empty(self.shape, self.dtype) else: ary = numpy.empty(self.shape, self.dtype) else: assert ary.size == self.size assert ary.dtype == self.dtype if self.size: drv.memcpy_dtoh(ary, self.gpudata) return ary
def __gpu_decorate_nodes(self, samples, labels): si_0 = driver.pagelocked_empty(self.n_samples, dtype = self.dtype_indices) si_1 = driver.pagelocked_empty(self.n_samples, dtype = self.dtype_indices) self.values_array = np.empty(self.n_nodes, dtype = self.dtype_labels) cuda.memcpy_dtoh(si_0, self.sorted_indices_gpu.ptr) cuda.memcpy_dtoh(si_1, self.sorted_indices_gpu_.ptr) decorate(self.target, si_0, si_1, self.values_idx_array, self.values_si_idx_array, self.values_array, self.n_nodes) self.values_idx_array = None self.values_si_idx_array = None self.left_children.resize(self.n_nodes, refcheck = False) self.right_children.resize(self.n_nodes, refcheck = False) self.feature_threshold_array.resize(self.n_nodes, refcheck = False) self.feature_idx_array.resize(self.n_nodes, refcheck = False)
def __init__(self, backend, ioshape, initval, extent, aliases, tags): # Call the standard matrix constructor super().__init__(backend, ioshape, initval, extent, aliases, tags) # If MPI is CUDA-aware then construct a buffer out of our CUDA # device allocation and pass this directly to MPI if backend.mpitype == 'cuda-aware': self.hdata = _make_pybuf(self.data, self.nbytes, 0x200) # Otherwise, allocate a buffer on the host for MPI to send/recv from else: self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol), self.dtype, 'C')
def threshold_integrated(series, value): global _dn, _n, _bn, _loc_tmp, _loc_out, _val_out, _loc, _val t = numpy.float32(value**2) nb = int(numpy.ceil(float(len(series))/nt/gs)) if _bn is None or len(_bn) < nb: _bn = gpuarray.zeros(nb, dtype=numpy.uint32) if _n is None: _n = driver.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP) ptr = numpy.intp(_n.base.get_device_pointer()) class T(): pass _dn = T() _dn.gpudata = ptr _dn.flags = _n.flags if _loc_tmp is None or len(series) > len(_loc_tmp): _loc_tmp = gpuarray.zeros(len(series), dtype=numpy.uint32) _loc_out = gpuarray.zeros(len(series), dtype=numpy.uint32) _val_out = gpuarray.zeros(len(series), dtype=series.dtype) _val = driver.pagelocked_empty((4096*256), numpy.complex64) _loc = driver.pagelocked_empty((4096*256), numpy.uint32) #Do the thresholding by block stuff(series.data, _loc_tmp, _bn, t, numpy.uint32(len(series)), block=(nt, 1, 1), grid=(nb, 1)) # Recombine the blocks into a final output stuff2(series.data, _loc_tmp, _loc_out, _val_out, _bn, _dn, block=(nb, 1, 1), grid=(nb, 1)) # We need to get the data back now pycbc.scheme.mgr.state.context.synchronize() if _n != 0: driver.memcpy_dtoh_async(_val[0:_n], _val_out.gpudata) driver.memcpy_dtoh_async(_loc[0:_n], _loc_out.gpudata) pycbc.scheme.mgr.state.context.synchronize() return _loc[0:_n], _val[0:_n]
def get_async(self, stream=None, ary=None): if ary is None: ary = drv.pagelocked_empty(self.shape, self.dtype) ary = _as_strided(ary, strides=self.strides) else: assert ary.size == self.size assert ary.dtype == self.dtype assert ary.flags.forc assert self.flags.forc, "Array in get() must be contiguous" if self.size: drv.memcpy_dtoh_async(ary, self.gpudata, stream) return ary
def get_async(self, stream = None, ary = None): if ary is None: ary = cuda.pagelocked_empty(self.shape, self.dtype) else: assert ary.size == self.size assert ary.dtype == ary.dtype if ary.base.__class__ != cuda.HostAllocation: raise TypeError("asynchronous memory trasfer requires pagelocked numpy array") if self.size: if self.M == 1: cuda.memcpy_dtoh_async(ary, self.gpudata, stream) else: PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype, async = True, stream = stream) return ary
def get(self, ary=None, astype=None, pagelocked=False): if ary is None: if pagelocked: ary = drv.pagelocked_empty(self.shape, self.dtype) else: ary = np.empty(self.shape, self.dtype) ary = _as_strided(ary, strides=self.strides) else: assert ary.size == self.size assert ary.dtype == self.dtype assert ary.flags.forc assert self.flags.forc, "Array in get() must be contiguous" if self.size: drv.memcpy_dtoh(ary, self.gpudata) if astype is not None: ary = ary.astype(astype) * 2 ** (self.iwl - 15) return ary
def get(self, ary = None, pagelocked = False): """ get the PitchArray to an ndarray if ary is specified, will transfer device memory to ary's memory pagelocked is ary's memory is pagelocked """ if ary is None: if pagelocked: ary = cuda.pagelocked_empty(self.shape, self.dtype) else: ary = np.empty(self.shape, self.dtype) else: assert ary.size == self.size assert ary.dtype == ary.dtype if self.size: if self.M == 1: cuda.memcpy_dtoh(ary, self.gpudata) else: PitchTrans(self.shape, ary, _pd(self.shape), self.gpudata, self.ld, self.dtype) return ary
def alloc_host_mem(self, shape, dtype): return drv.pagelocked_empty(shape, dtype, order="C", mem_flags=0)
""" threshold_kernel = ElementwiseKernel( " %(tp_in)s *in, %(tp_out1)s *outv, %(tp_out2)s *outl, %(tp_th)s threshold, %(tp_n)s *bn" % { "tp_in": dtype_to_ctype(numpy.complex64), "tp_out1": dtype_to_ctype(numpy.complex64), "tp_out2": dtype_to_ctype(numpy.uint32), "tp_th": dtype_to_ctype(numpy.float32), "tp_n": dtype_to_ctype(numpy.uint32), }, threshold_op, "getstuff") import pycuda.driver as drv n = drv.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP) nptr = numpy.intp(n.base.get_device_pointer()) val = drv.pagelocked_empty((4096*256), numpy.complex64, mem_flags=drv.host_alloc_flags.DEVICEMAP) vptr = numpy.intp(val.base.get_device_pointer()) loc = drv.pagelocked_empty((4096*256), numpy.int32, mem_flags=drv.host_alloc_flags.DEVICEMAP) lptr = numpy.intp(loc.base.get_device_pointer()) class T(): pass tn = T() tv = T() tl = T() tn.gpudata = nptr
stacklevel=2) ary = ary.reshape(self.shape) if ary.dtype != self.dtype: raise ValueError("ary and self must have the same dtype") if self.size: _memcpy_discontig(self, ary, async=async, stream=stream) def set_async(self, ary, stream=None): return self.set(ary, async=True, stream=stream) def get(self, ary=None, pagelocked=False, async=False, stream=None): if ary is None: if pagelocked: ary = drv.pagelocked_empty(self.shape, self.dtype) else: ary = np.empty(self.shape, self.dtype) strides = _compact_strides(self) ary = _as_strided(ary, strides=strides) else: if self.size != ary.size: raise ValueError("self and ary must be the same size") if self.shape != ary.shape: from warnings import warn warn("get() between arrays of different shape is deprecated " "and will be removed in PyCUDA 2017.x", DeprecationWarning, stacklevel=2) ary = ary.reshape(self.shape)
self.count=0 self.display=display def displayResults(self,res, cm=pylab.cm.gray, title='Specify a title'): if self.display: self.count=self.count+1 pylab.figure(self.count) pylab.imshow(res, cm, interpolation='nearest') pylab.colorbar() pylab.title(title) fx = 0 fy = 0 nx = 2**10 ny = 2**10 g1 = cuda.pagelocked_empty((nx,ny),'int16') g2 = cuda.pagelocked_empty((nx,ny),'int16') imname1 = "fr1.bin" imname2 = "fr2.bin" outname = "frame.vtk" pivim.load_bin_image(imname1,g1) pivim.load_bin_image(imname2,g2) grid = g1.astype('complex64').reshape(nx,ny) #grid2 = g2.astype('complex64').reshape(nx,ny) displayResults = mydisplay(display=True).displayResults displayResults(g1,title="Initial Grid")
def cu_template_render_image_single(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128): """ CPU part of the SPH render code that executes the rendering on the GPU does some basic particle set prunning and sets up the image tiles. It launches cuda kernels for rendering the individual sections of the image """ import pycuda.driver as drv import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule from radix_sort import radix_sort global_start = time.clock() start = time.clock() # construct an array of particles Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')] ps = drv.pagelocked_empty(len(s),dtype=Partstruct) with s.immediate_mode : ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']] if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start) ymin,ymax = xmin,xmax # ---------------------- # setup the global image # ---------------------- image = np.zeros((nx,ny),dtype=np.float32) dx = float32((xmax-xmin)/nx) dy = float32((ymax-ymin)/ny) x_start = xmin+dx/2 y_start = ymin+dy/2 zplane = 0.0 start = time.clock() # ------------------ # set up the kernels # ------------------ code = file('/home/itp/roskar/homegrown/template_kernel.cu').read() mod = SourceModule(code) tile_histogram = mod.get_function("tile_histogram") distribute_particles = mod.get_function("distribute_particles") tile_render_kernel = mod.get_function("tile_render_kernel") calculate_keys = mod.get_function("calculate_keys") # allocate histogram array hist = np.zeros(Ntiles,dtype=np.int32) # transfer histogram array and particle data to GPU hist_gpu = drv.mem_alloc(hist.nbytes) drv.memcpy_htod(hist_gpu,hist) start_g = drv.Event() end_g = drv.Event() start_g.record() ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes) drv.memcpy_htod(ps_on_gpu,ps_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g)) # make everything the right size xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax]) nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles]) start_g.record() tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles, block=(nthreads,1,1),grid=(32,1,1)) drv.Context.synchronize() drv.memcpy_dtoh(hist,hist_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g)) print "<<< Total particle array = %d"%(hist.sum()) # --------------------------------------------------------------------------------- # figured out the numbers of particles per tile -- set up the tile particle buffers # --------------------------------------------------------------------------------- ps_tiles = np.empty(hist.sum(),dtype=Partstruct) ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes) tile_offsets = np.array([0],dtype=np.int32) tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32)) tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes) drv.memcpy_htod(tile_offsets_gpu,tile_offsets) start_g.record() distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), xmin, xmax, ymin, ymax, nx, ny, Ntiles, block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4) end_g.record() end_g.synchronize() if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g)) drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu) # ------------------------- # start going through tiles # ------------------------- # initialize the image on the device im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes) drv.memcpy_htod(im_gpu,image.astype(np.float32)) # allocate key arrays -- these will be keys to sort particles into softening bins start_g.record() keys_gpu = drv.mem_alloc(int(4*hist.sum())) calculate_keys(ps_tiles_gpu, keys_gpu, np.int32(hist.sum()), np.float32(dx), block=(nthreads,1,1),grid=(32,1,1)) end_g.record() end_g.synchronize() if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g)) keys = np.empty(hist.sum(), dtype=np.int32) # ---------------------------------------- # sort particles by their softening length # ---------------------------------------- for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : radix_sort(int(keys_gpu), int(ps_tiles_gpu), tile_offsets[i], n_per_tile) drv.memcpy_dtoh(keys,keys_gpu) drv.memcpy_dtoh(ps_tiles,ps_tiles_gpu) # return keys,ps_tiles,tile_offsets,dx drv.Context.synchronize() tile_start = time.clock() for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : my_stream = streams[i%16] xmin_p, xmax_p, ymin_p, ymax_p = tiles_physical[i] xmin_t, xmax_t, ymin_t, ymax_t = tiles_pix[i] nx_tile = xmax_t-xmin_t+1 ny_tile = ymax_t-ymin_t+1 # make everything the right size xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t]) xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p]) tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i), xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t, im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]), block=(nthreads,1,1),stream=my_stream) if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start) # ---------------------------------------------------------------------------------- # process the particles with large smoothing lengths concurrently with GPU execution # ---------------------------------------------------------------------------------- #if ind[1] != len(xs) : # start = time.clock() # image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:], # nx,ny,xmin,xmax,ymin,ymax)).T # if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1], # time.clock()-start) drv.Context.synchronize() if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start) drv.memcpy_dtoh(image,im_gpu) drv.stop_profiler() if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start) del(start_g) del(end_g) return image
def cu_template_render_image(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128, tile_size=100): """ CPU part of the SPH render code that executes the rendering on the GPU does some basic particle set prunning and sets up the image tiles. It launches cuda kernels for rendering the individual sections of the image """ import pycuda.driver as drv import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule from radix_sort import radix_sort global_start = time.clock() start = time.clock() # construct an array of particles Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')] ps = drv.pagelocked_empty(len(s),dtype=Partstruct) with s.immediate_mode : ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']] if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start) ymin,ymax = xmin,xmax # ---------------------- # setup the global image # ---------------------- image = np.zeros((nx,ny),dtype=np.float32) dx = float32((xmax-xmin)/nx) dy = float32((ymax-ymin)/ny) x_start = xmin+dx/2 y_start = ymin+dy/2 zplane = 0.0 # ------------------------------------------------------------------------------------------------ # trim particles based on smoothing length -- the GPU will only render those that need < 32 pixels # ------------------------------------------------------------------------------------------------ start = time.clock() # gpu_bool = 2*ps['h'] < 15.*dx ps_gpu = ps#[gpu_bool] # ps_cpu = ps[~gpu_bool] #del(ps) if timing: '<<< Setting up gpu/cpu particle struct arrays took %f s'%(time.clock()-start) # ----------------------------------------------------------------- # set up the image slices -- max. size is 100x100 pixels # in this step only process particles that need kernels < 40 pixels # tiles are 100x100 = 1e4 pixels x 4 bytes = 40k # kernels are 31x31 pixels max = 3844 bytes # max shared memory size is 48k # ----------------------------------------------------------------- start = time.clock() tiles_pix, tiles_physical = make_tiles(nx,ny,xmin,xmax,ymin,ymax,tile_size) if timing: print '<<< Tiles made in %f s'%(time.clock()-start) Ntiles = tiles_pix.shape[0] # ------------------ # set up the kernels # ------------------ code = file(os.path.join(os.path.dirname(__file__),'template_kernel.cu')).read() mod = SourceModule(code,options=["--ptxas-options=-v"]) tile_histogram = mod.get_function("tile_histogram") distribute_particles = mod.get_function("distribute_particles") tile_render_kernel = mod.get_function("tile_render_kernel") calculate_keys = mod.get_function("calculate_keys") # ------------------------------------------------------------- # set up streams and figure out particle distributions per tile # ------------------------------------------------------------- # allocate histogram array hist = np.zeros(Ntiles,dtype=np.int32) # transfer histogram array and particle data to GPU hist_gpu = drv.mem_alloc(hist.nbytes) drv.memcpy_htod(hist_gpu,hist) start_g = drv.Event() end_g = drv.Event() start_g.record() ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes) drv.memcpy_htod(ps_on_gpu,ps_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g)) # make everything the right size xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax]) nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles]) # ----------------------------- # calculate pixels per particle # ----------------------------- # allocate key arrays -- these will be keys to sort particles into softening bins start_g.record() keys_gpu = drv.mem_alloc(int(4*len(s))) calculate_keys(ps_on_gpu, keys_gpu, np.int32(len(s)), np.float32(dx), block=(nthreads,1,1),grid=(1024,1,1)) end_g.record() end_g.synchronize() if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g)) # ---------------------------------------- # sort particles by their softening length # ---------------------------------------- start_g.record() radix_sort(int(keys_gpu), int(ps_on_gpu), np.int32(0), np.int32(len(s))) end_g.record() end_g.synchronize() if timing: print '<<< Radix sorting all tiles took %f ms'%(start_g.time_till(end_g)) start_g.record() tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles, block=(nthreads,1,1),grid=(1024,1,1)) drv.Context.synchronize() drv.memcpy_dtoh(hist,hist_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g)) print "<<< Total particle array = %d"%(hist.sum()) # --------------------------------------------------------------------------------- # figured out the numbers of particles per tile -- set up the tile particle buffers # --------------------------------------------------------------------------------- ps_tiles = np.empty(hist.sum(),dtype=Partstruct) ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes) tile_offsets = np.array([0],dtype=np.int32) tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32)) tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes) drv.memcpy_htod(tile_offsets_gpu,tile_offsets) start_g.record() distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), xmin, xmax, ymin, ymax, nx, ny, Ntiles, block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4) end_g.record() end_g.synchronize() if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g)) drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu) # ------------------------- # start going through tiles # ------------------------- # initialize the image on the device im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes) drv.memcpy_htod(im_gpu,image.astype(np.float32)) tile_start = time.clock() streams = [drv.Stream() for i in range(16)] for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : my_stream = streams[i%(16)] xmin_p, xmax_p, ymin_p, ymax_p = tiles_physical[i] xmin_t, xmax_t, ymin_t, ymax_t = tiles_pix[i] nx_tile = xmax_t-xmin_t+1 ny_tile = ymax_t-ymin_t+1 # make everything the right size xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t]) xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p]) if n_per_tile > nthreads*256: ngrid=128 else : ngrid = 64 tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i), xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t, im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]), block=(nthreads,1,1),grid=(ngrid,1,1),stream=my_stream) if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start) # ---------------------------------------------------------------------------------- # process the particles with large smoothing lengths concurrently with GPU execution # ---------------------------------------------------------------------------------- #if ind[1] != len(xs) : # start = time.clock() # image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:], # nx,ny,xmin,xmax,ymin,ymax)).T # if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1], # time.clock()-start) drv.Context.synchronize() if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start) drv.memcpy_dtoh(image,im_gpu) drv.stop_profiler() if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start) del(start_g) del(end_g) return image
self.count=0 self.display=display def displayResults(self,res, cm=pylab.cm.gray, title='Specify a title'): if self.display: self.count=self.count+1 pylab.figure(self.count) pylab.imshow(res, cm, interpolation='nearest') pylab.colorbar() pylab.title(title) fx = 0 fy = 0 nx = 2**10 ny = 2**10 g1 = cuda.pagelocked_empty(1024*1024,'int16') g2 = cuda.pagelocked_empty(1024*1024,'int16') cuda.load_bin_image("synthetic4_B.bin",g1) cuda.load_bin_image("synthetic4_A.bin",g2) grid = g1.astype('complex64').reshape(1024,1024) grid2 = g2.astype('complex64').reshape(1024,1024) #grid = FGrid(fx,fy,nx,ny).grid #grid2 = FGrid(fx,fy,nx,ny).grid #grid[0:4,0:4] = 5+0j; #grid[7:11,0:4] = 5+0j; #grid2[1:5,1:5] = 5+0j;
increment_mod = SourceModule(""" __global__ void increment(double *a, int N) { int idx = threadIdx.x; if (idx < N) a[idx] = a[idx]+1; } """) increment = increment_mod.get_function("increment") N = 20 M = 3 # Time use of pinned host memory: x = drv.pagelocked_empty((N, N), np.float64, mem_flags=drv.host_alloc_flags.DEVICEMAP) x_gpu_ptr = np.intp(x.base.get_device_pointer()) times = np.empty(M) for i in xrange(M): x[:, :] = np.random.rand(N, N) x_orig = x.copy() start = time() increment(x_gpu_ptr, np.uint32(x.size), block=(512, 1, 1)) times[i] = time()-start np.allclose(x_orig + 1, x) print "Average kernel execution time with pinned memory: %3.7f" % np.mean(times) # Time use of pageable host memory: x = np.empty((N, N), np.float64)
def __init__(self, array_or_dtype, x_overlap=0): """ Create a spatial grid on the GPU(s). Input variables array_or_dtype -- can either be a numpy array of the same shape as the global space, or a numpy dtype. If a valid array is passed, it will be loaded on to the GPU. If a dtype is passed, then an array of zeros, of that dtype will be loaded onto the GPU. Optional variables x_overlap -- the number of adjacent cells in either the negative or positive x-direction that need to simultaneously be accessed along with the current cell. Must be a non-negative integer. Default value is 0. """ shape = get_space_info()['shape'] # Get the shape of the space. xr = get_space_info()['x_range'] # Get the local x_range. all_x_ranges = get_space_info()['all_x_ranges'] # Get the local x_range. local_shape = (xr[1]-xr[0], shape[1], shape[2]) self._set_gce_type('grid') # Set the gce type to grid. # Make sure overlap option is valid. if type(x_overlap) is not int: raise TypeError('x_overlap must be an integer.') elif x_overlap < 0: raise TypeError('x_overlap must be a non-negative integer.') if comm.rank == 0: # Process the array_or_dtype input variable. if type(array_or_dtype) is np.ndarray: # Input is an array. array = array_or_dtype # Make sure the array is of the correct shape. if array.shape != shape: raise TypeError('Shape of array does not match shape of space.') # Make sure the array is of a valid datatype. self._get_dtype(array.dtype.type) elif type(array_or_dtype) is type: # Input is a datatype. self._get_dtype(array_or_dtype) # Validate the dtype. array = np.zeros(shape, dtype=self.dtype) # Make a zeros array. else: # Invalid input. raise TypeError('Input variable must be a numpy array or dtype') # Prepare array to be scattered. array = [array[r[0]:r[1],:,:] for r in all_x_ranges] else: array = None array = comm.scatter(array) self._get_dtype(array.dtype.type) # # Narrow down the array to local x_range. # array = array[xr[0]:xr[1],:,:] # Add padding to array, if needed. self._xlap = x_overlap if self._xlap is not 0: padding = np.empty((self._xlap,) + shape[1:3], dtype=array.dtype) array = np.concatenate((padding, array, padding), axis=0) self.to_gpu(array) # Load onto device. # Determine information needed for synchronization. if self._xlap is not 0: # Calculates the pointer to the x offset in a grid. ptr_dx = lambda x_pos: self.data.ptr + self.data.dtype.itemsize * \ x_pos * shape[1] * shape[2] # Pointers to different sections of the grid that are relevant # for synchronization. self._sync_ptrs = { 'forw_src': ptr_dx(xr[1]-xr[0]), \ 'back_dest': ptr_dx(0), \ 'back_src': ptr_dx(self._xlap), \ 'forw_dest': ptr_dx(xr[1]-xr[0] + self._xlap)} # Buffers used during synchronization. self._sync_buffers = [drv.pagelocked_empty( \ (self._xlap, shape[1], shape[2]), \ self.dtype) for k in range(4)] # Streams used during synchronization. self._sync_streams = [drv.Stream() for k in range(4)] # Used to identify neighboring MPI nodes with whom to synchronize. self._sync_adj = get_space_info()['mpi_adj'] # Offset in bytes to the true start of the grid. # This is used to "hide" overlap areas from the kernel. self._xlap_offset = self.data.dtype.itemsize * \ self._xlap * shape[1] * shape[2] self.synchronize() # Synchronize the grid. comm.Barrier() # Wait for all grids to synchronize before proceeding.
Pdb(color_scheme="LightBG").set_trace() _magma_spotrf_gpu( char_pointer(uplo), int_pointer(n), # ==================================================================== # = I think this argument is responsible for the segmentation fault. = # ==================================================================== ctypes.POINTER(ctypes.c_float).from_address(get_gpu_pointer(A)), int_pointer(lda), work.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), int_pointer(info), ) return info if __name__ == "__main__": n = 10 # Create matrix to be factored A_orig = (np.eye(n) + np.ones((n, n)) * 0.3).astype("float32") A_gpu = cuda.to_device(A_orig) # Allocate pagelocked work array nwork = magma_get_spotrf_nb(int_pointer(n)) print nwork work_gpu = cuda.pagelocked_empty((nwork, nwork), dtype="float32") # # Do Cholesky factorization info = magma_spotrf_gpu("L", n, A_gpu, n, work_gpu)
DMP = DeviceMemoryPool() """ cells = nps.cells max_cell_pop = 0 for cellkey in cells.keys(): #print cellkey, cells[cellkey].nparticles[0] #print cells[cellkey].lindices[0].get_npy_array() if cells[cellkey].nparticles[0] > max_cell_pop: max_cell_pop = cells[cellkey].nparticles[0] """ max_cell_pop_gpu, nc, num_particles = nps.get_max_cell_pop(0, DMP) max_cell_pop = max_cell_pop_gpu.get() nbrs_gpu = cuda.pagelocked_empty(shape=(pa.num_real_particles, 27*max_cell_pop), dtype=np.int32) nbrs_gpu_ptr = np.intp(nbrs_gpu.base.get_device_pointer()) nnbrs_gpu = cuda.pagelocked_empty(shape=(pa.num_real_particles), dtype=np.int32) nnbrs_gpu_ptr = np.intp(nnbrs_gpu.base.get_device_pointer()) cells_gpu = gpuarray.zeros((nc[0]*nc[1]*nc[2], int(max_cell_pop)), dtype=np.int32)-1 #cellpop_gpu = gpuarray.zeros((nc[0], nc[1], nc[2]), dtype=np.int32) indices = arange_uint(num_particles) iters = 100 print "NumPa:", pa.num_real_particles bint = 0. npst = 0.
import pycuda.driver as cuda import pycuda.autoinit import numpy as np from magma_wrapper import magma_spotrf_gpu_wrap, magma_get_spotrf_nb_wrap n = 1000 # Create matrix to be factored A = np.eye(n, dtype='float32') + np.ones((n,n))*.1 A_gpu = cuda.to_device(A) # Allocate pagelocked work array nwork = magma_get_spotrf_nb_wrap(n) work_gpu = cuda.pagelocked_empty((nb,nb), dtype='float32') # Do Cholesky factorization info = magma_spotrf_gpu_wrap('U', n, A_gpu, n, work_gpu) # Copy back the Cholesy factor and check for correctness L = cuda.from_device(A_gpu, (n,n), 'float32') print np.abs(np.dot(L,L.T)-A).max()