def _univariate_pdf_call(cu_func, data, packed_params, get): ndata = len(data) nparams = len(packed_params) func_regs = cu_func.num_regs packed_params = util.prep_ndarray(packed_params) data_per, params_per = util.tune_blocksize(data, packed_params, func_regs) shared_mem = util.compute_shmem(data, packed_params, data_per, params_per) block_design = (data_per * params_per, 1, 1) grid_design = (util.get_boxes(ndata, data_per), util.get_boxes(nparams, params_per)) # see cufiles/univcaller.cu #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32)) gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32) gpu_data = data if isinstance(data, GPUArray) else to_gpu(data) gpu_packed_params = to_gpu(packed_params) design = np.array(((data_per, params_per) + # block design (len(data),) + packed_params.shape), # params spec dtype=np.int32) cu_func(gpu_dest, gpu_data, gpu_packed_params, design[0], design[1], design[2], design[3], design[4], block=block_design, grid=grid_design, shared=shared_mem) if get: output = gpu_dest.get() if nparams > 1: output = output.reshape((nparams, ndata), order='C').T return output else: return gpu_dest
def sample_discrete(in_densities, logged=False, pad=False, return_gpuarray=False): """ Takes a categorical sample from the unnormalized univariate densities defined in the rows of 'densities' Parameters --------- densities : ndarray or gpuarray (n, k) logged: boolean indicating whether densities is on the log scale ... Returns ------- indices : ndarray or gpuarray (if return_gpuarray=True) of length n and dtype = int32 """ if pad: if logged: densities = util.pad_data_mult16(in_densities, fill=1) else: densities = util.pad_data_mult16(in_densities, fill=0) else: densities = in_densities n, k = densities.shape if logged: cu_func = cu_module.get_function('sample_discrete_logged') else: cu_func = cu_module.get_function('sample_discrete') if isinstance(densities, GPUArray): if densities.flags.f_contiguous: densities.reshape(k, n, 'C') gpu_densities = util.transpose(densities) else: gpu_densities = densities else: densities = util.prep_ndarray(densities) gpu_densities = to_gpu(densities) # setup GPU data #gpu_random = curand(n) gpu_random = to_gpu(np.asarray(np.random.rand(n), dtype=np.float32)) #gpu_dest = to_gpu(np.zeros(n, dtype=np.float32)) gpu_dest = gpu_empty(n, dtype=np.float32) stride = gpu_densities.shape[1] if stride % 2 == 0: stride += 1 dims = np.array([n,k, gpu_densities.shape[1], stride],dtype=np.int32) # optimize design ... grid_design, block_design = _tune_sfm(n, stride, cu_func.num_regs) shared_mem = 4 * (block_design[0] * stride + 1 * block_design[0]) cu_func(gpu_densities, gpu_random, gpu_dest, dims[0], dims[1], dims[2], dims[3], block=block_design, grid=grid_design, shared=shared_mem) gpu_random.gpudata.free() if return_gpuarray: return gpu_dest else: res = gpu_dest.get() gpu_dest.gpudata.free() return res
def sample_discrete(densities, logged=False, return_gpuarray=False): """ Takes a categorical sample from the unnormalized univariate densities defined in the rows of 'densities' Parameters --------- densities : ndarray or gpuarray (n, k) logged: boolean indicating whether densities is on the log scale ... Returns ------- indices : ndarray or gpuarray (if return_gpuarray=True) of length n and dtype = int32 """ from gpustats.util import info n, k = densities.shape # prep data if isinstance(densities, GPUArray): if densities.flags.f_contiguous: gpu_densities = util.transpose(densities) else: gpu_densities = densities else: densities = util.prep_ndarray(densities) gpu_densities = to_gpu(densities) # get gpu function cu_func = cu_module.get_function('sample_discrete') # setup GPU data gpu_random = to_gpu(np.asarray(np.random.rand(n), dtype=np.float32)) gpu_dest = gpu_empty(n, dtype=np.int32) dims = np.array([n, k, logged], dtype=np.int32) if info.max_block_threads < 1024: x_block_dim = 16 else: x_block_dim = 32 y_block_dim = 16 # setup GPU call block_design = (x_block_dim, y_block_dim, 1) grid_design = (int(n / y_block_dim) + 1, 1) shared_mem = 4 * ((x_block_dim + 1) * y_block_dim + 2 * y_block_dim) cu_func(gpu_densities, gpu_random, gpu_dest, dims[0], dims[1], dims[2], block=block_design, grid=grid_design, shared=shared_mem) gpu_random.gpudata.free() if return_gpuarray: return gpu_dest else: res = gpu_dest.get() gpu_dest.gpudata.free() return res
def sample_discrete(densities, logged=False, return_gpuarray=False): """ Takes a categorical sample from the unnormalized univariate densities defined in the rows of 'densities' Parameters --------- densities : ndarray or gpuarray (n, k) logged: boolean indicating whether densities is on the log scale ... Returns ------- indices : ndarray or gpuarray (if return_gpuarray=True) of length n and dtype = int32 """ from gpustats.util import info n, k = densities.shape # prep data if isinstance(densities, GPUArray): if densities.flags.f_contiguous: gpu_densities = util.transpose(densities) else: gpu_densities = densities else: densities = util.prep_ndarray(densities) gpu_densities = to_gpu(densities) # get gpu function cu_func = cu_module.get_function("sample_discrete") # setup GPU data gpu_random = to_gpu(np.asarray(np.random.rand(n), dtype=np.float32)) gpu_dest = gpu_empty(n, dtype=np.int32) dims = np.array([n, k, logged], dtype=np.int32) if info.max_block_threads < 1024: x_block_dim = 16 else: x_block_dim = 32 y_block_dim = 16 # setup GPU call block_design = (x_block_dim, y_block_dim, 1) grid_design = (int(n / y_block_dim) + 1, 1) shared_mem = 4 * ((x_block_dim + 1) * y_block_dim + 2 * y_block_dim) cu_func( gpu_densities, gpu_random, gpu_dest, dims[0], dims[1], dims[2], block=block_design, grid=grid_design, shared=shared_mem, ) gpu_random.gpudata.free() if return_gpuarray: return gpu_dest else: res = gpu_dest.get() gpu_dest.gpudata.free() return res
def _multivariate_pdf_call(cu_func, data, packed_params, get, order, datadim=None): packed_params = util.prep_ndarray(packed_params) func_regs = cu_func.num_regs # Prep the data. Skip if gpudata ... if isinstance(data, GPUArray): padded_data = data if datadim==None: ndata, dim = data.shape else: ndata, dim = data.shape[0], datadim else: ndata, dim = data.shape padded_data = util.pad_data(data) nparams = len(packed_params) data_per, params_per = util.tune_blocksize(padded_data, packed_params, func_regs) blocksize = data_per * params_per #print 'the blocksize is ' + str(blocksize) #print 'data_per ' + str(data_per) + '. params_per ' + str(params_per) shared_mem = util.compute_shmem(padded_data, packed_params, data_per, params_per) block_design = (data_per * params_per, 1, 1) grid_design = (util.get_boxes(ndata, data_per), util.get_boxes(nparams, params_per)) # see cufiles/mvcaller.cu design = np.array(((data_per, params_per) + # block design padded_data.shape + # data spec (dim,) + # non-padded number of data columns packed_params.shape), # params spec dtype=np.int32) if nparams == 1: gpu_dest = gpu_empty(ndata, dtype=np.float32) #gpu_dest = to_gpu(np.zeros(ndata, dtype=np.float32)) else: gpu_dest = gpu_empty((ndata, nparams), dtype=np.float32, order='F') #gpu_dest = to_gpu(np.zeros((ndata, nparams), dtype=np.float32, order='F')) # Upload data if not already uploaded if not isinstance(padded_data, GPUArray): gpu_padded_data = to_gpu(padded_data) else: gpu_padded_data = padded_data gpu_packed_params = to_gpu(packed_params) params = (gpu_dest, gpu_padded_data, gpu_packed_params) + tuple(design) kwds = dict(block=block_design, grid=grid_design, shared=shared_mem) cu_func(*params, **kwds) gpu_packed_params.gpudata.free() if get: if order=='F': return gpu_dest.get() else: return np.asarray(gpu_dest.get(), dtype=np.float32, order='C') #output = gpu_dest.get() #if nparams > 1: # output = output.reshape((nparams, ndata), order='C').T #return output else: if order=='F' or nparams==1: return gpu_dest else: res = gpu_transpose(util.GPUarray_reshape(gpu_dest, (nparams, ndata), "C")) gpu_dest.gpudata.free() return res
def _multivariate_pdf_call(cu_func, data, packed_params, get, order, datadim=None): packed_params = util.prep_ndarray(packed_params) func_regs = cu_func.num_regs # Prep the data. Skip if gpu data... if isinstance(data, GPUArray): padded_data = data if datadim is None: n_data, dim = data.shape else: n_data, dim = data.shape[0], datadim else: n_data, dim = data.shape padded_data = util.pad_data(data) n_params = len(packed_params) data_per, params_per = util.tune_blocksize( padded_data, packed_params, func_regs ) shared_mem = util.compute_shared_mem( padded_data, packed_params, data_per, params_per ) block_design = (data_per * params_per, 1, 1) grid_design = (util.get_boxes(n_data, data_per), util.get_boxes(n_params, params_per)) # see cufiles/mvcaller.cu design = np.array( ( (data_per, params_per) + # block design padded_data.shape + # data spec (dim,) + # non-padded number of data columns packed_params.shape # params spec ), dtype=np.int32 ) if n_params == 1: gpu_dest = gpu_empty(n_data, dtype=np.float32) else: gpu_dest = gpu_empty((n_data, n_params), dtype=np.float32, order='F') # Upload data if not already uploaded if not isinstance(padded_data, GPUArray): gpu_padded_data = to_gpu(padded_data) else: gpu_padded_data = padded_data gpu_packed_params = to_gpu(packed_params) params = (gpu_dest, gpu_padded_data, gpu_packed_params) + tuple(design) kwargs = dict(block=block_design, grid=grid_design, shared=shared_mem) cu_func(*params, **kwargs) gpu_packed_params.gpudata.free() if get: if order == 'F': return gpu_dest.get() else: return np.asarray(gpu_dest.get(), dtype=np.float32, order='C') else: if order == 'F' or n_params == 1: return gpu_dest else: res = gpu_transpose( util.gpu_array_reshape(gpu_dest, (n_params, n_data), "C") ) gpu_dest.gpudata.free() return res