def __init__(self, seed=None): super(PyCudaHandler, self).__init__() self.dtype = np.float32 self.context = cumisc._global_cublas_handle self.EMPTY = gpuarray.zeros((), dtype=self.dtype) if seed is None: seed = global_rnd.generate_seed() def get_seeds(n): return gpuarray.to_gpu(np.ones(n, np.int32) * seed) self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds)
def _init(self, ref_image): skcuda.linalg.init() self.n_pixels = ref_image.size if self.max_ref_images is None: self.max_ref_images = int(np.sqrt(self.n_pixels)) # GPU array of reference images as rows (hence equal to B.T). # It's initially full of random data. self.BT_gpu = XORWOWRandomNumberGenerator().gen_normal( (self.max_ref_images, self.n_pixels), float) self.next_ref_image_index = 0 self.initialised = True self.ref_image_hashes = [] self.n_ref_images = 0
def generar_numeros_normal(size, desv): n, m = size generador = XORWOWRandomNumberGenerator() array = generador.gen_normal(shape=n*m, dtype=np.float32) array = array.reshape((n, m)).get() return array
def numGen(size, desv): i, j = size generator = XORWOWRandomNumberGenerator() array = generator.gen_normal(shape=i * j, dtype=np.float32) array = array.reshape((i, j)).get() return array
def __init__(self, gpu_ctx, gpu_stream, nx, ny, dx, dy, boundaryConditions, staggered, soar_q0=None, soar_L=None, interpolation_factor=1, use_lcg=False, angle=np.array([[0]], dtype=np.float32), coriolis_f=np.array([[0]], dtype=np.float32), block_width=16, block_height=16): """ Initiates a class that generates small scale geostrophically balanced perturbations of the ocean state. (nx, ny): number of internal grid cells in the domain (dx, dy): size of each grid cell soar_q0: amplitude parameter for the perturbation, default: dx*1e-5 soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, and then interpolated down to the computational mesh. The coarse mesh will then have (nx/interpolation_factor, ny/interpolation_factor) grid cells. use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array (block_width, block_height): The size of each GPU block """ self.use_lcg = use_lcg # Set numpy random state self.random_state = np.random.RandomState() # Make sure that all variables initialized within ifs are defined self.random_numbers = None self.rng = None self.seed = None self.host_seed = None self.gpu_ctx = gpu_ctx self.gpu_stream = gpu_stream self.nx = np.int32(nx) self.ny = np.int32(ny) self.dx = np.float32(dx) self.dy = np.float32(dy) self.staggered = np.int(0) if staggered: self.staggered = np.int(1) # The cutoff parameter is hard-coded. # The size of the cutoff determines the computational radius in the # SOAR function. Hence, the size of the local memory in the OpenCL # kernels has to be hard-coded. self.cutoff = np.int32(config.soar_cutoff) # Check that the interpolation factor plays well with the grid size: assert (interpolation_factor > 0 and interpolation_factor % 2 == 1), 'interpolation_factor must be a positive odd integer' assert (nx % interpolation_factor == 0 ), 'nx must be divisible by the interpolation factor' assert (ny % interpolation_factor == 0 ), 'ny must be divisible by the interpolation factor' self.interpolation_factor = np.int32(interpolation_factor) # The size of the coarse grid self.coarse_nx = np.int32(nx / self.interpolation_factor) self.coarse_ny = np.int32(ny / self.interpolation_factor) self.coarse_dx = np.float32(dx * self.interpolation_factor) self.coarse_dy = np.float32(dy * self.interpolation_factor) self.periodicNorthSouth = np.int32( boundaryConditions.isPeriodicNorthSouth()) self.periodicEastWest = np.int32( boundaryConditions.isPeriodicEastWest()) # Size of random field and seed # The SOAR function is a stencil which requires cutoff number of grid cells, # and the interpolation operator requires further 2 ghost cell values in each direction. # The random field must therefore be created with 2 + cutoff number of ghost cells. self.rand_ghost_cells_x = np.int32(2 + self.cutoff) self.rand_ghost_cells_y = np.int32(2 + self.cutoff) if self.periodicEastWest: self.rand_ghost_cells_x = np.int32(0) if self.periodicNorthSouth: self.rand_ghost_cells_y = np.int32(0) self.rand_nx = np.int32(self.coarse_nx + 2 * self.rand_ghost_cells_x) self.rand_ny = np.int32(self.coarse_ny + 2 * self.rand_ghost_cells_y) # Since normal distributed numbers are generated in pairs, we need to store half the number of # of seed values compared to the number of random numbers. self.seed_ny = np.int32(self.rand_ny) self.seed_nx = np.int32(np.ceil(self.rand_nx / 2)) # Generate seed: self.floatMax = 2147483648.0 if self.use_lcg: self.host_seed = self.random_state.rand( self.seed_ny, self.seed_nx) * self.floatMax self.host_seed = self.host_seed.astype(np.uint64, order='C') if not self.use_lcg: self.rng = XORWOWRandomNumberGenerator() else: self.seed = Common.CUDAArray2D(gpu_stream, self.seed_nx, self.seed_ny, 0, 0, self.host_seed, double_precision=True, integers=True) # Constants for the SOAR function: self.soar_q0 = np.float32(self.dx / 100000) if soar_q0 is not None: self.soar_q0 = np.float32(soar_q0) self.soar_L = np.float32(0.75 * self.coarse_dx) if soar_L is not None: self.soar_L = np.float32(soar_L) # Allocate memory for random numbers (xi) self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate a second buffer for random numbers (nu) self.perpendicular_random_numbers_host = np.zeros( (self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.perpendicular_random_numbers = Common.CUDAArray2D( self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate memory for coarse buffer if needed # Two ghost cells in each direction needed for bicubic interpolation self.coarse_buffer_host = np.zeros( (self.coarse_ny + 4, self.coarse_nx + 4), dtype=np.float32, order='C') self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream, self.coarse_nx, self.coarse_ny, 2, 2, self.coarse_buffer_host) # Allocate extra memory needed for reduction kernels. # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu] self.reduction_buffer = None reduction_buffer_host = np.zeros((1, 3), dtype=np.float32) self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0, reduction_buffer_host) # Generate kernels self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \ defines={'block_width': block_width, 'block_height': block_height}, compile_args={ 'options': ["--use_fast_math", "--maxrregcount=32"] }) self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \ defines={}) # Get CUDA functions and define data types for prepared_{async_}call() # Generate kernels self.squareSumKernel = self.reduction_kernels.get_function("squareSum") self.squareSumKernel.prepare("iiPP") self.squareSumDoubleKernel = self.reduction_kernels.get_function( "squareSumDouble") self.squareSumDoubleKernel.prepare("iiPPP") self.makePerpendicularKernel = self.kernels.get_function( "makePerpendicular") self.makePerpendicularKernel.prepare("iiPiPiP") self.uniformDistributionKernel = self.kernels.get_function( "uniformDistribution") self.uniformDistributionKernel.prepare("iiiPiPi") self.normalDistributionKernel = None if self.use_lcg: self.normalDistributionKernel = self.kernels.get_function( "normalDistribution") self.normalDistributionKernel.prepare("iiiPiPi") self.soarKernel = self.kernels.get_function("SOAR") self.soarKernel.prepare("iifffffiiPiPii") self.geostrophicBalanceKernel = self.kernels.get_function( "geostrophicBalance") self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPif") self.bicubicInterpolationKernel = self.kernels.get_function( "bicubicInterpolation") self.bicubicInterpolationKernel.prepare( "iiiiffiiiiffiiffffPiPiPiPiPif") #Compute kernel launch parameters self.local_size = (block_width, block_height, 1) self.local_size_reductions = (128, 1, 1) self.global_size_reductions = (1, 1) # Launch one thread for each seed, which in turns generates two iid N(0,1) self.global_size_random_numbers = ( \ int(np.ceil(self.seed_nx / float(self.local_size[0]))), \ int(np.ceil(self.seed_ny / float(self.local_size[1]))) \ ) # Launch on thread for each random number (in order to create perpendicular random numbers) self.global_size_perpendicular = ( \ int(np.ceil(self.rand_nx / float(self.local_size[0]))), \ int(np.ceil(self.rand_ny / float(self.local_size[1]))) \ ) # Launch one thread per SOAR-correlated result - need to write to two ghost # cells in order to do bicubic interpolation based on the result self.global_size_SOAR = ( \ int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \ int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \ ) # One thread per resulting perturbed grid cell self.global_size_geo_balance = ( \ int(np.ceil( (self.nx)/float(self.local_size[0]))), \ int(np.ceil( (self.ny)/float(self.local_size[1]))) \ ) # Texture for coriolis field self.coriolis_texref = self.kernels.get_texref("coriolis_f_tex") if isinstance(coriolis_f, cuda.Array): # coriolis_f is already a texture, so we just set the reference self.coriolis_texref.set_array(coriolis_f) else: #Upload data to GPU and bind to texture reference self.coriolis_texref.set_array( cuda.np_to_array(np.ascontiguousarray(coriolis_f, dtype=np.float32), order="C")) # Set texture parameters self.coriolis_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.coriolis_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.coriolis_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing # FIXME! Allow different versions of coriolis, similar to CDKLM # Texture for angle towards north self.angle_texref = self.kernels.get_texref("angle_tex") if isinstance(angle, cuda.Array): # angle is already a texture, so we just set the reference self.angle_texref.set_array(angle) else: #Upload data to GPU and bind to texture reference self.angle_texref.set_array( cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32), order="C")) # Set texture parameters self.angle_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.angle_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.angle_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing
# Author: Chaojie Wang <*****@*****.**>; Jiawen Wu # License: BSD-3-Clause import pycuda.curandom as curandom import pycuda.driver as drv import pycuda.tools import pycuda.autoinit from pycuda import gpuarray from pycuda.compiler import SourceModule from pycuda.curandom import XORWOWRandomNumberGenerator import numpy as np realmin = 2.2e-10 cuda_generator = XORWOWRandomNumberGenerator() mod = SourceModule(""" #include <stdio.h> __device__ int cudarand(long long seed) { if (seed == 0) { seed = 1; } long long temp=(48271 * seed + 0) % 2147483647; return temp; }