Ejemplo n.º 1
0
    def __init__(self, seed=None):
        super(PyCudaHandler, self).__init__()
        self.dtype = np.float32
        self.context = cumisc._global_cublas_handle
        self.EMPTY = gpuarray.zeros((), dtype=self.dtype)
        if seed is None:
            seed = global_rnd.generate_seed()

        def get_seeds(n):
            return gpuarray.to_gpu(np.ones(n, np.int32) * seed)
        self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds)
 def _init(self, ref_image):
     skcuda.linalg.init()
     self.n_pixels = ref_image.size
     if self.max_ref_images is None:
         self.max_ref_images = int(np.sqrt(self.n_pixels))
     # GPU array of reference images as rows (hence equal to B.T).
     # It's initially full of random data.
     self.BT_gpu = XORWOWRandomNumberGenerator().gen_normal(
         (self.max_ref_images, self.n_pixels), float)
     self.next_ref_image_index = 0
     self.initialised = True
     self.ref_image_hashes = []
     self.n_ref_images = 0
Ejemplo n.º 3
0
def generar_numeros_normal(size, desv):
    n, m = size
    generador = XORWOWRandomNumberGenerator()
    array = generador.gen_normal(shape=n*m, dtype=np.float32) 
    array = array.reshape((n, m)).get()  
    return array
Ejemplo n.º 4
0
def numGen(size, desv):
    i, j = size
    generator = XORWOWRandomNumberGenerator()
    array = generator.gen_normal(shape=i * j, dtype=np.float32)
    array = array.reshape((i, j)).get()
    return array
Ejemplo n.º 5
0
    def __init__(self,
                 gpu_ctx,
                 gpu_stream,
                 nx,
                 ny,
                 dx,
                 dy,
                 boundaryConditions,
                 staggered,
                 soar_q0=None,
                 soar_L=None,
                 interpolation_factor=1,
                 use_lcg=False,
                 angle=np.array([[0]], dtype=np.float32),
                 coriolis_f=np.array([[0]], dtype=np.float32),
                 block_width=16,
                 block_height=16):
        """
        Initiates a class that generates small scale geostrophically balanced perturbations of
        the ocean state.
        (nx, ny): number of internal grid cells in the domain
        (dx, dy): size of each grid cell
        soar_q0: amplitude parameter for the perturbation, default: dx*1e-5
        soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor
        interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, 
            and then interpolated down to the computational mesh. The coarse mesh will then have
            (nx/interpolation_factor, ny/interpolation_factor) grid cells.
        use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers
        angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array
        (block_width, block_height): The size of each GPU block
        """

        self.use_lcg = use_lcg

        # Set numpy random state
        self.random_state = np.random.RandomState()

        # Make sure that all variables initialized within ifs are defined
        self.random_numbers = None
        self.rng = None
        self.seed = None
        self.host_seed = None

        self.gpu_ctx = gpu_ctx
        self.gpu_stream = gpu_stream

        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.staggered = np.int(0)
        if staggered:
            self.staggered = np.int(1)

        # The cutoff parameter is hard-coded.
        # The size of the cutoff determines the computational radius in the
        # SOAR function. Hence, the size of the local memory in the OpenCL
        # kernels has to be hard-coded.
        self.cutoff = np.int32(config.soar_cutoff)

        # Check that the interpolation factor plays well with the grid size:
        assert (interpolation_factor > 0 and interpolation_factor % 2
                == 1), 'interpolation_factor must be a positive odd integer'

        assert (nx % interpolation_factor == 0
                ), 'nx must be divisible by the interpolation factor'
        assert (ny % interpolation_factor == 0
                ), 'ny must be divisible by the interpolation factor'
        self.interpolation_factor = np.int32(interpolation_factor)

        # The size of the coarse grid
        self.coarse_nx = np.int32(nx / self.interpolation_factor)
        self.coarse_ny = np.int32(ny / self.interpolation_factor)
        self.coarse_dx = np.float32(dx * self.interpolation_factor)
        self.coarse_dy = np.float32(dy * self.interpolation_factor)

        self.periodicNorthSouth = np.int32(
            boundaryConditions.isPeriodicNorthSouth())
        self.periodicEastWest = np.int32(
            boundaryConditions.isPeriodicEastWest())

        # Size of random field and seed
        # The SOAR function is a stencil which requires cutoff number of grid cells,
        # and the interpolation operator requires further 2 ghost cell values in each direction.
        # The random field must therefore be created with 2 + cutoff number of ghost cells.
        self.rand_ghost_cells_x = np.int32(2 + self.cutoff)
        self.rand_ghost_cells_y = np.int32(2 + self.cutoff)
        if self.periodicEastWest:
            self.rand_ghost_cells_x = np.int32(0)
        if self.periodicNorthSouth:
            self.rand_ghost_cells_y = np.int32(0)
        self.rand_nx = np.int32(self.coarse_nx + 2 * self.rand_ghost_cells_x)
        self.rand_ny = np.int32(self.coarse_ny + 2 * self.rand_ghost_cells_y)

        # Since normal distributed numbers are generated in pairs, we need to store half the number of
        # of seed values compared to the number of random numbers.
        self.seed_ny = np.int32(self.rand_ny)
        self.seed_nx = np.int32(np.ceil(self.rand_nx / 2))

        # Generate seed:
        self.floatMax = 2147483648.0
        if self.use_lcg:
            self.host_seed = self.random_state.rand(
                self.seed_ny, self.seed_nx) * self.floatMax
            self.host_seed = self.host_seed.astype(np.uint64, order='C')

        if not self.use_lcg:
            self.rng = XORWOWRandomNumberGenerator()
        else:
            self.seed = Common.CUDAArray2D(gpu_stream,
                                           self.seed_nx,
                                           self.seed_ny,
                                           0,
                                           0,
                                           self.host_seed,
                                           double_precision=True,
                                           integers=True)

        # Constants for the SOAR function:
        self.soar_q0 = np.float32(self.dx / 100000)
        if soar_q0 is not None:
            self.soar_q0 = np.float32(soar_q0)

        self.soar_L = np.float32(0.75 * self.coarse_dx)
        if soar_L is not None:
            self.soar_L = np.float32(soar_L)

        # Allocate memory for random numbers (xi)
        self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx),
                                            dtype=np.float32,
                                            order='C')
        self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx,
                                                 self.rand_ny, 0, 0,
                                                 self.random_numbers_host)

        # Allocate a second buffer for random numbers (nu)
        self.perpendicular_random_numbers_host = np.zeros(
            (self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.perpendicular_random_numbers = Common.CUDAArray2D(
            self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0,
            self.random_numbers_host)

        # Allocate memory for coarse buffer if needed
        # Two ghost cells in each direction needed for bicubic interpolation
        self.coarse_buffer_host = np.zeros(
            (self.coarse_ny + 4, self.coarse_nx + 4),
            dtype=np.float32,
            order='C')
        self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream,
                                                self.coarse_nx, self.coarse_ny,
                                                2, 2, self.coarse_buffer_host)

        # Allocate extra memory needed for reduction kernels.
        # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu]
        self.reduction_buffer = None
        reduction_buffer_host = np.zeros((1, 3), dtype=np.float32)
        self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0,
                                                   reduction_buffer_host)

        # Generate kernels
        self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \
                                          defines={'block_width': block_width, 'block_height': block_height},
                                          compile_args={
                                              'options': ["--use_fast_math",
                                                          "--maxrregcount=32"]
                                          })

        self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \
                                                         defines={})

        # Get CUDA functions and define data types for prepared_{async_}call()
        # Generate kernels
        self.squareSumKernel = self.reduction_kernels.get_function("squareSum")
        self.squareSumKernel.prepare("iiPP")

        self.squareSumDoubleKernel = self.reduction_kernels.get_function(
            "squareSumDouble")
        self.squareSumDoubleKernel.prepare("iiPPP")

        self.makePerpendicularKernel = self.kernels.get_function(
            "makePerpendicular")
        self.makePerpendicularKernel.prepare("iiPiPiP")

        self.uniformDistributionKernel = self.kernels.get_function(
            "uniformDistribution")
        self.uniformDistributionKernel.prepare("iiiPiPi")

        self.normalDistributionKernel = None
        if self.use_lcg:
            self.normalDistributionKernel = self.kernels.get_function(
                "normalDistribution")
            self.normalDistributionKernel.prepare("iiiPiPi")

        self.soarKernel = self.kernels.get_function("SOAR")
        self.soarKernel.prepare("iifffffiiPiPii")

        self.geostrophicBalanceKernel = self.kernels.get_function(
            "geostrophicBalance")
        self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPif")

        self.bicubicInterpolationKernel = self.kernels.get_function(
            "bicubicInterpolation")
        self.bicubicInterpolationKernel.prepare(
            "iiiiffiiiiffiiffffPiPiPiPiPif")

        #Compute kernel launch parameters
        self.local_size = (block_width, block_height, 1)

        self.local_size_reductions = (128, 1, 1)
        self.global_size_reductions = (1, 1)

        # Launch one thread for each seed, which in turns generates two iid N(0,1)
        self.global_size_random_numbers = ( \
                       int(np.ceil(self.seed_nx / float(self.local_size[0]))), \
                       int(np.ceil(self.seed_ny / float(self.local_size[1]))) \
                     )

        # Launch on thread for each random number (in order to create perpendicular random numbers)
        self.global_size_perpendicular = ( \
                      int(np.ceil(self.rand_nx / float(self.local_size[0]))), \
                      int(np.ceil(self.rand_ny / float(self.local_size[1]))) \
                     )

        # Launch one thread per SOAR-correlated result - need to write to two ghost
        # cells in order to do bicubic interpolation based on the result
        self.global_size_SOAR = ( \
                     int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \
                     int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \
                    )

        # One thread per resulting perturbed grid cell
        self.global_size_geo_balance = ( \
                    int(np.ceil( (self.nx)/float(self.local_size[0]))), \
                    int(np.ceil( (self.ny)/float(self.local_size[1]))) \
                   )

        # Texture for coriolis field
        self.coriolis_texref = self.kernels.get_texref("coriolis_f_tex")
        if isinstance(coriolis_f, cuda.Array):
            # coriolis_f is already a texture, so we just set the reference
            self.coriolis_texref.set_array(coriolis_f)
        else:
            #Upload data to GPU and bind to texture reference
            self.coriolis_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(coriolis_f,
                                                      dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.coriolis_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.coriolis_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.coriolis_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing
        # FIXME! Allow different versions of coriolis, similar to CDKLM

        # Texture for angle towards north
        self.angle_texref = self.kernels.get_texref("angle_tex")
        if isinstance(angle, cuda.Array):
            # angle is already a texture, so we just set the reference
            self.angle_texref.set_array(angle)
        else:
            #Upload data to GPU and bind to texture reference
            self.angle_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.angle_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.angle_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.angle_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing
Ejemplo n.º 6
0
# Author: Chaojie Wang <*****@*****.**>; Jiawen Wu
# License: BSD-3-Clause

import pycuda.curandom as curandom
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.compiler import SourceModule
from pycuda.curandom import XORWOWRandomNumberGenerator

import numpy as np

realmin = 2.2e-10

cuda_generator = XORWOWRandomNumberGenerator()

mod = SourceModule("""

#include <stdio.h>

__device__ int cudarand(long long seed)
{
    if (seed == 0)
    {
        seed = 1;
    }
    long long temp=(48271 * seed + 0) % 2147483647;
    return temp;
}