Exemple #1
0
    def __init__(self, seed=None):
        super(PyCudaHandler, self).__init__()
        self.dtype = np.float32
        self.context = cumisc._global_cublas_handle
        self.EMPTY = gpuarray.zeros((), dtype=self.dtype)
        if seed is None:
            seed = global_rnd.generate_seed()

        def get_seeds(n):
            return gpuarray.to_gpu(np.ones(n, np.int32) * seed)
        self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds)
Exemple #2
0
class NewCUDARandom:

	def __init__(self, env, double):
		self._env = env

		p = double_precision if double else single_precision
		self._scalar_dtype = p.scalar.dtype
		self._complex_dtype = p.complex.dtype

		self._scalar_cast = p.scalar.cast
		self._complex_cast = p.complex.cast

		from pycuda.curandom import XORWOWRandomNumberGenerator as RNG
		self._rng = RNG()

		kernel_template = """
		<%!
			from math import pi
		%>

		EXPORTED_FUNC void scaleRandoms(int gsize,
			GLOBAL_MEM COMPLEX* data,
			COMPLEX loc, SCALAR scale)
		{
			int id = GLOBAL_ID_FLAT;
			if(id >= gsize)
				return;

			COMPLEX r = data[id];
			r.x += loc.x;
			r.y += loc.y;
			r.x *= scale;
			r.y *= scale;
			data[id] = r;
		}
		"""

		self._program = self._env.compile(kernel_template, double=double)
		self._scaleRandoms = self._program.scaleRandoms

	def random_normal(self, result, scale=1.0, loc=0.0):
		self._rng.fill_normal(result, stream=self._env.stream)
		self._scaleRandoms(result.size, result,
			self._complex_cast(loc), self._scalar_cast(scale / numpy.sqrt(2.0)))
Exemple #3
0
    def __init__(self, seed=None):
        super(PyCudaHandler, self).__init__()
        self.dtype = np.float32
        self.context = cumisc._global_cublas_handle
        self.EMPTY = gpuarray.zeros((), dtype=self.dtype)
        if seed is None:
            seed = global_rnd.generate_seed()

        def get_seeds(n):
            return gpuarray.to_gpu(np.ones(n, np.int32) * seed)
        self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds)
 def _init(self, ref_image):
     skcuda.linalg.init()
     self.n_pixels = ref_image.size
     if self.max_ref_images is None:
         self.max_ref_images = int(np.sqrt(self.n_pixels))
     # GPU array of reference images as rows (hence equal to B.T).
     # It's initially full of random data.
     self.BT_gpu = XORWOWRandomNumberGenerator().gen_normal(
         (self.max_ref_images, self.n_pixels), float)
     self.next_ref_image_index = 0
     self.initialised = True
     self.ref_image_hashes = []
     self.n_ref_images = 0
Exemple #5
0
	def __init__(self, env, double):
		self._env = env

		p = double_precision if double else single_precision
		self._scalar_dtype = p.scalar.dtype
		self._complex_dtype = p.complex.dtype

		self._scalar_cast = p.scalar.cast
		self._complex_cast = p.complex.cast

		from pycuda.curandom import XORWOWRandomNumberGenerator as RNG
		self._rng = RNG()

		kernel_template = """
		<%!
			from math import pi
		%>

		EXPORTED_FUNC void scaleRandoms(int gsize,
			GLOBAL_MEM COMPLEX* data,
			COMPLEX loc, SCALAR scale)
		{
			int id = GLOBAL_ID_FLAT;
			if(id >= gsize)
				return;

			COMPLEX r = data[id];
			r.x += loc.x;
			r.y += loc.y;
			r.x *= scale;
			r.y *= scale;
			data[id] = r;
		}
		"""

		self._program = self._env.compile(kernel_template, double=double)
		self._scaleRandoms = self._program.scaleRandoms
Exemple #6
0
def generar_numeros_normal(size, desv):
    n, m = size
    generador = XORWOWRandomNumberGenerator()
    array = generador.gen_normal(shape=n*m, dtype=np.float32) 
    array = array.reshape((n, m)).get()  
    return array
Exemple #7
0
def numGen(size, desv):
    i, j = size
    generator = XORWOWRandomNumberGenerator()
    array = generator.gen_normal(shape=i * j, dtype=np.float32)
    array = array.reshape((i, j)).get()
    return array
Exemple #8
0
    def __init__(self,
                 gpu_ctx,
                 gpu_stream,
                 nx,
                 ny,
                 dx,
                 dy,
                 boundaryConditions,
                 staggered,
                 soar_q0=None,
                 soar_L=None,
                 interpolation_factor=1,
                 use_lcg=False,
                 angle=np.array([[0]], dtype=np.float32),
                 coriolis_f=np.array([[0]], dtype=np.float32),
                 block_width=16,
                 block_height=16):
        """
        Initiates a class that generates small scale geostrophically balanced perturbations of
        the ocean state.
        (nx, ny): number of internal grid cells in the domain
        (dx, dy): size of each grid cell
        soar_q0: amplitude parameter for the perturbation, default: dx*1e-5
        soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor
        interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, 
            and then interpolated down to the computational mesh. The coarse mesh will then have
            (nx/interpolation_factor, ny/interpolation_factor) grid cells.
        use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers
        angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array
        (block_width, block_height): The size of each GPU block
        """

        self.use_lcg = use_lcg

        # Set numpy random state
        self.random_state = np.random.RandomState()

        # Make sure that all variables initialized within ifs are defined
        self.random_numbers = None
        self.rng = None
        self.seed = None
        self.host_seed = None

        self.gpu_ctx = gpu_ctx
        self.gpu_stream = gpu_stream

        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.staggered = np.int(0)
        if staggered:
            self.staggered = np.int(1)

        # The cutoff parameter is hard-coded.
        # The size of the cutoff determines the computational radius in the
        # SOAR function. Hence, the size of the local memory in the OpenCL
        # kernels has to be hard-coded.
        self.cutoff = np.int32(config.soar_cutoff)

        # Check that the interpolation factor plays well with the grid size:
        assert (interpolation_factor > 0 and interpolation_factor % 2
                == 1), 'interpolation_factor must be a positive odd integer'

        assert (nx % interpolation_factor == 0
                ), 'nx must be divisible by the interpolation factor'
        assert (ny % interpolation_factor == 0
                ), 'ny must be divisible by the interpolation factor'
        self.interpolation_factor = np.int32(interpolation_factor)

        # The size of the coarse grid
        self.coarse_nx = np.int32(nx / self.interpolation_factor)
        self.coarse_ny = np.int32(ny / self.interpolation_factor)
        self.coarse_dx = np.float32(dx * self.interpolation_factor)
        self.coarse_dy = np.float32(dy * self.interpolation_factor)

        self.periodicNorthSouth = np.int32(
            boundaryConditions.isPeriodicNorthSouth())
        self.periodicEastWest = np.int32(
            boundaryConditions.isPeriodicEastWest())

        # Size of random field and seed
        # The SOAR function is a stencil which requires cutoff number of grid cells,
        # and the interpolation operator requires further 2 ghost cell values in each direction.
        # The random field must therefore be created with 2 + cutoff number of ghost cells.
        self.rand_ghost_cells_x = np.int32(2 + self.cutoff)
        self.rand_ghost_cells_y = np.int32(2 + self.cutoff)
        if self.periodicEastWest:
            self.rand_ghost_cells_x = np.int32(0)
        if self.periodicNorthSouth:
            self.rand_ghost_cells_y = np.int32(0)
        self.rand_nx = np.int32(self.coarse_nx + 2 * self.rand_ghost_cells_x)
        self.rand_ny = np.int32(self.coarse_ny + 2 * self.rand_ghost_cells_y)

        # Since normal distributed numbers are generated in pairs, we need to store half the number of
        # of seed values compared to the number of random numbers.
        self.seed_ny = np.int32(self.rand_ny)
        self.seed_nx = np.int32(np.ceil(self.rand_nx / 2))

        # Generate seed:
        self.floatMax = 2147483648.0
        if self.use_lcg:
            self.host_seed = self.random_state.rand(
                self.seed_ny, self.seed_nx) * self.floatMax
            self.host_seed = self.host_seed.astype(np.uint64, order='C')

        if not self.use_lcg:
            self.rng = XORWOWRandomNumberGenerator()
        else:
            self.seed = Common.CUDAArray2D(gpu_stream,
                                           self.seed_nx,
                                           self.seed_ny,
                                           0,
                                           0,
                                           self.host_seed,
                                           double_precision=True,
                                           integers=True)

        # Constants for the SOAR function:
        self.soar_q0 = np.float32(self.dx / 100000)
        if soar_q0 is not None:
            self.soar_q0 = np.float32(soar_q0)

        self.soar_L = np.float32(0.75 * self.coarse_dx)
        if soar_L is not None:
            self.soar_L = np.float32(soar_L)

        # Allocate memory for random numbers (xi)
        self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx),
                                            dtype=np.float32,
                                            order='C')
        self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx,
                                                 self.rand_ny, 0, 0,
                                                 self.random_numbers_host)

        # Allocate a second buffer for random numbers (nu)
        self.perpendicular_random_numbers_host = np.zeros(
            (self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.perpendicular_random_numbers = Common.CUDAArray2D(
            self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0,
            self.random_numbers_host)

        # Allocate memory for coarse buffer if needed
        # Two ghost cells in each direction needed for bicubic interpolation
        self.coarse_buffer_host = np.zeros(
            (self.coarse_ny + 4, self.coarse_nx + 4),
            dtype=np.float32,
            order='C')
        self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream,
                                                self.coarse_nx, self.coarse_ny,
                                                2, 2, self.coarse_buffer_host)

        # Allocate extra memory needed for reduction kernels.
        # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu]
        self.reduction_buffer = None
        reduction_buffer_host = np.zeros((1, 3), dtype=np.float32)
        self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0,
                                                   reduction_buffer_host)

        # Generate kernels
        self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \
                                          defines={'block_width': block_width, 'block_height': block_height},
                                          compile_args={
                                              'options': ["--use_fast_math",
                                                          "--maxrregcount=32"]
                                          })

        self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \
                                                         defines={})

        # Get CUDA functions and define data types for prepared_{async_}call()
        # Generate kernels
        self.squareSumKernel = self.reduction_kernels.get_function("squareSum")
        self.squareSumKernel.prepare("iiPP")

        self.squareSumDoubleKernel = self.reduction_kernels.get_function(
            "squareSumDouble")
        self.squareSumDoubleKernel.prepare("iiPPP")

        self.makePerpendicularKernel = self.kernels.get_function(
            "makePerpendicular")
        self.makePerpendicularKernel.prepare("iiPiPiP")

        self.uniformDistributionKernel = self.kernels.get_function(
            "uniformDistribution")
        self.uniformDistributionKernel.prepare("iiiPiPi")

        self.normalDistributionKernel = None
        if self.use_lcg:
            self.normalDistributionKernel = self.kernels.get_function(
                "normalDistribution")
            self.normalDistributionKernel.prepare("iiiPiPi")

        self.soarKernel = self.kernels.get_function("SOAR")
        self.soarKernel.prepare("iifffffiiPiPii")

        self.geostrophicBalanceKernel = self.kernels.get_function(
            "geostrophicBalance")
        self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPif")

        self.bicubicInterpolationKernel = self.kernels.get_function(
            "bicubicInterpolation")
        self.bicubicInterpolationKernel.prepare(
            "iiiiffiiiiffiiffffPiPiPiPiPif")

        #Compute kernel launch parameters
        self.local_size = (block_width, block_height, 1)

        self.local_size_reductions = (128, 1, 1)
        self.global_size_reductions = (1, 1)

        # Launch one thread for each seed, which in turns generates two iid N(0,1)
        self.global_size_random_numbers = ( \
                       int(np.ceil(self.seed_nx / float(self.local_size[0]))), \
                       int(np.ceil(self.seed_ny / float(self.local_size[1]))) \
                     )

        # Launch on thread for each random number (in order to create perpendicular random numbers)
        self.global_size_perpendicular = ( \
                      int(np.ceil(self.rand_nx / float(self.local_size[0]))), \
                      int(np.ceil(self.rand_ny / float(self.local_size[1]))) \
                     )

        # Launch one thread per SOAR-correlated result - need to write to two ghost
        # cells in order to do bicubic interpolation based on the result
        self.global_size_SOAR = ( \
                     int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \
                     int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \
                    )

        # One thread per resulting perturbed grid cell
        self.global_size_geo_balance = ( \
                    int(np.ceil( (self.nx)/float(self.local_size[0]))), \
                    int(np.ceil( (self.ny)/float(self.local_size[1]))) \
                   )

        # Texture for coriolis field
        self.coriolis_texref = self.kernels.get_texref("coriolis_f_tex")
        if isinstance(coriolis_f, cuda.Array):
            # coriolis_f is already a texture, so we just set the reference
            self.coriolis_texref.set_array(coriolis_f)
        else:
            #Upload data to GPU and bind to texture reference
            self.coriolis_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(coriolis_f,
                                                      dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.coriolis_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.coriolis_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.coriolis_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing
        # FIXME! Allow different versions of coriolis, similar to CDKLM

        # Texture for angle towards north
        self.angle_texref = self.kernels.get_texref("angle_tex")
        if isinstance(angle, cuda.Array):
            # angle is already a texture, so we just set the reference
            self.angle_texref.set_array(angle)
        else:
            #Upload data to GPU and bind to texture reference
            self.angle_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.angle_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.angle_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.angle_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing
Exemple #9
0
class OceanStateNoise(object):
    """
    Generating random perturbations for a ocean state.
   
    Perturbation for the surface field, dEta, is produced with a covariance structure according to a SOAR function,
    while dHu and dHv are found by the geostrophic balance to avoid shock solutions.
    """
    def __init__(self,
                 gpu_ctx,
                 gpu_stream,
                 nx,
                 ny,
                 dx,
                 dy,
                 boundaryConditions,
                 staggered,
                 soar_q0=None,
                 soar_L=None,
                 interpolation_factor=1,
                 use_lcg=False,
                 angle=np.array([[0]], dtype=np.float32),
                 coriolis_f=np.array([[0]], dtype=np.float32),
                 block_width=16,
                 block_height=16):
        """
        Initiates a class that generates small scale geostrophically balanced perturbations of
        the ocean state.
        (nx, ny): number of internal grid cells in the domain
        (dx, dy): size of each grid cell
        soar_q0: amplitude parameter for the perturbation, default: dx*1e-5
        soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor
        interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, 
            and then interpolated down to the computational mesh. The coarse mesh will then have
            (nx/interpolation_factor, ny/interpolation_factor) grid cells.
        use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers
        angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array
        (block_width, block_height): The size of each GPU block
        """

        self.use_lcg = use_lcg

        # Set numpy random state
        self.random_state = np.random.RandomState()

        # Make sure that all variables initialized within ifs are defined
        self.random_numbers = None
        self.rng = None
        self.seed = None
        self.host_seed = None

        self.gpu_ctx = gpu_ctx
        self.gpu_stream = gpu_stream

        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.staggered = np.int(0)
        if staggered:
            self.staggered = np.int(1)

        # The cutoff parameter is hard-coded.
        # The size of the cutoff determines the computational radius in the
        # SOAR function. Hence, the size of the local memory in the OpenCL
        # kernels has to be hard-coded.
        self.cutoff = np.int32(config.soar_cutoff)

        # Check that the interpolation factor plays well with the grid size:
        assert (interpolation_factor > 0 and interpolation_factor % 2
                == 1), 'interpolation_factor must be a positive odd integer'

        assert (nx % interpolation_factor == 0
                ), 'nx must be divisible by the interpolation factor'
        assert (ny % interpolation_factor == 0
                ), 'ny must be divisible by the interpolation factor'
        self.interpolation_factor = np.int32(interpolation_factor)

        # The size of the coarse grid
        self.coarse_nx = np.int32(nx / self.interpolation_factor)
        self.coarse_ny = np.int32(ny / self.interpolation_factor)
        self.coarse_dx = np.float32(dx * self.interpolation_factor)
        self.coarse_dy = np.float32(dy * self.interpolation_factor)

        self.periodicNorthSouth = np.int32(
            boundaryConditions.isPeriodicNorthSouth())
        self.periodicEastWest = np.int32(
            boundaryConditions.isPeriodicEastWest())

        # Size of random field and seed
        # The SOAR function is a stencil which requires cutoff number of grid cells,
        # and the interpolation operator requires further 2 ghost cell values in each direction.
        # The random field must therefore be created with 2 + cutoff number of ghost cells.
        self.rand_ghost_cells_x = np.int32(2 + self.cutoff)
        self.rand_ghost_cells_y = np.int32(2 + self.cutoff)
        if self.periodicEastWest:
            self.rand_ghost_cells_x = np.int32(0)
        if self.periodicNorthSouth:
            self.rand_ghost_cells_y = np.int32(0)
        self.rand_nx = np.int32(self.coarse_nx + 2 * self.rand_ghost_cells_x)
        self.rand_ny = np.int32(self.coarse_ny + 2 * self.rand_ghost_cells_y)

        # Since normal distributed numbers are generated in pairs, we need to store half the number of
        # of seed values compared to the number of random numbers.
        self.seed_ny = np.int32(self.rand_ny)
        self.seed_nx = np.int32(np.ceil(self.rand_nx / 2))

        # Generate seed:
        self.floatMax = 2147483648.0
        if self.use_lcg:
            self.host_seed = self.random_state.rand(
                self.seed_ny, self.seed_nx) * self.floatMax
            self.host_seed = self.host_seed.astype(np.uint64, order='C')

        if not self.use_lcg:
            self.rng = XORWOWRandomNumberGenerator()
        else:
            self.seed = Common.CUDAArray2D(gpu_stream,
                                           self.seed_nx,
                                           self.seed_ny,
                                           0,
                                           0,
                                           self.host_seed,
                                           double_precision=True,
                                           integers=True)

        # Constants for the SOAR function:
        self.soar_q0 = np.float32(self.dx / 100000)
        if soar_q0 is not None:
            self.soar_q0 = np.float32(soar_q0)

        self.soar_L = np.float32(0.75 * self.coarse_dx)
        if soar_L is not None:
            self.soar_L = np.float32(soar_L)

        # Allocate memory for random numbers (xi)
        self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx),
                                            dtype=np.float32,
                                            order='C')
        self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx,
                                                 self.rand_ny, 0, 0,
                                                 self.random_numbers_host)

        # Allocate a second buffer for random numbers (nu)
        self.perpendicular_random_numbers_host = np.zeros(
            (self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.perpendicular_random_numbers = Common.CUDAArray2D(
            self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0,
            self.random_numbers_host)

        # Allocate memory for coarse buffer if needed
        # Two ghost cells in each direction needed for bicubic interpolation
        self.coarse_buffer_host = np.zeros(
            (self.coarse_ny + 4, self.coarse_nx + 4),
            dtype=np.float32,
            order='C')
        self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream,
                                                self.coarse_nx, self.coarse_ny,
                                                2, 2, self.coarse_buffer_host)

        # Allocate extra memory needed for reduction kernels.
        # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu]
        self.reduction_buffer = None
        reduction_buffer_host = np.zeros((1, 3), dtype=np.float32)
        self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0,
                                                   reduction_buffer_host)

        # Generate kernels
        self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \
                                          defines={'block_width': block_width, 'block_height': block_height},
                                          compile_args={
                                              'options': ["--use_fast_math",
                                                          "--maxrregcount=32"]
                                          })

        self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \
                                                         defines={})

        # Get CUDA functions and define data types for prepared_{async_}call()
        # Generate kernels
        self.squareSumKernel = self.reduction_kernels.get_function("squareSum")
        self.squareSumKernel.prepare("iiPP")

        self.squareSumDoubleKernel = self.reduction_kernels.get_function(
            "squareSumDouble")
        self.squareSumDoubleKernel.prepare("iiPPP")

        self.makePerpendicularKernel = self.kernels.get_function(
            "makePerpendicular")
        self.makePerpendicularKernel.prepare("iiPiPiP")

        self.uniformDistributionKernel = self.kernels.get_function(
            "uniformDistribution")
        self.uniformDistributionKernel.prepare("iiiPiPi")

        self.normalDistributionKernel = None
        if self.use_lcg:
            self.normalDistributionKernel = self.kernels.get_function(
                "normalDistribution")
            self.normalDistributionKernel.prepare("iiiPiPi")

        self.soarKernel = self.kernels.get_function("SOAR")
        self.soarKernel.prepare("iifffffiiPiPii")

        self.geostrophicBalanceKernel = self.kernels.get_function(
            "geostrophicBalance")
        self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPif")

        self.bicubicInterpolationKernel = self.kernels.get_function(
            "bicubicInterpolation")
        self.bicubicInterpolationKernel.prepare(
            "iiiiffiiiiffiiffffPiPiPiPiPif")

        #Compute kernel launch parameters
        self.local_size = (block_width, block_height, 1)

        self.local_size_reductions = (128, 1, 1)
        self.global_size_reductions = (1, 1)

        # Launch one thread for each seed, which in turns generates two iid N(0,1)
        self.global_size_random_numbers = ( \
                       int(np.ceil(self.seed_nx / float(self.local_size[0]))), \
                       int(np.ceil(self.seed_ny / float(self.local_size[1]))) \
                     )

        # Launch on thread for each random number (in order to create perpendicular random numbers)
        self.global_size_perpendicular = ( \
                      int(np.ceil(self.rand_nx / float(self.local_size[0]))), \
                      int(np.ceil(self.rand_ny / float(self.local_size[1]))) \
                     )

        # Launch one thread per SOAR-correlated result - need to write to two ghost
        # cells in order to do bicubic interpolation based on the result
        self.global_size_SOAR = ( \
                     int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \
                     int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \
                    )

        # One thread per resulting perturbed grid cell
        self.global_size_geo_balance = ( \
                    int(np.ceil( (self.nx)/float(self.local_size[0]))), \
                    int(np.ceil( (self.ny)/float(self.local_size[1]))) \
                   )

        # Texture for coriolis field
        self.coriolis_texref = self.kernels.get_texref("coriolis_f_tex")
        if isinstance(coriolis_f, cuda.Array):
            # coriolis_f is already a texture, so we just set the reference
            self.coriolis_texref.set_array(coriolis_f)
        else:
            #Upload data to GPU and bind to texture reference
            self.coriolis_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(coriolis_f,
                                                      dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.coriolis_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.coriolis_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.coriolis_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing
        # FIXME! Allow different versions of coriolis, similar to CDKLM

        # Texture for angle towards north
        self.angle_texref = self.kernels.get_texref("angle_tex")
        if isinstance(angle, cuda.Array):
            # angle is already a texture, so we just set the reference
            self.angle_texref.set_array(angle)
        else:
            #Upload data to GPU and bind to texture reference
            self.angle_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.angle_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.angle_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.angle_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing

    def __del__(self):
        self.cleanUp()

    def cleanUp(self):
        if self.rng is not None:
            self.rng = None
        if self.seed is not None:
            self.seed.release()
        if self.random_numbers is not None:
            self.random_numbers.release()
        if self.perpendicular_random_numbers is not None:
            self.perpendicular_random_numbers.release()
        if self.reduction_buffer is not None:
            self.reduction_buffer.release()
        self.gpu_ctx = None
        gc.collect()

    @classmethod
    def fromsim(cls,
                sim,
                soar_q0=None,
                soar_L=None,
                interpolation_factor=1,
                use_lcg=False,
                block_width=16,
                block_height=16):
        staggered = False
        if isinstance(sim, FBL.FBL) or isinstance(sim, CTCS.CTCS):
            staggered = True
        return cls(sim.gpu_ctx,
                   sim.gpu_stream,
                   sim.nx,
                   sim.ny,
                   sim.dx,
                   sim.dy,
                   sim.boundary_conditions,
                   staggered,
                   soar_q0=soar_q0,
                   soar_L=soar_L,
                   interpolation_factor=interpolation_factor,
                   angle=sim.angle_texref.get_array(),
                   coriolis_f=sim.coriolis_texref.get_array(),
                   use_lcg=use_lcg,
                   block_width=block_width,
                   block_height=block_height)

    def getSeed(self):
        assert (
            self.use_lcg
        ), "getSeed is only valid if LCG is used as pseudo-random generator."

        return self.seed.download(self.gpu_stream)

    def resetSeed(self):
        assert (
            self.use_lcg
        ), "resetSeed is only valid if LCG is used as pseudo-random generator."

        # Generate seed:
        self.floatMax = 2147483648.0
        self.host_seed = self.random_state.rand(self.seed_ny,
                                                self.seed_nx) * self.floatMax
        self.host_seed = self.host_seed.astype(np.uint64, order='C')
        self.seed.upload(self.gpu_stream, self.host_seed)

    def getRandomNumbers(self):
        return self.random_numbers.download(self.gpu_stream)

    def getPerpendicularRandomNumbers(self):
        return self.perpendicular_random_numbers.download(self.gpu_stream)

    def getCoarseBuffer(self):
        return self.coarse_buffer.download(self.gpu_stream)

    def getReductionBuffer(self):
        return self.reduction_buffer.download(self.gpu_stream)

    def generateNormalDistribution(self):
        if not self.use_lcg:
            self.rng.fill_normal(self.random_numbers.data,
                                 stream=self.gpu_stream)
        else:
            self.normalDistributionKernel.prepared_async_call(
                self.global_size_random_numbers, self.local_size,
                self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx,
                self.seed.data.gpudata, self.seed.pitch,
                self.random_numbers.data.gpudata, self.random_numbers.pitch)

    def generateNormalDistributionPerpendicular(self):
        if not self.use_lcg:
            self.rng.fill_normal(self.perpendicular_random_numbers.data,
                                 stream=self.gpu_stream)
        else:
            self.normalDistributionKernel.prepared_async_call(
                self.global_size_random_numbers, self.local_size,
                self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx,
                self.seed.data.gpudata, self.seed.pitch,
                self.perpendicular_random_numbers.data.gpudata,
                self.perpendicular_random_numbers.pitch)

    def generateUniformDistribution(self):
        # Call kernel -> new random numbers
        if not self.use_lcg:
            self.rng.fill_uniform(self.random_numbers.data,
                                  stream=self.gpu_stream)
        else:
            self.uniformDistributionKernel.prepared_async_call(
                self.global_size_random_numbers, self.local_size,
                self.gpu_stream, self.seed_nx, self.seed_ny, self.rand_nx,
                self.seed.data.gpudata, self.seed.pitch,
                self.random_numbers.data.gpudata, self.random_numbers.pitch)

    def perturbSim(self,
                   sim,
                   q0_scale=1.0,
                   update_random_field=True,
                   perturbation_scale=1.0,
                   perpendicular_scale=0.0,
                   align_with_cell_i=None,
                   align_with_cell_j=None,
                   stream=None):
        """
        Generating a perturbed ocean state and adding it to sim's ocean state 
        """
        self.perturbOceanState(sim.gpu_data.h0,
                               sim.gpu_data.hu0,
                               sim.gpu_data.hv0,
                               sim.bathymetry.Bi,
                               sim.f,
                               beta=sim.coriolis_beta,
                               g=sim.g,
                               y0_reference_cell=sim.y_zero_reference_cell,
                               ghost_cells_x=sim.ghost_cells_x,
                               ghost_cells_y=sim.ghost_cells_y,
                               q0_scale=q0_scale,
                               update_random_field=update_random_field,
                               perturbation_scale=perturbation_scale,
                               perpendicular_scale=perpendicular_scale,
                               align_with_cell_i=align_with_cell_i,
                               align_with_cell_j=align_with_cell_j,
                               land_mask_value=sim.bathymetry.mask_value,
                               stream=stream)

    def perturbOceanState(self,
                          eta,
                          hu,
                          hv,
                          H,
                          f,
                          beta=0.0,
                          g=9.81,
                          y0_reference_cell=0,
                          ghost_cells_x=0,
                          ghost_cells_y=0,
                          q0_scale=1.0,
                          update_random_field=True,
                          perturbation_scale=1.0,
                          perpendicular_scale=0.0,
                          align_with_cell_i=None,
                          align_with_cell_j=None,
                          land_mask_value=np.float32(1.0e20),
                          stream=None):
        """
        Apply the SOAR Q covariance matrix on the random ocean field which is
        added to the provided buffers eta, hu and hv.
        eta: surface deviation - CUDAArray2D object.
        hu: volume transport in x-direction - CUDAArray2D object.
        hv: volume transport in y-dirextion - CUDAArray2D object.
        
        Optional parameters not used else_where:
        q0_scale=1: scale factor to the SOAR amplitude parameter q0
        update_random_field=True: whether to generate new random numbers or use those already 
            present in the random numbers buffer
        perturbation_scale=1.0: scale factor to the perturbation of the eta field
        perpendicular_scale=0.0: scale factor for additional perturbation from the perpendicular random field
        align_with_cell_i=None, align_with_cell_j=None: Index to a cell for which to align the coarse grid.
            The default value align_with_cell=None corresponds to zero offset between the coarse and fine grid.
        """

        if stream is None:
            stream = self.gpu_stream

        if update_random_field:
            # Need to update the random field, requiering a global sync
            self.generateNormalDistribution()

        soar_q0 = np.float32(self.soar_q0 * q0_scale)

        offset_i, offset_j = self._obtain_coarse_grid_offset(
            align_with_cell_i, align_with_cell_j)

        # Generate the SOAR field on the coarse grid

        self.soarKernel.prepared_async_call(
            self.global_size_SOAR, self.local_size, stream, self.coarse_nx,
            self.coarse_ny,
            self.coarse_dx, self.coarse_dy, soar_q0, self.soar_L,
            np.float32(perturbation_scale), self.periodicNorthSouth,
            self.periodicEastWest, self.random_numbers.data.gpudata,
            self.random_numbers.pitch, self.coarse_buffer.data.gpudata,
            self.coarse_buffer.pitch, np.int32(0))
        if perpendicular_scale > 0:
            self.soarKernel.prepared_async_call(
                self.global_size_SOAR, self.local_size, stream, self.coarse_nx,
                self.coarse_ny, self.coarse_dx,
                self.coarse_dy, soar_q0, self.soar_L,
                np.float32(perpendicular_scale), self.periodicNorthSouth,
                self.periodicEastWest,
                self.perpendicular_random_numbers.data.gpudata,
                self.perpendicular_random_numbers.pitch,
                self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch,
                np.int32(1))

        if self.interpolation_factor > 1:
            self.bicubicInterpolationKernel.prepared_async_call(
                self.global_size_geo_balance, self.local_size, stream,
                self.nx, self.ny, np.int32(ghost_cells_x),
                np.int32(ghost_cells_y), self.dx, self.dy,
                self.coarse_nx, self.coarse_ny, np.int32(ghost_cells_x),
                np.int32(ghost_cells_y), self.coarse_dx, self.coarse_dy,
                np.int32(offset_i), np.int32(offset_j), np.float32(g),
                np.float32(f), np.float32(beta), np.float32(y0_reference_cell),
                self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch,
                eta.data.gpudata, eta.pitch, hu.data.gpudata, hu.pitch,
                hv.data.gpudata, hv.pitch, H.data.gpudata, H.pitch,
                land_mask_value)

        else:
            self.geostrophicBalanceKernel.prepared_async_call(
                self.global_size_geo_balance, self.local_size, stream, self.nx,
                self.ny, self.dx, self.dy, np.int32(ghost_cells_x),
                np.int32(ghost_cells_y), np.float32(g), np.float32(f),
                np.float32(beta), np.float32(y0_reference_cell),
                self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch,
                eta.data.gpudata, eta.pitch, hu.data.gpudata, hu.pitch,
                hv.data.gpudata, hv.pitch, H.data.gpudata, H.pitch,
                land_mask_value)

    def _obtain_coarse_grid_offset(self, fine_index_i, fine_index_j):

        default_offset = self.interpolation_factor // 2

        offset_i, offset_j = 0, 0

        if fine_index_i is not None:
            coarse_i = fine_index_i // self.interpolation_factor
            raw_offset_i = fine_index_i % self.interpolation_factor
            offset_i = -int(raw_offset_i - default_offset)
        if fine_index_j is not None:
            coarse_j = fine_index_j // self.interpolation_factor
            raw_offset_j = fine_index_j % self.interpolation_factor
            offset_j = -int(raw_offset_j - default_offset)
        return offset_i, offset_j

    def getRandomNorm(self):
        """
        Calculates sum(xi^2), where xi \sim N(0,I)
        Calling a kernel that sums the square of all elements in the random buffer
        """
        self.squareSumKernel.prepared_async_call(
            self.global_size_reductions, self.local_size_reductions,
            self.gpu_stream, self.rand_nx, self.rand_ny,
            self.random_numbers.data.gpudata,
            self.reduction_buffer.data.gpudata)
        return self.getReductionBuffer()[0, 0]

    def findDoubleNormAndDot(self):
        """
        Calculates sum(xi^2), sum(nu^2), sum(xi*nu)
        and stores these values in the reduction buffer
        """
        self.squareSumDoubleKernel.prepared_async_call(
            self.global_size_reductions, self.local_size_reductions,
            self.gpu_stream, self.rand_nx, self.rand_ny,
            self.random_numbers.data.gpudata,
            self.perpendicular_random_numbers.data.gpudata,
            self.reduction_buffer.data.gpudata)

    def _makePerpendicular(self):
        """
        Calls the kernel that transform nu (perpendicular_random_numbers buffer) to be 
        perpendicular to xi (random_numbers buffer).
        Both nu and xi should be independent samples from N(0,I) prior to calling this function.
        After this function, they are still both samples from N(0,I), but are no longer independent
        (but lineary independent).
        """
        self.makePerpendicularKernel.prepared_async_call(
            self.global_size_perpendicular, self.local_size, self.gpu_stream,
            self.rand_nx, self.rand_ny, self.random_numbers.data.gpudata,
            self.random_numbers.pitch,
            self.perpendicular_random_numbers.data.gpudata,
            self.perpendicular_random_numbers.pitch,
            self.reduction_buffer.data.gpudata)

    def generatePerpendicularNormalDistributions(self):
        """
        Generates xi, nu \sim N(0,I) such that xi and nu are perpendicular.
        In the process, it calculates sum(xi^2), sum(nu^2), which are written to the first two 
        elements in the reduction buffer.
        The third reduction buffer will contain the original, now outdated, dot(xi, nu), which 
        was used to construct a random nu that is perpendicular to xi in the first place.
        """
        self.generateNormalDistribution()
        self.generateNormalDistributionPerpendicular()
        self.findDoubleNormAndDot()
        self._makePerpendicular()

    ##### CPU versions of the above functions ####

    def getSeedCPU(self):
        assert (
            self.use_lcg
        ), "getSeedCPU is only valid if LCG is used as pseudo-random generator."
        return self.host_seed

    def generateNormalDistributionCPU(self):
        self._CPUUpdateRandom(True)

    def generateUniformDistributionCPU(self):
        self._CPUUpdateRandom(False)

    def getRandomNumbersCPU(self):
        return self.random_numbers_host

    def perturbEtaCPU(self,
                      eta,
                      use_existing_GPU_random_numbers=False,
                      ghost_cells_x=0,
                      ghost_cells_y=0):
        """
        Apply the SOAR Q covariance matrix on the random field to add
        a perturbation to the incomming eta buffer.
        eta: numpy array
        """
        # Call CPU utility function
        if use_existing_GPU_random_numbers:
            self.random_numbers_host = self.getRandomNumbers()
        else:
            self.generateNormalDistributionCPU()
        d_eta = self._applyQ_CPU()

        if self.interpolation_factor > 1:
            d_eta = self._interpolate_CPU(d_eta, geostrophic_balance=False)

        interior = [
            -ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x
        ]
        for i in range(4):
            if interior[i] == 0:
                interior[i] = None

        eta[interior[2]:interior[0], interior[3]:interior[1]] = d_eta[2:-2,
                                                                      2:-2]

    def perturbOceanStateCPU(self,
                             eta,
                             hu,
                             hv,
                             H,
                             f,
                             beta=0.0,
                             g=9.81,
                             ghost_cells_x=0,
                             ghost_cells_y=0,
                             use_existing_GPU_random_numbers=False,
                             use_existing_CPU_random_numbers=False):
        """
        Apply the SOAR Q covariance matrix on the random field to add
        a perturbation to the incomming eta buffer.
        Generate geostrophically balanced hu and hv which is added to the incomming hu and hv buffers.
        eta: numpy array
        """
        # Call CPU utility function
        if use_existing_GPU_random_numbers:
            self.random_numbers_host = self.getRandomNumbers()
        elif not use_existing_CPU_random_numbers:
            self.generateNormalDistributionCPU()

        # generates perturbation (d_eta[ny+4, nx+4], d_hu[ny, nx] and d_hv[ny, nx])
        d_eta, d_hu, d_hv = self._obtainOceanPerturbations_CPU(H, f, beta, g)

        interior = [
            -ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x
        ]
        for i in range(4):
            if interior[i] == 0:
                interior[i] = None

        eta[interior[2]:interior[0], interior[3]:interior[1]] += d_eta[2:-2,
                                                                       2:-2]
        hu[interior[2]:interior[0], interior[3]:interior[1]] += d_hu
        hv[interior[2]:interior[0], interior[3]:interior[1]] += d_hv

    # ------------------------------
    # CPU utility functions:
    # ------------------------------

    def _lcg(self, seed):
        modulo = np.uint64(2147483647)
        seed = np.uint64(((seed * 1103515245) + 12345) % modulo)  #0x7fffffff
        return seed / 2147483648.0, seed

    def _boxMuller(self, seed_in):
        seed = np.uint64(seed_in)
        u1, seed = self._lcg(seed)
        u2, seed = self._lcg(seed)
        r = np.sqrt(-2.0 * np.log(u1))
        theta = 2 * np.pi * u2
        n1 = r * np.cos(theta)
        n2 = r * np.sin(theta)
        return n1, n2, seed

    def _CPUUpdateRandom(self, normalDist):
        """
        Updating the random number buffer at the CPU.
        normalDist: Boolean parameter. 
            If True, the random numbers are from N(0,1)
            If False, the random numbers are from U[0,1]
        """
        if not self.use_lcg:
            if normalDist:
                self.generateNormalDistribution()
            else:
                self.generateUniformDistribution()
            self.random_numbers_host = self.getRandomNumbers()
            return

        #(ny, nx) = seed.shape
        #(domain_ny, domain_nx) = random.shape
        b_dim_x = self.local_size[0]
        b_dim_y = self.local_size[1]
        blocks_x = self.global_size_random_numbers[0]
        blocks_y = self.global_size_random_numbers[1]
        for by in range(blocks_y):
            for bx in range(blocks_x):
                for j in range(b_dim_y):
                    for i in range(b_dim_x):

                        ## Content of kernel:
                        y = b_dim_y * by + j  # thread_id
                        x = b_dim_x * bx + i  # thread_id
                        if (x < self.seed_nx and y < self.seed_ny):
                            n1, n2 = 0.0, 0.0
                            if normalDist:
                                n1, n2, self.host_seed[y, x] = self._boxMuller(
                                    self.host_seed[y, x])
                            else:
                                n1, self.host_seed[y, x] = self._lcg(
                                    self.host_seed[y, x])
                                n2, self.host_seed[y, x] = self._lcg(
                                    self.host_seed[y, x])

                            if x * 2 + 1 < self.rand_nx:
                                self.random_numbers_host[y, x * 2] = n1
                                self.random_numbers_host[y, x * 2 + 1] = n2
                            elif x * 2 == self.rand_nx:
                                self.random_numbers_host[y, x * 2] = n1

    def _SOAR_Q_CPU(self, a_x, a_y, b_x, b_y):
        """
        CPU implementation of a SOAR covariance function between grid points
        (a_x, a_y) and (b_x, b_y)
        """
        dist = np.sqrt(self.coarse_dx * self.coarse_dx * (a_x - b_x)**2 +
                       self.coarse_dy * self.coarse_dy * (a_y - b_y)**2)
        return self.soar_q0 * (1.0 + dist / self.soar_L) * np.exp(
            -dist / self.soar_L)

    def _applyQ_CPU(self, perturbation_scale=1):
        #xi, dx=1, dy=1, q0=0.1, L=1, cutoff=5):
        """
        Create the perturbation field for eta based on the SOAR covariance 
        structure.
        
        The resulting size is (coarse_nx+4, coarse_ny+4), as two ghost cells are required to 
        do bicubic interpolation of the result.
        """

        # Assume in a GPU setting - we read xi into shared memory with ghostcells
        # Additional cutoff number of ghost cells required to calculate SOAR contribution
        ny_halo = int(self.coarse_ny + (2 + self.cutoff) * 2)
        nx_halo = int(self.coarse_nx + (2 + self.cutoff) * 2)
        local_xi = np.zeros((ny_halo, nx_halo))
        for j in range(ny_halo):
            global_j = j
            if self.periodicNorthSouth:
                global_j = (j - self.cutoff - 2) % self.rand_ny
            for i in range(nx_halo):
                global_i = i
                if self.periodicEastWest:
                    global_i = (i - self.cutoff - 2) % self.rand_nx
                local_xi[j, i] = self.random_numbers_host[global_j, global_i]

        # Sync threads

        # Allocate output buffer
        Qxi = np.zeros((self.coarse_ny + 4, self.coarse_nx + 4))
        for a_y in range(self.coarse_ny + 4):
            for a_x in range(self.coarse_nx + 4):
                # This is a OpenCL thread (a_x, a_y)
                local_a_x = a_x + self.cutoff
                local_a_y = a_y + self.cutoff

                #############
                #Qxi[a_y, a_x] = local_xi[local_a_y, local_a_x]
                #continue
                #############

                start_b_y = local_a_y - self.cutoff
                end_b_y = local_a_y + self.cutoff + 1
                start_b_x = local_a_x - self.cutoff
                end_b_x = local_a_x + self.cutoff + 1

                Qx = 0.0
                for b_y in range(start_b_y, end_b_y):
                    for b_x in range(start_b_x, end_b_x):
                        Q = self._SOAR_Q_CPU(local_a_x, local_a_y, b_x, b_y)
                        Qx += Q * local_xi[b_y, b_x]
                Qxi[a_y, a_x] = perturbation_scale * Qx

        return Qxi

    def _obtainOceanPerturbations_CPU(self,
                                      H,
                                      f,
                                      beta,
                                      g,
                                      perturbation_scale=1):
        # Obtain perturbed eta - size (coarse_ny+4, coarse_nx+4)
        d_eta = self._applyQ_CPU(perturbation_scale)

        # Interpolate if the coarse grid is not the same as the computational grid
        # d_eta then becomes (ny+4, nx+4)
        if self.interpolation_factor > 1:
            d_eta = self._interpolate_CPU(d_eta)

        ####
        # Global sync (currently)
        #     Can be made into a local sync, as long as d_eta is given
        #     periodic overlap (1 more global computated ghost cell)
        ####

        d_hu = np.zeros((self.ny, self.nx))
        d_hv = np.zeros((self.ny, self.nx))

        ### Find H_mid:
        # Read global H (def on intersections) to local, find H_mid
        # The local memory can then be reused to something else (perhaps use local_d_eta before computing local_d_eta?)
        H_mid = np.zeros((self.ny, self.nx))
        for j in range(self.ny):
            for i in range(self.nx):
                H_mid[j, i] = 0.25 * (H[j, i] + H[j + 1, i] + H[j, i + 1] +
                                      H[j + 1, i + 1])

        ####
        # Local sync
        ####

        # Compute geostrophically balanced (hu, hv) for each cell within the domain
        for j in range(0, self.ny):
            local_j = j + 2  # index in d_eta buffer
            coriolis = f + beta * local_j * self.dy
            for i in range(0, self.nx):
                local_i = i + 2  # index in d_eta buffer
                h_mid = d_eta[local_j, local_i] + H_mid[j, i]

                ##############
                #h_mid = H_mid[j, i]
                ##############

                eta_diff_y = (d_eta[local_j + 1, local_i] -
                              d_eta[local_j - 1, local_i]) / (2.0 * self.dy)
                d_hu[j, i] = -(g / coriolis) * h_mid * eta_diff_y

                eta_diff_x = (d_eta[local_j, local_i + 1] -
                              d_eta[local_j, local_i - 1]) / (2.0 * self.dx)
                d_hv[j, i] = (g / coriolis) * h_mid * eta_diff_x

        return d_eta, d_hu, d_hv

    def _interpolate_CPU(self, coarse_eta, interpolation_order=3):
        """
        Interpolates values coarse_eta defined on the coarse grid onto the computational grid.
        Input coarse_eta is of size [coarse_ny+4, coarse_nx+4], and output will be given as
        eta [ny+4, nx+4].
        """

        # Create buffers for eta, hu and hv:
        d_eta = np.zeros((self.ny + 4, self.nx + 4))

        min_rel_x = 10
        max_rel_x = -10
        min_rel_y = 10
        max_rel_y = -10

        # Loop over internal cells and first ghost cell layer.
        for loc_j in range(self.ny + 2):
            for loc_i in range(self.nx + 2):

                # index in resulting d_eta buffer
                i = loc_i + 1
                j = loc_j + 1

                # Position of cell center in fine grid:
                x = (i - 2 + 0.5) * self.dx
                y = (j - 2 + 0.5) * self.dy

                # Location in coarse grid (defined in course grid's cell centers)
                # (coarse_i, coarse_j) is the first coarse grid point towards lower left.
                coarse_i = int(np.floor(x / self.coarse_dx + 2 - 0.5))
                coarse_j = int(np.floor(y / self.coarse_dy + 2 - 0.5))

                # Position of the coarse grid point
                coarse_x = (coarse_i - 2 + 0.5) * self.coarse_dx
                coarse_y = (coarse_j - 2 + 0.5) * self.coarse_dy

                assert coarse_x <= x
                assert coarse_x + self.coarse_dx >= x

                rel_x = (x - coarse_x) / self.coarse_dx
                rel_y = (y - coarse_y) / self.coarse_dy

                if rel_x < min_rel_x:
                    min_rel_x = rel_x
                if rel_x > max_rel_x:
                    max_rel_x = rel_x
                if rel_y < min_rel_y:
                    min_rel_y = rel_y
                if rel_y > max_rel_y:
                    max_rel_y = rel_y

                assert rel_x >= 0 and rel_x < 1
                assert rel_y >= 0 and rel_y < 1

                d_eta[j, i] = self._bicubic_interpolation_inner(
                    coarse_eta, coarse_i, coarse_j, rel_x, rel_y,
                    interpolation_order)

        return d_eta

    def _bicubic_interpolation_inner(self,
                                     coarse_eta,
                                     coarse_i,
                                     coarse_j,
                                     rel_x,
                                     rel_y,
                                     interpolation_order=3):
        # Matrix needed to find the interpolation coefficients
        bicubic_matrix = np.matrix([[1, 0, 0, 0], [0, 0, 1, 0],
                                    [-3, 3, -2, -1], [2, -2, 1, 1]])

        f00 = coarse_eta[coarse_j, coarse_i]
        f01 = coarse_eta[coarse_j + 1, coarse_i]
        f10 = coarse_eta[coarse_j, coarse_i + 1]
        f11 = coarse_eta[coarse_j + 1, coarse_i + 1]

        fx00 = (coarse_eta[coarse_j, coarse_i + 1] -
                coarse_eta[coarse_j, coarse_i - 1]) / 2
        fx01 = (coarse_eta[coarse_j + 1, coarse_i + 1] -
                coarse_eta[coarse_j + 1, coarse_i - 1]) / 2
        fx10 = (coarse_eta[coarse_j, coarse_i + 2] -
                coarse_eta[coarse_j, coarse_i]) / 2
        fx11 = (coarse_eta[coarse_j + 1, coarse_i + 2] -
                coarse_eta[coarse_j + 1, coarse_i]) / 2

        fy00 = (coarse_eta[coarse_j + 1, coarse_i] -
                coarse_eta[coarse_j - 1, coarse_i]) / 2
        fy01 = (coarse_eta[coarse_j + 2, coarse_i] -
                coarse_eta[coarse_j, coarse_i]) / 2
        fy10 = (coarse_eta[coarse_j + 1, coarse_i + 1] -
                coarse_eta[coarse_j - 1, coarse_i + 1]) / 2
        fy11 = (coarse_eta[coarse_j + 2, coarse_i + 1] -
                coarse_eta[coarse_j, coarse_i + 1]) / 2

        fy_10 = (coarse_eta[coarse_j + 1, coarse_i - 1] -
                 coarse_eta[coarse_j - 1, coarse_i - 1]) / 2
        fy_11 = (coarse_eta[coarse_j + 2, coarse_i - 1] -
                 coarse_eta[coarse_j, coarse_i - 1]) / 2
        fy20 = (coarse_eta[coarse_j + 1, coarse_i + 2] -
                coarse_eta[coarse_j - 1, coarse_i + 2]) / 2
        fy21 = (coarse_eta[coarse_j + 2, coarse_i + 2] -
                coarse_eta[coarse_j, coarse_i + 2]) / 2

        fxy00 = (fy10 - fy_10) / 2
        fxy01 = (fy11 - fy_11) / 2
        fxy10 = (fy20 - fy00) / 2
        fxy11 = (fy21 - fy01) / 2

        f_matrix = np.matrix([[f00, f01, fy00, fy01], [f10, f11, fy10, fy11],
                              [fx00, fx01, fxy00, fxy01],
                              [fx10, fx11, fxy10, fxy11]])

        a_matrix = np.dot(bicubic_matrix,
                          np.dot(f_matrix, bicubic_matrix.transpose()))

        x_vec = np.matrix([1.0, rel_x, rel_x * rel_x, rel_x * rel_x * rel_x])
        y_vec = np.matrix([1.0, rel_y, rel_y * rel_y,
                           rel_y * rel_y * rel_y]).transpose()

        if interpolation_order == 0:
            # Flat average:
            return 0.25 * (f00 + f01 + f10 + f11)

        elif interpolation_order == 1:
            # Linear interpolation:
            return f00 * (1 - rel_x) * (1 - rel_y) + f10 * rel_x * (
                1 - rel_y) + f01 * (1 - rel_x) * rel_y + f11 * rel_x * rel_y

        elif interpolation_order == 3:
            # Bicubic interpolation (make sure that we return a float)
            return np.dot(x_vec, np.dot(a_matrix, y_vec))[0, 0]
Exemple #10
0
# Author: Chaojie Wang <*****@*****.**>; Jiawen Wu
# License: BSD-3-Clause

import pycuda.curandom as curandom
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.compiler import SourceModule
from pycuda.curandom import XORWOWRandomNumberGenerator

import numpy as np

realmin = 2.2e-10

cuda_generator = XORWOWRandomNumberGenerator()

mod = SourceModule("""

#include <stdio.h>

__device__ int cudarand(long long seed)
{
    if (seed == 0)
    {
        seed = 1;
    }
    long long temp=(48271 * seed + 0) % 2147483647;
    return temp;
}
Exemple #11
0
class PyCudaHandler(Handler):
    __undescribed__ = {'context', 'dtype', 'EMPTY', 'rnd'}

    def __init__(self, seed=None):
        super(PyCudaHandler, self).__init__()
        self.dtype = np.float32
        self.context = cumisc._global_cublas_handle
        self.EMPTY = gpuarray.zeros((), dtype=self.dtype)
        if seed is None:
            seed = global_rnd.generate_seed()

        def get_seeds(n):
            return gpuarray.to_gpu(np.ones(n, np.int32) * seed)
        self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds)

    array_type = pycuda.gpuarray.GPUArray

    def __init_from_description__(self, description):
        self.__init__()

    def _get_gridsize(self, n):
        min_threads = 32
        max_threads = 256
        max_blocks = 384

        if n < min_threads:
            block_count = 1
            threads_per_block = min_threads
        elif n < (max_blocks * min_threads):
            block_count = (n + min_threads - 1) // min_threads
            threads_per_block = min_threads
        elif n < (max_blocks * max_threads):
            block_count = max_blocks
            grp = (n + min_threads - 1) // min_threads
            threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads
        else:
            block_count = max_blocks
            threads_per_block = max_threads

        return (block_count, 1), (threads_per_block, 1, 1)

    # ------------------------- Allocate new memory ------------------------- #

    def allocate(self, size):
        return gpuarray.zeros(size, dtype=self.dtype)

    def ones(self, shape):
        a = self.zeros(shape)
        self.fill(a, 1.0)
        return a

    def zeros(self, shape):
        return gpuarray.zeros(shape=shape, dtype=self.dtype)

    # ---------------------------- Copy and Fill ---------------------------- #

    def copy_to(self, src, dest):
        # Copy data from src to dest (both must be GPUArrays)
        pycuda.driver.memcpy_dtod(dest.gpudata, src.gpudata, dest.nbytes)

    def copy_to_if(self, src, dest, cond):
        copy_to_if_kernel(src, dest, cond)

    def create_from_numpy(self, arr):
        return gpuarray.to_gpu(arr.astype(self.dtype))

    def fill(self, mem, val):
        mem.fill(val)

    def fill_if(self, mem, val, cond):
        fill_if_kernel(mem, val, cond)

    def get_numpy_copy(self, mem):
        assert type(mem) == self.array_type
        return mem.get()

    def set_from_numpy(self, mem, arr):
        assert mem.shape == arr.shape, "Shape of destination ({}) != Shape " \
                                       "of source ({})".format(mem.shape,
                                                               arr.shape)
        mem.set(arr.astype(self.dtype))

    # ---------------------------- Debug helpers ---------------------------- #

    def is_fully_finite(self, a):
        temp = gpuarray.zeros_like(a)
        check_inf_or_nan_kernel(a, temp)
        return not np.any(temp.get())

    # ----------------------- Mathematical operations ----------------------- #

    def abs_t(self, a, out):
        cumath.fabs(a, out=out)

    def add_into_if(self, a, out, cond):
        add_into_if_kernel(a, out, cond)

    def add_mv(self, m, v, out):
        cumisc.add_matvec(m, v, out=out)

    def add_st(self, s, t, out):
        add_st_kernel(s, t, out)

    def add_tt(self, a, b, out):
        add_mm_kernel(a, b, out)

    def avgpool2d_backward_batch(self, inputs, window, outputs, padding,
                                 stride, in_deltas, out_deltas):
        n, h, w, c = inputs.shape
        o_h, o_w = outputs.shape[1], outputs.shape[2]
        _avepool_bwd_fp32_impl(np.int32(inputs.size), out_deltas,
                               np.int32(n), np.int32(h),
                               np.int32(w), np.int32(c),
                               np.int32(o_h), np.int32(o_w),
                               np.int32(window[0]), np.int32(window[1]),
                               np.int32(stride[0]), np.int32(stride[1]),
                               np.int32(padding), np.int32(padding),
                               in_deltas,
                               block=(NUM_CUDA_THREADS, 1, 1),
                               grid=(get_blocks(inputs.size), 1))

    def avgpool2d_forward_batch(self, inputs, window, outputs, padding,
                                stride):
        n, h, w, c = inputs.shape
        o_h, o_w = outputs.shape[1], outputs.shape[2]
        _avepool_fwd_fp32_impl(np.int32(outputs.size), inputs,
                               np.int32(n), np.int32(h),
                               np.int32(w), np.int32(c),
                               np.int32(o_h), np.int32(o_w),
                               np.int32(window[0]), np.int32(window[1]),
                               np.int32(stride[0]), np.int32(stride[1]),
                               np.int32(padding), np.int32(padding),
                               outputs,
                               block=(NUM_CUDA_THREADS, 1, 1),
                               grid=(get_blocks(outputs.size), 1))

    def binarize_v(self, v, out):
        binarize_v_kernel(out, v, out.shape[0], out.shape[1])

    def broadcast_t(self, a, axis, out):
        broadcast_dim = int(out.shape[axis])
        stride = int(np.prod(out.shape[axis+1:]))
        broadcast_t_kernel(out, a, broadcast_dim, stride)

    def clip_t(self, a, a_min, a_max, out):
        clip_kernel(a, out, a_min, a_max)

    def conv2d_backward_batch(self, inputs, params, padding, stride,
                              in_deltas, out_deltas, dparams, dbias):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = out_deltas.shape[1] * out_deltas.shape[2]
        num_kernel_params = np.prod(kernel_shape)

        dparams.fill(0.0)
        dbias.fill(0.0)
        tmp = self.zeros(dbias.shape)
        col = self.zeros((num_output_pixels, num_kernel_params))

        for i in range(num_images):
            num_cuda_kernels = num_output_pixels * num_input_maps

            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(out_deltas.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(NUM_CUDA_THREADS, 1, 1),
                              grid=(get_blocks(num_cuda_kernels), 1))

            # Compute gradients
            reshaped_dparams = dparams.reshape(num_filters, num_kernel_params)
            reshaped_out_deltas = out_deltas[i].reshape((num_output_pixels,
                                                         num_filters))
            self.dot_add_mm(reshaped_out_deltas, col, out=reshaped_dparams,
                            transa=True)

            self.sum_t(reshaped_out_deltas, axis=0, out=tmp)
            self.add_tt(tmp, dbias, out=dbias)

            # Compute in_deltas
            reshaped_params = params.reshape((num_filters, num_kernel_params))
            self.dot_mm(reshaped_out_deltas, reshaped_params, out=col)
            num_cuda_kernels = input_rows * input_cols * num_input_maps
            _col2im_fp32_impl(np.int32(num_cuda_kernels), col.gpudata,
                              np.int32(input_cols), np.int32(num_input_maps),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(out_deltas.shape[1]),
                              np.int32(out_deltas.shape[2]),
                              in_deltas[i],
                              block=(NUM_CUDA_THREADS, 1, 1),
                              grid=(get_blocks(num_cuda_kernels), 1))

    def conv2d_forward_batch(self, inputs, params, bias, outputs,
                             padding, stride):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = outputs.shape[1] * outputs.shape[2]
        num_kernel_params = np.prod(kernel_shape)
        out_shape = (num_output_pixels, num_filters)
        num_cuda_kernels = num_output_pixels * num_input_maps

        for i in range(num_images):
            col = self.zeros((num_output_pixels, num_kernel_params))
            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(outputs.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(NUM_CUDA_THREADS, 1, 1),
                              grid=(get_blocks(num_cuda_kernels), 1))

            reshaped_params = params.reshape(num_filters, num_kernel_params)
            culinalg.dot(col, reshaped_params, transb='T',
                         out=outputs[i].reshape(out_shape))

        flat_outputs = flatten_all_but_last(outputs)
        self.add_mv(flat_outputs, bias, flat_outputs)

    def dot_add_mm(self, a, b, out, transa=False, transb=False):
        transa = 'T' if transa else 'N'
        transb = 'T' if transb else 'N'
        culinalg.add_dot(a, b, out, transa, transb)

    def dot_mm(self, a, b, out, transa=False, transb=False):
        transa = 'T' if transa else 'N'
        transb = 'T' if transb else 'N'
        culinalg.dot(a, b, transa=transa, transb=transb, out=out)

    def divide_mv(self, m, v, out):
        cumisc.div_matvec(m, v, out=out)

    def divide_tt(self, a, b, out):
        div_kernel(a, b, out)

    def fill_gaussian(self, mean, std, out):
        self.rnd.fill_normal(out)
        self.mult_st(std, out, out=out)
        self.add_st(mean, out, out=out)

    def generate_probability_mask(self, mask, probability):
        self.rnd.fill_uniform(mask)
        create_probabilistic_mask_kernel(mask, probability, mask)

    def index_m_by_v(self, m, v, out):
        index_m_by_v_kernel(out, v, m, m.shape[0], m.shape[1])

    def log_t(self, a, out):
        cumath.log(a, out=out)

    def maxpool2d_backward_batch(self, inputs, window, outputs, padding,
                                 stride, argmax, in_deltas, out_deltas):
        in_image_size = inputs.size // inputs.shape[0]
        out_image_size = outputs.size // outputs.shape[0]
        _maxpool_bwd_fp32_impl(np.int32(outputs.size), out_deltas,
                               argmax,
                               np.int32(out_image_size),
                               np.int32(in_image_size),
                               in_deltas,
                               block=(NUM_CUDA_THREADS, 1, 1),
                               grid=(get_blocks(outputs.size), 1))

    def maxpool2d_forward_batch(self, inputs, window, outputs, padding,
                                stride, argmax):
        n, h, w, c = inputs.shape
        o_h, o_w = outputs.shape[1], outputs.shape[2]
        _maxpool_fwd_fp32_impl(np.int32(outputs.size), inputs,
                               np.int32(h), np.int32(w), np.int32(c),
                               np.int32(o_h), np.int32(o_w),
                               np.int32(window[0]), np.int32(window[1]),
                               np.int32(stride[0]), np.int32(stride[1]),
                               np.int32(padding), np.int32(padding),
                               outputs,
                               argmax,
                               block=(NUM_CUDA_THREADS, 1, 1),
                               grid=(get_blocks(outputs.size), 1))

    def merge_tt(self, a, b, out):
        assert(a.shape[-1] + b.shape[-1] == out.shape[-1])
        n = int(np.prod(out.shape[:-1]))
        grid, block = self._get_gridsize(n)
        _merge_impl(a.gpudata, b.gpudata, out.gpudata,
                    np.int32(n), np.int32(a.shape[-1]), np.int32(b.shape[-1]),
                    block=block, grid=grid)

    def modulo_tt(self, a, b, out):
        modulo_tt_kernel(a, b, out)

    def mult_add_st(self, s, t, out):
        mult_add_st_kernel(s, t, out)

    def mult_add_tt(self, a, b, out):
        mult_add_kernel(a, b, out)

    def mult_mv(self, m, v, out):
        if m.shape == v.shape:
            self.mult_tt(m, v, out=out)
        else:
            cumisc.mult_matvec(m, v, out=out)

    def mult_add_mv(self, m, v, out):
        if m.shape == v.shape:
            self.mult_add_tt(m, v, out=out)
        else:
            tmp = self.allocate(out.shape)
            cumisc.mult_matvec(m, v, out=tmp)
            self.add_tt(tmp, out, out=out)

    def mult_st(self, s, t, out):
        mult_st_kernel(s, t, out)

    def mult_tt(self, a, b, out):
        mult_tt_kernel(a, b, out)

    def sign_t(self, a, out):
        sign_kernel(a, out)

    def split_add_tt(self, x, out_a, out_b):
        assert(out_a.shape[-1] + out_b.shape[-1] == x.shape[-1])
        n = int(np.prod(x.shape[:-1]))
        grid, block = self._get_gridsize(n)
        _split_add_impl(x.gpudata, out_a.gpudata, out_b.gpudata,
                        np.int32(n), np.int32(out_a.shape[-1]),
                        np.int32(out_b.shape[-1]),
                        block=block, grid=grid)

    def sqrt_t(self, a, out):
        cumath.sqrt(a, out=out)

    def subtract_mv(self, m, v, out):
        cumisc.binaryop_matvec('-', m, v, None, out, None)

    def subtract_tt(self, a, b, out):
        subtract_mm_kernel(a, b, out)

    def sum_t(self, a, axis, out):
        if len(a.shape) < 3 and (axis == 0 or axis == 1):
            cumisc.sum(a, axis=axis, out=out)
        elif axis is None:
            cumisc.sum(a.reshape((a.size, 1)), axis=0, out=out)
        else:
            raise NotImplementedError

    # ------------------------ Activation functions ------------------------- #

    def rel(self, x, y):
        rel_kernel(x, y)

    def rel_deriv(self, x, y, dy, dx):
        rel_deriv_kernel(x, y, dy, dx)

    def sigmoid(self, x, y):
        sigmoid_kernel(x, y)

    def sigmoid_deriv(self, x, y, dy, dx):
        sigmoid_deriv_kernel(x, y, dy, dx)

    def softmax_m(self, m, out):
        n, k = m.shape
        tmp = gpuarray.empty((1, n), dtype=m.dtype)
        _softmax_impl(m, tmp.gpudata, out, np.int32(n),
                      np.int32(k), block=(32, 1, 1), grid=(n, 1, 1))
        return out

    def tanh(self, x, y):
        tanh_kernel(x, y)

    def tanh_deriv(self, x, y, dy, dx):
        tanh_deriv_kernel(x, y, dy, dx)
Exemple #12
0
    def __init__(self, gpu_ctx, gpu_stream,
                 nx, ny, dx, dy,
                 boundaryConditions, staggered,
                 soar_q0=None, soar_L=None,
                 interpolation_factor = 1,
                 use_lcg=False,
                 block_width=16, block_height=16):
        """
        Initiates a class that generates small scale geostrophically balanced perturbations of
        the ocean state.
        (nx, ny): number of internal grid cells in the domain
        (dx, dy): size of each grid cell
        soar_q0: amplitude parameter for the perturbation, default: dx*1e-5
        soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor
        interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, 
            and then interpolated down to the computational mesh. The coarse mesh will then have
            (nx/interpolation_factor, ny/interpolation_factor) grid cells.
        use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers
        (block_width, block_height): The size of each GPU block
        """

        self.use_lcg = use_lcg

        # Set numpy random state
        self.random_state = np.random.RandomState()
        
        # Make sure that all variables initialized within ifs are defined
        self.random_numbers = None
        self.rng = None
        self.seed = None
        self.host_seed = None
        
        self.gpu_ctx = gpu_ctx
        self.gpu_stream = gpu_stream
        
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.staggered = np.int(0)
        if staggered:
            self.staggered = np.int(1)
            
        # The cutoff parameter is hard-coded.
        # The size of the cutoff determines the computational radius in the
        # SOAR function. Hence, the size of the local memory in the OpenCL 
        # kernels has to be hard-coded.
        self.cutoff = np.int32(config.soar_cutoff) 
        
        # Check that the interpolation factor plays well with the grid size:
        assert ( interpolation_factor > 0 and interpolation_factor % 2 == 1), 'interpolation_factor must be a positive odd integer'
        
        assert (nx % interpolation_factor == 0), 'nx must be divisible by the interpolation factor'
        assert (ny % interpolation_factor == 0), 'ny must be divisible by the interpolation factor'
        self.interpolation_factor = np.int32(interpolation_factor)
        
        # The size of the coarse grid 
        self.coarse_nx = np.int32(nx/self.interpolation_factor)
        self.coarse_ny = np.int32(ny/self.interpolation_factor)
        self.coarse_dx = np.float32(dx*self.interpolation_factor)
        self.coarse_dy = np.float32(dy*self.interpolation_factor)
        
        self.periodicNorthSouth = np.int32(boundaryConditions.isPeriodicNorthSouth())
        self.periodicEastWest = np.int32(boundaryConditions.isPeriodicEastWest())
        
        # Size of random field and seed
        # The SOAR function is a stencil which requires cutoff number of grid cells,
        # and the interpolation operator requires further 2 ghost cell values in each direction.
        # The random field must therefore be created with 2 + cutoff number of ghost cells.
        self.rand_ghost_cells_x = np.int32(2+self.cutoff)
        self.rand_ghost_cells_y = np.int32(2+self.cutoff)
        if self.periodicEastWest:
            self.rand_ghost_cells_x = np.int32(0)
        if self.periodicNorthSouth:
            self.rand_ghost_cells_y = np.int32(0)
        self.rand_nx = np.int32(self.coarse_nx + 2*self.rand_ghost_cells_x)
        self.rand_ny = np.int32(self.coarse_ny + 2*self.rand_ghost_cells_y)

        # Since normal distributed numbers are generated in pairs, we need to store half the number of
        # of seed values compared to the number of random numbers.
        self.seed_ny = np.int32(self.rand_ny)
        self.seed_nx = np.int32(np.ceil(self.rand_nx/2))

        # Generate seed:
        self.floatMax = 2147483648.0
        if self.use_lcg:
            self.host_seed = self.random_state.rand(self.seed_ny, self.seed_nx)*self.floatMax
            self.host_seed = self.host_seed.astype(np.uint64, order='C')
        
        if not self.use_lcg:
            self.rng = XORWOWRandomNumberGenerator()
        else:
            self.seed = Common.CUDAArray2D(gpu_stream, self.seed_nx, self.seed_ny, 0, 0, self.host_seed, double_precision=True, integers=True)
        
        # Constants for the SOAR function:
        self.soar_q0 = np.float32(self.dx/100000)
        if soar_q0 is not None:
            self.soar_q0 = np.float32(soar_q0)
            
        self.soar_L = np.float32(0.75*self.coarse_dx)
        if soar_L is not None:
            self.soar_L = np.float32(soar_L)
        
        # Allocate memory for random numbers (xi)
        self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host)
        
        # Allocate a second buffer for random numbers (nu)
        self.perpendicular_random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.perpendicular_random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host)
        
        
        # Allocate memory for coarse buffer if needed
        # Two ghost cells in each direction needed for bicubic interpolation 
        self.coarse_buffer_host = np.zeros((self.coarse_ny+4, self.coarse_nx+4), dtype=np.float32, order='C')
        self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream, self.coarse_nx, self.coarse_ny, 2, 2, self.coarse_buffer_host)

        # Allocate extra memory needed for reduction kernels.
        # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu]
        self.reduction_buffer = None
        reduction_buffer_host = np.zeros((1,3), dtype=np.float32)
        self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0, reduction_buffer_host)
       
        # Generate kernels
        self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \
                                          defines={'block_width': block_width, 'block_height': block_height})
        
        self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \
                                                         defines={})
        
        # Get CUDA functions and define data types for prepared_{async_}call()
        # Generate kernels
        self.squareSumKernel = self.reduction_kernels.get_function("squareSum")
        self.squareSumKernel.prepare("iiPP")
                
        self.squareSumDoubleKernel = self.reduction_kernels.get_function("squareSumDouble")
        self.squareSumDoubleKernel.prepare("iiPPP")
        
        self.makePerpendicularKernel = self.kernels.get_function("makePerpendicular")
        self.makePerpendicularKernel.prepare("iiPiPiP")
        
        self.uniformDistributionKernel = self.kernels.get_function("uniformDistribution")
        self.uniformDistributionKernel.prepare("iiiPiPi")
        
        self.normalDistributionKernel = None
        if self.use_lcg:
            self.normalDistributionKernel = self.kernels.get_function("normalDistribution")
            self.normalDistributionKernel.prepare("iiiPiPi")
        
        self.soarKernel = self.kernels.get_function("SOAR")
        self.soarKernel.prepare("iifffffiiPiPii")
        
        self.geostrophicBalanceKernel = self.kernels.get_function("geostrophicBalance")
        self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPi")
        
        self.bicubicInterpolationKernel = self.kernels.get_function("bicubicInterpolation")
        self.bicubicInterpolationKernel.prepare("iiiiffiiiiffiiffffPiPiPiPiPi")
        
        #Compute kernel launch parameters
        self.local_size = (block_width, block_height, 1)
        
        self.local_size_reductions  = (128, 1, 1)
        self.global_size_reductions = (1,   1)
        
        # Launch one thread for each seed, which in turns generates two iid N(0,1)
        self.global_size_random_numbers = ( \
                       int(np.ceil(self.seed_nx / float(self.local_size[0]))), \
                       int(np.ceil(self.seed_ny / float(self.local_size[1]))) \
                     ) 
        
        # Launch on thread for each random number (in order to create perpendicular random numbers)
        self.global_size_perpendicular = ( \
                      int(np.ceil(self.rand_nx / float(self.local_size[0]))), \
                      int(np.ceil(self.rand_ny / float(self.local_size[1]))) \
                     )
        
        
        # Launch one thread per SOAR-correlated result - need to write to two ghost 
        # cells in order to do bicubic interpolation based on the result
        self.global_size_SOAR = ( \
                     int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \
                     int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \
                    )
        
        # One thread per resulting perturbed grid cell
        self.global_size_geo_balance = ( \
                    int(np.ceil( (self.nx)/float(self.local_size[0]))), \
                    int(np.ceil( (self.ny)/float(self.local_size[1]))) \
                   )
Exemple #13
0
class OceanStateNoise(object):
    """
    Generating random perturbations for a ocean state.
   
    Perturbation for the surface field, dEta, is produced with a covariance structure according to a SOAR function,
    while dHu and dHv are found by the geostrophic balance to avoid shock solutions.
    """
    
    def __init__(self, gpu_ctx, gpu_stream,
                 nx, ny, dx, dy,
                 boundaryConditions, staggered,
                 soar_q0=None, soar_L=None,
                 interpolation_factor = 1,
                 use_lcg=False,
                 block_width=16, block_height=16):
        """
        Initiates a class that generates small scale geostrophically balanced perturbations of
        the ocean state.
        (nx, ny): number of internal grid cells in the domain
        (dx, dy): size of each grid cell
        soar_q0: amplitude parameter for the perturbation, default: dx*1e-5
        soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor
        interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, 
            and then interpolated down to the computational mesh. The coarse mesh will then have
            (nx/interpolation_factor, ny/interpolation_factor) grid cells.
        use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers
        (block_width, block_height): The size of each GPU block
        """

        self.use_lcg = use_lcg

        # Set numpy random state
        self.random_state = np.random.RandomState()
        
        # Make sure that all variables initialized within ifs are defined
        self.random_numbers = None
        self.rng = None
        self.seed = None
        self.host_seed = None
        
        self.gpu_ctx = gpu_ctx
        self.gpu_stream = gpu_stream
        
        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.staggered = np.int(0)
        if staggered:
            self.staggered = np.int(1)
            
        # The cutoff parameter is hard-coded.
        # The size of the cutoff determines the computational radius in the
        # SOAR function. Hence, the size of the local memory in the OpenCL 
        # kernels has to be hard-coded.
        self.cutoff = np.int32(config.soar_cutoff) 
        
        # Check that the interpolation factor plays well with the grid size:
        assert ( interpolation_factor > 0 and interpolation_factor % 2 == 1), 'interpolation_factor must be a positive odd integer'
        
        assert (nx % interpolation_factor == 0), 'nx must be divisible by the interpolation factor'
        assert (ny % interpolation_factor == 0), 'ny must be divisible by the interpolation factor'
        self.interpolation_factor = np.int32(interpolation_factor)
        
        # The size of the coarse grid 
        self.coarse_nx = np.int32(nx/self.interpolation_factor)
        self.coarse_ny = np.int32(ny/self.interpolation_factor)
        self.coarse_dx = np.float32(dx*self.interpolation_factor)
        self.coarse_dy = np.float32(dy*self.interpolation_factor)
        
        self.periodicNorthSouth = np.int32(boundaryConditions.isPeriodicNorthSouth())
        self.periodicEastWest = np.int32(boundaryConditions.isPeriodicEastWest())
        
        # Size of random field and seed
        # The SOAR function is a stencil which requires cutoff number of grid cells,
        # and the interpolation operator requires further 2 ghost cell values in each direction.
        # The random field must therefore be created with 2 + cutoff number of ghost cells.
        self.rand_ghost_cells_x = np.int32(2+self.cutoff)
        self.rand_ghost_cells_y = np.int32(2+self.cutoff)
        if self.periodicEastWest:
            self.rand_ghost_cells_x = np.int32(0)
        if self.periodicNorthSouth:
            self.rand_ghost_cells_y = np.int32(0)
        self.rand_nx = np.int32(self.coarse_nx + 2*self.rand_ghost_cells_x)
        self.rand_ny = np.int32(self.coarse_ny + 2*self.rand_ghost_cells_y)

        # Since normal distributed numbers are generated in pairs, we need to store half the number of
        # of seed values compared to the number of random numbers.
        self.seed_ny = np.int32(self.rand_ny)
        self.seed_nx = np.int32(np.ceil(self.rand_nx/2))

        # Generate seed:
        self.floatMax = 2147483648.0
        if self.use_lcg:
            self.host_seed = self.random_state.rand(self.seed_ny, self.seed_nx)*self.floatMax
            self.host_seed = self.host_seed.astype(np.uint64, order='C')
        
        if not self.use_lcg:
            self.rng = XORWOWRandomNumberGenerator()
        else:
            self.seed = Common.CUDAArray2D(gpu_stream, self.seed_nx, self.seed_ny, 0, 0, self.host_seed, double_precision=True, integers=True)
        
        # Constants for the SOAR function:
        self.soar_q0 = np.float32(self.dx/100000)
        if soar_q0 is not None:
            self.soar_q0 = np.float32(soar_q0)
            
        self.soar_L = np.float32(0.75*self.coarse_dx)
        if soar_L is not None:
            self.soar_L = np.float32(soar_L)
        
        # Allocate memory for random numbers (xi)
        self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host)
        
        # Allocate a second buffer for random numbers (nu)
        self.perpendicular_random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.perpendicular_random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host)
        
        
        # Allocate memory for coarse buffer if needed
        # Two ghost cells in each direction needed for bicubic interpolation 
        self.coarse_buffer_host = np.zeros((self.coarse_ny+4, self.coarse_nx+4), dtype=np.float32, order='C')
        self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream, self.coarse_nx, self.coarse_ny, 2, 2, self.coarse_buffer_host)

        # Allocate extra memory needed for reduction kernels.
        # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu]
        self.reduction_buffer = None
        reduction_buffer_host = np.zeros((1,3), dtype=np.float32)
        self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0, reduction_buffer_host)
       
        # Generate kernels
        self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \
                                          defines={'block_width': block_width, 'block_height': block_height})
        
        self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \
                                                         defines={})
        
        # Get CUDA functions and define data types for prepared_{async_}call()
        # Generate kernels
        self.squareSumKernel = self.reduction_kernels.get_function("squareSum")
        self.squareSumKernel.prepare("iiPP")
                
        self.squareSumDoubleKernel = self.reduction_kernels.get_function("squareSumDouble")
        self.squareSumDoubleKernel.prepare("iiPPP")
        
        self.makePerpendicularKernel = self.kernels.get_function("makePerpendicular")
        self.makePerpendicularKernel.prepare("iiPiPiP")
        
        self.uniformDistributionKernel = self.kernels.get_function("uniformDistribution")
        self.uniformDistributionKernel.prepare("iiiPiPi")
        
        self.normalDistributionKernel = None
        if self.use_lcg:
            self.normalDistributionKernel = self.kernels.get_function("normalDistribution")
            self.normalDistributionKernel.prepare("iiiPiPi")
        
        self.soarKernel = self.kernels.get_function("SOAR")
        self.soarKernel.prepare("iifffffiiPiPii")
        
        self.geostrophicBalanceKernel = self.kernels.get_function("geostrophicBalance")
        self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPi")
        
        self.bicubicInterpolationKernel = self.kernels.get_function("bicubicInterpolation")
        self.bicubicInterpolationKernel.prepare("iiiiffiiiiffiiffffPiPiPiPiPi")
        
        #Compute kernel launch parameters
        self.local_size = (block_width, block_height, 1)
        
        self.local_size_reductions  = (128, 1, 1)
        self.global_size_reductions = (1,   1)
        
        # Launch one thread for each seed, which in turns generates two iid N(0,1)
        self.global_size_random_numbers = ( \
                       int(np.ceil(self.seed_nx / float(self.local_size[0]))), \
                       int(np.ceil(self.seed_ny / float(self.local_size[1]))) \
                     ) 
        
        # Launch on thread for each random number (in order to create perpendicular random numbers)
        self.global_size_perpendicular = ( \
                      int(np.ceil(self.rand_nx / float(self.local_size[0]))), \
                      int(np.ceil(self.rand_ny / float(self.local_size[1]))) \
                     )
        
        
        # Launch one thread per SOAR-correlated result - need to write to two ghost 
        # cells in order to do bicubic interpolation based on the result
        self.global_size_SOAR = ( \
                     int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \
                     int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \
                    )
        
        # One thread per resulting perturbed grid cell
        self.global_size_geo_balance = ( \
                    int(np.ceil( (self.nx)/float(self.local_size[0]))), \
                    int(np.ceil( (self.ny)/float(self.local_size[1]))) \
                   )
        
        
        
    def __del__(self):
        self.cleanUp()
     
    def cleanUp(self):
        if self.rng is not None:
            self.rng = None
        if self.seed is not None:
            self.seed.release()
        if self.random_numbers is not None:
            self.random_numbers.release()
        if self.perpendicular_random_numbers is not None:
            self.perpendicular_random_numbers.release()
        if self.reduction_buffer is not None:
            self.reduction_buffer.release()
        self.gpu_ctx = None
        gc.collect()
        
    @classmethod
    def fromsim(cls, sim, soar_q0=None, soar_L=None, interpolation_factor=1,  
                block_width=16, block_height=16):
        staggered = False
        if isinstance(sim, FBL.FBL) or isinstance(sim, CTCS.CTCS):
            staggered = True
        return cls(sim.gpu_ctx, sim.gpu_stream,
                   sim.nx, sim.ny, sim.dx, sim.dy,
                   sim.boundary_conditions, staggered,
                   soar_q0=soar_q0, soar_L=soar_L,
                   interpolation_factor=interpolation_factor,
                   block_width=block_width, block_height=block_height)

    def getSeed(self):
        assert(self.use_lcg), "getSeed is only valid if LCG is used as pseudo-random generator."
        
        return self.seed.download(self.gpu_stream)
    
    def resetSeed(self):
        assert(self.use_lcg), "resetSeed is only valid if LCG is used as pseudo-random generator."

        # Generate seed:
        self.floatMax = 2147483648.0
        self.host_seed = self.random_state.rand(self.seed_ny, self.seed_nx)*self.floatMax
        self.host_seed = self.host_seed.astype(np.uint64, order='C')
        self.seed.upload(self.gpu_stream, self.host_seed)

    def getRandomNumbers(self):
        return self.random_numbers.download(self.gpu_stream)
    
    def getPerpendicularRandomNumbers(self):
        return self.perpendicular_random_numbers.download(self.gpu_stream)
    
    def getCoarseBuffer(self):
        return self.coarse_buffer.download(self.gpu_stream)
    
    def getReductionBuffer(self):
        return self.reduction_buffer.download(self.gpu_stream)
    
    def generateNormalDistribution(self):
        if not self.use_lcg:
            self.rng.fill_normal(self.random_numbers.data, stream=self.gpu_stream)
        else:
            self.normalDistributionKernel.prepared_async_call(self.global_size_random_numbers, self.local_size, self.gpu_stream,
                                                              self.seed_nx, self.seed_ny,
                                                              self.rand_nx,
                                                              self.seed.data.gpudata, self.seed.pitch,
                                                              self.random_numbers.data.gpudata, self.random_numbers.pitch)
    
    def generateNormalDistributionPerpendicular(self):
        if not self.use_lcg:
            self.rng.fill_normal(self.perpendicular_random_numbers.data, stream=self.gpu_stream)
        else:
            self.normalDistributionKernel.prepared_async_call(self.global_size_random_numbers, self.local_size, self.gpu_stream,
                                                              self.seed_nx, self.seed_ny,
                                                              self.rand_nx,
                                                              self.seed.data.gpudata, self.seed.pitch,
                                                              self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch)

    def generateUniformDistribution(self):
        # Call kernel -> new random numbers
        if not self.use_lcg:
            self.rng.fill_uniform(self.random_numbers.data, stream=self.gpu_stream)
        else:
            self.uniformDistributionKernel.prepared_async_call(self.global_size_random_numbers, self.local_size, self.gpu_stream,
                                                               self.seed_nx, self.seed_ny,
                                                               self.rand_nx,
                                                               self.seed.data.gpudata, self.seed.pitch,
                                                               self.random_numbers.data.gpudata, self.random_numbers.pitch)

    def perturbSim(self, sim, q0_scale=1.0, update_random_field=True, 
                   perturbation_scale=1.0, perpendicular_scale=0.0,
                   align_with_cell_i=None, align_with_cell_j=None):
        """
        Generating a perturbed ocean state and adding it to sim's ocean state 
        """
        
        self.perturbOceanState(sim.gpu_data.h0, sim.gpu_data.hu0, sim.gpu_data.hv0,
                               sim.bathymetry.Bi,
                               sim.f, beta=sim.coriolis_beta, g=sim.g, 
                               y0_reference_cell=sim.y_zero_reference_cell,
                               ghost_cells_x=sim.ghost_cells_x,
                               ghost_cells_y=sim.ghost_cells_y,
                               q0_scale=q0_scale,
                               update_random_field=update_random_field,
                               perturbation_scale=perturbation_scale,
                               perpendicular_scale=perpendicular_scale,
                               align_with_cell_i=align_with_cell_i,
                               align_with_cell_j=align_with_cell_j)
                               
    
    def perturbOceanState(self, eta, hu, hv, H, f, beta=0.0, g=9.81, 
                          y0_reference_cell=0, ghost_cells_x=0, ghost_cells_y=0,
                          q0_scale=1.0, update_random_field=True, 
                          perturbation_scale=1.0, perpendicular_scale=0.0,
                          align_with_cell_i=None, align_with_cell_j=None):
        """
        Apply the SOAR Q covariance matrix on the random ocean field which is
        added to the provided buffers eta, hu and hv.
        eta: surface deviation - CUDAArray2D object.
        hu: volume transport in x-direction - CUDAArray2D object.
        hv: volume transport in y-dirextion - CUDAArray2D object.
        
        Optional parameters not used else_where:
        q0_scale=1: scale factor to the SOAR amplitude parameter q0
        update_random_field=True: whether to generate new random numbers or use those already 
            present in the random numbers buffer
        perturbation_scale=1.0: scale factor to the perturbation of the eta field
        perpendicular_scale=0.0: scale factor for additional perturbation from the perpendicular random field
        align_with_cell_i=None, align_with_cell_j=None: Index to a cell for which to align the coarse grid.
            The default value align_with_cell=None corresponds to zero offset between the coarse and fine grid.
        """
        if update_random_field:
            # Need to update the random field, requiering a global sync
            self.generateNormalDistribution()
        
        soar_q0 = np.float32(self.soar_q0 * q0_scale)
        
        offset_i, offset_j = self._obtain_coarse_grid_offset(align_with_cell_i, align_with_cell_j)
        
        # Generate the SOAR field on the coarse grid
        
        
        self.soarKernel.prepared_async_call(self.global_size_SOAR, self.local_size, self.gpu_stream,
                                            self.coarse_nx, self.coarse_ny,
                                            self.coarse_dx, self.coarse_dy,

                                            soar_q0, self.soar_L,
                                            np.float32(perturbation_scale),
                                            
                                            self.periodicNorthSouth, self.periodicEastWest,
                                            self.random_numbers.data.gpudata, self.random_numbers.pitch,
                                            self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch,
                                            np.int32(0))
        if perpendicular_scale > 0:
            self.soarKernel.prepared_async_call(self.global_size_SOAR, self.local_size, self.gpu_stream,
                                                self.coarse_nx, self.coarse_ny,
                                                self.coarse_dx, self.coarse_dy,

                                                soar_q0, self.soar_L,
                                                np.float32(perpendicular_scale),

                                                self.periodicNorthSouth, self.periodicEastWest,
                                                self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch,
                                                self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch,
                                                np.int32(1))
        
        if self.interpolation_factor > 1:
            self.bicubicInterpolationKernel.prepared_async_call(self.global_size_geo_balance, self.local_size, self.gpu_stream,
                                                                self.nx, self.ny, 
                                                                np.int32(ghost_cells_x), np.int32(ghost_cells_y),
                                                                self.dx, self.dy,
                                                                
                                                                self.coarse_nx, self.coarse_ny,
                                                                np.int32(ghost_cells_x), np.int32(ghost_cells_y),
                                                                self.coarse_dx, self.coarse_dy,
                                                                np.int32(offset_i), np.int32(offset_j),
                                                                
                                                                np.float32(g), np.float32(f),
                                                                np.float32(beta), np.float32(y0_reference_cell),
                                                                
                                                                self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch,
                                                                eta.data.gpudata, eta.pitch,
                                                                hu.data.gpudata, hu.pitch,
                                                                hv.data.gpudata, hv.pitch,
                                                                H.data.gpudata, H.pitch)

        else:
            self.geostrophicBalanceKernel.prepared_async_call(self.global_size_geo_balance, self.local_size, self.gpu_stream,
                                                              self.nx, self.ny,
                                                              self.dx, self.dy,
                                                              np.int32(ghost_cells_x), np.int32(ghost_cells_y),

                                                              np.float32(g), np.float32(f),
                                                              np.float32(beta), np.float32(y0_reference_cell),

                                                              self.coarse_buffer.data.gpudata, self.coarse_buffer.pitch,
                                                              eta.data.gpudata, eta.pitch,
                                                              hu.data.gpudata, hu.pitch,
                                                              hv.data.gpudata, hv.pitch,
                                                              H.data.gpudata, H.pitch)
    
    def _obtain_coarse_grid_offset(self, fine_index_i, fine_index_j):
        
        default_offset = self.interpolation_factor//2

        offset_i, offset_j = 0, 0
        
        if fine_index_i is not None:
            coarse_i = fine_index_i//self.interpolation_factor
            raw_offset_i = fine_index_i % self.interpolation_factor
            offset_i = -int(raw_offset_i - default_offset)
        if fine_index_j is not None:        
            coarse_j = fine_index_j//self.interpolation_factor
            raw_offset_j = fine_index_j % self.interpolation_factor
            offset_j = -int(raw_offset_j - default_offset)
        return offset_i, offset_j
    

    def getRandomNorm(self):
        """
        Calculates sum(xi^2), where xi \sim N(0,I)
        Calling a kernel that sums the square of all elements in the random buffer
        """
        self.squareSumKernel.prepared_async_call(self.global_size_reductions,
                                                 self.local_size_reductions, 
                                                 self.gpu_stream,
                                                 self.rand_nx, self.rand_ny,
                                                 self.random_numbers.data.gpudata,
                                                 self.reduction_buffer.data.gpudata)
        return self.getReductionBuffer()[0,0]
    
   
    def findDoubleNormAndDot(self):
        """
        Calculates sum(xi^2), sum(nu^2), sum(xi*nu)
        and stores these values in the reduction buffer
        """
        self.squareSumDoubleKernel.prepared_async_call(self.global_size_reductions,
                                                       self.local_size_reductions, 
                                                       self.gpu_stream,
                                                       self.rand_nx, self.rand_ny,
                                                       self.random_numbers.data.gpudata,
                                                       self.perpendicular_random_numbers.data.gpudata,
                                                       self.reduction_buffer.data.gpudata)
        
    def _makePerpendicular(self):
        """
        Calls the kernel that transform nu (perpendicular_random_numbers buffer) to be 
        perpendicular to xi (random_numbers buffer).
        Both nu and xi should be independent samples from N(0,I) prior to calling this function.
        After this function, they are still both samples from N(0,I), but are no longer independent
        (but lineary independent).
        """
        self.makePerpendicularKernel.prepared_async_call(self.global_size_perpendicular, self.local_size, self.gpu_stream,
                                                         self.rand_nx, self.rand_ny,
                                                         self.random_numbers.data.gpudata, self.random_numbers.pitch,
                                                         self.perpendicular_random_numbers.data.gpudata, self.perpendicular_random_numbers.pitch,
                                                         self.reduction_buffer.data.gpudata)
    
    def generatePerpendicularNormalDistributions(self):
        """
        Generates xi, nu \sim N(0,I) such that xi and nu are perpendicular.
        In the process, it calculates sum(xi^2), sum(nu^2), which are written to the first two 
        elements in the reduction buffer.
        The third reduction buffer will contain the original, now outdated, dot(xi, nu), which 
        was used to construct a random nu that is perpendicular to xi in the first place.
        """
        self.generateNormalDistribution()
        self.generateNormalDistributionPerpendicular()
        self.findDoubleNormAndDot()
        self._makePerpendicular()
    
    
    ##### CPU versions of the above functions ####
    
    def getSeedCPU(self):
        assert(self.use_lcg), "getSeedCPU is only valid if LCG is used as pseudo-random generator."
        return self.host_seed
    
    def generateNormalDistributionCPU(self):
        self._CPUUpdateRandom(True)
    
    def generateUniformDistributionCPU(self):
        self._CPUUpdateRandom(False)
    
    def getRandomNumbersCPU(self):
        return self.random_numbers_host
    
    def perturbEtaCPU(self, eta, use_existing_GPU_random_numbers=False,
                      ghost_cells_x=0, ghost_cells_y=0):
        """
        Apply the SOAR Q covariance matrix on the random field to add
        a perturbation to the incomming eta buffer.
        eta: numpy array
        """
        # Call CPU utility function
        if use_existing_GPU_random_numbers:
            self.random_numbers_host = self.getRandomNumbers()
        else:
            self.generateNormalDistributionCPU()
        d_eta = self._applyQ_CPU()
        
        if self.interpolation_factor > 1:
            d_eta = self._interpolate_CPU(d_eta, geostrophic_balance=False)
        
        interior = [-ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x]
        for i in range(4):
            if interior[i] == 0:
                interior[i] = None
        
        eta[interior[2]:interior[0], interior[3]:interior[1]] = d_eta[2:-2, 2:-2]
    
    def perturbOceanStateCPU(self, eta, hu, hv, H, f,  beta=0.0, g=9.81,
                             ghost_cells_x=0, ghost_cells_y=0,
                             use_existing_GPU_random_numbers=False,
                             use_existing_CPU_random_numbers=False):
        """
        Apply the SOAR Q covariance matrix on the random field to add
        a perturbation to the incomming eta buffer.
        Generate geostrophically balanced hu and hv which is added to the incomming hu and hv buffers.
        eta: numpy array
        """
        # Call CPU utility function
        if use_existing_GPU_random_numbers:
            self.random_numbers_host = self.getRandomNumbers()
        elif not use_existing_CPU_random_numbers:
            self.generateNormalDistributionCPU()
        
        # generates perturbation (d_eta[ny+4, nx+4], d_hu[ny, nx] and d_hv[ny, nx])
        d_eta, d_hu, d_hv = self._obtainOceanPerturbations_CPU(H, f, beta, g)
        
        interior = [-ghost_cells_y, -ghost_cells_x, ghost_cells_y, ghost_cells_x]
        for i in range(4):
            if interior[i] == 0:
                interior[i] = None
        
        eta[interior[2]:interior[0], interior[3]:interior[1]] += d_eta[2:-2, 2:-2]
        hu[interior[2]:interior[0], interior[3]:interior[1]] += d_hu
        hv[interior[2]:interior[0], interior[3]:interior[1]] += d_hv
    
    
     
    
    # ------------------------------
    # CPU utility functions:
    # ------------------------------
    
    def _lcg(self, seed):
        modulo = np.uint64(2147483647)
        seed = np.uint64(((seed*1103515245) + 12345) % modulo) #0x7fffffff
        return seed / 2147483648.0, seed
    
    def _boxMuller(self, seed_in):
        seed = np.uint64(seed_in)
        u1, seed = self._lcg(seed)
        u2, seed = self._lcg(seed)
        r = np.sqrt(-2.0*np.log(u1))
        theta = 2*np.pi*u2
        n1 = r*np.cos(theta)
        n2 = r*np.sin(theta)
        return n1, n2, seed
    
    def _CPUUpdateRandom(self, normalDist):
        """
        Updating the random number buffer at the CPU.
        normalDist: Boolean parameter. 
            If True, the random numbers are from N(0,1)
            If False, the random numbers are from U[0,1]
        """
        if not self.use_lcg:
            if normalDist:
                self.generateNormalDistribution()
            else:
                self.generateUniformDistribution()
            self.random_numbers_host = self.getRandomNumbers()
            return
        
        #(ny, nx) = seed.shape
        #(domain_ny, domain_nx) = random.shape
        b_dim_x = self.local_size[0]
        b_dim_y = self.local_size[1]
        blocks_x = self.global_size_random_numbers[0]
        blocks_y = self.global_size_random_numbers[1]
        for by in range(blocks_y):
            for bx in range(blocks_x):
                for j in range(b_dim_y):
                    for i in range(b_dim_x):

                        ## Content of kernel:
                        y = b_dim_y*by + j # thread_id
                        x = b_dim_x*bx + i # thread_id
                        if (x < self.seed_nx and y < self.seed_ny):
                            n1, n2 = 0.0, 0.0
                            if normalDist:
                                n1, n2, self.host_seed[y,x]   = self._boxMuller(self.host_seed[y,x])
                            else:
                                n1, self.host_seed[y,x] = self._lcg(self.host_seed[y,x])
                                n2, self.host_seed[y,x] = self._lcg(self.host_seed[y,x])
                                
                            if x*2 + 1 < self.rand_nx:
                                self.random_numbers_host[y, x*2  ] = n1
                                self.random_numbers_host[y, x*2+1] = n2
                            elif x*2 == self.rand_nx:
                                self.random_numbers_host[y, x*2] = n1
    
    def _SOAR_Q_CPU(self, a_x, a_y, b_x, b_y):
        """
        CPU implementation of a SOAR covariance function between grid points
        (a_x, a_y) and (b_x, b_y)
        """
        dist = np.sqrt(  self.coarse_dx*self.coarse_dx*(a_x - b_x)**2  
                       + self.coarse_dy*self.coarse_dy*(a_y - b_y)**2 )
        return self.soar_q0*(1.0 + dist/self.soar_L)*np.exp(-dist/self.soar_L)
    
    def _applyQ_CPU(self, perturbation_scale=1):
        #xi, dx=1, dy=1, q0=0.1, L=1, cutoff=5):
        """
        Create the perturbation field for eta based on the SOAR covariance 
        structure.
        
        The resulting size is (coarse_nx+4, coarse_ny+4), as two ghost cells are required to 
        do bicubic interpolation of the result.
        """
                        
        # Assume in a GPU setting - we read xi into shared memory with ghostcells
        # Additional cutoff number of ghost cells required to calculate SOAR contribution
        ny_halo = int(self.coarse_ny + (2 + self.cutoff)*2)
        nx_halo = int(self.coarse_nx + (2 + self.cutoff)*2)
        local_xi = np.zeros((ny_halo, nx_halo))
        for j in range(ny_halo):
            global_j = j
            if self.periodicNorthSouth:
                global_j = (j - self.cutoff - 2) % self.rand_ny
            for i in range(nx_halo):
                global_i = i
                if self.periodicEastWest:
                    global_i = (i - self.cutoff - 2) % self.rand_nx
                local_xi[j,i] = self.random_numbers_host[global_j, global_i]
                
        # Sync threads
        
        # Allocate output buffer
        Qxi = np.zeros((self.coarse_ny+4, self.coarse_nx+4))
        for a_y in range(self.coarse_ny+4):
            for a_x in range(self.coarse_nx+4):
                # This is a OpenCL thread (a_x, a_y)
                local_a_x = a_x + self.cutoff
                local_a_y = a_y + self.cutoff
                
                #############
                #Qxi[a_y, a_x] = local_xi[local_a_y, local_a_x]
                #continue
                #############
                
                
                start_b_y = local_a_y - self.cutoff
                end_b_y =  local_a_y + self.cutoff+1
                start_b_x = local_a_x - self.cutoff
                end_b_x =  local_a_x + self.cutoff+1

                Qx = 0.0
                for b_y in range(start_b_y, end_b_y):
                    for b_x in range(start_b_x, end_b_x):
                        Q = self._SOAR_Q_CPU(local_a_x, local_a_y, b_x, b_y)
                        Qx += Q*local_xi[b_y, b_x]
                Qxi[a_y, a_x] = perturbation_scale*Qx
        
        return Qxi
    
    
    def _obtainOceanPerturbations_CPU(self, H, f, beta, g, perturbation_scale=1):
        # Obtain perturbed eta - size (coarse_ny+4, coarse_nx+4)
        d_eta = self._applyQ_CPU(perturbation_scale)

        # Interpolate if the coarse grid is not the same as the computational grid
        # d_eta then becomes (ny+4, nx+4)
        if self.interpolation_factor > 1:
            d_eta = self._interpolate_CPU(d_eta)
        
        ####
        # Global sync (currently)
        #     Can be made into a local sync, as long as d_eta is given 
        #     periodic overlap (1 more global computated ghost cell)
        ####

        d_hu = np.zeros((self.ny, self.nx))
        d_hv = np.zeros((self.ny, self.nx))

        ### Find H_mid:
        # Read global H (def on intersections) to local, find H_mid
        # The local memory can then be reused to something else (perhaps use local_d_eta before computing local_d_eta?)
        H_mid = np.zeros((self.ny, self.nx))
        for j in range(self.ny):
            for i in range(self.nx):
                H_mid[j,i] = 0.25* (H[j,i] + H[j+1, i] + H[j, i+1] + H[j+1, i+1])
        
        ####
        # Local sync
        ####

        # Compute geostrophically balanced (hu, hv) for each cell within the domain
        for j in range(0, self.ny):
            local_j = j + 2     # index in d_eta buffer
            coriolis = f + beta*local_j*self.dy
            for i in range(0, self.nx):
                local_i = i + 2    # index in d_eta buffer
                h_mid = d_eta[local_j,local_i] + H_mid[j, i]
                
                ##############
                #h_mid = H_mid[j, i]
                ##############
                
                
                eta_diff_y = (d_eta[local_j+1, local_i] - d_eta[local_j-1, local_i])/(2.0*self.dy)
                d_hu[j,i] = -(g/coriolis)*h_mid*eta_diff_y

                eta_diff_x = (d_eta[local_j, local_i+1] - d_eta[local_j, local_i-1])/(2.0*self.dx)
                d_hv[j,i] = (g/coriolis)*h_mid*eta_diff_x   
    
        return d_eta, d_hu, d_hv
    
    
    
    def _interpolate_CPU(self, coarse_eta, interpolation_order=3):
        """
        Interpolates values coarse_eta defined on the coarse grid onto the computational grid.
        Input coarse_eta is of size [coarse_ny+4, coarse_nx+4], and output will be given as
        eta [ny+4, nx+4].
        """

        
        # Create buffers for eta, hu and hv:
        d_eta = np.zeros((self.ny+4, self.nx+4))
      
        
        
        
        min_rel_x = 10
        max_rel_x = -10
        min_rel_y = 10
        max_rel_y = -10
        
        # Loop over internal cells and first ghost cell layer.
        for loc_j in range(self.ny+2):
            for loc_i in range(self.nx+2):
                
                # index in resulting d_eta buffer
                i = loc_i + 1
                j = loc_j + 1

                # Position of cell center in fine grid:
                x = (i - 2 + 0.5)*self.dx
                y = (j - 2 + 0.5)*self.dy

                # Location in coarse grid (defined in course grid's cell centers)
                # (coarse_i, coarse_j) is the first coarse grid point towards lower left.
                coarse_i = int(np.floor(x/self.coarse_dx + 2 - 0.5))
                coarse_j = int(np.floor(y/self.coarse_dy + 2 - 0.5))
                
                # Position of the coarse grid point
                coarse_x = (coarse_i - 2 + 0.5)*self.coarse_dx
                coarse_y = (coarse_j - 2 + 0.5)*self.coarse_dy
                
                assert coarse_x <= x
                assert coarse_x + self.coarse_dx >= x

                rel_x = (x - coarse_x)/self.coarse_dx
                rel_y = (y - coarse_y)/self.coarse_dy
                
                if rel_x < min_rel_x:
                    min_rel_x = rel_x
                if rel_x > max_rel_x:
                    max_rel_x = rel_x
                if rel_y < min_rel_y:
                    min_rel_y = rel_y
                if rel_y > max_rel_y:
                    max_rel_y = rel_y

                assert rel_x >= 0 and rel_x < 1
                assert rel_y >= 0 and rel_y < 1
                    
                d_eta[j,i] = self._bicubic_interpolation_inner(coarse_eta, coarse_i, coarse_j, rel_x, rel_y, interpolation_order)

        return d_eta
        
        
    def _bicubic_interpolation_inner(self, coarse_eta, coarse_i, coarse_j, rel_x, rel_y, interpolation_order=3):
         # Matrix needed to find the interpolation coefficients
        bicubic_matrix = np.matrix([[ 1,  0,  0,  0], 
                                    [ 0,  0,  1,  0], 
                                    [-3,  3, -2, -1],
                                    [ 2, -2,  1,  1]])
        
        f00   =  coarse_eta[coarse_j  , coarse_i  ]
        f01   =  coarse_eta[coarse_j+1, coarse_i  ]
        f10   =  coarse_eta[coarse_j  , coarse_i+1]
        f11   =  coarse_eta[coarse_j+1, coarse_i+1]

        fx00  = (coarse_eta[coarse_j  , coarse_i+1] - coarse_eta[coarse_j  , coarse_i-1])/2
        fx01  = (coarse_eta[coarse_j+1, coarse_i+1] - coarse_eta[coarse_j+1, coarse_i-1])/2       
        fx10  = (coarse_eta[coarse_j  , coarse_i+2] - coarse_eta[coarse_j  , coarse_i  ])/2    
        fx11  = (coarse_eta[coarse_j+1, coarse_i+2] - coarse_eta[coarse_j+1, coarse_i  ])/2      

        fy00  = (coarse_eta[coarse_j+1, coarse_i  ] - coarse_eta[coarse_j-1, coarse_i  ])/2
        fy01  = (coarse_eta[coarse_j+2, coarse_i  ] - coarse_eta[coarse_j  , coarse_i  ])/2       
        fy10  = (coarse_eta[coarse_j+1, coarse_i+1] - coarse_eta[coarse_j-1, coarse_i+1])/2       
        fy11  = (coarse_eta[coarse_j+2, coarse_i+1] - coarse_eta[coarse_j  , coarse_i+1])/2       

        fy_10 = (coarse_eta[coarse_j+1, coarse_i-1] - coarse_eta[coarse_j-1, coarse_i-1])/2
        fy_11 = (coarse_eta[coarse_j+2, coarse_i-1] - coarse_eta[coarse_j  , coarse_i-1])/2
        fy20  = (coarse_eta[coarse_j+1, coarse_i+2] - coarse_eta[coarse_j-1, coarse_i+2])/2
        fy21  = (coarse_eta[coarse_j+2, coarse_i+2] - coarse_eta[coarse_j  , coarse_i+2])/2

        fxy00 = (fy10 - fy_10)/2
        fxy01 = (fy11 - fy_11)/2
        fxy10 = (fy20 -  fy00)/2
        fxy11 = (fy21 -  fy01)/2


        f_matrix = np.matrix([[ f00,  f01,  fy00,  fy01],
                              [ f10,  f11,  fy10,  fy11],
                              [fx00, fx01, fxy00, fxy01],
                              [fx10, fx11, fxy10, fxy11] ])

        a_matrix = np.dot(bicubic_matrix, np.dot(f_matrix, bicubic_matrix.transpose()))
        
        x_vec = np.matrix([1.0, rel_x, rel_x*rel_x, rel_x*rel_x*rel_x])
        y_vec = np.matrix([1.0, rel_y, rel_y*rel_y, rel_y*rel_y*rel_y]).transpose()

        if interpolation_order == 0:
            # Flat average:
            return 0.25*(f00 + f01 + f10 + f11)

        elif interpolation_order == 1:
            # Linear interpolation:
            return f00*(1-rel_x)*(1-rel_y) + f10*rel_x*(1-rel_y) + f01*(1-rel_x)*rel_y + f11*rel_x*rel_y

        elif interpolation_order == 3:
            # Bicubic interpolation (make sure that we return a float)
            return np.dot(x_vec, np.dot(a_matrix, y_vec))[0, 0]
Exemple #14
0
class PyCudaHandler(Handler):
    __undescribed__ = {'context', 'dtype', 'EMPTY', 'rnd'}

    def __init__(self, seed=None):
        super(PyCudaHandler, self).__init__()
        self.dtype = np.float32
        self.context = cumisc._global_cublas_handle
        self.EMPTY = gpuarray.zeros((), dtype=self.dtype)
        if seed is None:
            seed = global_rnd.generate_seed()

        def get_seeds(n):
            return gpuarray.to_gpu(np.ones(n, np.int32) * seed)
        self.rnd = XORWOWRandomNumberGenerator(seed_getter=get_seeds)

    array_type = pycuda.gpuarray.GPUArray

    def __init_from_description__(self, description):
        self.__init__()

    def _get_gridsize(self, n):
        min_threads = 32
        max_threads = 256
        max_blocks = 384

        if n < min_threads:
            block_count = 1
            threads_per_block = min_threads
        elif n < (max_blocks * min_threads):
            block_count = (n + min_threads - 1) // min_threads
            threads_per_block = min_threads
        elif n < (max_blocks * max_threads):
            block_count = max_blocks
            grp = (n + min_threads - 1) // min_threads
            threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads
        else:
            block_count = max_blocks
            threads_per_block = max_threads

        return (block_count, 1), (threads_per_block, 1, 1)

    # ------------------------- Allocate new memory ------------------------- #

    def allocate(self, size):
        return gpuarray.zeros(size, dtype=self.dtype)

    def ones(self, shape):
        a = self.zeros(shape)
        self.fill(a, 1.0)
        return a

    def zeros(self, shape):
        return gpuarray.zeros(shape=shape, dtype=self.dtype)

    # ---------------------------- Copy and Fill ---------------------------- #

    def copy_to(self, src, dest):
        # Copy data from src to dest (both must be GPUArrays)
        pycuda.driver.memcpy_dtod(dest.gpudata, src.gpudata, dest.nbytes)

    def copy_to_if(self, src, dest, cond):
        copy_to_if_kernel(src, dest, cond)

    def create_from_numpy(self, arr):
        return gpuarray.to_gpu(arr.astype(self.dtype))

    def fill(self, mem, val):
        mem.fill(val)

    def fill_if(self, mem, val, cond):
        fill_if_kernel(mem, val, cond)

    def get_numpy_copy(self, mem):
        assert type(mem) == self.array_type
        return mem.get()

    def set_from_numpy(self, mem, arr):
        assert mem.shape == arr.shape, "Shape of destination ({}) != Shape " \
                                       "of source ({})".format(mem.shape,
                                                               arr.shape)
        mem.set(arr.astype(self.dtype))

    # ---------------------------- Debug helpers ---------------------------- #

    def is_fully_finite(self, a):
        temp = gpuarray.zeros_like(a)
        check_inf_or_nan_kernel(a, temp)
        return not np.any(temp.get())

    # ----------------------- Mathematical operations ----------------------- #

    def abs_t(self, a, out):
        cumath.fabs(a, out=out)

    def add_into_if(self, a, out, cond):
        add_into_if_kernel(a, out, cond)

    def add_mv(self, m, v, out):
        cumisc.add_matvec(m, v, out=out)

    def add_st(self, s, t, out):
        add_st_kernel(s, t, out)

    def add_tt(self, a, b, out):
        add_mm_kernel(a, b, out)

    def avgpool2d_backward_batch(self, inputs, window, outputs, padding,
                                 stride, in_deltas, out_deltas):
        n, h, w, c = inputs.shape
        o_h, o_w = outputs.shape[1], outputs.shape[2]
        _avepool_bwd_fp32_impl(np.int32(inputs.size), out_deltas,
                               np.int32(n), np.int32(h),
                               np.int32(w), np.int32(c),
                               np.int32(o_h), np.int32(o_w),
                               np.int32(window[0]), np.int32(window[1]),
                               np.int32(stride[0]), np.int32(stride[1]),
                               np.int32(padding), np.int32(padding),
                               in_deltas,
                               block=(get_blocks(inputs.size), 1, 1),
                               grid=(NUM_CUDA_THREADS, 1, 1))

    def avgpool2d_forward_batch(self, inputs, window, outputs, padding,
                                stride):
        n, h, w, c = inputs.shape
        o_h, o_w = outputs.shape[1], outputs.shape[2]
        _avepool_fwd_fp32_impl(np.int32(outputs.size), inputs,
                               np.int32(n), np.int32(h),
                               np.int32(w), np.int32(c),
                               np.int32(o_h), np.int32(o_w),
                               np.int32(window[0]), np.int32(window[1]),
                               np.int32(stride[0]), np.int32(stride[1]),
                               np.int32(padding), np.int32(padding),
                               outputs,
                               block=(get_blocks(outputs.size), 1, 1),
                               grid=(NUM_CUDA_THREADS, 1, 1))

    def binarize_v(self, v, out):
        binarize_v_kernel(out, v, out.shape[0], out.shape[1])

    def broadcast_t(self, a, axis, out):
        broadcast_dim = int(out.shape[axis])
        stride = int(np.prod(out.shape[axis+1:]))
        broadcast_t_kernel(out, a, broadcast_dim, stride)

    def clip_t(self, a, a_min, a_max, out):
        clip_kernel(a, out, a_min, a_max)

    def conv2d_backward_batch(self, inputs, params, padding, stride,
                              in_deltas, out_deltas, dparams, dbias):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = out_deltas.shape[1] * out_deltas.shape[2]
        num_kernel_params = np.prod(kernel_shape)

        dparams.fill(0.0)
        dbias.fill(0.0)
        tmp = self.zeros(dbias.shape)
        col = self.zeros((num_output_pixels, num_kernel_params))

        for i in range(num_images):
            num_cuda_kernels = num_output_pixels * num_input_maps

            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(out_deltas.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(get_blocks(num_cuda_kernels), 1, 1),
                              grid=(NUM_CUDA_THREADS, 1, 1))

            # Compute gradients
            reshaped_dparams = dparams.reshape(num_filters, num_kernel_params)
            reshaped_out_deltas = out_deltas[i].reshape((num_output_pixels,
                                                         num_filters))
            self.dot_add_mm(reshaped_out_deltas, col, out=reshaped_dparams,
                            transa=True)

            self.sum_t(reshaped_out_deltas, axis=0, out=tmp)
            self.add_tt(tmp, dbias, out=dbias)

            # Compute in_deltas
            reshaped_params = params.reshape((num_filters, num_kernel_params))
            self.dot_mm(reshaped_out_deltas, reshaped_params, out=col)
            num_cuda_kernels = input_rows * input_cols * num_input_maps
            _col2im_fp32_impl(np.int32(num_cuda_kernels), col.gpudata,
                              np.int32(input_cols), np.int32(num_input_maps),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(out_deltas.shape[1]),
                              np.int32(out_deltas.shape[2]),
                              in_deltas[i],
                              block=(get_blocks(num_cuda_kernels), 1, 1),
                              grid=(NUM_CUDA_THREADS, 1, 1))

    def conv2d_forward_batch(self, inputs, params, bias, outputs,
                             padding, stride):
        num_filters = params.shape[0]
        num_images, input_rows, input_cols, num_input_maps = inputs.shape
        kernel_shape = params.shape[1:]
        num_output_pixels = outputs.shape[1] * outputs.shape[2]
        num_kernel_params = np.prod(kernel_shape)
        out_shape = (num_output_pixels, num_filters)
        num_cuda_kernels = num_output_pixels * num_input_maps

        for i in range(num_images):
            col = self.zeros((num_output_pixels, num_kernel_params))
            _im2col_fp32_impl(np.int32(num_cuda_kernels), inputs[i],
                              np.int32(input_rows), np.int32(input_cols),
                              np.int32(kernel_shape[0]),
                              np.int32(kernel_shape[1]),
                              np.int32(padding), np.int32(padding),
                              np.int32(stride[0]), np.int32(stride[1]),
                              np.int32(outputs.shape[2]),
                              np.int32(num_input_maps),
                              col.gpudata,
                              block=(get_blocks(num_cuda_kernels), 1, 1),
                              grid=(NUM_CUDA_THREADS, 1, 1))

            reshaped_params = params.reshape(num_filters, num_kernel_params)
            culinalg.dot(col, reshaped_params, transb='T',
                         out=outputs[i].reshape(out_shape))

        flat_outputs = flatten_all_but_last(outputs)
        self.add_mv(flat_outputs, bias, flat_outputs)

    def dot_add_mm(self, a, b, out, transa=False, transb=False):
        transa = 'T' if transa else 'N'
        transb = 'T' if transb else 'N'
        culinalg.add_dot(a, b, out, transa, transb)

    def dot_mm(self, a, b, out, transa=False, transb=False):
        transa = 'T' if transa else 'N'
        transb = 'T' if transb else 'N'
        culinalg.dot(a, b, transa=transa, transb=transb, out=out)

    def divide_mv(self, m, v, out):
        cumisc.div_matvec(m, v, out=out)

    def divide_tt(self, a, b, out):
        div_kernel(a, b, out)

    def fill_gaussian(self, mean, std, out):
        self.rnd.fill_normal(out)
        self.mult_st(std, out, out=out)
        self.add_st(mean, out, out=out)

    def generate_probability_mask(self, mask, probability):
        self.rnd.fill_uniform(mask)
        create_probabilistic_mask_kernel(mask, probability, mask)

    def index_m_by_v(self, m, v, out):
        index_m_by_v_kernel(out, v, m, m.shape[0], m.shape[1])

    def log_t(self, a, out):
        cumath.log(a, out=out)

    def maxpool2d_backward_batch(self, inputs, window, outputs, padding,
                                 stride, argmax, in_deltas, out_deltas):
        in_image_size = inputs.size // inputs.shape[0]
        out_image_size = outputs.size // outputs.shape[0]
        _maxpool_bwd_fp32_impl(np.int32(outputs.size), out_deltas,
                               argmax,
                               np.int32(out_image_size),
                               np.int32(in_image_size),
                               in_deltas,
                               block=(get_blocks(outputs.size), 1, 1),
                               grid=(NUM_CUDA_THREADS, 1, 1))

    def maxpool2d_forward_batch(self, inputs, window, outputs, padding,
                                stride, argmax):
        n, h, w, c = inputs.shape
        o_h, o_w = outputs.shape[1], outputs.shape[2]
        _maxpool_fwd_fp32_impl(np.int32(outputs.size), inputs,
                               np.int32(h), np.int32(w), np.int32(c),
                               np.int32(o_h), np.int32(o_w),
                               np.int32(window[0]), np.int32(window[1]),
                               np.int32(stride[0]), np.int32(stride[1]),
                               np.int32(padding), np.int32(padding),
                               outputs,
                               argmax,
                               block=(get_blocks(outputs.size), 1, 1),
                               grid=(NUM_CUDA_THREADS, 1, 1))

    def merge_tt(self, a, b, out):
        assert(a.shape[-1] + b.shape[-1] == out.shape[-1])
        n = int(np.prod(out.shape[:-1]))
        grid, block = self._get_gridsize(n)
        _merge_impl(a.gpudata, b.gpudata, out.gpudata,
                    np.int32(n), np.int32(a.shape[-1]), np.int32(b.shape[-1]),
                    block=block, grid=grid)

    def modulo_tt(self, a, b, out):
        modulo_tt_kernel(a, b, out)

    def mult_add_st(self, s, t, out):
        mult_add_st_kernel(s, t, out)

    def mult_add_tt(self, a, b, out):
        mult_add_kernel(a, b, out)

    def mult_mv(self, m, v, out):
        if m.shape == v.shape:
            self.mult_tt(m, v, out=out)
        else:
            cumisc.mult_matvec(m, v, out=out)

    def mult_add_mv(self, m, v, out):
        if m.shape == v.shape:
            self.mult_add_tt(m, v, out=out)
        else:
            tmp = self.allocate(out.shape)
            cumisc.mult_matvec(m, v, out=tmp)
            self.add_tt(tmp, out, out=out)

    def mult_st(self, s, t, out):
        mult_st_kernel(s, t, out)

    def mult_tt(self, a, b, out):
        mult_tt_kernel(a, b, out)

    def sign_t(self, a, out):
        sign_kernel(a, out)

    def split_add_tt(self, x, out_a, out_b):
        assert(out_a.shape[-1] + out_b.shape[-1] == x.shape[-1])
        n = int(np.prod(x.shape[:-1]))
        grid, block = self._get_gridsize(n)
        _split_add_impl(x.gpudata, out_a.gpudata, out_b.gpudata,
                        np.int32(n), np.int32(out_a.shape[-1]),
                        np.int32(out_b.shape[-1]),
                        block=block, grid=grid)

    def sqrt_t(self, a, out):
        cumath.sqrt(a, out)

    def subtract_mv(self, m, v, out):
        cumisc.binaryop_matvec('-', m, v, None, out, None)

    def subtract_tt(self, a, b, out):
        subtract_mm_kernel(a, b, out)

    def sum_t(self, a, axis, out):
        if len(a.shape) < 3 and (axis == 0 or axis == 1):
            cumisc.sum(a, axis, out)
        elif axis is None:
            self.copy_to(cumisc.sum(a), out)
        else:
            raise NotImplementedError

    # ------------------------ Activation functions ------------------------- #

    def rel(self, x, y):
        rel_kernel(x, y)

    def rel_deriv(self, x, y, dy, dx):
        rel_deriv_kernel(x, y, dy, dx)

    def sigmoid(self, x, y):
        sigmoid_kernel(x, y)

    def sigmoid_deriv(self, x, y, dy, dx):
        sigmoid_deriv_kernel(x, y, dy, dx)

    def softmax_m(self, m, out):
        n, k = m.shape
        tmp = gpuarray.empty((1, n), dtype=m.dtype)
        _softmax_impl(m, tmp.gpudata, out, np.int32(n),
                      np.int32(k), block=(32, 1, 1), grid=(n, 1, 1))
        return out

    def tanh(self, x, y):
        tanh_kernel(x, y)

    def tanh_deriv(self, x, y, dy, dx):
        tanh_deriv_kernel(x, y, dy, dx)