Example #1
0
def calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=None):
    D = A[0].shape[1]
    Dm1 = A[0].shape[0]
    q = len(A)
    
    x = garr.zeros((Dm1, q * D - Dm1), dtype=A[0].dtype)
    x_part = garr.empty_like(x)
    x_subpart = garr.empty_like(A[0])
    
    if not (C is None and Kp1 is None):
        assert (not C is None) and (not Kp1 is None)
        x_part.fill(0)
        for s in range(q):
            x_subpart = eps_r(rp1, C[s], Ap1, x_subpart, handle) #~1st line
            
            x_subpart += cla.dot(A[s], Kp1, handle=handle) #~3rd line
    
            x_part += cla.dot(cla.dot(x_subpart, r_si, handle=handle), Vsh[s], handle=handle)

        x += cla.dot(lm1_s, x_part, handle=handle)

    if not lm2 is None:
        x_part.fill(0)
        for s in range(q):     #~2nd line
            x_subpart = eps_l(lm2, Am1, Cm1[s], x_subpart, handle)
            x_part += cla.dot(x_subpart, cla.dot(r_s, Vsh[s], handle=handle), handle=handle)
        x += cla.dot(lm1_si, x_part, handle=handle)
        
    return x
Example #2
0
def gpubarlinedata(xdata, ydata, bins, minval=None, maxval=None):
    if maxval == None:
        maxval = gpumax(xdata)
    if minval == None:
        minval = gpumin(xdata)
    binsize = (maxval - minval) / float(bins)
    inbin = gpuarray.empty_like(xdata)
    select = gpuarray.empty_like(xdata)
    xmeans = []
    ymeans = []
    errors = []
    for i in xrange(bins):
        lo = minval + binsize * i
        hi = minval + binsize * (i + 1)
        gpubarlinekerna(xdata, lo, hi, inbin)
        N = gpusum(inbin)
        if N > 1:
            gpubarlinekernb(inbin, ydata, select)
            my = gpusum(select) / float(N)
            gpubarlinekernb(inbin, xdata, select)
            mx = gpusum(select) / float(N)
            gpubarlinekernc(inbin, ydata, my, select)
            s = sqrt(gpusum(select) / (N * (N - 1)))
            xmeans.append(mx)
            ymeans.append(my)
            errors.append(s)
    return (xmeans, ymeans, errors)
Example #3
0
def integrate(stepsize=0.01, stores=5, steps=10000, number_of_particles=2 ** 10):
    gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles)
    number_of_particles = np.int32(number_of_particles)
    gpu_rs, gpu_vs = [gpu_r], [gpu_v]

    for i in xrange(stores - 1):
        gpu_rs.append(gpuarray.empty_like(gpu_r))
        gpu_vs.append(gpuarray.empty_like(gpu_v))

    advance = SourceModule(advance_kernel).get_function("advance")
    advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32])

    block_size = (32, 0, 0)
    grid_size = (int(number_of_particles / 32), 0, 0)

    advance.prepared_call(block_size, grid_size, gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles)

    old, new = 1, 2
    for i in xrange(steps):
        r = rs_gpu[old].get_async()
        v = vs_gpu[old].get_async()
        advance.prepared_call_async(
            block_size, grid_size, gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles
        )

        np.write("step{i:4}_r".format(i * stepsize) + ".dat", r)
        np.write("step{i:4}_v".format(i * stepsize) + ".dat", r)

        old, new = new, (new + 1) % stores
Example #4
0
def integrate(stepsize = .01, stores = 5, steps=10000, number_of_particles=2**10):
    gpu_r, gpu_v, gpu_mass = create_particles(number_of_particles)
    number_of_particles = np.int32(number_of_particles)
    gpu_rs, gpu_vs = [gpu_r], [gpu_v]
    
    for i in xrange(stores-1):
        gpu_rs.append(gpuarray.empty_like(gpu_r))
        gpu_vs.append(gpuarray.empty_like(gpu_v))
        
    advance = SourceModule(advance_kernel).get_function("advance")
    advance.prepare([np.intp, np.intp, np.intp, np.intp, np.intp, np.int32])
    
    block_size = (32,0,0)
    grid_size = (int(number_of_particles/32), 0, 0)
    
    advance.prepared_call(block_size, grid_size ,gpu_r[0], gpu_v[0], gpu_mass, gpu_r[1], gpu_v[1], number_of_particles)

    old, new = 1, 2
    for i in xrange(steps):
        r = rs_gpu[old].get_async()
        v = vs_gpu[old].get_async()
        advance.prepared_call_async(block_size, grid_size ,gpu_rs[old], gpu_vs[old], gpu_mass, gpu_rs[new], gpu_vs[new], number_of_particles)
        
        np.write("step{i:4}_r".format(i*stepsize)+".dat", r)
        np.write("step{i:4}_v".format(i*stepsize)+".dat", r)
        
        old, new = new, (new+1)%stores
def cufft_conv(x, y):

    x = x.astype(np.complex64)
    y = y.astype(np.complex64)

    if (x.shape != y.shape):
        return -1

    plan = fft.Plan(x.shape, np.complex64, np.complex64)
    inverse_plan = fft.Plan(x.shape, np.complex64, np.complex64)

    x_gpu = gpuarray.to_gpu(x)
    y_gpu = gpuarray.to_gpu(y)

    x_fft = gpuarray.empty_like(x_gpu, dtype=np.complex64)
    y_fft = gpuarray.empty_like(y_gpu, dtype=np.complex64)
    out_gpu = gpuarray.empty_like(x_gpu, dtype=np.complex64)

    fft.fft(x_gpu, x_fft, plan)
    fft.fft(y_gpu, y_fft, plan)

    linalg.multiply(x_fft, y_fft, overwrite=True)
    fft.ifft(y_fft, out_gpu, inverse_plan, scale=True)
    conv_out = out_gpu.get()

    x_gpu.gpudata.free()
    y_gpu.gpudata.free()
    x_fft.gpudata.free()
    y_fft.gpudata.free()
    out_gpu.gpudata.free()

    return conv_out
Example #6
0
def sici(x_gpu):
    """
    Sine/Cosine integral.

    Computes the sine and cosine integral of every element in the
    input matrix.

    Parameters
    ----------
    x_gpu : GPUArray
        Input matrix of shape `(m, n)`.
        
    Returns
    -------
    (si_gpu, ci_gpu) : tuple of GPUArrays
        Tuple of GPUarrays containing the sine integrals and cosine
        integrals of the entries of `x_gpu`.
        
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> x = np.array([[1, 2], [3, 4]], np.float32)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> (si_gpu, ci_gpu) = sici(x_gpu)
    >>> (si, ci) = scipy.special.sici(x)
    >>> np.allclose(si, si_gpu.get())
    True
    >>> np.allclose(ci, ci_gpu.get())
    True
    """

    if x_gpu.dtype == np.float32:
        args = 'float *x, float *si, float *ci'
        op = 'sicif(x[i], &si[i], &ci[i])'
    elif x_gpu.dtype == np.float64:
        args = 'double *x, double *si, double *ci'
        op = 'sici(x[i], &si[i], &ci[i])'
    else:
        raise ValueError('unsupported type')

    try:
        func = sici.cache[x_gpu.dtype]
    except KeyError:
        func = elementwise.ElementwiseKernel(
            args,
            op,
            options=["-I", install_headers],
            preamble='#include "cuSpecialFuncs.h"')
        sici.cache[x_gpu.dtype] = func

    si_gpu = gpuarray.empty_like(x_gpu)
    ci_gpu = gpuarray.empty_like(x_gpu)
    func(x_gpu, si_gpu, ci_gpu)

    return (si_gpu, ci_gpu)
Example #7
0
def sici(x_gpu):
    """
    Sine/Cosine integral.

    Computes the sine and cosine integral of every element in the
    input matrix.

    Parameters
    ----------
    x_gpu : GPUArray
        Input matrix of shape `(m, n)`.
        
    Returns
    -------
    (si_gpu, ci_gpu) : tuple of GPUArrays
        Tuple of GPUarrays containing the sine integrals and cosine
        integrals of the entries of `x_gpu`.
        
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> x = np.array([[1, 2], [3, 4]], np.float32)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> (si_gpu, ci_gpu) = sici(x_gpu)
    >>> (si, ci) = scipy.special.sici(x)
    >>> np.allclose(si, si_gpu.get())
    True
    >>> np.allclose(ci, ci_gpu.get())
    True
    """

    if x_gpu.dtype == np.float32:
        args = 'float *x, float *si, float *ci'
        op = 'sicif(x[i], &si[i], &ci[i])'
    elif x_gpu.dtype == np.float64:
        args = 'double *x, double *si, double *ci'
        op = 'sici(x[i], &si[i], &ci[i])'
    else:
        raise ValueError('unsupported type')
    
    try:
        func = sici.cache[x_gpu.dtype]
    except KeyError:
        func = elementwise.ElementwiseKernel(args, op,
                                 options=["-I", install_headers],
                                 preamble='#include "cuSpecialFuncs.h"')
        sici.cache[x_gpu.dtype] = func

    si_gpu = gpuarray.empty_like(x_gpu)
    ci_gpu = gpuarray.empty_like(x_gpu)
    func(x_gpu, si_gpu, ci_gpu)
        
    return (si_gpu, ci_gpu)
Example #8
0
    def __init__(self,
                 n_units,
                 n_incoming,
                 N,
                 init_sd=1.0,
                 precision=np.float32,
                 magic_numbers=False):

        self.n_units = n_units
        self.n_incoming = n_incoming
        self.N = N
        w = np.random.normal(0, init_sd, (self.n_incoming, self.n_units))
        b = np.random.normal(0, init_sd, (1, n_units))

        self.weights = gpuarray.to_gpu(w.copy().astype(precision))
        self.gW = gpuarray.empty_like(self.weights)

        # Prior and ID must be set after creation
        self.prior = -1
        self.ID = -1

        self.biases = gpuarray.to_gpu(b.copy().astype(precision))
        self.gB = gpuarray.empty_like(self.biases)

        #Set up momentum variables for HMC sampler
        self.pW = gpuarray.to_gpu(np.random.normal(0, 1, self.gW.shape))
        self.pB = gpuarray.to_gpu(np.random.normal(0, 1, self.gB.shape))

        self.epsW = gpuarray.zeros(self.weights.shape, precision) + 1.0
        self.epsB = gpuarray.zeros(self.biases.shape, precision) + 1.0

        self.precision = precision
        self.outputs = gpuarray.zeros((self.N, self.n_units), precision)

        self.magic_numbers = magic_numbers
        #Define tan_h function on GPU
        if magic_numbers:
            self.tanh = ElementwiseKernel("float *x",
                                          "x[i] = 1.7159 * tanh(2/3*x[i]);",
                                          "tan_h",
                                          preamble="#include <math.h>")
        else:
            self.tanh = ElementwiseKernel(
                "float *x",
                "x[i] = tanh(min(max(-10.0,x[i]),10.0));",
                "tan_h",
                preamble="#include <math.h>")
        #Compile kernels
        kernels = SourceModule(open(path + '/kernels.cu', "r").read())
        self.add_bias_kernel = kernels.get_function("add_bias")

        self.rng = curandom.XORWOWRandomNumberGenerator()

        ##Initialize posterior weights
        self.posterior_weights = list()
        self.posterior_biases = list()
Example #9
0
def make_GPU_gradient(mesh, context):
    '''Prepare to compute gradient on the GPU w.r.t. the given mesh.
    Return gradient function.
    '''
    mx = int(getattr(mesh, 'nx', 1))
    my = int(getattr(mesh, 'ny', 1))
    mz = int(getattr(mesh, 'nz', 1))

    dxInv = np.array(1./getattr(mesh, 'dx', 1), dtype=np.float64)
    dyInv = np.array(1./getattr(mesh, 'dy', 1), dtype=np.float64)
    dzInv = np.array(1./getattr(mesh, 'dz', 1), dtype=np.float64)

    sizeof_double = 8
    with open(where + 'gradient2.cu') as fdlib:
        source = fdlib.read()
    module = SourceModule(source)

    mx_ptr = module.get_global("mx")[0]
    my_ptr = module.get_global("my")[0]
    mz_ptr = module.get_global("mz")[0]
    cuda.memcpy_htod(mx_ptr, np.array(mx, dtype=np.int32))
    cuda.memcpy_htod(my_ptr, np.array(my, dtype=np.int32))
    cuda.memcpy_htod(mz_ptr, np.array(mz, dtype=np.int32))

    dxInv_ptr = module.get_global("dxInv")[0]
    dyInv_ptr = module.get_global("dyInv")[0]
    dzInv_ptr = module.get_global("dzInv")[0]
    cuda.memcpy_htod(dxInv_ptr, dxInv)
    cuda.memcpy_htod(dyInv_ptr, dyInv)
    cuda.memcpy_htod(dzInv_ptr, dzInv)

    deriv_x = module.get_function("gradient_x")
    deriv_y = module.get_function("gradient_y")
    deriv_z = module.get_function("gradient_z")

    block, grid = mesh.get_domain_decomposition(DeviceData().max_threads)

    d_deriv_x = gpuarray.empty(shape=(1, mesh.n_nodes), dtype=np.float64)
    d_deriv_y = gpuarray.empty_like(d_deriv_x)
    d_deriv_z = gpuarray.empty_like(d_deriv_x)

    def _gradient(scalar_values):
        '''Calculate three-dimensional gradient for GPUArray
        scalar_values.
        '''
        deriv_x(scalar_values, d_deriv_x, block=block, grid=grid)
        deriv_y(scalar_values, d_deriv_y, block=block, grid=grid)
        deriv_z(scalar_values, d_deriv_z, block=block, grid=grid)
        context.synchronize()

        return (d_deriv_x, d_deriv_y, d_deriv_z)[:mesh.dimension]
    return _gradient
Example #10
0
    def __init__(self,
                 n_classes,
                 n_incoming,
                 N,
                 init_sd=0.1,
                 precision=np.float32):
        self.type = 'Softmax'
        self.n_incoming = n_incoming
        self.N = N
        w = np.random.normal(0, init_sd, (self.n_incoming, n_classes))
        b = np.random.normal(0, init_sd, (1, n_classes))
        self.weights = gpuarray.to_gpu(w.copy().astype(precision))
        self.gW = gpuarray.empty_like(self.weights)
        #print self.weights
        #       print init_sd
        self.biases = gpuarray.to_gpu(b.copy().astype(precision))
        self.gB = gpuarray.empty_like(self.biases)

        # Prior and ID are set later
        self.prior = -1
        self.ID = -1

        #Set up momentum variables for HMC sampler
        self.pW = gpuarray.to_gpu(np.random.normal(0, 1, self.gW.shape))
        self.pB = gpuarray.to_gpu(np.random.normal(0, 1, self.gB.shape))

        #Store stepsizes for each parameter
        self.epsW = gpuarray.zeros(self.weights.shape, precision) + 1.0
        self.epsB = gpuarray.zeros(self.biases.shape, precision) + 1.0

        self.n_classes = n_classes
        self.n_incoming = n_incoming

        self.N = N
        self.outputs = gpuarray.zeros((self.N, self.n_classes), precision)

        self.precision = precision

        kernels = SourceModule(open(path + '/kernels.cu', "r").read())
        self.softmax_kernel = kernels.get_function("softmax")
        self.add_bias_kernel = kernels.get_function("add_bias")

        self.rng = curandom.XORWOWRandomNumberGenerator()

        ##Initialize posterior weights
        self.posterior_weights = list()
        self.posterior_biases = list()

        self.eps_tol = 1e-10
Example #11
0
 def __init__(self,n_units,n_incoming,N,init_sd=1.0,precision=np.float32,magic_numbers=False):
     
     self.n_units = n_units
     self.n_incoming = n_incoming
     self.N = N
     w = np.random.normal(0,init_sd,(self.n_incoming,self.n_units))
     b = np.random.normal(0,init_sd,(1,n_units))
     
     self.weights = gpuarray.to_gpu(w.copy().astype(precision))
     self.gW = gpuarray.empty_like(self.weights)
     
     # Prior and ID must be set after creation
     self.prior = -1
     self.ID = -1
             
     self.biases = gpuarray.to_gpu(b.copy().astype(precision))
     self.gB = gpuarray.empty_like(self.biases)
         
     #Set up momentum variables for HMC sampler
     self.pW = gpuarray.to_gpu(np.random.normal(0,1,self.gW.shape))
     self.pB = gpuarray.to_gpu(np.random.normal(0,1,self.gB.shape))
     
     self.epsW = gpuarray.zeros(self.weights.shape,precision) + 1.0
     self.epsB = gpuarray.zeros(self.biases.shape,precision) + 1.0        
     
     self.precision = precision
     self.outputs = gpuarray.zeros((self.N,self.n_units),precision)   
     
     self.magic_numbers = magic_numbers
     #Define tan_h function on GPU   
     if magic_numbers:
         self.tanh = ElementwiseKernel(
             "float *x",
             "x[i] = 1.7159 * tanh(2/3*x[i]);",
             "tan_h",preamble="#include <math.h>")
     else:
         self.tanh = ElementwiseKernel(
         "float *x",
         "x[i] = tanh(min(max(-10.0,x[i]),10.0));",
         "tan_h",preamble="#include <math.h>")
     #Compile kernels 
     kernels = SourceModule(open(path+'/kernels.cu', "r").read())        
     self.add_bias_kernel = kernels.get_function("add_bias")
     
     self.rng = curandom.XORWOWRandomNumberGenerator()
     
     ##Initialize posterior weights
     self.posterior_weights = list()
     self.posterior_biases = list()
Example #12
0
    def feed_forward(self, input_data, prediction=False):
        """Propagate forward through the layer

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to perform dropout on.

        prediction : bool, optional
            Whether to use prediction model. If true, then the data is
            scaled by ``1 - dropout_probability`` uses dropout.

        **Returns:**
        
        dropout_data : ``GPUArray``
            The data after performing dropout.
        """

        if input_data.shape[1] != self.n_in:
            raise ValueError(
                'Number of outputs from previous layer (%d) '
                'does not match number of inputs to this layer (%d)' %
                (input_data.shape[1], self.n_in))

        if not prediction:
            dropout_input = gpuarray.empty_like(input_data)
            dropout_mask = sample_dropout_mask(input_data,
                                               self.dropout_probability,
                                               target=dropout_input)
            return dropout_input, dropout_mask
        else:
            return (input_data * (1 - self.dropout_probability), )
Example #13
0
def test_cublas_bug():
    '''
    The SGEMM call would cause all calls after it to fail for some unknown
    reason. Likely this is caused swaprows causing memory corruption.

    NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be
          fixed in CUDA 6.5
    '''
    from pycuda.driver import Stream
    from skcuda.cublas import cublasSgemm
    from skcuda.misc import _global_cublas_handle as handle

    n = 131

    s = slice(128, n)
    X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32))
    a = gpuarray.empty((X.shape[1], 3), dtype=np.float32)
    c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32)
    b = gpuarray.empty_like(X)

    m, n = a.shape[0], b[s].shape[1]
    k = a.shape[1]
    lda = m
    ldb = k
    ldc = m
    #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
    cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata,
                ldb, 0.0, c.gpudata, ldc)
    #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc

    #gpuarray.dot(d, Xoutd[s])
    #op.sgemm(a, b[s], c)

    stream = Stream()
    stream.synchronize()
Example #14
0
def exp1(z_gpu):
    """
    Exponential integral with `n = 1` of complex arguments.

    Parameters
    ----------
    z_gpu : GPUArray
        Input matrix of shape `(m, n)`.

    Returns
    -------
    e_gpu : GPUArray
        GPUarrays containing the exponential integrals of
        the entries of `z_gpu`.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64)
    >>> z_gpu = gpuarray.to_gpu(z)
    >>> e_gpu = exp1(z_gpu)
    >>> e_sp = scipy.special.exp1(z)
    >>> np.allclose(e_sp, e_gpu.get())
    True
    """

    e_gpu = gpuarray.empty_like(z_gpu)
    func = _get_exp1_kernel(z_gpu.dtype)
    func(z_gpu, e_gpu)

    return e_gpu
Example #15
0
 def computeIrDensity(self, dS_gpu):
     """
     Compute the impulse response density at the time intervals in dS_gpu 
     """
     K = self.modelParams["proc_id_model","K"]
     N = self.base.data.N
     gS_gpu = gpuarray.empty_like(dS_gpu)
     
     # Update GS using the impulse response parameters
     grid_w = int(np.ceil(N/1024.0))
     self.gpuKernels["computeLogisticNormalGSIndiv"](np.int32(K),
                                                     np.int32(self.base.data.N),
                                                     self.gpuPtrs["proc_id_model","C"].gpudata,
                                                     self.base.dSS["rowIndices"].gpudata,
                                                     self.base.dSS["colPtrs"].gpudata,
                                                     self.gpuPtrs["impulse_model","g_mu"].gpudata,
                                                     self.gpuPtrs["impulse_model","g_tau"].gpudata,
                                                     np.float32(self.params["dt_max"]),
                                                     dS_gpu.gpudata,
                                                     gS_gpu.gpudata,
                                                     block=(1024, 1, 1), 
                                                     grid=(grid_w,1)
                                                     )
     
     return gS_gpu
Example #16
0
    def feed_forward(self, input_data, prediction=False):
        """Propagate forward through the layer

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to perform dropout on.

        prediction : bool, optional
            Whether to use prediction model. If true, then the data is
            scaled by ``1 - dropout_probability`` uses dropout.

        **Returns:**
        
        dropout_data : ``GPUArray``
            The data after performing dropout.
        """

        if input_data.shape[1] != self.n_in:
            raise ValueError('Number of outputs from previous layer (%d) '
                             'does not match number of inputs to this layer (%d)' %
                             (input_data.shape[1], self.n_in))

        if not prediction:
            dropout_input = gpuarray.empty_like(input_data)
            dropout_mask = sample_dropout_mask(input_data,
                                               self.dropout_probability, target=dropout_input
                                           )
            return dropout_input, dropout_mask
        else:
            return (input_data * (1 - self.dropout_probability),)
Example #17
0
    def _FarnebackUpdateMatrices_gpu(self, R0_gpu, R1_gpu, flow_gpu, M_gpu):

        R1_warped_gpu = gpuarray.empty_like(R1_gpu)

        block = (32, 32, 1)
        grid = (int(divup(flow_gpu.shape[3],
                          block[0])), int(divup(flow_gpu.shape[2],
                                                block[1])), 1)

        for i in range(_NUM_POLY_COEFFICIENTS - 1):
            farneback3d._utils.ndarray_to_float_tex(self._r1_texture,
                                                    R1_gpu[i])
            self._warp_kernel(flow_gpu,
                              R1_warped_gpu[i],
                              np.int32(flow_gpu.shape[3]),
                              np.int32(flow_gpu.shape[2]),
                              np.int32(flow_gpu.shape[1]),
                              np.float32(1),
                              np.float32(1),
                              np.float32(1),
                              block=block,
                              grid=grid)

        self._update_matrices_kernel(R0_gpu,
                                     R1_warped_gpu,
                                     flow_gpu,
                                     M_gpu,
                                     np.int32(flow_gpu.shape[3]),
                                     np.int32(flow_gpu.shape[2]),
                                     np.int32(flow_gpu.shape[1]),
                                     block=block,
                                     grid=grid)
Example #18
0
def e1z(z_gpu, dev):
    """
    Exponential integral with `n = 1` of complex arguments.

    Parameters
    ----------
    x_gpu : GPUArray
        Input matrix of shape `(m, n)`.
    dev : pycuda.driver.Device
        Device object to be used.
        
    Returns
    -------
    e_gpu : GPUArray
        GPUarrays containing the exponential integrals of
        the entries of `z_gpu`.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64)
    >>> z_gpu = gpuarray.to_gpu(z)
    >>> e_gpu = e1z(z_gpu, pycuda.autoinit.device)
    >>> e_sp = scipy.special.exp1(z)
    >>> np.allclose(e_sp, e_gpu.get())
    True

    """

    if z_gpu.dtype == np.complex64:
        use_double = 0
    elif z_gpu.dtype == np.complex128:
        use_double = 1
    else:
        raise ValueError("unsupported type")

    # Get block/grid sizes:
    max_threads_per_block, max_block_dim, max_grid_dim = get_dev_attrs(dev)
    block_dim, grid_dim = select_block_grid_sizes(dev, z_gpu.shape)
    max_blocks_per_grid = max(max_grid_dim)

    # Set this to False when debugging to make sure the compiled kernel is
    # not cached:
    cache_dir = None
    e1z_mod = SourceModule(
        e1z_mod_template.substitute(
            use_double=use_double, max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid
        ),
        cache_dir=cache_dir,
        options=["-I", install_headers],
    )
    e1z_func = e1z_mod.get_function("e1z")

    e_gpu = gpuarray.empty_like(z_gpu)
    e1z_func(z_gpu.gpudata, e_gpu.gpudata, np.uint32(z_gpu.size), block=block_dim, grid=grid_dim)
    return e_gpu
Example #19
0
def worker():
    comm = MPI.Comm.Get_parent()
    size = comm.Get_size()
    rank = comm.Get_rank()
    name = MPI.Get_processor_name()

    import pycuda.driver as drv
    drv.init()

    # Find maximum number of available GPUs:
    max_gpus = drv.Device.count()

    # Use modular arithmetic to avoid assigning a nonexistent GPU:
    n = rank % max_gpus
    dev = drv.Device(n)
    ctx = dev.make_context()
    atexit.register(ctx.pop)

    # Execute a kernel:
    import pycuda.gpuarray as gpuarray
    from pycuda.elementwise import ElementwiseKernel
    
    kernel = ElementwiseKernel('double *y, double *x, double a',
                               'y[i] = a*x[i]')
    x_gpu = gpuarray.to_gpu(np.random.rand(2))
    y_gpu = gpuarray.empty_like(x_gpu)
    kernel(y_gpu, x_gpu, np.double(2.0))

    print 'I am process %d of %d on CPU %s using GPU %s of %s [x_gpu=%s, y_gpu=%s]' % \
        (rank, size, name, n, max_gpus, str(x_gpu.get()), str(y_gpu.get()))
    comm.Disconnect()
Example #20
0
 def buffer_apply(self, input):
     # TODO: buffer apply to a large input may cause a launch timeout, need to buffer in
     # smaller chunks if this is the case
     b = self.filt_b_gpu
     a = self.filt_a_gpu
     zi = self.filt_state
     if not hasattr(self,
                    'filt_x_gpu') or input.size != self.filt_x_gpu.size:
         self._desiredshape = input.shape
         self._has_run_once = False
         self.filt_x_gpu = gpuarray.to_gpu(input.flatten())
         self.filt_y_gpu = gpuarray.empty_like(self.filt_x_gpu)
     else:
         self.filt_x_gpu.set(input.flatten())
     filt_x_gpu = self.filt_x_gpu
     filt_y_gpu = self.filt_y_gpu
     if self._has_run_once:
         self.gpu_filt_func.launch_grid(*self.grid)
     else:
         self.gpu_filt_func.prepared_call(self.grid, intp(b.gpudata),
                                          intp(a.gpudata),
                                          intp(filt_x_gpu.gpudata),
                                          intp(zi.gpudata),
                                          intp(filt_y_gpu.gpudata),
                                          int32(input.shape[0]))
         self._has_run_once = True
     return reshape(filt_y_gpu.get(pagelocked=self.pagelocked_mem),
                    self._desiredshape)
Example #21
0
    def feed_forward(self, input_data, prediction=False):
        """Propagate forward through the layer

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to perform dropout on.

        prediction : bool, optional
            Whether to use prediction model. If true, then the data is
            scaled by ``1 - dropout_probability`` uses dropout.

        **Returns:**
        
        dropout_data : ``GPUArray``
            The data after performing dropout.
        """

        assert input_data.shape[1] == self.n_in

        if not prediction:
            dropout_input = gpuarray.empty_like(input_data)
            dropout_mask = sample_dropout_mask(input_data,
                                               self.dropout_probability, target=dropout_input
                                           )
            return dropout_input, dropout_mask
        else:
            return (input_data * (1 - self.dropout_probability),)
Example #22
0
def nan_to_zeros(x, target=None):
    assert x.flags.c_contiguous
    if target is None:
        target = gpuarray.empty_like(x)
    assert target.flags.c_contiguous
    all_kernels['nan_to_zeros'](x, target)
    return target
Example #23
0
 def initializeGpuMemory(self):
     K = self.modelParams["proc_id_model","K"]
     
     # Sufficient statistics for the parameters of G kernels
     self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32)
     self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) 
     self.gpuPtrs["impulse_model","GS"] = gpuarray.empty_like(self.base.dSS["dS"])
Example #24
0
def test_cublas_bug():
    '''
    The SGEMM call would cause all calls after it to fail for some unknown
    reason. Likely this is caused swaprows causing memory corruption.

    NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be
          fixed in CUDA 6.5
    '''
    from pycuda.driver import Stream
    from skcuda.cublas import cublasSgemm
    from skcuda.misc import _global_cublas_handle as handle

    n = 131

    s = slice(128, n)
    X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32))
    a = gpuarray.empty((X.shape[1], 3), dtype=np.float32)
    c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32)
    b = gpuarray.empty_like(X)

    m, n = a.shape[0], b[s].shape[1]
    k = a.shape[1]
    lda = m
    ldb = k
    ldc = m
    #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
    cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
    #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc

    #gpuarray.dot(d, Xoutd[s])
    #op.sgemm(a, b[s], c)

    stream = Stream()
    stream.synchronize()
Example #25
0
    def feed_forward(self, input_data, prediction=False):
        """Propagate forward through the layer

        **Parameters:**

        input_data : ``GPUArray``
            Inpute data to perform dropout on.

        prediction : bool, optional
            Whether to use prediction model. If true, then the data is
            scaled by ``1 - dropout_probability`` uses dropout.

        **Returns:**
        
        dropout_data : ``GPUArray``
            The data after performing dropout.
        """

        assert input_data.shape[1] == self.n_in

        if not prediction:
            dropout_input = gpuarray.empty_like(input_data)
            dropout_mask = sample_dropout_mask(input_data,
                                               self.dropout_probability,
                                               target=dropout_input)
            return dropout_input, dropout_mask
        else:
            return (input_data * (1 - self.dropout_probability), )
Example #26
0
def gaussian_fourierkernel_elemwise(uu, vv, ww, sigma):
    """
    Create Gaussian Fourier filter kernel
    Element wise cuda implementation

    """
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as cuda
    import pycuda.autoinit

    siz = np.floor(np.array(uu.shape))
    zz = uu.copy()
    u_gpu = gpuarray.to_gpu(uu)
    v_gpu = gpuarray.to_gpu(vv)
    w_gpu = gpuarray.to_gpu(ww)
    from pycuda.elementwise import ElementwiseKernel
    norm_comb = ElementwiseKernel(
        "float s, float pi, float *u, float *v, float *w, float *z",
        "z[i] = exp(-2 * (pi ^ 2) * (u[i] ^ 2 + v[i] ^ 2 + w[i] ^ 2) * (s^2))",
        "normal_combination")

    z_gpu = gpuarray.empty_like(a_gpu)
    norm_comb(sigma, np.pi, u_gpu, v_gpu, w_gpu, z_gpu)

    gfilter = (np.exp(-2 * (np.pi ** 2)) * z_gpu).get()
    return gfilter
Example #27
0
def main_no_tex(dtype):
    lc_kernel = get_lin_comb_kernel_no_tex((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10,26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            lc_kernel.prepared_call(x._grid, x._block,
                a.gpudata, x.gpudata,
                b.gpudata, y.gpudata,
                z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
Example #28
0
def nan_to_zeros(x, target=None):
    assert x.flags.c_contiguous
    if target is None:
        target = gpuarray.empty_like(x)
    assert target.flags.c_contiguous
    all_kernels['nan_to_zeros'](x, target)
    return target
Example #29
0
 def buffer_apply(self, input):
     # TODO: buffer apply to a large input may cause a launch timeout, need to buffer in
     # smaller chunks if this is the case
     b = self.filt_b_gpu
     a = self.filt_a_gpu
     zi = self.filt_state
     if not hasattr(self, "filt_x_gpu") or input.size != self.filt_x_gpu.size:
         self._desiredshape = input.shape
         self._has_run_once = False
         self.filt_x_gpu = gpuarray.to_gpu(input.flatten())
         self.filt_y_gpu = gpuarray.empty_like(self.filt_x_gpu)
     else:
         self.filt_x_gpu.set(input.flatten())
     filt_x_gpu = self.filt_x_gpu
     filt_y_gpu = self.filt_y_gpu
     if self._has_run_once:
         self.gpu_filt_func.launch_grid(*self.grid)
     else:
         self.gpu_filt_func.prepared_call(
             self.grid,
             intp(b.gpudata),
             intp(a.gpudata),
             intp(filt_x_gpu.gpudata),
             intp(zi.gpudata),
             intp(filt_y_gpu.gpudata),
             int32(input.shape[0]),
         )
         self._has_run_once = True
     return reshape(filt_y_gpu.get(pagelocked=self.pagelocked_mem), self._desiredshape)
Example #30
0
    def test():
        gpu_func = getattr(cumath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))
        if complex:
            _dtypes = complex_dtypes
        else:
            _dtypes = dtypes

        for s in sizes:
            for dtype in _dtypes:
                np.random.seed(1)
                A = (np.random.random(s)*(b-a) + a).astype(dtype)
                if complex:
                    A += (np.random.random(s)*(b-a) + a)*1j

                args = gpuarray.to_gpu(A)
                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(A)

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= threshold).all(), \
                        (max_err, name, dtype)

                gpu_results2 = gpuarray.empty_like(args)
                gr2 = gpu_func(args, out=gpu_results2)
                assert gpu_results2 is gr2
                gr2 = gr2.get()
                max_err = np.max(np.abs(cpu_results - gr2))
                assert (max_err <= threshold).all(), \
                        (max_err, name, dtype)
Example #31
0
def e1z(z_gpu):
    """
    Exponential integral with `n = 1` of complex arguments.

    Parameters
    ----------
    x_gpu : GPUArray
        Input matrix of shape `(m, n)`.
        
    Returns
    -------
    e_gpu : GPUArray
        GPUarrays containing the exponential integrals of
        the entries of `z_gpu`.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import scipy.special
    >>> import special
    >>> z = np.asarray(np.random.rand(4, 4)+1j*np.random.rand(4, 4), np.complex64)
    >>> z_gpu = gpuarray.to_gpu(z)
    >>> e_gpu = e1z(z_gpu, pycuda.autoinit.device)
    >>> e_sp = scipy.special.exp1(z)
    >>> np.allclose(e_sp, e_gpu.get())
    True

    """

    if z_gpu.dtype == np.complex64:
        use_double = 0
    elif z_gpu.dtype == np.complex128:
        use_double = 1
    else:
        raise ValueError('unsupported type')

    
    # Get block/grid sizes; the number of threads per block is limited
    # to 256 because the e1z kernel defined above uses too many
    # registers to be invoked more threads per block:
    dev = get_current_device()
    max_threads_per_block = 256
    block_dim, grid_dim = select_block_grid_sizes(dev, z_gpu.shape, max_threads_per_block)

    # Set this to False when debugging to make sure the compiled kernel is
    # not cached:
    cache_dir=None
    e1z_mod = \
             SourceModule(e1z_mod_template.substitute(use_double=use_double),
                          cache_dir=cache_dir)
    e1z_func = e1z_mod.get_function("e1z")

    e_gpu = gpuarray.empty_like(z_gpu)
    e1z_func(z_gpu, e_gpu,
              np.uint32(z_gpu.size),
              block=block_dim,
              grid=grid_dim)
    return e_gpu
Example #32
0
    def test():
        gpu_func = getattr(cumath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))
        if complex:
            _dtypes = complex_dtypes
        else:
            _dtypes = dtypes

        for s in sizes:
            for dtype in _dtypes:
                np.random.seed(1)
                A = (np.random.random(s) * (b - a) + a).astype(dtype)
                if complex:
                    A += (np.random.random(s) * (b - a) + a) * 1j

                args = gpuarray.to_gpu(A)
                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(A)

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= threshold).all(), \
                        (max_err, name, dtype)

                gpu_results2 = gpuarray.empty_like(args)
                gr2 = gpu_func(args, out=gpu_results2)
                assert gpu_results2 is gr2
                gr2 = gr2.get()
                max_err = np.max(np.abs(cpu_results - gr2))
                assert (max_err <= threshold).all(), \
                        (max_err, name, dtype)
Example #33
0
def substract_matrix(a, b, target=None):
    assert a.shape == b.shape
    if target is None:
        target = gpuarray.empty_like(a)

    all_kernels['substract_matrix'](a, b, target)
    return target
Example #34
0
def main(dtype):
    from pycuda.elementwise import get_linear_combination_kernel
    lc_kernel, lc_texrefs = get_linear_combination_kernel(
        ((True, dtype, dtype), (True, dtype, dtype)), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
            b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
            lc_kernel.prepared_call(x._grid, x._block, x.gpudata, y.gpudata,
                                    z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print(size, size_exp, stop.time_since(start))
Example #35
0
    def bind_buffers(self):
        """
        Gets allocated tensors for input and output feature maps.
        Allocates a scratch tensor for argmax indices if the op is max pooling
        since this is required for bprop. Builds a final list of parameters to
        pass to the kernel.
        """
        I_data = self.I.value.tensor
        O_data = self.O.value.tensor

        # Allocate argmax tensor
        if self.op == "max":
            if self.index not in self.transformer.argmax_tensors:
                argmax = empty_like(self.O.value.tensor)
                self.transformer.argmax_tensors[self.index] = argmax
            else:
                argmax = self.transformer.argmax_tensors[self.index]
            A_data = argmax.gpudata
        else:
            A_data = 0

        kernel_args = self.fprop_kernel
        self.params = [
            kernel_args[1], kernel_args[2], None, I_data.gpudata,
            O_data.gpudata, A_data, 1.0, 0.0, 0
        ]
        self.params.extend(kernel_args[3])
        super(PoolFpropKernel, self).bind_buffers()
Example #36
0
def substract_matrix(a, b, target=None):
    assert a.shape == b.shape
    if target is None:
        target = gpuarray.empty_like(a)

    all_kernels['substract_matrix'](a, b, target)
    return target
Example #37
0
def main_no_tex(dtype):
    lc_kernel = get_lin_comb_kernel_no_tex(
        ((True, dtype, dtype), (True, dtype, dtype)), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata,
                                    b.gpudata, y.gpudata, z.gpudata,
                                    x.mem_size)

        stop.record()
        stop.synchronize()

        print(size, size_exp, stop.time_since(start))
Example #38
0
def main(dtype):
    from pycuda.elementwise import get_linear_combination_kernel
    lc_kernel, lc_texrefs = get_linear_combination_kernel((
        (True, dtype, dtype),
        (True, dtype, dtype)
        ), dtype)

    for size_exp in range(10, 26):
        size = 1 << size_exp

        from pycuda.curandom import rand
        a = gpuarray.to_gpu(numpy.array(5, dtype=dtype))
        x = rand(size, dtype=dtype)
        b = gpuarray.to_gpu(numpy.array(7, dtype=dtype))
        y = rand(size, dtype=dtype)

        z = gpuarray.empty_like(x)

        start = drv.Event()
        stop = drv.Event()
        start.record()

        for i in range(20):
            a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True)
            b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True)
            lc_kernel.prepared_call(x._grid, x._block,
                x.gpudata, y.gpudata, z.gpudata, x.mem_size)

        stop.record()
        stop.synchronize()

        print size, size_exp, stop.time_since(start)
def main():
    width = 65
    height = 65
    depth = 260

    shift_x = 20
    shift_y = 17

    from pycuda.curandom import rand as curand

    a_gpu = curand((depth, height, width)).astype('complex64')
    a = a_gpu.get()

    b = np.zeros_like(a)
    b_gpu = gpuarray.to_gpu(b)

    circ_shift(a_gpu, b_gpu, shift_x, shift_y)
    b = b_gpu.get()
    # print(a)
    # print(b)
    assert np.all(b == np.roll(np.roll(a, shift_x, axis=2), shift_y, axis=1))

    t = time.time()
    b_gpu = gpuarray.empty_like(a_gpu)
    for i in np.arange(100):
        circ_shift(a_gpu, b_gpu, shift_x, shift_y)
    print('GPU took %.4f secs' % (time.time() - t))

    t = time.time()
    for i in np.arange(100):
        np.roll(np.roll(a, shift_x, axis=2), shift_y, axis=1)
    print('CPU took %.4f secs' % (time.time() - t))
Example #40
0
def mult_matrix(a, b, target=None):
    assert a.shape == b.shape
    if target is None:
        target = gpuarray.empty_like(a)

    all_kernels["mult_matrix"](a, b, target)
    return target
Example #41
0
def run_function(X, Y_expected, func, rtol=1e-6, with_inplace_test=True, **kwargs):
    # CPU, with target argument
    Y = np.empty_like(Y_expected)
    Yhr = func(X, out=Y, **kwargs)
    assert_allclose(Y_expected, Yhr, err_msg="CPU with target", rtol=rtol)
    assert Yhr is Y

    # CPU, no target argument
    Yhr = func(X, **kwargs)
    assert_allclose(Y_expected, Yhr, err_msg="CPU, no target", rtol=rtol)

    if with_inplace_test:
        X2 = X.copy()
        Yhr = func(X2, out=X2, **kwargs)
        assert_allclose(Y_expected, Yhr, err_msg="CPU, inplace target", rtol=rtol)
        assert Yhr is X2

    kwargs = op.to_gpu(kwargs)

    # GPU, with target
    Xd = op.to_gpu(X)
    Yd = gpuarray.empty_like(op.to_gpu(Y_expected))
    Ydr = func(Xd, out=Yd, **kwargs)
    assert_allclose(Y_expected, op.to_cpu(Ydr), err_msg="GPU with target", rtol=rtol)
    assert Ydr is Yd

    # GPU, no target
    Ydr = func(Xd, **kwargs)
    assert_allclose(Y_expected, op.to_cpu(Ydr), err_msg="GPU, no target", rtol=rtol)

    if with_inplace_test:
        Ydr = func(Xd, out=Xd, **kwargs)
        assert_allclose(Y_expected, op.to_cpu(Ydr), err_msg="GPU, inplace target", rtol=rtol)
        assert Ydr is Xd
Example #42
0
 def product(self, x: gpuarray.GPUArray) -> gpuarray.GPUArray:
     """Multiply sparse matrix by dense vector."""
     y = gpuarray.empty_like(x)
     op = cs.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE
     cs.cusparseDcsrmv(self.handle, op, self.m, self.n, self.nnz, 1.0,
                       self.descr, self.csrValA, self.csrRowPtrA,
                       self.csrColIndA, x, 0.0, y)
     return y
Example #43
0
def calc_x_G(Kp1,
             C,
             Cm1,
             rp1,
             lm2,
             Am1,
             A,
             Ap1,
             lm1_s,
             lm1_si,
             r_s,
             r_si,
             Vsh,
             handle=None):
    D = A[0].shape[1]
    Dm1 = A[0].shape[0]
    q = len(A)

    x = garr.zeros((Dm1, q * D - Dm1), dtype=A[0].dtype)
    x_part = garr.empty_like(x)
    x_subpart = garr.empty_like(A[0])

    if not (C is None and Kp1 is None):
        assert (not C is None) and (not Kp1 is None)
        x_part.fill(0)
        for s in range(q):
            x_subpart = eps_r(rp1, C[s], Ap1, x_subpart, handle)  #~1st line

            x_subpart += cla.dot(A[s], Kp1, handle=handle)  #~3rd line

            x_part += cla.dot(cla.dot(x_subpart, r_si, handle=handle),
                              Vsh[s],
                              handle=handle)

        x += cla.dot(lm1_s, x_part, handle=handle)

    if not lm2 is None:
        x_part.fill(0)
        for s in range(q):  #~2nd line
            x_subpart = eps_l(lm2, Am1, Cm1[s], x_subpart, handle)
            x_part += cla.dot(x_subpart,
                              cla.dot(r_s, Vsh[s], handle=handle),
                              handle=handle)
        x += cla.dot(lm1_si, x_part, handle=handle)

    return x
Example #44
0
 def computeIrDensity(self, dS_gpu):
     """
     Compute the impulse response density at the time intervals in dS_gpu 
     """
     gS_gpu = gpuarray.empty_like(dS_gpu)
     gS_gpu.fill(self.params["density"])
     
     return gS_gpu
Example #45
0
    def __call__(self, input_ary, output_ary=None, allocator=None,
            stream=None):
        allocator = allocator or input_ary.allocator

        if output_ary is None:
            output_ary = input_ary

        if isinstance(output_ary, (str, six.text_type)) and output_ary == "new":
            output_ary = gpuarray.empty_like(input_ary, allocator=allocator)

        if input_ary.shape != output_ary.shape:
            raise ValueError("input and output must have the same shape")

        if not input_ary.flags.forc:
            raise RuntimeError("ScanKernel cannot "
                    "deal with non-contiguous arrays")

        n, = input_ary.shape

        if not n:
            return output_ary

        unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
        dev = driver.Context.get_device()
        max_groups = 3*dev.get_attribute(
                driver.device_attribute.MULTIPROCESSOR_COUNT)

        from pytools import uniform_interval_splitting
        interval_size, num_groups = uniform_interval_splitting(
                n, unit_size, max_groups);

        block_results = allocator(self.dtype.itemsize*num_groups)
        dummy_results = allocator(self.dtype.itemsize)

        # first level scan of interval (one interval per block)
        self.scan_intervals_knl.prepared_async_call(
                (num_groups, 1), (self.scan_wg_size, 1, 1), stream,
                input_ary.gpudata,
                n, interval_size,
                output_ary.gpudata,
                block_results)

        # second level inclusive scan of per-block results
        self.scan_intervals_knl.prepared_async_call(
                (1,1), (self.scan_wg_size, 1, 1), stream,
                block_results,
                num_groups, interval_size,
                block_results,
                dummy_results)

        # update intervals with result of second level scan
        self.final_update_knl.prepared_async_call(
                (num_groups, 1,), (self.update_wg_size, 1, 1), stream,
                output_ary.gpudata,
                n, interval_size,
                block_results)

        return output_ary
Example #46
0
    def __call__(self, input_ary, output_ary=None, allocator=None,
            stream=None):
        allocator = allocator or input_ary.allocator

        if output_ary is None:
            output_ary = input_ary

        if isinstance(output_ary, (str, six.text_type)) and output_ary == "new":
            output_ary = gpuarray.empty_like(input_ary, allocator=allocator)

        if input_ary.shape != output_ary.shape:
            raise ValueError("input and output must have the same shape")

        if not input_ary.flags.forc:
            raise RuntimeError("ScanKernel cannot "
                    "deal with non-contiguous arrays")

        n, = input_ary.shape

        if not n:
            return output_ary

        unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
        dev = driver.Context.get_device()
        max_groups = 3*dev.get_attribute(
                driver.device_attribute.MULTIPROCESSOR_COUNT)

        from pytools import uniform_interval_splitting
        interval_size, num_groups = uniform_interval_splitting(
                n, unit_size, max_groups);

        block_results = allocator(self.dtype.itemsize*num_groups)
        dummy_results = allocator(self.dtype.itemsize)

        # first level scan of interval (one interval per block)
        self.scan_intervals_knl.prepared_async_call(
                (num_groups, 1), (self.scan_wg_size, 1, 1), stream,
                input_ary.gpudata,
                n, interval_size,
                output_ary.gpudata,
                block_results)

        # second level inclusive scan of per-block results
        self.scan_intervals_knl.prepared_async_call(
                (1,1), (self.scan_wg_size, 1, 1), stream,
                block_results,
                num_groups, interval_size,
                block_results,
                dummy_results)

        # update intervals with result of second level scan
        self.final_update_knl.prepared_async_call(
                (num_groups, 1,), (self.update_wg_size, 1, 1), stream,
                output_ary.gpudata,
                n, interval_size,
                block_results)

        return output_ary
Example #47
0
    def compute_bandwidth(self,
                          event_hit,
                          event_time,
                          event_charge,
                          scale_factor=1.0):
        """Use the MC information accumulated by accumulate_moments() to
        estimate the best bandwidth to use when kernel estimating."""

        rho = 1.0

        hitcount = self.hitcount_gpu.get()
        mom0 = np.maximum(hitcount, 1)
        tmom1 = self.tmom1_gpu.get()
        tmom2 = self.tmom2_gpu.get()

        tmean = tmom1 / mom0
        tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0)  # roundoff can go neg
        trms = tvar**0.5

        if self.time_only:
            d = 1
        else:
            d = 2
        dimensionality_factor = ((4.0 / (d + 2)) /
                                 (mom0 / scale_factor))**(-1.0 / (d + 4))
        gaussian_density = np.minimum(
            1.0 / trms, (1.0 / np.sqrt(2.0 * np.pi)) *
            np.exp(-0.5 * ((event_time - tmean) / trms)) / trms)
        time_bandwidths = dimensionality_factor / gaussian_density * rho
        inv_time_bandwidths = np.zeros_like(time_bandwidths)
        inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[
            time_bandwidths > 0]**-1

        # precompute inverse to speed up GPU evaluation
        self.inv_time_bandwidths_gpu = ga.to_gpu(
            inv_time_bandwidths.astype(np.float32))

        # Compute charge bandwidths if needed
        if self.time_only:
            self.inv_charge_bandwidths_gpu = ga.empty_like(
                self.inv_time_bandwidths_gpu)
            self.inv_charge_bandwidths_gpu.fill(0.0)
        else:
            qmom1 = self.qmom1_gpu.get()
            qmom2 = self.qmom2_gpu.get()

            qmean = qmom1 / mom0
            qrms = (qmom2 / mom0 - qmean**2)**0.5

            gaussian_density = np.minimum(
                1.0 / qrms, (1.0 / np.sqrt(2.0 * np.pi)) *
                np.exp(-0.5 * ((event_charge - qmean) / qrms)) / qrms)

            charge_bandwidths = dimensionality_factor / gaussian_density * rho

            # precompute inverse to speed up GPU evaluation
            self.inv_charge_bandwidths_gpu = ga.to_gpu(
                (charge_bandwidths**-1).astype(np.float32))
Example #48
0
    def __init__(self, bases, pv=None, *, force=False):
        """Create a new density matrix for several qudits.

        Parameters
        ----------
        bases : list of quantumsim.bases.PauliBasis
            Dimensions of qubits in the system.

        pv : array or None.
            Must be of size (2**no_qubits, 2**no_qubits). Only upper triangle
            is relevant.  If data is `None`, create a new density matrix with
            all qubits in ground state.
        """
        super().__init__(bases, pv, force=force)
        if pv is not None:
            if self.dim_pauli != pv.shape:
                raise ValueError(
                    '`bases` Pauli dimensionality should be the same as the '
                    'shape of `data` array.\n'
                    ' - bases shapes: {}\n - data shape: {}'
                    .format(self.dim_pauli, pv.shape))
        else:
            pv = np.zeros(self.dim_pauli, np.float64)
            ground_state_index = [pb.computational_basis_indices[0]
                                  for pb in self.bases]
            pv[tuple(ground_state_index)] = 1

        if isinstance(pv, np.ndarray):
            if pv.dtype not in (np.float16, np.float32, np.float64):
                raise ValueError(
                    '`pv` must have float64 data type, got {}'
                    .format(pv.dtype)
                )

            # Looks like there are some issues with ordering, so the line
            # below per se does not work.
            # self._data = ga.to_gpu(pv.astype(np.float64))

            self._work_data = ga.to_gpu(
                pv.reshape(pv.size, order='C').astype(np.float64))
            self._data = ga.empty(pv.shape, dtype=np.float64, order='C')
            self._data.set(self._work_data.reshape(pv.shape))
            self._work_data.gpudata.free()
        elif isinstance(pv, ga.GPUArray):
            if pv.dtype != np.float64:
                raise ValueError(
                    '`pv` must have float64 data type, got {}'
                    .format(pv.dtype)
                )
            self._data = pv
        else:
            raise ValueError(
                "`pv` must be Numpy array, PyCUDA GPU array or "
                "None, got type `{}`".format(type(pv)))

        self._data.gpudata.size = self._data.nbytes
        self._work_data = ga.empty_like(self._data)
        self._work_data.gpudata.size = self._work_data.nbytes
Example #49
0
def adam_var(var: gpuarray.GPUArray,
             grad: gpuarray.GPUArray,
             d2,
             out: gpuarray.GPUArray = None):
    adam_var_func = adam_var_float_ker if var.dtype == np.float32 else adam_var_double_ker
    if out is None:
        out = gpuarray.empty_like(var)
    adam_var_func(out, var, grad, var.dtype.type(d2))
    return out
Example #50
0
def adam_mean(mean: gpuarray.GPUArray,
              grad: gpuarray.GPUArray,
              d1,
              out: gpuarray.GPUArray = None):
    adam_mean_func = adam_mean_float_ker if mean.dtype == np.float32 else adam_mean_double_ker
    if out is None:
        out = gpuarray.empty_like(mean)
    adam_mean_func(out, mean, grad, mean.dtype.type(d1))
    return out
Example #51
0
def softmax(mat, tmp=None):
    if tmp is None:
        tmp = gpuarray.empty_like(mat)
    L = logsumexp(mat, tmp)
    add_vec_to_mat(mat, L, target=tmp, substract=True)
    exp_func.prepared_async_call(tmp._grid, tmp._block, None,
                                 tmp.gpudata, tmp.gpudata,
                                 tmp.mem_size)
    return tmp
Example #52
0
def test_reorderrows():
    n = 1270
    X = 5*np.random.randn(n, 1000).astype(np.float32)
    idx = list(range(X.shape[0]))
    np.random.shuffle(idx)
    Xd = op.to_gpu(X)
    Xoutd = gpuarray.empty_like(Xd)
    op.reorder_rows(Xd, idx, Xoutd)
    assert_allclose(X[idx], Xoutd.get())
    assert_allclose(X[idx], op.reorder_rows(X, idx))
Example #53
0
def df_relu(x):
    assert x.flags.c_contiguous
    df = gpuarray.empty_like(x)
    if x.dtype == np.dtype(np.float32):
        df_relu_kernel_float(x, df)
    elif x.dtype == np.dtype(np.float64):
        df_relu_kernel_double(x, df)
    else:
        raise ValueError("Incompatible dtype")
    return df
Example #54
0
 def __init__(self,n_classes,n_incoming,N,init_sd=0.1,precision=np.float32):
     self.type = 'Softmax'
     self.n_incoming = n_incoming
     self.N = N
     w = np.random.normal(0,init_sd,(self.n_incoming,n_classes))
     b = np.random.normal(0,init_sd,(1,n_classes))
     self.weights = gpuarray.to_gpu(w.copy().astype(precision))
     self.gW = gpuarray.empty_like(self.weights)
                     
     self.biases = gpuarray.to_gpu(b.copy().astype(precision))
     self.gB = gpuarray.empty_like(self.biases)
     
     # Prior and ID are set later        
     self.prior = -1
     self.ID = -1
     
     #Set up momentum variables for HMC sampler
     self.pW = gpuarray.to_gpu(np.random.normal(0,1,self.gW.shape))
     self.pB = gpuarray.to_gpu(np.random.normal(0,1,self.gB.shape))    
     
     #Store stepsizes for each parameter
     self.epsW = gpuarray.zeros(self.weights.shape,precision) + 1.0
     self.epsB = gpuarray.zeros(self.biases.shape,precision) + 1.0
     
     self.n_classes = n_classes
     self.n_incoming = n_incoming
     
     self.N = N
     self.outputs = gpuarray.zeros((self.N,self.n_classes),precision)        
     
     self.precision = precision
             
     kernels = SourceModule(open(path+'/kernels.cu', "r").read())
     self.softmax_kernel = kernels.get_function("softmax")
     self.add_bias_kernel = kernels.get_function("add_bias")
     
     self.rng = curandom.XORWOWRandomNumberGenerator()
     
     ##Initialize posterior weights
     self.posterior_weights = list()
     self.posterior_biases = list()
     
     self.eps_tol = 1e-10
Example #55
0
def conj(x_gpu, overwrite=True):
    """
    Complex conjugate.

    Compute the complex conjugate of the array in device memory.

    Parameters
    ----------
    x_gpu : pycuda.gpuarray.GPUArray
        Input array of shape `(m, n)`.
    overwrite : bool
        If true (default), save the result in the specified array.
        If false, return the result in a newly allocated array.

    Returns
    -------
    xc_gpu : pycuda.gpuarray.GPUArray
        Conjugate of the input array. If `overwrite` is true, the
        returned matrix is the same as the input array.

    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> x = np.array([[1+1j, 2-2j, 3+3j, 4-4j], [5+5j, 6-6j, 7+7j, 8-8j]], np.complex64)
    >>> x_gpu = gpuarray.to_gpu(x)
    >>> y_gpu = linalg.conj(x_gpu)
    >>> np.all(x == np.conj(y_gpu.get()))
    True

    """

    # Don't attempt to process non-complex matrix types:
    if x_gpu.dtype in [np.float32, np.float64]:
        return x_gpu

    try:
        func = conj.cache[x_gpu.dtype]
    except KeyError:
        ctype = tools.dtype_to_ctype(x_gpu.dtype)
        func = el.ElementwiseKernel(
                "{ctype} *x, {ctype} *y".format(ctype=ctype),
                "y[i] = conj(x[i])")
        conj.cache[x_gpu.dtype] = func
    if overwrite:
        func(x_gpu, x_gpu)
        return x_gpu
    else:
        y_gpu = gpuarray.empty_like(x_gpu)
        func(x_gpu, y_gpu)
        return y_gpu