Beispiel #1
0
                                arguments="double *x, int step")

    # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1
    traceDot = ReductionKernel(
        np.float64,
        neutral="0",
        reduce_expr="a+b",
        map_expr="A[i]*B[(i%a1)*a2+i/a1]",
        arguments="double *A, double *B, int a1, int a2")

    #=======================================================================================
    # Element-wise functions
    #=======================================================================================

    # log(X)
    log = ElementwiseKernel("double *in, double *out", "out[i] = log(in[i])",
                            "log_element")

    # log(1.0-X)
    logOne = ElementwiseKernel("double *in, double *out",
                               "out[i] = log(1.-in[i])", "logOne_element")

    # multiplication with broadcast on the last dimension (out = shorter[:,None]*longer)
    mul_bcast = ElementwiseKernel(
        "double *out, double *shorter, double *longer, int shorter_size",
        "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast")

    # multiplication with broadcast on the first dimension (out = shorter[None,:]*longer)
    mul_bcast_first = ElementwiseKernel(
        "double *out, double *shorter, double *longer, int first_dim",
        "out[i] = longer[i]*shorter[i/first_dim]", "mul_bcast")
Beispiel #2
0
from  cpab.gpu.Calcs import Calcs as GpuCalcs
if GpuCalcs is None:
    raise ValueError("This option is no longer supported!") 


from pycuda import gpuarray
from pycuda.elementwise import ElementwiseKernel

from decide_sharedmemory import decide_sharedmemory

threshold_krnl = ElementwiseKernel(
        "double * a, double min_val,double max_val",
        """
        if (a[i] < min_val)
            a[i]=min_val;
        else if (a[i]>max_val)
            a[i]=max_val;
        """        
        )
 



my_dtype = [np.float32,np.float64][1]


class CpaSpace(object):
    """
    An abstract class.
    The this class should never be invoked directly.
Beispiel #3
0
start.record() # start timing
gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) )
end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("SourceModule time and first three results:")
print("%fs, %s" % (secs, str(dest[:3])))


#####################
# Elementwise SECTION
# use an ElementwiseKernel with sin in a for loop all in C call from Python
kernel = ElementwiseKernel(
   "float *a, int n_iter",
   "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}",
   "gpusin")

a = numpy.ones(nbr_values).astype(numpy.float32)
a_gpu = gpuarray.to_gpu(a)
start.record() # start timing
kernel(a_gpu, numpy.int(n_iter))
end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("Elementwise time and first three results:")
print("%fs, %s" % (secs, str(a_gpu.get()[:3])))


####################################
@author: Feng-cong Li
"""

from wavesynlib.mathtools import Algorithm, Expression, Parameter
from wavesynlib.interfaces.gpu.factories import FFTFactory, MatrixMulFactory, EntrywiseNormFactory

from math import sqrt
import numpy as np
from scipy import linalg

from pycuda.elementwise import ElementwiseKernel
from pycuda import gpuarray

unimodularize = ElementwiseKernel(
    'pycuda::complex<double> * output, const pycuda::complex<double> * input, const int N',
    '''
using namespace pycuda;
output[i] = i>=N ? complex<double>(0.0) : polar(1.0, arg(input[i]));    
    ''')


class WeCAN(Algorithm):
    __name__ = 'WeCAN (CUDA Impl)'
    __CUDA__ = True

    def __init__(self):
        super().__init__()

    def __call__(self, N: Parameter(int, 'Sequence Length N.'),
                 gamma: Parameter(
                     Expression,
                     'N-by-1, corresponding to weights w_k = gamma_k^2'),
Beispiel #5
0
#!python
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a = (numpy.random.randn(400) + 1j * numpy.random.randn(400)).astype(
    numpy.complex64)
b = (numpy.random.randn(400) + 1j * numpy.random.randn(400)).astype(
    numpy.complex64)

a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)

from pycuda.elementwise import ElementwiseKernel
complex_mul = ElementwiseKernel(
    "pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z",
    "z[i] = x[i] * y[i]",
    "complex_mul",
    preamble="#include <pycuda-complex.hpp>",
)

c_gpu = gpuarray.empty_like(a_gpu)
complex_mul(a_gpu, b_gpu, c_gpu)

import numpy.linalg as la
error = la.norm(c_gpu.get() - (a * b))
print(error)
assert error < 1e-5
Beispiel #6
0
min_x = -2
max_x = 2
min_y = -2
max_y = 2
samples = 1000
max_iter = 10
threshold = 10

mandel_ker = ElementwiseKernel(
    "pycuda::complex<float> *input, float *output, int max_iters, float threshold",
    """
    output[i] = 1;
    pycuda::complex<float> c = input[i];
    pycuda::complex<float> z(0,0);
    for(int j=0; j<max_iters; j++){
        z = z*z+c;
        if(abs(z)>threshold){
            output[i] = 0;
            break;
        }
    }
    """, "mandel_ker")


@timeit
def mandelbrot_gpu():
    x = np.linspace(min_x, max_x, samples, dtype=np.complex64).reshape(-1, 1)
    y = np.linspace(min_y, max_y, samples, dtype=np.complex64).reshape(1,
                                                                       -1) * 1j

    input = x + y
Beispiel #7
0
        state = super(DropinLayer, self).__getstate__()
        state.append(self.dropin_value)
        return state

    def __setstate__(self, state):
        self.dropin_value = state.pop()
        super(DropinLayer, self).__setstate__(state)


try:
    from pycuda.elementwise import ElementwiseKernel

    # the mask says wether we change the element, d says if we set to 0 or to 1
    __cuda_inplace_saltpepper = ElementwiseKernel(
        'float* x, float* m, float* d, float overall_rate, float salt_rate, float salt_value',
        """
        m[i] = m[i] > overall_rate;
        x[i] = m[i] ? x[i] : (d[i] < salt_rate ? salt_value : 0.0f);
        """, 'eltw_saltpepper_inplace')
except ImportError:
    pass


def inplace_saltpepper(X, overall_rate, salt_rate, salt_value, stream=None):
    M = op.rand_uniform_like(X, stream=op.streams[0])
    D = op.rand_uniform_like(X, stream=op.streams[1])
    if isinstance(X, op.gpuarray.GPUArray):
        __cuda_inplace_saltpepper(X,
                                  M,
                                  D,
                                  overall_rate,
                                  salt_rate,
Beispiel #8
0
def init_cuda():
    """Initialize CUDA functionality

    This function attempts to load the necessary interfaces
    (hardware connectivity) to run CUDA-based filering. This
    function should only need to be run once per session.

    If the config var (set via mne.set_config or in ENV)
    MNE_USE_CUDA == 'true', this function will be executed when
    importing mne. If this variable is not set, this function can
    be manually executed.
    """
    global cuda_capable
    global cuda_multiply_inplace_complex128
    global cuda_halve_value_complex128
    global cuda_real_value_complex128
    global requires_cuda
    if cuda_capable is True:
        logger.info('CUDA previously enabled, currently %s available memory' %
                    sizeof_fmt(mem_get_info()[0]))
        return
    # Triage possible errors for informative messaging
    cuda_capable = False
    try:
        import pycuda.gpuarray
        import pycuda.driver
    except ImportError:
        logger.warn('module pycuda not found, CUDA not enabled')
    else:
        try:
            # Initialize CUDA; happens with importing autoinit
            import pycuda.autoinit
        except ImportError:
            logger.warn('pycuda.autoinit could not be imported, likely '
                        'a hardware error, CUDA not enabled')
        else:
            # Make our multiply inplace kernel
            try:
                from pycuda.elementwise import ElementwiseKernel
                # let's construct our own CUDA multiply in-place function
                dtype = 'pycuda::complex<double>'
                cuda_multiply_inplace_complex128 = \
                    ElementwiseKernel(dtype + ' *a, ' + dtype + ' *b',
                                      'b[i] *= a[i]', 'multiply_inplace')
                cuda_halve_value_complex128 = \
                    ElementwiseKernel(dtype + ' *a', 'a[i] /= 2.0',
                                      'halve_value')
                cuda_real_value_complex128 = \
                    ElementwiseKernel(dtype + ' *a', 'a[i] = real(a[i])',
                                      'real_value')
            except:
                # This should never happen
                raise RuntimeError('pycuda ElementwiseKernel could not be '
                                   'constructed, please report this issue '
                                   'to mne-python developers with your '
                                   'system information and pycuda version')
            else:
                # Make sure scikits.cuda is installed
                try:
                    from scikits.cuda import fft as cudafft
                except ImportError:
                    logger.warn('modudle scikits.cuda not found, CUDA not '
                                'enabled')
                else:
                    # Make sure we can use 64-bit FFTs
                    try:
                        fft_plan = cudafft.Plan(16, np.float64, np.complex128)
                        del fft_plan
                    except:
                        logger.warn('Device does not support 64-bit FFTs, '
                                    'CUDA not enabled')
                    else:
                        cuda_capable = True
                        # Figure out limit for CUDA FFT calculations
                        logger.info('Enabling CUDA with %s available memory' %
                                    sizeof_fmt(mem_get_info()[0]))
    requires_cuda = np.testing.dec.skipif(not cuda_capable,
                                          'CUDA not initialized')
Beispiel #9
0
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel
from pycuda.curandom import rand as curand

n = 500
m = 400

sumaMat = ElementwiseKernel("float *a, float *b, float *c",
                            "c[i] = a[i] + b[i]", "add")

a_gpu = curand((n, m))
b_gpu = curand((n, m))

c_gpu = gpuarray.empty_like(a_gpu)
sumaMat(a_gpu, b_gpu, c_gpu)

print(a_gpu)
print(b_gpu)
print(c_gpu)
Beispiel #10
0
def sigmoid_gpu(X):
    Y = pycuda.gpuarray.empty(X.shape, dtype=X.dtype)
    sigmoid = ElementwiseKernel("double *Y, double *X",
                                "Y[i] = 1.0 / (1.0 + exp (-X[i]) )", "sigmoid")
    sigmoid(Y, X)
    return Y
Beispiel #11
0
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a_gpu = curand((50,))
b_gpu = curand((50,))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(
        "float a, float *x, float b, float *y, float *z",
        "z[i] = a*x[i] + b*y[i]",
        "linear_combination")

c_gpu = gpuarray.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

import numpy.linalg as la
assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5

Beispiel #12
0
from reikna.linalg import MatrixMul
import reikna.cluda as cluda
import pycuda.cumath
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel
import pycuda.gpuarray
import numpy as np

increment = ElementwiseKernel("float *X, float *Y", "Y[i] = 1 + X[i]",
                              "increment")


def modified_gemm_gpu(A, B, C):
    shape = (A.shape[0], B.shape[1])
    api = cluda.cuda_api()
    thr = api.Thread.create()
    res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype)

    mul = MatrixMul(A, B, out_arr=res_arr)
    mulc = mul.compile(thr)
    mulc(res_arr, A, B)

    return res_arr + C


def tanh_gpu(X):
    return pycuda.cumath.tanh(X)


def sigmoid_gpu(X):
    Y = pycuda.gpuarray.empty(X.shape, dtype=X.dtype)
import pycuda.autoinit
from pycuda import gpuarray
import pycuda.driver as drv
from skcuda import cublas
import numbers
import numpy as np
from pycuda.reduction import ReductionKernel
from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule
from skcuda import misc, linalg

_global_cublas_allocator = drv.mem_alloc
_global_cublas_handle = cublas.cublasCreate()

sigmoid_float_ker = ElementwiseKernel(f"float *Y, float *x",
                                      "Y[i] = 1.0 / (1.0 + exp (-x[i]) )",
                                      "sigmoid_float")

sigmoid_double_ker = ElementwiseKernel(f"double *Y, double *x",
                                       "Y[i] = 1.0 / (1.0 + exp (-x[i]) )",
                                       "sigmoid_double")

tanh_float_ker = ElementwiseKernel(f"float *Y, float *x", """
    Y[i] = tanh(x[i])
    """, "tanh_float")

tanh_double_ker = ElementwiseKernel(
    f"double *Y, double *x", """
    double pos_exp = exp (x[i]);
    double neg_exp = exp (-x[i]);
    Y[i] = (pos_exp - neg_exp) / (pos_exp + neg_exp)
    /* ************** htilde **************** */
    htilde[i]._M_re = amplitude * cos( phasing );
    htilde[i]._M_im = -1.0 * amplitude * sin( phasing );

"""

phenomC_kernel = ElementwiseKernel("""pycuda::complex<double> *htilde, int kmin, double delta_f, 
                                       double eta, double Xi, double distance,
                                       double m_sec, double piM, double Mfrd,
                                       double pfaN, double pfa2, double pfa3, double pfa4, 
                                       double pfa5, double pfa6, double pfa6log, double pfa7,
                                       double a1, double a2, double a3, double a4,
                                       double a5, double a6, double b1, double b2, 
                                       double Mf1, double Mf2, double Mf0, 
                                       double d1, double d2, double d0, 
                                       double xdota2, double xdota3, double xdota4, 
                                       double xdota5, double xdota6, double xdota6log, 
                                       double xdota7, double xdotaN, double AN,
                                       double A2, double A3, double A4, double A5,
                                       double A5imag, double A6, double A6log, double A6imag,
                                       double g1, double del1, double del2, double Q""",
                    phenomC_text, "phenomC_kernel",
                    preamble=preamble, options=pkg_config_header_strings(['lal']))


def FinalSpin( Xi, eta ):
    """Computes the spin of the final BH that gets formed after merger. This is done usingn Eq 5-6 of arXiv:0710.3345"""
    s4 = -0.129
    s5 = -0.384
    t0 = -2.686
Beispiel #15
0
 def __init__(self, *args, **kwargs):
     super(LongWrapperGPU, self).__init__(*args, **kwargs)
     self._wrap = ElementwiseKernel(
         'double *z',
         'z[i] -= floor((z[i] - {z_min:}) / {circ}) * {circ}'.format(
             circ=self.circumference, z_min=self.z_min), 'wrap_z')
Beispiel #16
0
    int sizePadding = (sPad-sVol)/2;
    
    int pidx = sizePadding + (threadIdx.x+sizePadding)*sPad + (threadIdx.y+sizePadding)*sPad*sPad;
    if (idx < sVol*sVol*sVol){
        for (int i=0; i < sVol; i++) {
            if (volume[idx+i){
            padded_volume[pidx+i] = volume[idx+i];
        }
    }
    __syncthreads();
}

"""

linearAdd = ElementwiseKernel("float *d_t, float *d_m, float a, float b",
                              "d_t[i] = ((d_t[i] - a) * b) * d_m[i]",
                              "linear_combination")

update_scores_angles = cc_mod.get_function('update_scores_angles')
paste_in_center_gpu = cc_mod.get_function("pasteCenter")


class TemplateMatchingPlan():
    def __init__(self, volume, template, mask, wedge, stdV, gpu=True):
        self.volume = gu.to_gpu(volume)

        self.template = Volume(template)
        self.templatePadded = gu.zeros_like(self.volume, dtype=np.float32)

        self.mask = Volume(mask)
        self.maskPadded = gu.zeros_like(self.volume, dtype=np.float32)
# --- Create random vectorson the CPU
h_a = np.random.randn(1, N)
h_b = np.random.randn(1, N)

# --- Set CPU arrays as single precision
h_a = h_a.astype(np.float32)
h_b = h_b.astype(np.float32)

d_a = gpuarray.to_gpu(h_a)
d_b = gpuarray.to_gpu(h_b)
d_c = gpuarray.empty_like(d_a)

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(
    "float *d_c, float *d_a, float *d_b, float a, float b",
    "d_c[i] = a * d_a[i] + b * d_b[i]", "linear_combination")

start.record()
lin_comb(d_c, d_a, d_b, 2, 3)
end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Processing time = %fs" % (secs))

# --- Copy results from device to host
h_c = d_c.get()

if np.array_equal(h_c, 2 * h_a + 3 * h_b):
    print("Test passed!")
else:
Beispiel #18
0
import pycuda.gpuarray as gpuarray
import pycuda.cumath as cumath
from pycuda.elementwise import ElementwiseKernel
from pycuda.compiler import SourceModule
import pycuda as cuda
import pycuda.autoinit
import numpy as np
from time import time

is_equal = ElementwiseKernel("unsigned int *x, unsigned int *y, bool *z",
                             "z[i] = x[i] == y[i]", "is_equal")
modulo = ElementwiseKernel("unsigned int *x, unsigned int *y, unsigned int *z",
                           "z[i] = x[i] % y[i]", "modulo")
is_div = ElementwiseKernel("unsigned int x, unsigned int *y, bool *z",
                           "z[i] = (x % y[i]) == 0", "is_div")
'''
count_facs = SourceModule("""
    __global__ void count_facs(int x, unsigned int *y, int *z) {
        if (x % y[threadIdx.x]) {
            z[0] += 1;
        }
    }
""").get_function("count_facs")
'''

limit = 5000  # limit = int(input('Limit: '))
start_time = time()
with open('primes1.txt') as f:
    primes = np.fromiter(map(int,
                             f.read().strip().split(',')),
                         dtype=np.uint32)
Beispiel #19
0
from write_and_read_results import (
    ARRAY_SIZES,
    write_results_to_file,
    ADD_ARRAYS,
    BACKGROUND_CORRECTION,
)

mode = "elementwise kernel"

BackgroundCorrectionKernel = ElementwiseKernel(
    arguments=
    "{0} * data, {0} * flat, const {0} * dark, const {0} MINIMUM_PIXEL_VALUE, const {0} MAXIMUM_PIXEL_VALUE"
    .format(C_DTYPE),
    operation="flat[i] -= dark[i];"
    "if (flat[i] == 0) flat[i] = MINIMUM_PIXEL_VALUE;"
    "data[i] -= dark[i];"
    "data[i] /= flat[i];"
    "if (data[i] > MAXIMUM_PIXEL_VALUE) data[i] = MAXIMUM_PIXEL_VALUE;"
    "if (data[i] < MINIMUM_PIXEL_VALUE) data[i] = MINIMUM_PIXEL_VALUE;",
    name="BackgroundCorrectionKernel",
)

elementwise_background_correction = lambda data, flat, dark: BackgroundCorrectionKernel(
    data, flat, dark, MINIMUM_PIXEL_VALUE, MAXIMUM_PIXEL_VALUE)

# Create an element-wise Add Array Function
AddArraysKernel = ElementwiseKernel(
    arguments="{0} * arr1, {0} * arr2".format(C_DTYPE),
    operation="arr1[i] += arr2[i]",
    name="AddArraysKernel",
)
Beispiel #20
0
      int bidx = (idy-midy)+H/2;
      int bid = GRID(bidx,bidy,W);

      // Stay within the bounds
      if (idx>-1 && idx<W && idy>-1 && idy<H && bidx>-1 && bidx<W && bidy>-1 && bidy<H) {
        // Subtract dirty beam from dirty map
        dimg[id]=dimg[id]-dpsf[bid]*scaler;
        // Add clean beam to clean map
        cimg[id]=cimg[id]+cpsf[bid]*scaler;
      };
    }
    """
sub_beam_kernel = cuda_compile(sub_beam_kernel_source, "sub_beam_kernel")

add_noise_kernel = ElementwiseKernel(
    "float *a, float* b, int N",
    "b[i] = a[i]+b[i]",
    "gpunoise")


######################
# Gridding functions
######################

def spheroid(eta, m, alpha):
    """
    Calculates spheriodal wave functions. See Schwab 1984 for details.
    This implementation follows MIRIAD's grid.for subroutine.
    """

    twoalp = 2 * alpha
    if np.abs(eta) > 1:
from __future__ import absolute_import
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a_gpu = curand((50,))
b_gpu = curand((50,))

from pycuda.elementwise import ElementwiseKernel
# arguments, operation, name="kernel", keep=False, options=[], preamble=""
lin_comb = ElementwiseKernel(
    "float a, float *x, float b, float *y, float *z",
    "z[i] = my_f(a*x[i], b*y[i])",
    "linear_combination",
    preamble="""
        __device__ float my_f(float x, float y)
        { 
          return sin(x*y);
        }
        """)

c_gpu = gpuarray.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

import numpy.linalg as la
assert la.norm(c_gpu.get() - numpy.sin((5*a_gpu*6*b_gpu).get())) < 1e-5
Beispiel #22
0
"""
Created on Thu Nov 17 15:35:42 2016

@author: shiwu_001
"""

from config import DTYPE
from numpy import int32
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel
from pycuda.tools import context_dependent_memoize
"""
    Elementwise Kernel
"""
# r = a
seta = ElementwiseKernel("float a, float *r", "r[i] = a", "kernel_seta")

# r = x
setx = ElementwiseKernel("float *x, float *r", "r[i] = x[i]", "kernel_setx")

# r = ax + b
axpb = ElementwiseKernel("float a, float *x, float b, float *r",
                         "r[i] = a*x[i] + b", "kernel_axpb")

# r = ax + by
axpby = ElementwiseKernel("float a, float *x, float b, float *y, float *r",
                          "r[i] = a*x[i] + b*y[i]", "kernel_axpby")

# r = ax + by + cz
axpbypcz = ElementwiseKernel(
    "float a, float *x, float b, float *y, float c, float *z, float *r",
Beispiel #23
0
from pycuda import gpuarray as ga
from pycuda import driver
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel
import numpy as np

_axby = ElementwiseKernel(
            """ pycuda::complex<double> a, pycuda::complex<double> *x,
                pycuda::complex<double> b, pycuda::complex<double> *y""", \
           ' x[i] = a * x[i] + b * y[i]')

_norm = ReductionKernel(np.complex128,
                        neutral="0",
                        reduce_expr="a+b",
                        map_expr="pow(abs(x[i]), 2)",
                        arguments="pycuda::complex<double> *x")


class Grid:
    def __init__(self, array):
        # Get the array.
        if type(array) is np.ndarray:
            self.g = ga.to_gpu(array)  # Copy data to the GPU.
        elif type(array) is ga.GPUArray:
            self.g = array  # GPUArray already initialized.
        else:
            print 'Invalid type'  # Raise proper exception here.

#         # Create the aby function.
#         if self.g.dtype is np.dtype('complex128'):
#             cuda_type = 'pycuda::complex<double>'
Beispiel #24
0
# WS 12/13/20
# simple_element_kernel_example0.py
# from 'hands-on gpu programming with python and cuda book'

import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.elementwise import ElementwiseKernel
from time import time

host_data = np.float32(np.random.random(50000000))

gpu_2x_ker = ElementwiseKernel("float *in, float *out", "out[i] = 2*in[i]",
                               "gpu_2x_ker")


def speedcomparison():

    t1 = time()
    host_data_x2 = host_data * np.float32(2)
    t2 = time()

    print('total CPU time: {} sec'.format(t2 - t1))

    device_data = gpuarray.to_gpu(host_data)

    device_data_2x = gpuarray.empty_like(device_data)

    t1 = time()
    gpu_2x_ker(device_data, device_data_2x)
    t2 = time()
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 13 15:13:27 2018

@author: bhaumik
"""

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
from pycuda.elementwise import ElementwiseKernel
import pycuda.autoinit
from pycuda.curandom import rand as curand

# Kernel function
add = ElementwiseKernel("float *d_a, float *d_b, float *d_c",
                        "d_c[i] = d_a[i] + d_b[i]", "add")

# create a couple of random matrices with a given shape
shape = 1000000
d_a = curand(shape)
d_b = curand(shape)
d_c = gpuarray.empty_like(d_a)
start = drv.Event()
end = drv.Event()
start.record()
# Calling kernel
add(d_a, d_b, d_c)
end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3
print("Addition of %d element of GPU" % shape)
Beispiel #26
0
                      np.int32(k),
                      block=(32, 1, 1),
                      grid=(n, 1, 1))
        return out

    def tanh(self, x, y):
        tanh_kernel(x, y)

    def tanh_deriv(self, x, y, dy, dx):
        tanh_deriv_kernel(x, y, dy, dx)


# --------------------------- Kernel Definitions ---------------------------- #

add_into_if_kernel = ElementwiseKernel("float* a, float* out, float* cond",
                                       "if (cond[i] != 0) out[i] += a[i]",
                                       "add_into_if_kernel")
add_mm_kernel = ElementwiseKernel("float* x, float* y, float *out",
                                  "out[i] = x[i] + y[i]", "add_mm_kernel")

add_st_kernel = ElementwiseKernel("float x, float* y, float *out",
                                  "out[i] = x + y[i]", "add_st_kernel")

binarize_v_kernel = ElementwiseKernel(
    "float* out, float* v, int nrows, int ncols",
    "out[i] = v[i / ncols] == (i % ncols) ? 1.0f : 0.0f", "binarize_v_kernel")

broadcast_t_kernel = ElementwiseKernel(
    "float* out, float* a, unsigned int broadcast_dim, unsigned int stride",
    "out[i] = a[i % stride + (i / (broadcast_dim * stride)) * stride]",
    "broadcast_t_kernel")
Beispiel #27
0
        bn[0] = 0;

    pycuda::complex<float> val = in[i];
    if ( abs(val) > threshold){
        int n_w = atomicAdd(bn, 1);
        outv[n_w] = val;
        outl[n_w] = i;
    }

"""

threshold_kernel = ElementwiseKernel(
    " %(tp_in)s *in, %(tp_out1)s *outv, %(tp_out2)s *outl, %(tp_th)s threshold, %(tp_n)s *bn"
    % {
        "tp_in": dtype_to_ctype(numpy.complex64),
        "tp_out1": dtype_to_ctype(numpy.complex64),
        "tp_out2": dtype_to_ctype(numpy.uint32),
        "tp_th": dtype_to_ctype(numpy.float32),
        "tp_n": dtype_to_ctype(numpy.uint32),
    }, threshold_op, "getstuff")

import pycuda.driver as drv
n = drv.pagelocked_empty((1),
                         numpy.uint32,
                         mem_flags=drv.host_alloc_flags.DEVICEMAP)
nptr = numpy.intp(n.base.get_device_pointer())

val = drv.pagelocked_empty((4096 * 256),
                           numpy.complex64,
                           mem_flags=drv.host_alloc_flags.DEVICEMAP)
vptr = numpy.intp(val.base.get_device_pointer())
Beispiel #28
0
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.elementwise import ElementwiseKernel

mandel_ker = ElementwiseKernel(
    "pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound",
    """
mandelbrot_graph[i] = 1;

pycuda::complex<float> c = lattice[i]; 
pycuda::complex<float> z(0,0);

for (int j = 0; j < max_iters; j++)
    {
    
     z = z*z + c;
     
     if(abs(z) > upper_bound)
         {
          mandelbrot_graph[i] = 0;
          break;
         }

    }
         
""", "mandel_ker")


def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high,
                   max_iters, upper_bound):
Beispiel #29
0
#!/usr/bin/env python
"""
Created on Wed Oct 15 13:12:50 2014

Author: Oren Freifeld
Email: [email protected]
"""

import numpy as np
from pycuda.elementwise import ElementwiseKernel
from pycuda.reduction import ReductionKernel

calc_err_per_sample = ElementwiseKernel("double *x, double *y, double *err",
                                        "err[i] = x[i] - y[i]",
                                        "calc_err_per_sample")

# Note this likelihood assumes isotropic covariance
calc_ll_per_sample = ElementwiseKernel(
    "double *err, double sigma_lm, double *ll ",
    "ll[i] = -0.5 * err[i]*err[i] / (sigma_lm*sigma_lm)", "calc_ll_per_sample")

calc_negative_ll_per_sample = ElementwiseKernel(
    "double *err, double sigma_lm, double *negative_ll ",
    "negative_ll[i] =  0.5 * err[i]*err[i] / (sigma_lm*sigma_lm)",
    "calc_negative_ll_per_sample")

calc_err_by_der_per_sample = ElementwiseKernel(
    "double *x, double *y, double *err_by_der, double dt",
    """err_by_der[i] = (x[i+1]-x[i])/dt - (y[i+1]-y[i])/dt""",
    "calc_err_by_der_per_sample")
   L. Dixon and D. G. Grier,
   "Flow visualization and flow cytometry with holographic video
   microscopy," Opt. Express 17, 13071-13079 (2009).

HISTORY
This code was adapted from the IDL implementation of
generalizedlorenzmie__define.pro
which was written by David G. Grier.
This version is

Copyright (c) 2018 David G. Grier
'''

safe_division = ElementwiseKernel(
    "float *x, float *y, float a, float *z",
    "if (abs(y[i]) > 1e-6) { z[i] = x[i]/y[i]; } else {z[i] = a;};",
    "safe_division",
)


class CudaGeneralizedLorenzMie(GeneralizedLorenzMie):
    '''
    A class that computes scattered light fields with CUDA acceleration

    ...

    Attributes
    ----------
    particle : Particle
        Object representing the particle scattering light
    instrument : Instrument