arguments="double *x, int step") # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1 traceDot = ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%a1)*a2+i/a1]", arguments="double *A, double *B, int a1, int a2") #======================================================================================= # Element-wise functions #======================================================================================= # log(X) log = ElementwiseKernel("double *in, double *out", "out[i] = log(in[i])", "log_element") # log(1.0-X) logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element") # multiplication with broadcast on the last dimension (out = shorter[:,None]*longer) mul_bcast = ElementwiseKernel( "double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast") # multiplication with broadcast on the first dimension (out = shorter[None,:]*longer) mul_bcast_first = ElementwiseKernel( "double *out, double *shorter, double *longer, int first_dim", "out[i] = longer[i]*shorter[i/first_dim]", "mul_bcast")
from cpab.gpu.Calcs import Calcs as GpuCalcs if GpuCalcs is None: raise ValueError("This option is no longer supported!") from pycuda import gpuarray from pycuda.elementwise import ElementwiseKernel from decide_sharedmemory import decide_sharedmemory threshold_krnl = ElementwiseKernel( "double * a, double min_val,double max_val", """ if (a[i] < min_val) a[i]=min_val; else if (a[i]>max_val) a[i]=max_val; """ ) my_dtype = [np.float32,np.float64][1] class CpaSpace(object): """ An abstract class. The this class should never be invoked directly.
start.record() # start timing gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) ) end.record() # end timing # calculate the run length end.synchronize() secs = start.time_till(end)*1e-3 print("SourceModule time and first three results:") print("%fs, %s" % (secs, str(dest[:3]))) ##################### # Elementwise SECTION # use an ElementwiseKernel with sin in a for loop all in C call from Python kernel = ElementwiseKernel( "float *a, int n_iter", "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}", "gpusin") a = numpy.ones(nbr_values).astype(numpy.float32) a_gpu = gpuarray.to_gpu(a) start.record() # start timing kernel(a_gpu, numpy.int(n_iter)) end.record() # end timing # calculate the run length end.synchronize() secs = start.time_till(end)*1e-3 print("Elementwise time and first three results:") print("%fs, %s" % (secs, str(a_gpu.get()[:3]))) ####################################
@author: Feng-cong Li """ from wavesynlib.mathtools import Algorithm, Expression, Parameter from wavesynlib.interfaces.gpu.factories import FFTFactory, MatrixMulFactory, EntrywiseNormFactory from math import sqrt import numpy as np from scipy import linalg from pycuda.elementwise import ElementwiseKernel from pycuda import gpuarray unimodularize = ElementwiseKernel( 'pycuda::complex<double> * output, const pycuda::complex<double> * input, const int N', ''' using namespace pycuda; output[i] = i>=N ? complex<double>(0.0) : polar(1.0, arg(input[i])); ''') class WeCAN(Algorithm): __name__ = 'WeCAN (CUDA Impl)' __CUDA__ = True def __init__(self): super().__init__() def __call__(self, N: Parameter(int, 'Sequence Length N.'), gamma: Parameter( Expression, 'N-by-1, corresponding to weights w_k = gamma_k^2'),
#!python import pycuda.driver as cuda import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.curandom import rand as curand a = (numpy.random.randn(400) + 1j * numpy.random.randn(400)).astype( numpy.complex64) b = (numpy.random.randn(400) + 1j * numpy.random.randn(400)).astype( numpy.complex64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) from pycuda.elementwise import ElementwiseKernel complex_mul = ElementwiseKernel( "pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z", "z[i] = x[i] * y[i]", "complex_mul", preamble="#include <pycuda-complex.hpp>", ) c_gpu = gpuarray.empty_like(a_gpu) complex_mul(a_gpu, b_gpu, c_gpu) import numpy.linalg as la error = la.norm(c_gpu.get() - (a * b)) print(error) assert error < 1e-5
min_x = -2 max_x = 2 min_y = -2 max_y = 2 samples = 1000 max_iter = 10 threshold = 10 mandel_ker = ElementwiseKernel( "pycuda::complex<float> *input, float *output, int max_iters, float threshold", """ output[i] = 1; pycuda::complex<float> c = input[i]; pycuda::complex<float> z(0,0); for(int j=0; j<max_iters; j++){ z = z*z+c; if(abs(z)>threshold){ output[i] = 0; break; } } """, "mandel_ker") @timeit def mandelbrot_gpu(): x = np.linspace(min_x, max_x, samples, dtype=np.complex64).reshape(-1, 1) y = np.linspace(min_y, max_y, samples, dtype=np.complex64).reshape(1, -1) * 1j input = x + y
state = super(DropinLayer, self).__getstate__() state.append(self.dropin_value) return state def __setstate__(self, state): self.dropin_value = state.pop() super(DropinLayer, self).__setstate__(state) try: from pycuda.elementwise import ElementwiseKernel # the mask says wether we change the element, d says if we set to 0 or to 1 __cuda_inplace_saltpepper = ElementwiseKernel( 'float* x, float* m, float* d, float overall_rate, float salt_rate, float salt_value', """ m[i] = m[i] > overall_rate; x[i] = m[i] ? x[i] : (d[i] < salt_rate ? salt_value : 0.0f); """, 'eltw_saltpepper_inplace') except ImportError: pass def inplace_saltpepper(X, overall_rate, salt_rate, salt_value, stream=None): M = op.rand_uniform_like(X, stream=op.streams[0]) D = op.rand_uniform_like(X, stream=op.streams[1]) if isinstance(X, op.gpuarray.GPUArray): __cuda_inplace_saltpepper(X, M, D, overall_rate, salt_rate,
def init_cuda(): """Initialize CUDA functionality This function attempts to load the necessary interfaces (hardware connectivity) to run CUDA-based filering. This function should only need to be run once per session. If the config var (set via mne.set_config or in ENV) MNE_USE_CUDA == 'true', this function will be executed when importing mne. If this variable is not set, this function can be manually executed. """ global cuda_capable global cuda_multiply_inplace_complex128 global cuda_halve_value_complex128 global cuda_real_value_complex128 global requires_cuda if cuda_capable is True: logger.info('CUDA previously enabled, currently %s available memory' % sizeof_fmt(mem_get_info()[0])) return # Triage possible errors for informative messaging cuda_capable = False try: import pycuda.gpuarray import pycuda.driver except ImportError: logger.warn('module pycuda not found, CUDA not enabled') else: try: # Initialize CUDA; happens with importing autoinit import pycuda.autoinit except ImportError: logger.warn('pycuda.autoinit could not be imported, likely ' 'a hardware error, CUDA not enabled') else: # Make our multiply inplace kernel try: from pycuda.elementwise import ElementwiseKernel # let's construct our own CUDA multiply in-place function dtype = 'pycuda::complex<double>' cuda_multiply_inplace_complex128 = \ ElementwiseKernel(dtype + ' *a, ' + dtype + ' *b', 'b[i] *= a[i]', 'multiply_inplace') cuda_halve_value_complex128 = \ ElementwiseKernel(dtype + ' *a', 'a[i] /= 2.0', 'halve_value') cuda_real_value_complex128 = \ ElementwiseKernel(dtype + ' *a', 'a[i] = real(a[i])', 'real_value') except: # This should never happen raise RuntimeError('pycuda ElementwiseKernel could not be ' 'constructed, please report this issue ' 'to mne-python developers with your ' 'system information and pycuda version') else: # Make sure scikits.cuda is installed try: from scikits.cuda import fft as cudafft except ImportError: logger.warn('modudle scikits.cuda not found, CUDA not ' 'enabled') else: # Make sure we can use 64-bit FFTs try: fft_plan = cudafft.Plan(16, np.float64, np.complex128) del fft_plan except: logger.warn('Device does not support 64-bit FFTs, ' 'CUDA not enabled') else: cuda_capable = True # Figure out limit for CUDA FFT calculations logger.info('Enabling CUDA with %s available memory' % sizeof_fmt(mem_get_info()[0])) requires_cuda = np.testing.dec.skipif(not cuda_capable, 'CUDA not initialized')
import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.elementwise import ElementwiseKernel from pycuda.curandom import rand as curand n = 500 m = 400 sumaMat = ElementwiseKernel("float *a, float *b, float *c", "c[i] = a[i] + b[i]", "add") a_gpu = curand((n, m)) b_gpu = curand((n, m)) c_gpu = gpuarray.empty_like(a_gpu) sumaMat(a_gpu, b_gpu, c_gpu) print(a_gpu) print(b_gpu) print(c_gpu)
def sigmoid_gpu(X): Y = pycuda.gpuarray.empty(X.shape, dtype=X.dtype) sigmoid = ElementwiseKernel("double *Y, double *X", "Y[i] = 1.0 / (1.0 + exp (-X[i]) )", "sigmoid") sigmoid(Y, X) return Y
import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.curandom import rand as curand a_gpu = curand((50,)) b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) import numpy.linalg as la assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
from reikna.linalg import MatrixMul import reikna.cluda as cluda import pycuda.cumath from pycuda.elementwise import ElementwiseKernel from pycuda.reduction import ReductionKernel import pycuda.gpuarray import numpy as np increment = ElementwiseKernel("float *X, float *Y", "Y[i] = 1 + X[i]", "increment") def modified_gemm_gpu(A, B, C): shape = (A.shape[0], B.shape[1]) api = cluda.cuda_api() thr = api.Thread.create() res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype) mul = MatrixMul(A, B, out_arr=res_arr) mulc = mul.compile(thr) mulc(res_arr, A, B) return res_arr + C def tanh_gpu(X): return pycuda.cumath.tanh(X) def sigmoid_gpu(X): Y = pycuda.gpuarray.empty(X.shape, dtype=X.dtype)
import pycuda.autoinit from pycuda import gpuarray import pycuda.driver as drv from skcuda import cublas import numbers import numpy as np from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule from skcuda import misc, linalg _global_cublas_allocator = drv.mem_alloc _global_cublas_handle = cublas.cublasCreate() sigmoid_float_ker = ElementwiseKernel(f"float *Y, float *x", "Y[i] = 1.0 / (1.0 + exp (-x[i]) )", "sigmoid_float") sigmoid_double_ker = ElementwiseKernel(f"double *Y, double *x", "Y[i] = 1.0 / (1.0 + exp (-x[i]) )", "sigmoid_double") tanh_float_ker = ElementwiseKernel(f"float *Y, float *x", """ Y[i] = tanh(x[i]) """, "tanh_float") tanh_double_ker = ElementwiseKernel( f"double *Y, double *x", """ double pos_exp = exp (x[i]); double neg_exp = exp (-x[i]); Y[i] = (pos_exp - neg_exp) / (pos_exp + neg_exp)
/* ************** htilde **************** */ htilde[i]._M_re = amplitude * cos( phasing ); htilde[i]._M_im = -1.0 * amplitude * sin( phasing ); """ phenomC_kernel = ElementwiseKernel("""pycuda::complex<double> *htilde, int kmin, double delta_f, double eta, double Xi, double distance, double m_sec, double piM, double Mfrd, double pfaN, double pfa2, double pfa3, double pfa4, double pfa5, double pfa6, double pfa6log, double pfa7, double a1, double a2, double a3, double a4, double a5, double a6, double b1, double b2, double Mf1, double Mf2, double Mf0, double d1, double d2, double d0, double xdota2, double xdota3, double xdota4, double xdota5, double xdota6, double xdota6log, double xdota7, double xdotaN, double AN, double A2, double A3, double A4, double A5, double A5imag, double A6, double A6log, double A6imag, double g1, double del1, double del2, double Q""", phenomC_text, "phenomC_kernel", preamble=preamble, options=pkg_config_header_strings(['lal'])) def FinalSpin( Xi, eta ): """Computes the spin of the final BH that gets formed after merger. This is done usingn Eq 5-6 of arXiv:0710.3345""" s4 = -0.129 s5 = -0.384 t0 = -2.686
def __init__(self, *args, **kwargs): super(LongWrapperGPU, self).__init__(*args, **kwargs) self._wrap = ElementwiseKernel( 'double *z', 'z[i] -= floor((z[i] - {z_min:}) / {circ}) * {circ}'.format( circ=self.circumference, z_min=self.z_min), 'wrap_z')
int sizePadding = (sPad-sVol)/2; int pidx = sizePadding + (threadIdx.x+sizePadding)*sPad + (threadIdx.y+sizePadding)*sPad*sPad; if (idx < sVol*sVol*sVol){ for (int i=0; i < sVol; i++) { if (volume[idx+i){ padded_volume[pidx+i] = volume[idx+i]; } } __syncthreads(); } """ linearAdd = ElementwiseKernel("float *d_t, float *d_m, float a, float b", "d_t[i] = ((d_t[i] - a) * b) * d_m[i]", "linear_combination") update_scores_angles = cc_mod.get_function('update_scores_angles') paste_in_center_gpu = cc_mod.get_function("pasteCenter") class TemplateMatchingPlan(): def __init__(self, volume, template, mask, wedge, stdV, gpu=True): self.volume = gu.to_gpu(volume) self.template = Volume(template) self.templatePadded = gu.zeros_like(self.volume, dtype=np.float32) self.mask = Volume(mask) self.maskPadded = gu.zeros_like(self.volume, dtype=np.float32)
# --- Create random vectorson the CPU h_a = np.random.randn(1, N) h_b = np.random.randn(1, N) # --- Set CPU arrays as single precision h_a = h_a.astype(np.float32) h_b = h_b.astype(np.float32) d_a = gpuarray.to_gpu(h_a) d_b = gpuarray.to_gpu(h_b) d_c = gpuarray.empty_like(d_a) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel( "float *d_c, float *d_a, float *d_b, float a, float b", "d_c[i] = a * d_a[i] + b * d_b[i]", "linear_combination") start.record() lin_comb(d_c, d_a, d_b, 2, 3) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Processing time = %fs" % (secs)) # --- Copy results from device to host h_c = d_c.get() if np.array_equal(h_c, 2 * h_a + 3 * h_b): print("Test passed!") else:
import pycuda.gpuarray as gpuarray import pycuda.cumath as cumath from pycuda.elementwise import ElementwiseKernel from pycuda.compiler import SourceModule import pycuda as cuda import pycuda.autoinit import numpy as np from time import time is_equal = ElementwiseKernel("unsigned int *x, unsigned int *y, bool *z", "z[i] = x[i] == y[i]", "is_equal") modulo = ElementwiseKernel("unsigned int *x, unsigned int *y, unsigned int *z", "z[i] = x[i] % y[i]", "modulo") is_div = ElementwiseKernel("unsigned int x, unsigned int *y, bool *z", "z[i] = (x % y[i]) == 0", "is_div") ''' count_facs = SourceModule(""" __global__ void count_facs(int x, unsigned int *y, int *z) { if (x % y[threadIdx.x]) { z[0] += 1; } } """).get_function("count_facs") ''' limit = 5000 # limit = int(input('Limit: ')) start_time = time() with open('primes1.txt') as f: primes = np.fromiter(map(int, f.read().strip().split(',')), dtype=np.uint32)
from write_and_read_results import ( ARRAY_SIZES, write_results_to_file, ADD_ARRAYS, BACKGROUND_CORRECTION, ) mode = "elementwise kernel" BackgroundCorrectionKernel = ElementwiseKernel( arguments= "{0} * data, {0} * flat, const {0} * dark, const {0} MINIMUM_PIXEL_VALUE, const {0} MAXIMUM_PIXEL_VALUE" .format(C_DTYPE), operation="flat[i] -= dark[i];" "if (flat[i] == 0) flat[i] = MINIMUM_PIXEL_VALUE;" "data[i] -= dark[i];" "data[i] /= flat[i];" "if (data[i] > MAXIMUM_PIXEL_VALUE) data[i] = MAXIMUM_PIXEL_VALUE;" "if (data[i] < MINIMUM_PIXEL_VALUE) data[i] = MINIMUM_PIXEL_VALUE;", name="BackgroundCorrectionKernel", ) elementwise_background_correction = lambda data, flat, dark: BackgroundCorrectionKernel( data, flat, dark, MINIMUM_PIXEL_VALUE, MAXIMUM_PIXEL_VALUE) # Create an element-wise Add Array Function AddArraysKernel = ElementwiseKernel( arguments="{0} * arr1, {0} * arr2".format(C_DTYPE), operation="arr1[i] += arr2[i]", name="AddArraysKernel", )
int bidx = (idy-midy)+H/2; int bid = GRID(bidx,bidy,W); // Stay within the bounds if (idx>-1 && idx<W && idy>-1 && idy<H && bidx>-1 && bidx<W && bidy>-1 && bidy<H) { // Subtract dirty beam from dirty map dimg[id]=dimg[id]-dpsf[bid]*scaler; // Add clean beam to clean map cimg[id]=cimg[id]+cpsf[bid]*scaler; }; } """ sub_beam_kernel = cuda_compile(sub_beam_kernel_source, "sub_beam_kernel") add_noise_kernel = ElementwiseKernel( "float *a, float* b, int N", "b[i] = a[i]+b[i]", "gpunoise") ###################### # Gridding functions ###################### def spheroid(eta, m, alpha): """ Calculates spheriodal wave functions. See Schwab 1984 for details. This implementation follows MIRIAD's grid.for subroutine. """ twoalp = 2 * alpha if np.abs(eta) > 1:
from __future__ import absolute_import import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.curandom import rand as curand a_gpu = curand((50,)) b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel # arguments, operation, name="kernel", keep=False, options=[], preamble="" lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = my_f(a*x[i], b*y[i])", "linear_combination", preamble=""" __device__ float my_f(float x, float y) { return sin(x*y); } """) c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) import numpy.linalg as la assert la.norm(c_gpu.get() - numpy.sin((5*a_gpu*6*b_gpu).get())) < 1e-5
""" Created on Thu Nov 17 15:35:42 2016 @author: shiwu_001 """ from config import DTYPE from numpy import int32 from pycuda.elementwise import ElementwiseKernel from pycuda.reduction import ReductionKernel from pycuda.tools import context_dependent_memoize """ Elementwise Kernel """ # r = a seta = ElementwiseKernel("float a, float *r", "r[i] = a", "kernel_seta") # r = x setx = ElementwiseKernel("float *x, float *r", "r[i] = x[i]", "kernel_setx") # r = ax + b axpb = ElementwiseKernel("float a, float *x, float b, float *r", "r[i] = a*x[i] + b", "kernel_axpb") # r = ax + by axpby = ElementwiseKernel("float a, float *x, float b, float *y, float *r", "r[i] = a*x[i] + b*y[i]", "kernel_axpby") # r = ax + by + cz axpbypcz = ElementwiseKernel( "float a, float *x, float b, float *y, float c, float *z, float *r",
from pycuda import gpuarray as ga from pycuda import driver from pycuda.elementwise import ElementwiseKernel from pycuda.reduction import ReductionKernel import numpy as np _axby = ElementwiseKernel( """ pycuda::complex<double> a, pycuda::complex<double> *x, pycuda::complex<double> b, pycuda::complex<double> *y""", \ ' x[i] = a * x[i] + b * y[i]') _norm = ReductionKernel(np.complex128, neutral="0", reduce_expr="a+b", map_expr="pow(abs(x[i]), 2)", arguments="pycuda::complex<double> *x") class Grid: def __init__(self, array): # Get the array. if type(array) is np.ndarray: self.g = ga.to_gpu(array) # Copy data to the GPU. elif type(array) is ga.GPUArray: self.g = array # GPUArray already initialized. else: print 'Invalid type' # Raise proper exception here. # # Create the aby function. # if self.g.dtype is np.dtype('complex128'): # cuda_type = 'pycuda::complex<double>'
# WS 12/13/20 # simple_element_kernel_example0.py # from 'hands-on gpu programming with python and cuda book' import numpy as np import pycuda.autoinit from pycuda import gpuarray from pycuda.elementwise import ElementwiseKernel from time import time host_data = np.float32(np.random.random(50000000)) gpu_2x_ker = ElementwiseKernel("float *in, float *out", "out[i] = 2*in[i]", "gpu_2x_ker") def speedcomparison(): t1 = time() host_data_x2 = host_data * np.float32(2) t2 = time() print('total CPU time: {} sec'.format(t2 - t1)) device_data = gpuarray.to_gpu(host_data) device_data_2x = gpuarray.empty_like(device_data) t1 = time() gpu_2x_ker(device_data, device_data_2x) t2 = time()
# -*- coding: utf-8 -*- """ Created on Mon Aug 13 15:13:27 2018 @author: bhaumik """ import pycuda.gpuarray as gpuarray import pycuda.driver as drv from pycuda.elementwise import ElementwiseKernel import pycuda.autoinit from pycuda.curandom import rand as curand # Kernel function add = ElementwiseKernel("float *d_a, float *d_b, float *d_c", "d_c[i] = d_a[i] + d_b[i]", "add") # create a couple of random matrices with a given shape shape = 1000000 d_a = curand(shape) d_b = curand(shape) d_c = gpuarray.empty_like(d_a) start = drv.Event() end = drv.Event() start.record() # Calling kernel add(d_a, d_b, d_c) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Addition of %d element of GPU" % shape)
np.int32(k), block=(32, 1, 1), grid=(n, 1, 1)) return out def tanh(self, x, y): tanh_kernel(x, y) def tanh_deriv(self, x, y, dy, dx): tanh_deriv_kernel(x, y, dy, dx) # --------------------------- Kernel Definitions ---------------------------- # add_into_if_kernel = ElementwiseKernel("float* a, float* out, float* cond", "if (cond[i] != 0) out[i] += a[i]", "add_into_if_kernel") add_mm_kernel = ElementwiseKernel("float* x, float* y, float *out", "out[i] = x[i] + y[i]", "add_mm_kernel") add_st_kernel = ElementwiseKernel("float x, float* y, float *out", "out[i] = x + y[i]", "add_st_kernel") binarize_v_kernel = ElementwiseKernel( "float* out, float* v, int nrows, int ncols", "out[i] = v[i / ncols] == (i % ncols) ? 1.0f : 0.0f", "binarize_v_kernel") broadcast_t_kernel = ElementwiseKernel( "float* out, float* a, unsigned int broadcast_dim, unsigned int stride", "out[i] = a[i % stride + (i / (broadcast_dim * stride)) * stride]", "broadcast_t_kernel")
bn[0] = 0; pycuda::complex<float> val = in[i]; if ( abs(val) > threshold){ int n_w = atomicAdd(bn, 1); outv[n_w] = val; outl[n_w] = i; } """ threshold_kernel = ElementwiseKernel( " %(tp_in)s *in, %(tp_out1)s *outv, %(tp_out2)s *outl, %(tp_th)s threshold, %(tp_n)s *bn" % { "tp_in": dtype_to_ctype(numpy.complex64), "tp_out1": dtype_to_ctype(numpy.complex64), "tp_out2": dtype_to_ctype(numpy.uint32), "tp_th": dtype_to_ctype(numpy.float32), "tp_n": dtype_to_ctype(numpy.uint32), }, threshold_op, "getstuff") import pycuda.driver as drv n = drv.pagelocked_empty((1), numpy.uint32, mem_flags=drv.host_alloc_flags.DEVICEMAP) nptr = numpy.intp(n.base.get_device_pointer()) val = drv.pagelocked_empty((4096 * 256), numpy.complex64, mem_flags=drv.host_alloc_flags.DEVICEMAP) vptr = numpy.intp(val.base.get_device_pointer())
import numpy as np import pycuda.autoinit from pycuda import gpuarray from pycuda.elementwise import ElementwiseKernel mandel_ker = ElementwiseKernel( "pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound", """ mandelbrot_graph[i] = 1; pycuda::complex<float> c = lattice[i]; pycuda::complex<float> z(0,0); for (int j = 0; j < max_iters; j++) { z = z*z + c; if(abs(z) > upper_bound) { mandelbrot_graph[i] = 0; break; } } """, "mandel_ker") def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
#!/usr/bin/env python """ Created on Wed Oct 15 13:12:50 2014 Author: Oren Freifeld Email: [email protected] """ import numpy as np from pycuda.elementwise import ElementwiseKernel from pycuda.reduction import ReductionKernel calc_err_per_sample = ElementwiseKernel("double *x, double *y, double *err", "err[i] = x[i] - y[i]", "calc_err_per_sample") # Note this likelihood assumes isotropic covariance calc_ll_per_sample = ElementwiseKernel( "double *err, double sigma_lm, double *ll ", "ll[i] = -0.5 * err[i]*err[i] / (sigma_lm*sigma_lm)", "calc_ll_per_sample") calc_negative_ll_per_sample = ElementwiseKernel( "double *err, double sigma_lm, double *negative_ll ", "negative_ll[i] = 0.5 * err[i]*err[i] / (sigma_lm*sigma_lm)", "calc_negative_ll_per_sample") calc_err_by_der_per_sample = ElementwiseKernel( "double *x, double *y, double *err_by_der, double dt", """err_by_der[i] = (x[i+1]-x[i])/dt - (y[i+1]-y[i])/dt""", "calc_err_by_der_per_sample")
L. Dixon and D. G. Grier, "Flow visualization and flow cytometry with holographic video microscopy," Opt. Express 17, 13071-13079 (2009). HISTORY This code was adapted from the IDL implementation of generalizedlorenzmie__define.pro which was written by David G. Grier. This version is Copyright (c) 2018 David G. Grier ''' safe_division = ElementwiseKernel( "float *x, float *y, float a, float *z", "if (abs(y[i]) > 1e-6) { z[i] = x[i]/y[i]; } else {z[i] = a;};", "safe_division", ) class CudaGeneralizedLorenzMie(GeneralizedLorenzMie): ''' A class that computes scattered light fields with CUDA acceleration ... Attributes ---------- particle : Particle Object representing the particle scattering light instrument : Instrument