def gpuFunc(iterator): # 1. Data preparation iterator = iter(iterator) cpu_data = list(iterator) cpu_dataset = " ".join(cpu_data) ascii_data = np.asarray([ord(x) for x in cpu_dataset], dtype=np.uint8) # 2. Driver initialization and data transfer cuda.init() dev = cuda.Device(0) contx = dev.make_context() gpu_dataset = gpuarray.to_gpu(ascii_data) # 3. GPU kernel. # The kernel's algorithm counts the words by keeping # track of the space between them countkrnl = reduction.ReductionKernel( long, neutral="0", map_expr="(a[i] == 32)*(b[i] != 32)", reduce_expr="a + b", arguments="char *a, char *b") results = countkrnl(gpu_dataset[:-1], gpu_dataset[1:]).get() yield results # Release GPU context resources contx.pop() del gpu_dataset del contx gc.collect()
def gpuPi(iterator): iterator = iter(iterator) length = len(list(iterator)) a = np.random.random_sample(length) b = np.random.random_sample(length) cuda.init() dev = cuda.Device(0) contx = dev.make_context() gpu_a = gpuarray.to_gpu(a) gpu_b = gpuarray.to_gpu(b) countkrnl = reduction.ReductionKernel( np.float64, neutral="0", map_expr="(a[i]*a[i] + b[i]*b[i] >= 1.0) ? 1.0 : 0.0", reduce_expr="a+b", arguments="float * a, float * b") pointInsideCircle = countkrnl(gpu_a, gpu_b).get() yield pointInsideCircle contx.pop() del gpu_a del gpu_b gc.collect()
def get_reduction_kernel(self, reduce_expr, map_expr, neutral, *args): """Generate and return reduction kernel; see PyCUDA documentation of pycuda.reduction.ReductionKernel for detailed description. Function expects buffers that are in device address space, stored in gpu_* variables. :param reduce_expr: expression used to reduce two values into one, must use a and b as values names, e.g. 'a+b' :param map_expr: expression used to map value from input array, arrays are named x0, x1, etc., e.g. 'x0[i]*x1[i] :param neutral: neutral value in reduce_expr, e.g. '0' :param args: buffers on which to calculate reduction, e.g. backend.gpu_rho """ arrays = [] arguments = [] for i, arg in enumerate(args): array = self.arrays[arg] arrays.append(array) arguments.append('const {0} *x{1}'.format( pycuda.tools.dtype_to_ctype(array.dtype), i)) kernel = reduction.ReductionKernel(arrays[0].dtype, neutral=neutral, reduce_expr=reduce_expr, map_expr=map_expr, arguments=', '.join(arguments)) return lambda: kernel(*arrays).get()
def createCudaWordCountKernel(): initvalue = "0" mapper = "(a[i] == 32)*(b[i] != 32)" # 32 is ascii code for whitespace reducer = "a+b" cudafunctionarguments = "char* a, char* b" wordcountkernel = reduction.ReductionKernel( numpy.float32, neutral=initvalue, reduce_expr=reducer, map_expr=mapper, arguments=cudafunctionarguments) return wordcountkernel
def maxabs(x_gpu): """ Get maximum absolute value. Find maximum absolute value in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- m_gpu : pycuda.gpuarray.GPUArray Array containing maximum absolute value in `x_gpu`. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.array([-1, 2, -3], np.float32)) >>> m_gpu = misc.maxabs(x_gpu) >>> np.allclose(m_gpu.get(), 3.0) True """ try: func = maxabs.cache[x_gpu.dtype] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) use_double = int(x_gpu.dtype in [np.float64, np.complex128]) ret_type = np.float64 if use_double else np.float32 func = reduction.ReductionKernel( ret_type, neutral="0", reduce_expr="max(a,b)", map_expr="abs(x[i])", arguments="{ctype} *x".format(ctype=ctype)) maxabs.cache[x_gpu.dtype] = func return func(x_gpu)
def gpuFunc(iterator): # 1. Data preparation iterator = iter(iterator) cpu_data = list(iterator) """ #print cpu_data cpu_dataset = " ".join(cpu_data) #print cpu_dataset ascii_data = np.asarray([ord(x) for x in cpu_dataset], dtype=np.uint8) #print ascii_data # 2. Driver initialization and data transfer cuda.init() dev = cuda.Device(0) contx = dev.make_context() gpu_dataset = gpuarray.to_gpu(ascii_data) # 3. GPU kernel. # The kernel's algorithm counts the words by keeping # track of the space between them countkrnl = reduction.ReductionKernel(long, neutral = "0", map_expr = "(a[i] == 32)*(b[i] != 32)", reduce_expr = "a + b", arguments = "char *a, char *b") results = countkrnl(gpu_dataset[:-1],gpu_dataset[1:]).get() #print results value.append(3) print "value " + str(value) yield results , [1,2,3] """ print "PVector", PVector dpFactor = 0.85 pSum = sum(PVector) numLines = len(cpu_data) pList = [] cuda.init() dev = cuda.Device(0) contx = dev.make_context() for i in range(numLines): if cpu_data[i][0] != '#': continue firstSpaceIndex = cpu_data[i].find(' ') secondSpaceIndex = cpu_data[i].find(' ', firstSpaceIndex+1) nodeID = int(cpu_data[i][firstSpaceIndex+1:secondSpaceIndex]) probListStr = cpu_data[i][secondSpaceIndex+1:].split(' ') probListFlt = map(float,probListStr) matSize = len(probListStr) #probListFlt = [float(x)*dpFactor + (1-dpFactor) for x in probListStr] probListFlt = [x*dpFactor + (1-dpFactor)/matSize for x in probListFlt] npProbListFlt = np.asarray(probListFlt,dtype = np.float32) npPVector = np.asarray(PVector,np.float32) #cuda.init() #dev = cuda.Device(0) #contx = dev.make_context() gpu_matVect = gpuarray.to_gpu(npProbListFlt) gpu_pVect = gpuarray.to_gpu(npPVector) countkrnl = reduction.ReductionKernel(np.float32, neutral = "0", map_expr = "a[i]*b[i]", reduce_expr = "a+b", arguments = "float *a, float *b") result = countkrnl(gpu_matVect, gpu_pVect).get() if i == 0: pList = [0.0] * matSize pList[nodeID] = result yield pList # Release GPU context resources contx.pop() del gpu_matVect del gpu_pVect del contx gc.collect()
import numpy as np import pycuda.autoinit from pycuda import gpuarray, reduction x = np.arange(0, 1001, dtype=np.uint32) kernel = reduction.ReductionKernel( dtype_out=np.float32, arguments="unsigned int* x", map_expr="(float)x[i] * x[i]", reduce_expr="a + b", neutral="0.0", ) x_gpu = gpuarray.to_gpu(x) result = kernel(x_gpu).get() print(result)
import pycuda.reduction as rd import pycuda.driver as cuda import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy as np a = gpuarray.arange(400, dtype=np.float32) b = gpuarray.arange(400, dtype=np.float32) krnl = rd.ReductionKernel(np.float32, neutral='0', reduce_expr='a+b', map_expr='x[i]*y[i]', arguments='float *x, float *y') my_dot_prod = krnl(a, b).get() print my_dot_prod print np.sum(np.arange(400)**2)
def __init__(self): size = self.gridDIM_y * self.gridDIM_p_y * self.gridDIM_x * self.gridDIM_p_x self.FAFT_axes0 = 0 self.FAFT_axes1 = 1 self.FAFT_axes2 = 2 self.FAFT_axes3 = 3 #m = size self.FAFT_segment_axes0 = 0 self.FAFT_segment_axes1 = 0 self.FAFT_segment_axes2 = 0 self.FAFT_segment_axes3 = 0 self.NF = 1 # Phase space step size self.dp_y = 2 * self.p_y_amplitude / float(self.gridDIM_p_y) #axis 0 self.dy = 2 * self.y_amplitude / float(self.gridDIM_y) #axis 1 self.dp_x = 2 * self.p_x_amplitude / float(self.gridDIM_p_x) #axis 2 self.dx = 2 * self.x_amplitude / float(self.gridDIM_x) #axis 3 # Ambiguity space step size self.dtheta_y = 2 * self.theta_y_amplitude / float( self.gridDIM_y) #axis 0 self.dlambda_y = 2 * self.lambda_y_amplitude / float( self.gridDIM_p_y) #axis 1 self.dtheta_x = 2 * self.theta_x_amplitude / float( self.gridDIM_x) #axis 2 self.dlambda_x = 2 * self.lambda_x_amplitude / float( self.gridDIM_p_x) #axis 3 # delta parameters self.delta_p_y = self.dp_y * self.dtheta_y / (2 * np.pi) #axis 0 self.delta_y = self.dy * self.dlambda_y / (2 * np.pi) #axis 1 self.delta_p_x = self.dp_x * self.dtheta_x / (2 * np.pi) #axis 2 self.delta_x = self.dx * self.dlambda_x / (2 * np.pi) #axis 3 # Phase space self.p_y_range = np.linspace(-self.p_y_amplitude, self.p_y_amplitude - self.dp_y, self.gridDIM_p_y) #axis 0 self.y_range = np.linspace(-self.y_amplitude, self.y_amplitude - self.dy, self.gridDIM_y) #axis 1 self.p_x_range = np.linspace(-self.p_x_amplitude, self.p_x_amplitude - self.dp_x, self.gridDIM_p_x) #axis 2 self.x_range = np.linspace(-self.x_amplitude, self.x_amplitude - self.dx, self.gridDIM_x) #axis 3 # Ambiguity space range self.theta_y_range = np.linspace( -self.theta_y_amplitude, self.theta_y_amplitude - self.dtheta_y, self.gridDIM_y) #0 self.lambda_y_range = np.linspace( -self.lambda_y_amplitude, self.lambda_y_amplitude - self.dlambda_y, self.gridDIM_p_y) #1 self.theta_x_range = np.linspace( -self.theta_x_amplitude, self.theta_x_amplitude - self.dtheta_x, self.gridDIM_x) #2 self.lambda_x_range = np.linspace( -self.lambda_x_amplitude, self.lambda_x_amplitude - self.dlambda_x, self.gridDIM_p_x) #3 # Grid self.y = self.y_range[np.newaxis, :, np.newaxis, np.newaxis] #axis 1 self.p_y = self.p_y_range[:, np.newaxis, np.newaxis, np.newaxis] #axis 0 self.p_x = self.p_x_range[np.newaxis, np.newaxis, :, np.newaxis] #axis 2 self.x = self.x_range[np.newaxis, np.newaxis, np.newaxis, :] #axis 3 self.CUDA_constants = '\n' self.CUDA_constants += '__device__ double dt = %f; ' % self.dt self.CUDA_constants += '__device__ double mass = %f; \n' % self.mass self.CUDA_constants += '__device__ double dp_y = %f; ' % self.dp_y self.CUDA_constants += '__device__ double dy = %f; ' % self.dy self.CUDA_constants += '__device__ double dp_x = %f; ' % self.dp_x self.CUDA_constants += '__device__ double dx = %f; \n' % self.dx self.CUDA_constants += '__device__ double dtheta_y = %f; ' % self.dtheta_y self.CUDA_constants += '__device__ double dlambda_y = %f; ' % self.dlambda_y self.CUDA_constants += '__device__ double dtheta_x = %f; ' % self.dtheta_x self.CUDA_constants += '__device__ double dlambda_x = %f; \n' % self.dlambda_x self.CUDA_constants += '__device__ int gridDIM_x = %d; ' % self.gridDIM_x self.CUDA_constants += '__device__ int gridDIM_y = %d; ' % self.gridDIM_y try: self.CUDA_constants += '__device__ double D_lambda_x = %f; ' % self.D_lambda_x self.CUDA_constants += '__device__ double D_lambda_y = %f; ' % self.D_lambda_y self.CUDA_constants += '__device__ double D_theta_x = %f; ' % self.D_theta_x self.CUDA_constants += '__device__ double D_theta_y = %f; \n' % self.D_theta_y except AttributeError: pass self.CUDA_constants += '\n' print self.CUDA_constants #........................................................................... print ' GPU memory Total ', pycuda.driver.mem_get_info( )[1] / float(2**30), 'GB' print ' GPU memory Free (Before) ', pycuda.driver.mem_get_info( )[0] / float(2**30), 'GB' self.W_init_gpu = gpuarray.zeros((self.gridDIM_p_y, self.gridDIM_y, self.gridDIM_p_x, self.gridDIM_x), dtype=np.complex128) print ' GPU memory Free (After) ', pycuda.driver.mem_get_info( )[0] / float(2**30), 'GB' #............................................................................ self.indexUnpack_x_p_string = """ int i_x = i%gridDIM_x; int i_p_x = (i/gridDIM_x) % gridDIM_x; int i_y = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y; int i_p_y = i/(gridDIM_x*gridDIM_x*gridDIM_y); double x = dx *( i_x - gridDIM_x/2 ); double p_x = dp_x*( i_p_x - gridDIM_x/2 ); double y = dy *( i_y - gridDIM_y/2 ); double p_y = dp_y*( i_p_y - gridDIM_y/2 ); """ self.indexUnpack_lambda_theta_string = """ int i_x = i%gridDIM_x; int i_p_x = (i/gridDIM_x) % gridDIM_x; int i_y = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y; int i_p_y = i/(gridDIM_x*gridDIM_x*gridDIM_y); double lambda_x = dlambda_x * ( i_x - gridDIM_x/2 ); double theta_x = dtheta_x * ( i_p_x - gridDIM_x/2 ); double lambda_y = dlambda_y * ( i_y - gridDIM_y/2 ); double theta_y = dtheta_y * ( i_p_y - gridDIM_y/2 ); """ self.indexUnpack_lambda_p_string = """ int i_x = i%gridDIM_x; int i_p_x = (i/gridDIM_x) % gridDIM_x; int i_y = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y; int i_p_y = i/(gridDIM_x*gridDIM_x*gridDIM_y); double lambda_x = dlambda_x*( i_x - gridDIM_x/2 ); double p_x = dp_x *( i_p_x - gridDIM_x/2 ); double lambda_y = dlambda_y*( i_y - gridDIM_y/2 ); double p_y = dp_y *( i_p_y - gridDIM_y/2 ); """ self.indexUnpack_x_theta_string = """ int i_x = i%gridDIM_x; int i_p_x = (i/gridDIM_x) % gridDIM_x; int i_y = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y; int i_p_y = i/(gridDIM_x*gridDIM_x*gridDIM_y); double x = dx *( i_x - gridDIM_x/2 ); double theta_x = dtheta_x*( i_p_x - gridDIM_x/2 ); double y = dy *( i_y - gridDIM_y/2 ); double theta_y = dtheta_y*( i_p_y - gridDIM_y/2 ); """ #............................................................................................... self.Gaussian_GPU = ElementwiseKernel( """pycuda::complex<double> *W , double mu_p_y, double mu_y, double mu_p_x, double mu_x , double sigma_p_y, double sigma_y, double sigma_p_x, double sigma_x """, self.indexUnpack_x_p_string + """ double temp = exp(-0.5*( x - mu_x )*( x - mu_x )/( sigma_x * sigma_x ) ); temp *= exp(-0.5*( y - mu_y )*( y - mu_y )/( sigma_y * sigma_y ) ); temp *= exp(-0.5*( p_x - mu_p_x )*( p_x - mu_p_x )/( sigma_p_x * sigma_p_x ) ); temp *= exp(-0.5*( p_y - mu_p_y )*( p_y - mu_p_y )/( sigma_p_y * sigma_p_y ) ); W[i] = pycuda::complex<double>( temp , 0. ); """, "Gaussian", preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants) # self.HOscillatorGound_GPU = ElementwiseKernel( """pycuda::complex<double> *W, double x_mu, double y_mu, double p_x_mu, double p_y_mu, double omega_x, double omega_y, double mass""", self.indexUnpack_x_p_string + """ double temp = (mass*pow( omega_x*(x-x_mu) ,2) + pow(p_x-p_x_mu,2)/mass)/omega_x; temp += (mass*pow( omega_y*(y-y_mu) ,2) + pow(p_y-p_y_mu,2)/mass)/omega_y; W[i] = pycuda::complex<double>( exp(-temp) , 0. ); """, "Gaussian", preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants) # Kinetic propagator ................................................................................ kineticStringC = '__device__ double K(double p_x, double p_y){ \n return ' + self.kineticString + ';\n}' self.exp_p_lambda_GPU = ElementwiseKernel( """ pycuda::complex<double> *B """, self.indexUnpack_lambda_p_string + """ double r = exp( - dt*D_lambda_y * lambda_x*lambda_x ); r *= exp( - dt*D_lambda_y * lambda_y*lambda_y ); double phase = dt*K(p_x + 0.5*lambda_x, p_y + 0.5*lambda_y) - dt*K(p_x - 0.5*lambda_x, p_y - 0.5*lambda_y); B[i] *= pycuda::complex<double>( r*cos(phase), -r*sin(phase) ); """, "exp_p_lambda_GPU", preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants + kineticStringC) # Potential propagator .............................................................................. potentialStringC = '__device__ double V(double x, double y){ \n return ' + self.potentialString + ';\n}' self.exp_x_theta_GPU = ElementwiseKernel( """ pycuda::complex<double> *B """, self.indexUnpack_x_theta_string + """ double phase = dt*V(x-0.5*theta_x , y-0.5*theta_y) - dt*V( x+0.5*theta_x , y+0.5*theta_y ); double r = exp( - dt*D_theta_y * theta_x*theta_x - dt*D_theta_y * theta_y*theta_y ); B[i] *= pycuda::complex<double>( r*cos(phase), -r*sin(phase) ); """, "exp_x_theta_GPU", preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants + potentialStringC) # Ehrenfest theorems ................................................................................. x_Define = "\n#define x(i) dx*( (i%gridDIM_x) - 0.5*gridDIM_x )\n" p_x_Define = "\n#define p_x(i) dp_x*( ((i/gridDIM_x) % gridDIM_x)-0.5*gridDIM_x)\n" y_Define = "\n#define y(i) dy *( (i/(gridDIM_x*gridDIM_x)) % gridDIM_y - 0.5*gridDIM_y)\n" p_y_Define = "\n#define p_y(i) dp_y*( i/(gridDIM_x*gridDIM_x*gridDIM_y) - 0.5*gridDIM_y )\n" p_x_p_y_Define = p_x_Define + p_y_Define phaseSpaceDefine = p_x_Define + p_y_Define + x_Define + y_Define self.Average_x_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="pycuda::real<double>( x(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + x_Define + self.CUDA_constants) self.Average_x_square_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="pycuda::real<double>( x(i)*x(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + x_Define + self.CUDA_constants) self.Average_p_x_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="pycuda::real<double>( p_x(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + p_x_Define + self.CUDA_constants) self.Average_p_x_square_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr= "pycuda::real<double>( p_x(i)*p_x(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + p_x_Define + self.CUDA_constants) self.Average_y_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="pycuda::real<double>( y(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + y_Define + self.CUDA_constants) self.Average_p_y_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="pycuda::real<double>( p_y(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + p_y_Define + self.CUDA_constants) # self.Average_y_square_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="pycuda::real<double>( y(i)*y(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + y_Define + self.CUDA_constants) self.Average_p_y_square_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr= "pycuda::real<double>( p_y(i)*p_y(i)*dx*dy*dp_x*dp_y*W[i] )", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + p_y_Define + self.CUDA_constants) # kineticString = self.kineticString.replace('p_x', 'p_x(i)') kineticString = kineticString.replace('p_y', 'p_y(i)') potentialString = (self.potentialString.replace('x', 'x(i)')).replace( 'y', 'y(i)') energyString = kineticString + "+" + potentialString print "\n" print energyString self.Energy_GPU = reduction.ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="pycuda::real<double>((" + energyString + ")*dx*dy*dp_x*dp_y*W[i])", arguments="pycuda::complex<double> *W", preamble="#define _USE_MATH_DEFINES" + phaseSpaceDefine + self.CUDA_constants)