def test_struct_reduce(ctx_factory): pytest.importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev, = context.devices if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple" and dev.driver_version == "8.12.47 310.40.00.05f01"): pytest.skip("causes a compiler hang on Apple/Nv GPU") mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0]) preamble = mmc_c_decl + r"""//CL// minmax_collector mmc_neutral() { // FIXME: needs infinity literal in real use, ok here minmax_collector result; result.cur_min = 1<<30; result.cur_max = -(1<<30); return result; } minmax_collector mmc_from_scalar(float x) { minmax_collector result; result.cur_min = x; result.cur_max = x; return result; } minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(context, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
def __init__(self, context, queue): """ Constructor. @param context OpenCL context where apply. @param queue OpenCL command queue. """ self.context = context self.queue = queue self.program = clUtils.loadProgram(context, clUtils.path() + "/lsqr.cl") # Create OpenCL objects as null objects, that we will generate # at the first iteration self.A = None self.B = None self.X0 = None self.X = None self.R = None # Create dot operator self.dot = ReductionKernel(context, np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="__global float *x, __global float *y") self.dot_c_vec = ElementwiseKernel(context, "float c, float *v", "v[i] *= c") self.copy_vec = ElementwiseKernel(context, "float* out, float *in", "out[i] = in[i]") self.linear_comb = ElementwiseKernel(context, "float* z," "float a, float *x, " "float b, float *y", "z[i] = a*x[i] + b*y[i]") self.prod = ElementwiseKernel(context, "float* z," "float *x, float *y", "z[i] = x[i]*y[i]")
def qubit_probability(self, target): """Get the probability of a single qubit begin measured as '0'""" preamble = """ #include <pyopencl-complex.h> float probability(int target, int i, cfloat_t amp) { if ((i & (1 << target )) != 0) { return 0; } // return 6.0; float abs = cfloat_abs(amp); return abs * abs; } """ kernel = ReductionKernel( context, np.float, neutral = "0", reduce_expr="a + b", map_expr="probability(target, i, amps[i])", arguments="__global cfloat_t *amps, __global int target", preamble=preamble ) return kernel(self.buffer, target).get()
def errest(self, x, y, z): if x.traits != y.traits != z.traits: raise ValueError('Incompatible matrix types') cnt = x.leaddim * x.nrow dtype = x.dtype # Build the reduction kernel rkern = ReductionKernel( self.backend.ctx, dtype, neutral='0', reduce_expr='a + b', map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)', arguments='__global {0}* x, __global {0}* y, __global {0}* z, ' '{0} atol, {0} rtol'.format(npdtype_to_ctype(dtype))) class ErrestKernel(ComputeKernel): @property def retval(self): return self._retarr.get() def run(self, queue, atol, rtol): qcomp = queue.cl_queue_comp xarr = Array(qcomp, cnt, dtype, data=x.data) yarr = Array(qcomp, cnt, dtype, data=y.data) zarr = Array(qcomp, cnt, dtype, data=z.data) self._retarr = rkern(xarr, yarr, zarr, atol, rtol, queue=qcomp) return ErrestKernel()
def __init__(self, ctx, queue, data, symmetry_modes): self._ctx = ctx self._queue = queue self.symmetry_modes = symmetry_modes self.data = data ctype = dtype_to_ctype(data.dtype) with open('sandpile.cl') as f: program = cl.Program(self._ctx, f.read()) macros = _gen_macros(data, symmetry_modes) options = _macros_to_options(macros) self._program = program.build(options=options) from pyopencl.reduction import ReductionKernel self._diff_krnl = ReductionKernel( self._ctx, numpy.uint32, neutral='0', reduce_expr='a+b', map_expr='grid[i]!=new_grid[i]', arguments='const __global %s *grid, const __global %s *new_grid' % (ctype, ctype))
def argmin_kernal(context): import numpy as np mmc_dtype = np.dtype([ ("cur_min", np.float32), ("cur_index", np.int32), ("pad", np.int32), ]) name = "argmin_collector" from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct mmc_dtype, mmc_c_decl = match_dtype_to_c_struct(device, name, mmc_dtype) mmc_dtype = get_or_register_dtype(name, mmc_dtype) preamble = mmc_c_decl + r"""//CL// argmin_collector mmc_neutral() { // FIXME: needs infinity literal in real use, ok here argmin_collector result; result.cur_min = INFINITY; result.cur_index = -1; return result; } argmin_collector mmc_from_scalar(float x,int index) { argmin_collector result; result.cur_min = x; result.cur_index = index; return result; } argmin_collector agg_mmc(argmin_collector a, argmin_collector b) { argmin_collector result = a; if (b.cur_min < result.cur_min) { result.cur_min = b.cur_min; result.cur_index = b.cur_index; } return result; } """ from pyopencl.reduction import ReductionKernel red = ReductionKernel(context, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i],i)", arguments="__global int *x", preamble=preamble) return red
def make_reduction_krnl(self): self.krnl = ReductionKernel( self.ctx, cltypes.float, neutral="0", reduce_expr="a+b", map_expr="pow(y_pred[i] - y_true[i], 2)", arguments= "__global const float* y_true, __global const float* y_pred", name="mse_reduction_kernel")
def make_reduction_krnl(self): self.krnl = ReductionKernel( self.ctx, cltypes.float, neutral="0", reduce_expr="a+b", # p is the true distribution, q is predicted map_expr="y_true[i] * (-log(y_pred[i]))", arguments= "__global const float* y_true, __global const float* y_pred", name="categorical_crossentropy_reduction_kernel")
def test_struct_reduce(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0]) preamble = mmc_c_decl + r"""//CL// minmax_collector mmc_neutral() { // FIXME: needs infinity literal in real use, ok here minmax_collector result; result.cur_min = 1<<30; result.cur_max = -(1<<30); return result; } minmax_collector mmc_from_scalar(float x) { minmax_collector result; result.cur_min = x; result.cur_max = x; return result; } minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(context, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
def compile_kernels(self): """Compile the kernel""" OpenclProcessing.compile_kernels(self, self.kernel_files, "-D NIMAGE=%i" % self.size) compiler_options = self.get_compiler_options(x87_volatile=True) src = concatenate_cl_kernel(("doubleword.cl", "statistics.cl")) self.reduction_comp = ReductionKernel(self.ctx, dtype_out=float8, neutral=zero8, map_expr="map_statistics(data, i)", reduce_expr="reduce_statistics(a,b)", arguments="__global float *data", preamble=src, options=compiler_options) self.reduction_simple = ReductionKernel(self.ctx, dtype_out=float8, neutral=zero8, map_expr="map_statistics(data, i)", reduce_expr="reduce_statistics_simple(a,b)", arguments="__global float *data", preamble=src, options=compiler_options) if "cl_khr_fp64" in self.device.extensions: self.reduction_double = ReductionKernel(self.ctx, dtype_out=float8, neutral=zero8, map_expr="map_statistics(data, i)", reduce_expr="reduce_statistics_double(a,b)", arguments="__global float *data", preamble=src, options=compiler_options) else: logger.info("Device %s does not support double-precision arithmetics, fall-back on compensated one", self.device) self.reduction_double = self.reduction_comp
def compile_kernels(self): """Compile the kernel""" OpenclProcessing.compile_kernels(self, self.kernel_files, "-D NIMAGE=%i" % self.size) compiler_options = self.get_compiler_options(x87_volatile=True) src = concatenate_cl_kernel(("kahan.cl", "statistics.cl")) self.reduction_comp = ReductionKernel( self.ctx, dtype_out=float8, neutral=zero8, map_expr="map_statistics(data, i)", reduce_expr="reduce_statistics(a,b)", arguments="__global float *data", preamble=src, options=compiler_options) self.reduction_simple = ReductionKernel( self.ctx, dtype_out=float8, neutral=zero8, map_expr="map_statistics(data, i)", reduce_expr="reduce_statistics_simple(a,b)", arguments="__global float *data", preamble=src, options=compiler_options)
def init_kernels(self): """Set up the OpenCL kernels.""" from pkg_resources import resource_string kernel_src = resource_string(__name__, 'CLBacterium.cl') self.program = cl.Program(self.context, kernel_src).build(cache_dir=False) # Some kernels that seem like they should be built into pyopencl... self.vclearf = ElementwiseKernel(self.context, "float8 *v", "v[i]=0.0", "vecclearf") self.vcleari = ElementwiseKernel(self.context, "int *v", "v[i]=0", "veccleari") self.vadd = ElementwiseKernel( self.context, "float8 *res, const float8 *in1, const float8 *in2", "res[i] = in1[i] + in2[i]", "vecadd") self.vsub = ElementwiseKernel( self.context, "float8 *res, const float8 *in1, const float8 *in2", "res[i] = in1[i] - in2[i]", "vecsub") self.vaddkx = ElementwiseKernel( self.context, "float8 *res, const float k, const float8 *in1, const float8 *in2", "res[i] = in1[i] + k*in2[i]", "vecaddkx") self.vsubkx = ElementwiseKernel( self.context, "float8 *res, const float k, const float8 *in1, const float8 *in2", "res[i] = in1[i] - k*in2[i]", "vecsubkx") # cell geometry kernels self.calc_cell_area = ElementwiseKernel( self.context, "float* res, float* r, float* l", "res[i] = 2.f*3.1415927f*r[i]*(2.f*r[i]+l[i])", "cell_area_kern") self.calc_cell_vol = ElementwiseKernel( self.context, "float* res, float* r, float* l", "res[i] = 3.1415927f*r[i]*r[i]*(2.f*r[i]+l[i])", "cell_vol_kern") # A dot product as sum of float4 dot products - # i.e. like flattening vectors of float8s into big float vectors # then computing dot # NB. Some openCLs seem not to implement dot(float8,float8) so split # into float4's self.vdot = ReductionKernel( self.context, numpy.float32, neutral="0", reduce_expr="a+b", map_expr="dot(x[i].s0123,y[i].s0123)+dot(x[i].s4567,y[i].s4567)", arguments="__global float8 *x, __global float8 *y")
def test_sum_without_data(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 2000 from pyopencl.reduction import ReductionKernel red = ReductionKernel(context, np.int32, neutral="0", reduce_expr="a+b", map_expr="i", arguments=[]) result_dev = red(range=slice(n), queue=queue).get() result_ref = n*(n-1)//2 assert result_dev == result_ref
def _get_minmax_kernel(self, ctx, dtype, mmc_dtype, prop_names, only_min, only_max, name, mmc_c_decl): tpl_args = ", ".join([ "%(dtype)s %(prop)s" % { 'dtype': dtype, 'prop': prop } for prop in prop_names ]) mmc_preamble = mmc_c_decl + minmax_tpl preamble = mkt.Template(text=mmc_preamble).render( args=tpl_args, prop_names=prop_names, dtype=name, only_min=only_min, only_max=only_max) knl_args = ", ".join([ "__global %(dtype)s *%(prop)s" % { 'dtype': dtype, 'prop': prop } for prop in prop_names ]) map_args = ", ".join([ "%(prop)s[i]" % { 'dtype': dtype, 'prop': prop } for prop in prop_names ]) from pyopencl.reduction import ReductionKernel knl = ReductionKernel(ctx, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(%s)" % map_args, arguments=knl_args, preamble=preamble) return knl
def test_reduction_not_first_argument(ctx_factory): # https://github.com/inducer/pyopencl/issues/535 from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 400 a = cl_array.arange(queue, n, dtype=np.float32) b = cl_array.arange(queue, n, dtype=np.float32) from pyopencl.reduction import ReductionKernel krnl = ReductionKernel(context, np.float32, neutral="0", reduce_expr="a+b", map_expr="z*x[i]*y[i]", arguments="float z, __global float *x, __global float *y") my_dot_prod = krnl(0.1, a, b).get() assert abs(my_dot_prod - 0.1*np.sum(np.arange(n)**2)) < 1e-4
def __init__(self, context, queue): """ Constructor. @param context OpenCL context where apply. @param queue OpenCL command queue. """ self.context = context self.queue = queue self.program = clUtils.loadProgram(context, clUtils.path() + "/jacobi.cl") # Create OpenCL objects as null objects, that we will generate # at the first iteration self.A = None self.B = None self.X0 = None self.X = None self.x = None # Create dot operator self.dot = ReductionKernel( context, np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="__global float *x, __global float *y")
from pyopencl.array import arange, Array from pyopencl.reduction import ReductionKernel import numpy ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue(ctx) #print dir(cl) #a = arange(queue, 400, dtype=numpy.float32) #b = arange(queue, 400, dtype=numpy.float32) acpu = numpy.zeros((100, 1), dtype=numpy.int32) for i in xrange(0, 100): if i % 5 == 0: acpu[i] = 1 a = Array(queue, (100, 1), numpy.int32) a.set(acpu) queue.finish() krnl = ReductionKernel( ctx, numpy.int32, neutral="0", reduce_expr="a+b", map_expr="x[i]", #*y[i]", arguments="__global int *x") #, __global in *y") my_sum = krnl(a).get() queue.finish() print my_sum
def get_minmax_kernel(ctx, dtype, inf, mmc_dtype, prop_names, only_min, only_max, name, mmc_c_decl, backend): tpl_args = ", ".join([ "%(dtype)s %(prop)s" % { 'dtype': dtype, 'prop': prop } for prop in prop_names ]) if backend == 'cuda': # overload assignment operator in struct mmc_overload = mkt.Template(text=minmax_operator_tpl).render( prop_names=prop_names, dtype=name, only_min=only_min, only_max=only_max) mmc_c_decl_lines = mmc_c_decl.splitlines() mmc_c_decl_lines = mmc_c_decl_lines[:-2] + \ mmc_overload.splitlines() + mmc_c_decl_lines[-2:] mmc_c_decl = '\n'.join(mmc_c_decl_lines) mmc_preamble = mmc_c_decl + minmax_tpl preamble = mkt.Template(text=mmc_preamble).render(args=tpl_args, prop_names=prop_names, dtype=name, only_min=only_min, only_max=only_max, inf=inf) map_args = ", ".join([ "%(prop)s[i]" % { 'dtype': dtype, 'prop': prop } for prop in prop_names ]) if backend == 'opencl': knl_args = ", ".join([ "__global %(dtype)s* %(prop)s" % { 'dtype': dtype, 'prop': prop } for prop in prop_names ]) from pyopencl._cluda import CLUDA_PREAMBLE from pyopencl.reduction import ReductionKernel cluda_preamble = mkt.Template(text=CLUDA_PREAMBLE).render( double_support=True) knl = ReductionKernel(ctx, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(%s)" % map_args, arguments=knl_args, preamble='\n'.join([cluda_preamble, preamble])) elif backend == 'cuda': knl_args = ", ".join([ "%(dtype)s* %(prop)s" % { 'dtype': dtype, 'prop': prop } for prop in prop_names ]) from pycuda._cluda import CLUDA_PREAMBLE from pycuda.reduction import ReductionKernel cluda_preamble = mkt.Template(text=CLUDA_PREAMBLE).render( double_support=True) knl = ReductionKernel(mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(%s)" % map_args, arguments=knl_args, preamble='\n'.join([cluda_preamble, preamble])) return knl
def valueMonteCarloGPU(ctx, queue, S_init, nPaths, Exp_Time, dtMonte, Strike, Int_Rate, Vol, PTYPE, nMonteLoops=1): nextStepPathKernel = ElementwiseKernel( ctx, "float *latestStep, float *ran, float Strike, float Int_Rate, float Exp_Time, float dt, float Vol", "float rval = exp((Int_Rate - 0.5f * Vol*Vol)*dt + Vol * sqrt(dt) * ran[i]); latestStep[i] *= rval;", "nextStepPathKernel") excersisePriceKernel = ElementwiseKernel( ctx, "float *latestStep, float Strike, float Int_Rate, float Exp_Time", "float rval = (latestStep[i]-Strike); latestStep[i] = exp(-Int_Rate*Exp_Time) * max(rval,0.0f);", "excersisePriceKernel") sumKernel = ReductionKernel(ctx, numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]", arguments="__global float *x") maxWorkItems = 1 * 2**9 multiplier = 1 if (nPaths > maxWorkItems): multiplier = math.ceil(nPaths / maxWorkItems) nPaths = multiplier * maxWorkItems else: maxWorkItems = nPaths #print(maxWorkItems, multiplier, nPaths) nTimeStepsMonte = math.ceil(Exp_Time / dtMonte) #print(nTimeStepsMonte,nMonteLoops) #set up random number generator gen = RanluxGenerator(queue, maxWorkItems, luxury=4, seed=time.time()) #the arrays ran = cl.array.zeros(queue, maxWorkItems, numpy.float32) latestStep = cl.array.zeros_like(ran) means = numpy.zeros(nMonteLoops) theMean = 0 #the loop for loop in range(nMonteLoops): theSum = 0 for mult in range(multiplier): latestStep.fill(S_init) for t in range(nTimeStepsMonte): gen.fill_normal(ran) gen.synchronize(queue) nextStepPathKernel(latestStep, ran, Strike, Int_Rate, Exp_Time, dtMonte, Vol) excersisePriceKernel(latestStep, Strike, Int_Rate, Exp_Time) #print(latestStep) #add to array theSum += sumKernel(latestStep, queue).get() means[loop] = theSum / nPaths monteAverage = numpy.mean(means) monteStdDeviation = numpy.std(means) return monteAverage, dtMonte, monteStdDeviation
def _generate(self): if self.backend == 'cython': if self.func is not None: self.tp.add(self.func) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_return_type(c_data) name = self.func.__name__ cargs = ', '.join(c_data[1]) map_expr = '{name}({cargs})'.format(name=name, cargs=cargs) else: py_data = (['int i', '{type}[:] inp'.format(type=self.type)], ['i', '&inp[0]']) c_data = (['int i', '{type}* inp'.format(type=self.type)], ['i', 'inp']) map_expr = 'inp[i]' py_defn = ['long SIZE'] + py_data[0][1:] c_defn = ['long SIZE'] + c_data[0][1:] py_args = ['SIZE'] + py_data[1][1:] template = Template(text=reduction_cy_template) src = template.render( name=self.name, type=self.type, map_expr=map_expr, reduce_expr=self.reduce_expr, neutral=self.neutral, c_arg_sig=', '.join(c_defn), py_arg_sig=', '.join(py_defn), py_args=', '.join(py_args), openmp=self._config.use_openmp, get_parallel_range=get_parallel_range ) self.tp.add_code(src) self.tp.compile() self.c_func = getattr(self.tp.mod, 'py_' + self.name) elif self.backend == 'opencl': if self.func is not None: self.tp.add(self.func) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) name = self.func.__name__ expr = '{func}({args})'.format( func=name, args=', '.join(c_data[1]) ) arguments = convert_to_float_if_needed( ', '.join(c_data[0][1:]) ) preamble = convert_to_float_if_needed(self.tp.get_code()) else: arguments = '{type} *in'.format(type=self.type) expr = None preamble = '' from .opencl import get_context, get_queue from pyopencl.reduction import ReductionKernel ctx = get_context() self.queue = get_queue() knl = ReductionKernel( ctx, dtype_out=self.dtype_out, neutral=self.neutral, reduce_expr=self.reduce_expr, map_expr=expr, arguments=arguments, preamble=preamble ) self.c_func = knl
def cl_reduct_krnl_build(cl_ctx, *args, **kwargs): return ReductionKernel(cl_ctx, *args, **kwargs)
operation="res_g[i] = dot(a_g[i],b_g[i])", name="elem_wise_krnl" ) elem_wise_event = elem_wise_krnl(a_g, b_g, res_g) elem_wise_time = time.time() # np.set_printoptions(precision=2) # print(res_g.get()) # print(res_g.get().shape) # print(res_g.get()[:10]) reduction_krnl = ReductionKernel(ctx, dtype_out=np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]", arguments="__global float *x", name="reduction_krnl", ) res_reduction = reduction_krnl(res_g, queue=queue, wait_for=[elem_wise_event]) reduction_time = time.time() print("elem_wise_time: {}".format(elem_wise_time-start)) print("reduction_time: {}".format(reduction_time-elem_wise_time)) print("Total: {}".format(reduction_time-start)) #print(res_reduction) print(res_reduction.get())
import pyopencl as cl import pyopencl.clrandom as clrand from pyopencl.reduction import ReductionKernel import numpy as np ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 10**7 x = clrand.rand(queue, n, np.float32) rknl = ReductionKernel(ctx, np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*x[i]", arguments="double *x") result = rknl(x) result_np = result.get()
def _generate(self, declarations=None): if self.backend == 'cython': if self.func is not None: self.tp.add(self.func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_return_type(c_data) name = self.func.__name__ cargs = ', '.join(c_data[1]) map_expr = '{name}({cargs})'.format(name=name, cargs=cargs) else: py_data = (['int i', '{type}[:] inp'.format(type=self.type)], ['i', '&inp[0]']) c_data = (['int i', '{type}* inp'.format(type=self.type)], ['i', 'inp']) map_expr = 'inp[i]' py_defn = ['long SIZE'] + py_data[0][1:] c_defn = ['long SIZE'] + c_data[0][1:] py_args = ['SIZE'] + py_data[1][1:] template = Template(text=reduction_cy_template) src = template.render(name=self.name, type=self.type, map_expr=map_expr, reduce_expr=self.reduce_expr, neutral=self.neutral, c_arg_sig=', '.join(c_defn), py_arg_sig=', '.join(py_defn), py_args=', '.join(py_args), openmp=self._config.use_openmp, get_parallel_range=get_parallel_range) # This is the user code source. self.source = self.tp.get_code() self.tp.add_code(src) self.tp.compile() self.all_source = self.tp.source return getattr(self.tp.mod, 'py_' + self.name) elif self.backend == 'opencl': if self.func is not None: self.tp.add(self.func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) name = self.func.__name__ expr = '{func}({args})'.format(func=name, args=', '.join(c_data[1])) arguments = convert_to_float_if_needed(', '.join( c_data[0][1:])) preamble = convert_to_float_if_needed(self.tp.get_code()) else: arguments = '{type} *in'.format(type=self.type) expr = None preamble = '' from .opencl import get_context, get_queue from pyopencl.reduction import ReductionKernel from pyopencl._cluda import CLUDA_PREAMBLE cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True) ctx = get_context() self.queue = get_queue() knl = ReductionKernel(ctx, dtype_out=self.dtype_out, neutral=self.neutral, reduce_expr=self.reduce_expr, map_expr=expr, arguments=arguments, preamble="\n".join( [cluda_preamble, preamble])) # only code we generate is saved here. self.source = "\n".join([cluda_preamble, preamble]) if knl.stage_1_inf.source: self.all_source = "\n".join([ "// ------ stage 1 -----", knl.stage_1_inf.source, "// ------ stage 2 -----", knl.stage_2_inf.source, ]) else: self.all_source = self.source return knl elif self.backend == 'cuda': if self.func is not None: self.tp.add(self.func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) name = self.func.__name__ expr = '{func}({args})'.format(func=name, args=', '.join(c_data[1])) arguments = convert_to_float_if_needed(', '.join( c_data[0][1:])) preamble = convert_to_float_if_needed(self.tp.get_code()) else: arguments = '{type} *in'.format(type=self.type) expr = None preamble = '' from .cuda import set_context set_context() from pycuda.reduction import ReductionKernel from pycuda._cluda import CLUDA_PREAMBLE cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True) knl = ReductionKernel(dtype_out=self.dtype_out, neutral=self.neutral, reduce_expr=self.reduce_expr, map_expr=expr, arguments=arguments, preamble="\n".join( [cluda_preamble, preamble])) # only code we generate is saved here. self.source = cluda_preamble + preamble # FIXME: it is difficult to get the sources from pycuda. self.all_source = self.source return knl
minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000, ), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(ctx, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5