def time_simple(N, nargs, niter=100): from gputools import OCLReductionKernel map_exprs = ["%s*x[i]" % i for i in xrange(nargs)] ks = [ OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="%s*x[i]" % i, arguments="__global float *x") for i in xrange(len(map_exprs)) ] ins = [ OCLArray.from_array(np.ones(N, np.float32)) for _ in xrange(len(map_exprs)) ] outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): for k, inn, out in zip(ks, ins, outs): k(inn, out=out) get_device().queue.finish() t = (time() - t) / niter print "simple reduction: result =", [float(out.get()) for out in outs] print "simple reduction:\t\t%.2f ms" % (1000 * t) return t
def setup(self, size, units, lam = .5, n0 = 1., use_fresnel_approx = False): """ sets up the internal variables e.g. propagators etc... :param size: the size of the geometry in pixels (Nx,Ny,Nz) :param units: the phyiscal units of each voxel in microns (dx,dy,dz) :param lam: the wavelength of light in microns :param n0: the refractive index of the surrounding media :param use_fresnel_approx: if True, uses fresnel approximation for propagator """ Bpm3d_Base.setup(self,size, units, lam = lam, n0 = n0, use_fresnel_approx = use_fresnel_approx) #setting up the gpu buffers and kernels self.program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny = self.size[:2] plan = fft_plan(()) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") self.gfactor_g = OCLArray.zeros(Nz,"float32") self.reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain")
from time import time t = time() for _ in range(niter): for k,inn,out in zip(ks,ins,outs): k(inn, out = out) get_device().queue.finish() t = (time()-t)/niter print("simple reduction: result =", [float(out.get()) for out in outs]) print("simple reduction:\t\t%.2f ms"%(1000*t)) return t from gputools import OCLReductionKernel k1 = OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]", arguments="__global float *x") k2 = OCLMultiReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_exprs=["x[i]"], arguments="__global float *x") N = 32 a = OCLArray.from_array(np.ones((N,N),np.float32)) # print k1(a[0], out = out[0]) print(k2(a))