def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False ): ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) self.compute_preconditioners() # Create a LinAlg instance self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx) # Positivity constraint self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a", "a[i] = max(a[i], 0.0f);") # Projection onto the L-infinity ball of radius Lambda self.elwise_proj_linf = ElementwiseKernel( self.ctx, "float2* a, float Lambda", "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);", "elwise_proj_linf" ) # Additional arrays self.linalg.gradient(self.d_x) self.d_p = parray.zeros_like(self.linalg.cl_mem["d_gradient"]) self.d_q = parray.zeros_like(self.d_data) self.d_g = self.linalg.d_image self.d_tmp = parray.zeros_like(self.d_x) self.add_to_cl_mem({ "d_p": self.d_p, "d_q": self.d_q, "d_tmp": self.d_tmp, }) self.theta = 1.0
def test_adj_inplace(self): inpgrad = clarray.to_device(self.queue, self.symgradin) inpdiv = clarray.to_device(self.queue, self.symdivin) outgrad = clarray.zeros_like(inpdiv) outdiv = clarray.zeros_like(inpgrad) outgrad.add_event(self.symgrad.fwd(outgrad, inpgrad)) outdiv.add_event(self.symgrad.adj(outdiv, inpdiv)) outgrad = outgrad.get() outdiv = outdiv.get() a1 = np.vdot( outgrad[..., :3].flatten(), self.symdivin[..., :3].flatten()) / self.symgradin.size * 4 a2 = 2 * np.vdot( outgrad[..., 3:6].flatten(), self.symdivin[..., 3:6].flatten()) / self.symgradin.size * 4 a = a1 + a2 b = np.vdot(self.symgradin[..., :3].flatten(), -outdiv[..., :3].flatten()) / self.symgradin.size * 4 print("Adjointness: %.2e +1j %.2e" % ((a - b).real, (a - b).imag)) np.testing.assert_allclose(a, b, rtol=RTOL, atol=ATOL)
def _setupVariables(self, x, data): data = clarray.to_device(self._queue[0], data.astype(self._DTYPE)) step_in = {} step_out = {} tmp_results = {} step_in["x"] = clarray.to_device(self._queue[0], x) step_in["xold"] = clarray.to_device(self._queue[0], x) step_in["xk"] = step_in["x"].copy() step_out["x"] = clarray.zeros_like(step_in["x"]) tmp_results["gradFx"] = step_in["x"].copy() tmp_results["DADA"] = clarray.zeros_like(step_in["x"]) tmp_results["DAd"] = clarray.zeros_like(step_in["x"]) tmp_results["d"] = data.copy() tmp_results["Ax"] = clarray.zeros_like(data) tmp_results["temp_reg"] = clarray.zeros_like(step_in["x"]) tmp_results["gradx"] = clarray.zeros( self._queue[0], step_in["x"].shape + (4,), dtype=self._DTYPE ) tmp_results["reg_norm"] = clarray.zeros( self._queue[0], step_in["x"].shape + (2,), dtype=self._DTYPE_real, ) tmp_results["reg"] = clarray.zeros( self._queue[0], step_in["x"].shape, dtype=self._DTYPE_real ) return (step_out, tmp_results, step_in, data)
def update(self, xd, yd, zd, vxd, vyd, vzd, qd, md, forces, t, dt, num_steps): axd = cl_array.zeros_like(xd) ayd = cl_array.zeros_like(xd) azd = cl_array.zeros_like(xd) for i in range(num_steps): # First half of position advance xd += (0.5 * dt) * vxd yd += (0.5 * dt) * vyd zd += (0.5 * dt) * vzd t += 0.5 * dt axd.fill(0.0, self.queue) ayd.fill(0.0, self.queue) azd.fill(0.0, self.queue) for acc in forces: acc.computeAcc(xd, yd, zd, vxd, vyd, vzd, qd, md, axd, ayd, azd, t) vxd += dt * axd vyd += dt * ayd vzd += dt * azd # Second half of position advance xd += (0.5 * dt) * vxd yd += (0.5 * dt) * vyd zd += (0.5 * dt) * vzd t += 0.5 * dt return t
def test_adj_inplace(self): inpgrad = clarray.to_device(self.queue, self.symgradin) inpdiv = clarray.to_device(self.queue, self.symdivin) outgrad = clarray.zeros_like(inpdiv) outdiv = clarray.zeros_like(inpgrad) self.symgrad.fwd(outgrad, inpgrad) self.symgrad.adj(outdiv, inpdiv) outgrad = outgrad.get() outdiv = outdiv.get() a1 = np.vdot( outgrad[..., :3].flatten(), self.symdivin[..., :3].flatten()) / self.symgradin.size * 4 a2 = 2 * np.vdot( outgrad[..., 3:6].flatten(), self.symdivin[..., 3:6].flatten()) / self.symgradin.size * 4 a = a1 + a2 b = np.vdot(self.symgradin.flatten(), -outdiv.flatten()) / self.symgradin.size * 4 print("Adjointness: %.2e +1j %.2e" % ((a - b).real, (a - b).imag)) self.assertAlmostEqual(a, b, places=12)
def _gpu_init(self): """Method to initialize all the data for GPU-accelerate search""" self.gpu_data = {} g = self.gpu_data d = self.data q = self.queue # move data to the GPU. All should be float32, as these is the native # lenght for GPUs g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array)) g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array)) # Make the scanning chain object an Image, as this is faster to rotate g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array)) g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) if self.distance_restraints: g['restraints'] = cl_array.to_device(q, float32array(d['restraints'])) # Allocate arrays on the GPU g['lsurf'] = cl_array.zeros_like(g['rcore']) g['clashvol'] = cl_array.zeros_like(g['rcore']) g['intervol'] = cl_array.zeros_like(g['rcore']) g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32) g['restspace'] = cl_array.zeros_like(g['interspace']) g['access_interspace'] = cl_array.zeros_like(g['interspace']) g['best_access_interspace'] = cl_array.zeros_like(g['interspace']) # arrays for counting # Reductions are typically tedious on GPU, and we need to define the # workgroupsize to allocate the correct amount of data WORKGROUPSIZE = 32 nsubhists = int(np.ceil(g['rcore'].size/WORKGROUPSIZE)) g['subhists'] = cl_array.zeros(q, (nsubhists, d['nrestraints'] + 1), dtype=np.float32) g['viol_counter'] = cl_array.zeros(q, (nsubhists, d['nrestraints'], d['nrestraints']), dtype=np.float32) # complex arrays g['ft_shape'] = list(d['shape']) g['ft_shape'][0] = d['shape'][0]//2 + 1 g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64) g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore']) g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore']) # other miscellanious data g['nrot'] = d['nrot'] g['max_clash'] = d['max_clash'] g['min_interaction'] = d['min_interaction'] # kernels g['k'] = Kernels(q.context) g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape']) g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape']) # initial calculations g['k'].rfftn(q, g['rcore'], g['ft_rcore']) g['k'].rfftn(q, g['rsurf'], g['ft_rsurf'])
def setup_device(self, imshape): print('Setting up with imshape = %s' % (str(imshape))) self.cached_shape = imshape self.clIm = cla.Array(self.q, imshape, np.float32) self.clm = cla.empty_like(self.clIm) self.clx = cla.empty_like(self.clIm) self.cly = cla.empty_like(self.clIm) self.clO = cla.zeros_like(self.clIm) self.clM = cla.zeros_like(self.clIm) self.clF = cla.empty_like(self.clIm) self.clS = cla.empty_like(self.clIm) self.clThisS = cla.empty_like(self.clIm) self.clScratch = cla.empty_like(self.clIm) self.radial_prg = pyopencl.Program(self.ctx, RADIAL_PROGRAM).build() self.sobel = Sobel(self.ctx, self.q) #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q) self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q) self.accum = ElementwiseKernel(self.ctx, 'float *a, float *b', 'a[i] += b[i]') self.norm_s = ElementwiseKernel(self.ctx, 'float *s, const float nRadii', 's[i] = -1 * s[i] / nRadii', 'norm_s') self.accum_s = ElementwiseKernel(self.ctx, 'float *a, float *b, const float nr', 'a[i] -= b[i] / nr') self.gaussians = {} self.gaussian_prgs = {} self.minmax = MinMaxKernel(self.ctx, self.q) # starburst storage clImageFormat = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) self.clIm2D = cl.Image(self.ctx, mf.READ_ONLY, clImageFormat, imshape) # Create sampler for sampling image object self.imSampler = cl.Sampler(self.ctx, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.LINEAR) self.cl_find_ray_boundaries = FindRayBoundaries(self.ctx, self.q)
def zeros_like(array, backend=None): if backend is None: backend = array.backend if backend == 'opencl': import pyopencl.array as gpuarray out = gpuarray.zeros_like(array.dev) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.zeros_like(array.dev) else: out = np.zeros_like(array.dev) return wrap_array(out, backend)
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) # Create a backprojector self.backprojector = Backprojection(sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=self.ctx, profile=profile) # Create a projector self.projector = Projection(self.backprojector.slice_shape, self.backprojector.angles, axis_position=axis_position, detector_width=self.backprojector.num_bins, normalize=False, ctx=self.ctx, profile=profile) self.sino_shape = sino_shape self.is_cpu = self.backprojector.is_cpu # Arrays self.d_data = parray.zeros(self.queue, sino_shape, dtype=np.float32) self.d_sino = parray.zeros_like(self.d_data) self.d_x = parray.zeros(self.queue, self.backprojector.slice_shape, dtype=np.float32) self.d_x_old = parray.zeros_like(self.d_x) self.add_to_cl_mem({ "d_data": self.d_data, "d_sino": self.d_sino, "d_x": self.d_x, "d_x_old": self.d_x_old, })
def test_2d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L = 4 M = 64 N = 32 axes = (-1, -2) nd_data = np.arange(L*M*N, dtype=np.complex64) nd_data.shape = (L, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def computeEnergy(self, x, y, z, q): xd = cl_array.to_device(self.queue, x) yd = cl_array.to_device(self.queue, y) zd = cl_array.to_device(self.queue, z) qd = cl_array.to_device(self.queue, q) coulombEnergy = cl_array.zeros_like(xd) prec = x.dtype if prec == numpy.float32: self.compEnergyF.calc_potential_energy(self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, numpy.int32(len(x)), numpy.float32(self.k),numpy.float32(self.impactFact), g_times_l = False) elif prec == numpy.float64: self.compEnergyD.calc_potential_energy(self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, numpy.int32(len(x)) , numpy.float64(self.k),numpy.float64(self.impactFact), g_times_l = False) else: print("Unknown float type.") return numpy.sum(coulombEnergy.get(self.queue))
def test_2d_in_4d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L1 = 4 L2 = 5 M = 64 N = 32 axes = (-1, -2) #ok #axes = (0,1) #ok #axes = (0,2) #cannot be collapsed nd_data = np.arange(L1*L2*M*N, dtype=np.complex64) nd_data.shape = (L1, L2, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def _test_desparsification(self, input_on_device, output_on_device): current_config = "input on device: %s, output on device: %s" % ( str(input_on_device), str(output_on_device) ) # De-sparsify on device csr = CSR(self.array.shape, max_nnz=self.ref_nnz) if input_on_device: data = parray.to_device(csr.queue, self.ref_data) indices = parray.to_device(csr.queue, self.ref_indices) indptr = parray.to_device(csr.queue, self.ref_indptr) else: data = self.ref_data indices = self.ref_indices indptr = self.ref_indptr if output_on_device: d_arr = parray.zeros_like(csr.array) output = d_arr else: output = None arr = csr.densify(data, indices, indptr, output=output) if output_on_device: arr = arr.get() # Compare self.assertTrue( np.allclose(arr.reshape(self.array.shape), self.array), "something wrong with densified data (%s)" % current_config )
def test_2d_in_4d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L1 = 4 L2 = 5 M = 64 N = 32 axes = (-1, -2) #ok #axes = (0,1) #ok #axes = (0,2) #cannot be collapsed nd_data = np.arange(L1 * L2 * M * N, dtype=np.complex64) nd_data.shape = (L1, L2, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT( ctx, queue, cl_data, cl_data_transformed, axes=axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def test_2d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L = 4 M = 64 N = 32 axes = (-1, -2) nd_data = np.arange(L * M * N, dtype=np.complex64) nd_data.shape = (L, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT( ctx, queue, cl_data, cl_data_transformed, axes=axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) self.compute_preconditioners() # Create a LinAlg instance self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx) # Positivity constraint self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a", "a[i] = max(a[i], 0.0f);") # Projection onto the L-infinity ball of radius Lambda self.elwise_proj_linf = ElementwiseKernel( self.ctx, "float2* a, float Lambda", "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);", "elwise_proj_linf") # Additional arrays self.linalg.gradient(self.d_x) self.d_p = parray.zeros_like(self.linalg.cl_mem["d_gradient"]) self.d_q = parray.zeros_like(self.d_data) self.d_g = self.linalg.d_image self.d_tmp = parray.zeros_like(self.d_x) self.add_to_cl_mem({ "d_p": self.d_p, "d_q": self.d_q, "d_tmp": self.d_tmp, }) self.theta = 1.0
def zeros_like(t: Tensor, gpu=False) -> Tensor: """Return a tensor of zeros with the same shape and type as a given tensor. """ if gpu: return Tensor(clarray.zeros_like(t._data), gpu=True) return Tensor(np.zeros_like(t._data, dtype=np.float32))
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False ): OpenclProcessing.__init__(self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) # Create a backprojector self.backprojector = Backprojection( sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=self.ctx, profile=profile ) # Create a projector self.projector = Projection( self.backprojector.slice_shape, self.backprojector.angles, axis_position=axis_position, detector_width=self.backprojector.num_bins, normalize=False, ctx=self.ctx, profile=profile ) self.sino_shape = sino_shape self.is_cpu = self.backprojector.is_cpu # Arrays self.d_data = parray.zeros(self.queue, sino_shape, dtype=np.float32) self.d_sino = parray.zeros_like(self.d_data) self.d_x = parray.zeros(self.queue, self.backprojector.slice_shape, dtype=np.float32) self.d_x_old = parray.zeros_like(self.d_x) self.add_to_cl_mem({ "d_data": self.d_data, "d_sino": self.d_sino, "d_x": self.d_x, "d_x_old": self.d_x_old, })
def test_1d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) nd_data = np.arange(32, dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed) transform.enqueue() assert np.allclose(cl_data_transformed.get(), np.fft.fft(nd_data))
def test_adj_inplace(self): inpfwd = clarray.to_device(self.queue, self.opinfwd) inpadj = clarray.to_device(self.queue, self.opinadj) outfwd = clarray.zeros_like(inpadj) outadj = clarray.zeros_like(inpfwd) self.op.fwd(outfwd, [inpfwd, [], self.grad_buf]) self.op.adj(outadj, [inpadj, [], self.grad_buf]) outfwd = outfwd.get() outadj = outadj.get() a = np.vdot(outfwd.flatten(), self.opinadj.flatten())/self.opinadj.size b = np.vdot(self.opinfwd.flatten(), outadj.flatten())/self.opinadj.size print("Adjointness: %.2e +1j %.2e" % ((a - b).real, (a - b).imag)) np.testing.assert_allclose(a, b, rtol=RTOL, atol=ATOL)
def ones_like(array, backend='cython'): if backend == 'opencl': import pyopencl.array as gpuarray dev_array = 1 + gpuarray.zeros_like(array) elif backend == 'cuda': import pycuda.gpuarray as gpuarray dev_array = gpuarray.ones_like(array) else: return Array(np.ones_like(array)) wrapped_array = Array() wrapped_array.set_dev_array(dev_array) return wrapped_array
def init_indices_buffers(self, image_width, image_height, kernels): mf = cl.mem_flags self.indices_host_buffer = numpy.arange(self.array_size, dtype=numpy.int32) self.indices_gpu_buffer = cl_array.arange(self.queue, 0, self.array_size, dtype=numpy.int32) self.sorted_indices_gpu_buffer = cl_array.zeros_like(self.indices_gpu_buffer) self.indices_host_back_buffers = {} for cell in kernels.keys(): self.indices_host_back_buffers[cell] = {} for centre in kernels[cell].keys(): self.indices_host_back_buffers[cell][centre] = numpy.zeros_like(self.source_host_buffer, dtype=numpy.int32)
def test_adj_inplace(self): inpfwd = clarray.to_device(self.queue, self.opinfwd) inpadj = clarray.to_device(self.queue, self.opinadj) outfwd = clarray.zeros_like(inpadj) outadj = clarray.zeros_like(inpfwd) self.op.fwd(outfwd, [inpfwd, [], self.grad_buf]) self.op.adj(outadj, [inpadj, [], self.grad_buf]) outfwd = outfwd.get() outadj = outadj.get() a = np.vdot(outfwd.flatten(), self.opinadj.flatten()) / self.opinadj.size b = np.vdot(self.opinfwd.flatten(), outadj.flatten()) / self.opinadj.size print("Adjointness: %.2e +1j %.2e" % ((a - b).real, (a - b).imag)) self.assertAlmostEqual(a, b, places=12)
def _test_sparsification(self, input_on_device, output_on_device): current_config = "input on device: %s, output on device: %s" % ( str(input_on_device), str(output_on_device) ) # Sparsify on device csr = CSR(self.array.shape) if input_on_device: # The array has to be flattened arr = parray.to_device(csr.queue, self.array.ravel()) else: arr = self.array if output_on_device: d_data = parray.zeros_like(csr.data) d_indices = parray.zeros_like(csr.indices) d_indptr = parray.zeros_like(csr.indptr) output = (d_data, d_indices, d_indptr) else: output = None data, indices, indptr = csr.sparsify(arr, output=output) if output_on_device: data = data.get() indices = indices.get() indptr = indptr.get() # Compare nnz = self.ref_nnz self.assertTrue( np.allclose(data[:nnz], self.ref_data), "something wrong with sparsified data (%s)" % current_config ) self.assertTrue( np.allclose(indices[:nnz], self.ref_indices), "something wrong with sparsified indices (%s)" % current_config ) self.assertTrue( np.allclose(indptr, self.ref_indptr), "something wrong with sparsified indices pointers (indptr) (%s)" % current_config )
def test_rotate_grid3d_linear(self): """Test rotate_grid3d kernel using nearest interpolation.""" k = self.k._program.rotate_grid3d # Identity rotation rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) grid = np.zeros((4, 5, 6), dtype=np.float32) grid[0, 0, 0] = 1 grid[0, 0, 1] = 1 grid[0, 1, 1] = 1 grid[0, 0, 2] = 1 grid[0, 0, -1] = 1 grid[-1, 0, 0] = 1 self.cl_grid = cl_array.to_device(self.queue, grid) self.cl_out = cl_array.zeros_like(self.cl_grid) args = (self.cl_grid.data, rotmat, self.cl_out.data, np.int32(False)) gws = tuple([2 * self.values['llength'] + 1] * 3) k(self.queue, gws, None, *args) self.assertTrue(np.allclose(self.cl_grid.get(), self.cl_out.get())) # 90' rotation around z-axis self.cl_out.fill(0) rotmat = np.asarray([0, -1, 0, 1, 0, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) args = (self.cl_grid.data, rotmat, self.cl_out.data, np.int32(False)) gws = tuple([2 * self.values['llength'] + 1] * 3) k(self.queue, gws, None, *args) answer = np.zeros(self.shape, dtype=np.float32) answer[0, 0, 0] = 1 answer[0, 1, 0] = 1 answer[0, 1, -1] = 1 answer[0, 2, 0] = 1 answer[0, -1, 0] = 1 answer[-1, 0, 0] = 1 self.assertTrue(np.allclose(answer, self.cl_out.get())) # Non-integer rotation rotmat = np.asarray( [[0.30901699, -0.5, 0.80901699], [-0.80901699, 0.30901699, 0.5], [-0.5, -0.80901699, -0.30901699]], dtype=np.float64) cl_rotmat = np.asarray(rotmat.ravel().tolist() + [0] * 7, dtype=np.float32) args = (self.cl_grid.data, cl_rotmat, self.cl_out.data, np.int32(False)) k(self.queue, gws, None, *args) rotate_grid3d(self.grid, rotmat, 2, self.out, False) test = np.allclose(self.cl_out.get(), self.out)
def Wp_func(params, G, P, loc, eflag, out=None): s = G.slices sh = G.shapes # Again, vectors are done full-grid utcon = cl_array.empty(params['queue'], sh.grid_vector, dtype=np.float64) utcon[0] = 0 utcon[1:] = P[s.U3VEC] utcov = G.lower_grid(utcon, loc) utsq = G.dot(utcon, utcov) global knl_Wp_func if knl_Wp_func is None: code = add_ghosts( replace_prim_names(""" cond1 := ((utsq_in[i,j,k] < 0.) * (abs(utsq_in[i,j,k]) < 1.e-13)) utsq1 := if(cond1, fabs(utsq_in[i,j,k]), utsq_in[i,j,k]) # Catch utsq < 0 and record it cond2 := ((utsq1 < 0) + (utsq1 > 1.e3 * gamma_max ** 2)) utsq := if(cond2, (P[RHO,i,j,k] + P[UU,i,j,k]), utsq1) eflag[i,j,k] = if(cond2, 2, eflag[i,j,k]) gamma := sqrt(1. + fabs(utsq)) Wp[i,j,k] = (P[RHO,i,j,k] + P[UU,i,j,k] + (gam - 1.) * P[UU,i,j,k]) * gamma ** 2 - P[RHO,i,j,k] * gamma """)) knl_Wp_func = lp.make_kernel( sh.isl_grid_scalar, code, [ *primsArrayArgs("P"), *scalarArrayArgs("utsq_in", "Wp"), *scalarArrayArgs("eflag", dtype=np.int32), ... ], assumptions=sh.assume_grid, default_offset=lp.auto) knl_Wp_func = lp.fix_parameters(knl_Wp_func, nprim=params['n_prim'], gam=params['gam'], gamma_max=params['gamma_max']) knl_Wp_func = tune_grid_kernel(knl_Wp_func, sh.bulk_scalar, ng=G.NG) print("Compiled Wp_func") if out is None: out = cl_array.zeros_like(utsq) evt, _ = knl_Wp_func(params['queue'], P=P, utsq_in=utsq, Wp=out, eflag=eflag) return out
def test_adj_inplace(self): inpfwd = clarray.to_device(self.queue, self.opinfwd) inpadj = clarray.to_device(self.queue, self.opinadj) outfwd = clarray.zeros_like(inpadj) outadj = clarray.zeros_like(inpfwd) outfwd.add_event( self.op.fwd(outfwd, [inpfwd, self.coil_buf, self.grad_buf])) outadj.add_event( self.op.adj(outadj, [inpadj, self.coil_buf, self.grad_buf])) outfwd = outfwd.map_to_host(wait_for=outfwd.events) outadj = outadj.map_to_host(wait_for=outadj.events) a = np.vdot(outfwd.flatten(), self.opinadj.flatten()) / self.opinadj.size b = np.vdot(self.opinfwd.flatten(), outadj.flatten()) / self.opinadj.size print("Adjointness: %.2e +1j %.2e" % ((a - b).real, (a - b).imag)) np.testing.assert_allclose(a, b, rtol=RTOL, atol=ATOL)
def setup_device(self, imshape): print('Setting up with imshape = %s' % (str(imshape))) self.imshape = imshape self.clIm = cla.Array(self.q, imshape, numpy.float32) self.clm = cla.empty_like(self.clIm) self.clx = cla.empty_like(self.clIm) self.cly = cla.empty_like(self.clIm) self.clO = cla.zeros_like(self.clIm) self.clM = cla.zeros_like(self.clIm) self.clF = cla.empty_like(self.clIm) self.clS = cla.empty_like(self.clIm) self.clThisS = cla.empty_like(self.clIm) self.clScratch = cla.empty_like(self.clIm) self.radial_prg = pyopencl.Program(self.ctx, PROGRAM).build() self.sobel = Sobel(self.ctx, self.q) #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q) self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q) self.accum = ElementwiseKernel(self.ctx, 'float *a, float *b', 'a[i] += b[i]') self.norm_s = ElementwiseKernel(self.ctx, 'float *s, const float nRadii', 's[i] = -1 * s[i] / nRadii', 'norm_s') self.accum_s = ElementwiseKernel(self.ctx, 'float *a, float *b, const float nr', 'a[i] -= b[i] / nr') self.gaussians = {} self.gaussian_prgs = {} self.minmax = MinMaxKernel(self.ctx, self.q)
def _init_cl_arrays(self): self.cl_G = cla.to_device(self.queue, self.G.astype(self.complexdtype)) self.cl_G_conj = cla.to_device(self.queue, self.G.astype(self.complexdtype).conj()) self.cl_work = cla.zeros(self.queue, tuple(self.N12_pad), self.complexdtype) self.cl_workF = cla.zeros_like(self.cl_work) self.cl_field1 = cla.empty(self.queue, tuple(self.N1), self.complexdtype) self.cl_field2 = cla.empty(self.queue, tuple(self.N2), self.complexdtype)
def test_1d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) nd_data = np.arange(32, dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed ) transform.enqueue() assert np.allclose(cl_data_transformed.get(), np.fft.fft(nd_data))
def raise_grid(self, vcov, loc=Loci.CENT, ocl=True, out=None): """Raise a grid of covariant rank-1 tensors to contravariant ones.""" if self.use_ocl and ocl: if out is None: if isinstance(vcov, np.ndarray): out = np.zeros_like(vcov) else: out = cl_array.zeros_like(vcov) evt, _ = self.dot2geom(self.queue, g=self.gcon_d[loc.value], v=vcov, out=out) return out else: return np.einsum("ij...,j...->i...", self.gcon[loc.value, :, :, :, :, None], vcov)
def gamma_func(params, G, Bsq, D, QdB, Qtsq, Wp, eflag, out=None): sh = G.shapes global knl_gamma_func if knl_gamma_func is None: code = add_ghosts(""" W := D[i,j,k] + Wp[i,j,k] WB := W + Bsq[i,j,k] # This is basically inversion of eq. A7 of MM <> utsq = -((W + WB) * QdB[i,j,k]**2 + W**2 * Qtsq[i,j,k]) / \ (QdB[i,j,k]**2 * (W + WB) + W**2 * (Qtsq[i,j,k] - WB**2)) # Catch utsq < 0 and record it cond := ((utsq < 0) + (utsq > 1.e3 * gamma_max ** 2)) eflag[i,j,k] = if(cond, 2, eflag[i,j,k]) gamma[i,j,k] = sqrt(1. + fabs(utsq)) """) knl_gamma_func = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("Bsq", "D", "QdB", "Qtsq", "Wp", "gamma"), *scalarArrayArgs("eflag", dtype=np.int32), ... ], assumptions=sh.assume_grid, default_offset=lp.auto) knl_gamma_func = lp.fix_parameters(knl_gamma_func, gamma_max=params['gamma_max']) knl_gamma_func = tune_grid_kernel(knl_gamma_func, sh.bulk_scalar, ng=G.NG) print("Compiled gamma_func") if out is None: out = cl_array.zeros_like(Bsq) evt, _ = knl_gamma_func(params['queue'], Bsq=Bsq, D=D, QdB=QdB, Qtsq=Qtsq, Wp=Wp, gamma=out, eflag=eflag) return out
def test_create_plan(self): G = gpyfftlib.GpyFFT() ctx = get_contexts()[0] queue = cl.CommandQueue(ctx) nd_data = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) plan = G.create_plan(ctx, cl_data.shape) print('plan.strides_in', plan.strides_in) print('plan.strides_out', plan.strides_out) print('plan.distances', plan.distances) print('plan.batch_size', plan.batch_size) del plan del G
def err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag, out=None): sh = G.shapes gamma = gamma_func(params, G, Bsq, D, QdB, Qtsq, Wp, eflag) global knl_err_eqn if knl_err_eqn is None: code = add_ghosts(""" W := Wp[i,j,k] + D[i,j,k] w := W / (gamma[i,j,k]**2) rho0 := D[i,j,k] / gamma[i,j,k] pres := (w - rho0) * (gam - 1.) / gam err[i,j,k] = -Ep[i,j,k] + Wp[i,j,k] - pres + 0.5*Bsq[i,j,k] + \ 0.5*(Bsq[i,j,k] * Qtsq[i,j,k] - QdB[i,j,k]**2)/((Bsq[i,j,k] + W)**2) """) knl_err_eqn = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("gamma", "Bsq", "D", "Ep", "QdB", "Qtsq", "Wp", "err"), *scalarArrayArgs("eflag", dtype=np.int32), ... ], assumptions=sh.assume_grid, default_offset=lp.auto) knl_err_eqn = lp.fix_parameters(knl_err_eqn, nprim=params['n_prim'], gam=params['gam'], gamma_max=params['gamma_max']) knl_err_eqn = tune_grid_kernel(knl_err_eqn, sh.bulk_scalar, ng=G.NG) if out is None: out = cl_array.zeros_like(Bsq) evt, _ = knl_err_eqn(params['queue'], Bsq=Bsq, D=D, Ep=Ep, QdB=QdB, Qtsq=Qtsq, Wp=Wp, gamma=gamma, err=out, eflag=eflag) return out
def test_create_plan(self): G = gpyfftlib.GpyFFT() ctx = get_contexts()[0] queue = cl.CommandQueue(ctx) nd_data = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) plan = G.create_plan(ctx, cl_data.shape) print('plan.strides_in', plan.strides_in) print('plan.strides_out', plan.strides_out) print('plan.distances', plan.distances) print('plan.batch_size', plan.batch_size)
def computeAcc(self, xd, yd, zd, vxd, vyd, vzd, qd, md, axd, ayd, azd, t, dt): # Compute average numbers of scattered photons nbars = cl_array.zeros_like(xd) if self.sigma == None: self.program.compute_mean_scattered_photons_homogeneous_beam( self.queue, (xd.size, ), None, xd.data, yd.data, zd.data, vxd.data, vyd.data, vzd.data, numpy.float32(self.k0[0]), numpy.float32(self.k0[1]), numpy.float32(self.k0[2]), numpy.float32(self.gamma), numpy.float32(self.delta0), numpy.float32(self.S), numpy.float32(dt), numpy.int32(xd.size), nbars.data) else: self.program.compute_mean_scattered_photons_gaussian_beam( self.queue, (xd.size, ), None, xd.data, yd.data, zd.data, vxd.data, vyd.data, vzd.data, numpy.float32(self.k0[0]), numpy.float32(self.k0[1]), numpy.float32(self.k0[2]), numpy.float32(self.x0[0]), numpy.float32(self.x0[1]), numpy.float32(self.x0[2]), numpy.float32(self.sigma), numpy.float32(self.gamma), numpy.float32(self.delta0), numpy.float32(self.S), numpy.float32(dt), numpy.int32(xd.size), nbars.data) # Compute scattered photons and associated recoil kicks nMax = int( math.ceil(10.0 * self.S * (self.gamma / 2.0 / numpy.pi) * dt)) actualNs = self.findSample(nbars, nMax) recoilDirectionsD = cl_array.Array(self.queue, [nbars.size, nMax, 3], dtype=numpy.float32) self.generator.fill_normal(recoilDirectionsD) # apply recoil kicks to particles recoilMomentum = numpy.linalg.norm( self.k0) * self._PlanckConstantReduced self.program.computeKicks(self.queue, (xd.size, ), None, md.data, actualNs.data, numpy.int32(nMax), recoilDirectionsD.data, numpy.float32(self.k0[0]), numpy.float32(self.k0[1]), numpy.float32(self.k0[2]), numpy.float32(recoilMomentum), numpy.float32(dt), axd.data, ayd.data, azd.data, numpy.int32(xd.shape[0]))
def template_test(self, test_name): data, kernel = self.get_data_and_kernel(test_name) conv = self.instantiate_convol(data.shape, kernel) if self.param["input_on_device"]: data_ref = parray.to_device(conv.queue, data) else: data_ref = data if self.param["output_on_device"]: d_res = parray.zeros_like(conv.data_out) res = d_res else: res = None res = conv(data_ref, output=res) if self.param["output_on_device"]: res = res.get() ref_func = self.get_reference_function(test_name) ref = ref_func(data, kernel) metric = self.compare(res, ref) logger.info("%s: max error = %.2e" % (test_name, metric)) tol = self.tol[str("%dD" % kernel.ndim)] self.assertLess(metric, tol, self.print_err(conv))
def test_rotate_grid3d_nearest(self): """Test rotate_grid3d kernel using nearest interpolation.""" k = self.k._program.rotate_grid3d # Identity rotation rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) grid = np.zeros((4, 5, 6), dtype=np.float32) grid[0, 0, 0] = 1 grid[0, 0, 1] = 1 grid[0, 1, 1] = 1 grid[0, 0, 2] = 1 grid[0, 0, -1] = 1 grid[-1, 0, 0] = 1 self.cl_grid = cl_array.to_device(self.queue, grid) self.cl_out = cl_array.zeros_like(self.cl_grid) args = (self.cl_grid.data, rotmat, self.cl_out.data, np.int32(True)) gws = tuple([2 * self.values['llength'] + 1] * 3) k(self.queue, gws, None, *args) self.assertTrue(np.allclose(self.cl_grid.get(), self.cl_out.get())) # 90' rotation around z-axis self.cl_out.fill(0) rotmat = np.asarray([0, -1, 0, 1, 0, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) args = (self.cl_grid.data, rotmat, self.cl_out.data, np.int32(True)) gws = tuple([2 * self.values['llength'] + 1] * 3) k(self.queue, gws, None, *args) answer = np.zeros(self.shape, dtype=np.float32) answer[0, 0, 0] = 1 answer[0, 1, 0] = 1 answer[0, 1, -1] = 1 answer[0, 2, 0] = 1 answer[0, -1, 0] = 1 answer[-1, 0, 0] = 1 self.assertTrue(np.allclose(answer, self.cl_out.get()))
def computeEnergy(self, x, y, z, q): xd = cl_array.to_device(self.queue, x) yd = cl_array.to_device(self.queue, y) zd = cl_array.to_device(self.queue, z) qd = cl_array.to_device(self.queue, q) coulombEnergy = cl_array.zeros_like(xd) prec = x.dtype if prec == numpy.float32: self.compEnergyF.calc_potential_energy(self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, numpy.int32(len(x)), numpy.float32(self.k), numpy.float32( self.impactFact), g_times_l=False) elif prec == numpy.float64: self.compEnergyD.calc_potential_energy(self.queue, (x.size, ), None, xd.data, yd.data, zd.data, qd.data, coulombEnergy.data, numpy.int32(len(x)), numpy.float64(self.k), numpy.float64( self.impactFact), g_times_l=False) else: print("Unknown float type.") return numpy.sum(coulombEnergy.get(self.queue))
def _rev_grad(self, valuation, adjoint, gradient, cache): q = pl.qs[0] X = cache[id(self.ops[0])] W = cache[id(self.ops[1])] b = cache[id(self.ops[2])] gy = adjoint _, out_c, out_h, out_w = gy.shape n, c, h, w = X.shape kh, kw = W.shape[2:] gW = clarray.zeros_like(W) gW_mat = gW.reshape(out_c, c * kh * kw) col_mats = self.col.reshape(n, c * kh * kw, out_h * out_w) gy_mats = gy.reshape(n, out_c, out_h * out_w) for i in xrange(n): gwmat = linalg.dot(q, gy_mats[i], col_mats[i], transB=True) gW_mat += gwmat W_mat = W.reshape(out_c, -1) gcol = clarray.empty_like(self.col) gcol_mats = gcol.reshape(n, c * kh * kw, out_h * out_w) for i in xrange(n): gcol_mats[i] = linalg.dot(q, W_mat, gy_mats[i], transA=True) gx, ev = conv.col2im(q, gcol, self.sy, self.sx, self.ph, self.pw, h, w) ev.wait() gb = None if b is not None: gb, ev = conv.bgrads_sum(q, gy) ev.wait() # TODO bias... sum along multiple axes of gy? # TODO set gW, gx and gb in gradient dict self.ops[0]._rev_grad(valuation, gx, gradient, cache) self.ops[1]._rev_grad(valuation, gW, gradient, cache) if gb is not None: self.ops[2]._rev_grad(valuation, gb, gradient, cache)
def _gpu_init(self): self.gpu_data = {} g = self.gpu_data d = self.data q = self.queue g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array)) g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array)) g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array)) g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) g['lsurf'] = cl_array.zeros_like(g['rcore']) g['clashvol'] = cl_array.zeros_like(g['rcore']) g['intervol'] = cl_array.zeros_like(g['rcore']) g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32) # complex arrays g['ft_shape'] = list(d['shape']) g['ft_shape'][0] = d['shape'][0]//2 + 1 g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64) g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore']) g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore']) # allocate SAXS arrays g['q'] = cl_array.to_device(q, float32array(d['q'])) g['targetIq'] = cl_array.to_device(q, float32array(d['targetIq'])) g['sq'] = cl_array.to_device(q, float32array(d['sq'])) g['base_Iq'] = cl_array.to_device(q, float32array(d['base_Iq'])) g['fifj'] = cl_array.to_device(q, float32array(d['fifj'])) g['rind'] = cl_array.to_device(q, d['rind'].astype(np.int32)) g['lind'] = cl_array.to_device(q, d['lind'].astype(np.int32)) g_rxyz = np.zeros((d['rxyz'].shape[0], 4), dtype=np.float32) g_rxyz[:, :3] = d['rxyz'][:] g_lxyz = np.zeros((d['lxyz'].shape[0], 4), dtype=np.float32) g_lxyz[:, :3] = d['lxyz'][:] g['rxyz'] = cl_array.to_device(q, g_rxyz) g['lxyz'] = cl_array.to_device(q, g_lxyz) g['rot_lxyz'] = cl_array.zeros_like(g['lxyz']) g['chi2'] = cl_array.to_device(q, d['chi2'].astype(np.float32)) g['best_chi2'] = cl_array.to_device(q, d['best_chi2'].astype(np.float32)) g['rot_ind'] = cl_array.zeros(q, d['shape'], dtype=np.int32) g['origin'] = np.zeros(4, dtype=np.float32) g['origin'][:3] = d['origin'].astype(np.float32) g['voxelspacing'] = np.float32(self.voxelspacing) # kernels g['k'] = Kernels(q.context) g['saxs_k'] = saxs_Kernels(q.context) g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape']) g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape']) g['k'].rfftn(q, g['rcore'], g['ft_rcore']) g['k'].rfftn(q, g['rsurf'], g['ft_rsurf']) g['nrot'] = d['nrot'] g['max_clash'] = d['max_clash'] g['min_interaction'] = d['min_interaction']
def zeros_like(cls, arr): return cl_array.zeros_like(queue, arr)
def zeros_like(a, dtype=None, order='K', subok=True): res = clarray.zeros_like(a) res.__class__ = myclArray res.reinit() return res
def computeAcc(self, xd, yd, zd, vxd, vyd, vzd, qd, md, axd, ayd, azd, t, dt): # Compute average numbers of scattered photons nbars = cl_array.zeros_like(xd) if self.sigma == None: self.program.compute_mean_scattered_photons_homogeneous_beam( self.queue, (xd.size, ), None, xd.data, yd.data, zd.data, vxd.data, vyd.data, vzd.data, numpy.float32(self.k0[0]), numpy.float32(self.k0[1]), numpy.float32(self.k0[2]), numpy.float32(self.gamma), numpy.float32(self.delta0), numpy.float32(self.S), numpy.float32(dt), numpy.int32(xd.size), nbars.data) else: self.program.compute_mean_scattered_photons_gaussian_beam( self.queue, (xd.size, ), None, xd.data, yd.data, zd.data, vxd.data, vyd.data, vzd.data, numpy.float32(self.k0[0]), numpy.float32(self.k0[1]), numpy.float32(self.k0[2]), numpy.float32(self.x0[0]), numpy.float32(self.x0[1]), numpy.float32(self.x0[2]), numpy.float32(self.sigma), numpy.float32(self.gamma), numpy.float32(self.delta0), numpy.float32(self.S), numpy.float32(dt), numpy.int32(xd.size), nbars.data) # Compute scattered photons and associated recoil kicks nMax = int(math.ceil(10.0 * self.S * (self.gamma / 2.0 / numpy.pi) * dt)) actualNs = self.findSample(nbars, nMax) recoilDirectionsD = cl_array.Array(self.queue, [nbars.size, nMax, 3], dtype = numpy.float32) self.generator.fill_normal(recoilDirectionsD) # apply recoil kicks to particles recoilMomentum = numpy.linalg.norm(self.k0) * self._PlanckConstantReduced self.program.computeKicks( self.queue, (xd.size, ), None, md.data, actualNs.data, numpy.int32(nMax), recoilDirectionsD.data, numpy.float32(self.k0[0]), numpy.float32(self.k0[1]), numpy.float32(self.k0[2]), numpy.float32(recoilMomentum), numpy.float32(dt), axd.data, ayd.data, azd.data, numpy.int32(xd.shape[0]))
h, w = datal.shape[:2] #datal += np.random.rand(datal.size).reshape(datal.shape)*(1.0-datal)*0.5 #idxdark = datal<0.5 #idxlight = np.min(datal, axis=2)>0.5 #rnd = np.random.rand(datal[idxlight].size//3)*0.25 #datal[idxlight] *= 0.75 #datal[idxlight] += np.array(3*[rnd]).T#.reshape(-1, datal.shape[-1]) #datal[idxdark] += np.random.rand(datal[idxdark].size)*0.25 datalcl = arr_from_np(queue, datal.astype(np.float32)) res = clarray.zeros_like(datalcl) gminiscl = clarray.zeros(dtype=np.uint32, shape=datalcl.shape[:2], queue=queue) ksource = tpl.render(rads=rads, w=w, allc=allc, allct=allct, n=nn, dtype='float', crds=allcircle, numc=3) print(ksource) #exit() program = cl.Program(ctx, ksource).build() program.filter(queue, (h-2*rr, w-2*rr,), None, datalcl.ravel().data, res.data, gminiscl.data) resint = np.round(res.get()*255).astype(np.uint8) import tkinter as tk from PIL import ImageDraw, Image, ImageTk import sys
def zeros_like(self, a): arr = cl_array.zeros_like(a) self._cl_arrays.append(arr) return arr