def dot(a, b, backend=None): if backend is None: backend = a.backend if backend == 'cython': return np.dot(a.dev, b.dev) if backend == 'opencl': import pyopencl.array as gpuarray return gpuarray.dot(a.dev, b.dev).get() if backend == 'cuda': import pycuda.gpuarray as gpuarray return gpuarray.dot(a.dev, b.dev).get()
def main(dtype, s): shape = (s, s) arr_np_a = np.random.random(shape).astype(dtype=dtype) arr_np_b = np.random.random(shape).astype(dtype=dtype) arr_g_a = array.to_device(queue, arr_np_a) arr_g_b = array.to_device(queue, arr_np_b) start = datetime.now() array.dot(arr_g_a, arr_g_b) diff = datetime.now() - start return diff
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtypes = [np.float32, np.complex64] if has_double_support(context.devices[0]): dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000, ), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000, ), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 vdot_ab = np.vdot(a, b) vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtypes = [np.float32, np.complex64] if has_double_support(context.devices[0]): dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 vdot_ab = np.vdot(a, b) vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
def vdot(m1: Tensor, m2: Tensor) -> Tensor: """Returns a dot product of two tensors.""" if m1.gpu or m2.gpu: return Tensor(clarray.dot(m1.data, m2.data), gpu=True) return Tensor(np.vdot(m1.data, m2.data))
def error_value(self, predicted, expected): """Returns the absolute values of 1/2 * || expected - predicted ||.""" predicted = self.convert_to_arrays(predicted) expected = self.convert_to_arrays(expected) out = predicted - expected out = pycl_array.dot(out, out) / 2 return out.get().max()
def test_dot(ctx_getter): from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,)) a = a_gpu.get() b_gpu = clrand(context, queue, (200000,)) b = b_gpu.get() dot_ab = numpy.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
def test_mem_pool_with_arrays(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) mem_pool = cl_tools.MemoryPool(cl_tools.CLAllocator(context)) a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000 result = cl_array.dot(a_dev, b_dev) assert a_dev.allocator is mem_pool assert b_dev.allocator is mem_pool assert result.allocator is mem_pool
def test_dot(ctx_getter): from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000, )) a = a_gpu.get() b_gpu = clrand(context, queue, (200000, )) b = b_gpu.get() dot_ab = numpy.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
def test_outoforderqueue_reductions(ctx_factory): context = ctx_factory() try: queue = cl.CommandQueue(context, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) except Exception: pytest.skip("out-of-order queue not available") # 0/1 values to avoid accumulated rounding error a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) a[800000] = 10 # all<5 looks true until near the end a_gpu = cl_array.to_device(queue, a) b1 = cl_array.sum(a_gpu).get() b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() b3 = (a_gpu < 5).all().get() assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
def rnorm2(self, X): """ Compute the norm square of the residuals. @param X Result of the last iteration (pyopencl.array object). @return norm square of the residuals. """ n = np.uint32(self.n) gSize = (clUtils.globalSize(n),) kernelargs = (self.A, self.B.data, X.data, self.R.data, n) # Test if the final result has been reached self.program.r(self.queue, gSize, None, *(kernelargs)) return cl_array.dot(self.R,self.R).get()
def test_dot(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,), np.float32) a = a_gpu.get() b_gpu = clrand(context, queue, (200000,), np.float32) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = "__pypy__" in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = '__pypy__' in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def solve(self, A, b, x0=None, tol=10e-5, iters=300): r""" Solve linear system of equations by a Jacobi iterative method. @param A Linear system matrix. @param b Linear system independent term. @param x0 Initial aproximation of the solution. @param tol Relative error tolerance: \n \$ \vert\vert b - A \, x \vert \vert_\infty / \vert\vert b \vert \vert_\infty \$ @param iters Maximum number of iterations. """ # Create/set OpenCL buffers self.setBuffers(A, b, x0) # Get dimensions for OpenCL execution n = np.uint32(len(b)) gSize = (clUtils.globalSize(n), ) # Get a norm to can compare later for valid result bnorm = np.sqrt(cl_array.dot(self.b, self.b).get()) # Initialize the problem beta = bnorm self.dot_c_vec(1.0 / beta, self.u) kernelargs = (self.A, self.u.data, self.v.data, n) self.program.dot_matT_vec(self.queue, gSize, None, *(kernelargs)) alpha = np.sqrt(cl_array.dot(self.v, self.v).get()) self.dot_c_vec(1.0 / alpha, self.v) self.copy_vec(self.w, self.v) phibar = beta rhobar = alpha # Iterate while the result converges or maximum number # of iterations is reached. for i in range(0, iters): # Compute residues kernelargs = (self.A, self.b.data, self.x.data, self.r.data, n) self.program.r(self.queue, gSize, None, *(kernelargs)) rnorm = np.sqrt(cl_array.dot(self.r, self.r).get()) # Test if the final result has been reached if rnorm / bnorm <= tol: break # Compute next alpha, beta, u, v kernelargs = (self.A, self.u.data, self.v.data, self.u.data, alpha, n) self.program.u(self.queue, gSize, None, *(kernelargs)) beta = np.sqrt(cl_array.dot(self.u, self.u).get()) if not beta: break self.dot_c_vec(1.0 / beta, self.u) kernelargs = (self.A, self.u.data, self.v.data, self.v.data, beta, n) self.program.v(self.queue, gSize, None, *(kernelargs)) alpha = np.sqrt(cl_array.dot(self.v, self.v).get()) if not alpha: break self.dot_c_vec(1.0 / alpha, self.v) # Apply the orthogonal transformation c, s, rho = self.symOrtho(rhobar, beta) theta = s * alpha rhobar = -c * alpha phi = c * phibar phibar = s * phibar # Update x and w self.linear_comb(self.x, 1.0, self.x, phi / rho, self.w) self.linear_comb(self.w, 1.0, self.v, -theta / rho, self.w) # Return result computed x = np.zeros((n), dtype=np.float32) cl.enqueue_read_buffer(self.queue, self.x.data, x).wait() return (x, rnorm / bnorm, i + 1)
def solve(self, A, B, x0=None, tol=10e-6, iters=300, w=1.0): r""" Solve linear system of equations by a Jacobi iterative method. @param A Linear system matrix. @param B Linear system independent term. @param x0 Initial aproximation of the solution. @param tol Relative error tolerance: \n \$ \vert\vert B - A \, x \vert \vert_\infty / \vert\vert B \vert \vert_\infty \$ @param iters Maximum number of iterations. @param w Relaxation factor (could be autoupdated if the method diverges) """ # Create/set OpenCL buffers w = np.float32(w) self.setBuffers(A,B,x0) # Get dimensions for OpenCL execution n = np.uint32(len(B)) gSize = (clUtils.globalSize(n),) # Get a norm to can compare later for valid result bnorm2 = cl_array.dot(self.B,self.B).get() FreeCAD.Console.PrintMessage(bnorm2) FreeCAD.Console.PrintMessage("\n") rnorm2 = 0. # Iterate while the result converges or maximum number # of iterations is reached. for i in range(0,iters): rnorm2 = self.rnorm2(self.X0) FreeCAD.Console.PrintMessage("\t") FreeCAD.Console.PrintMessage(rnorm2) FreeCAD.Console.PrintMessage(" -> ") FreeCAD.Console.PrintMessage(rnorm2 / bnorm2) FreeCAD.Console.PrintMessage("\n") if np.sqrt(rnorm2 / bnorm2) <= tol: break # Iterate kernelargs = (self.A, self.B.data, self.X0.data, self.X.data, w, n) self.program.jacobi(self.queue, gSize, None, *(kernelargs)) # Test if the result is diverging temp_rnorm2 = self.rnorm2(self.X) if(temp_rnorm2 > rnorm2): FreeCAD.Console.PrintMessage("\t\tDivergence found...\n\t\tw = ") w = w * rnorm2 / temp_rnorm2 FreeCAD.Console.PrintMessage(w) FreeCAD.Console.PrintMessage("\n") # Discard the result continue kernelargs = (self.A, self.B.data, self.X.data, self.X0.data, w, n) self.program.jacobi(self.queue, gSize, None, *(kernelargs)) # Return result computed cl.enqueue_read_buffer(self.queue, self.X0.data, self.x).wait() return (np.copy(self.x), np.sqrt(rnorm2 / bnorm2), i)
def calc_gradient_gpu(self, subfields_combined, cl_I_cam_measured=None, forward_only=False, individual_farfields=False): """ calculate objective (forward model) and gradient (gradient backpropagation) with respect to subfields_combined Parameters ---------- individual_farfields subfields_combined cl_I_cam_measured forward_only Returns ------- (objective, gradient : np.array) (objective, I_cam : pyopencl.array.Array) if forward_only=True, and saves far_field (E, I) if individual_farfields==True """ # propagate to far-field self.propagator_object_to_farfield.propagator_combined.cl_field1.set( subfields_combined, self.cl_queue) self.propagator_object_to_farfield.propagator_combined.propagate_gpudata( ) cl_far_field = self.propagator_object_to_farfield.propagator_combined.cl_field2 # add reference field in new array if individual_farfields: cl_E = cl_far_field else: cl_E = cl_far_field + self.cl_far_field_ref cl_I = abs(cl_E) cl_I *= cl_I # apply detector model cl_I_cam = self.convolver_detector.convolve_gpu(cl_I) if individual_farfields: #returns intensity without transmission applied return cl_far_field.get(), (cl_I_cam * self.cl_mask_bfp).get() # apply condenser transmission self.transmission_detection.apply_inline_gpu(cl_I_cam) cl_residuum = cl_I_cam - cl_I_cam_measured objective = cla.dot(cl_residuum, cl_residuum, queue=self.cl_queue).get() if forward_only: # return objective, I_cam with and withou tranmission applied self.far_field.field[:] = cl_E.get() #store far_field cl_I_cam_no_transmission = self.convolver_detector.convolve_gpu( abs(cl_E)**2) * self.cl_mask_bfp return objective, cl_I_cam, cl_I_cam_no_transmission # backward gradient propagation self.transmission_detection.apply_inline_gpu(cl_residuum) cl_I_cam_bar = self.convolver_detector.convolve_bar_gpu(cl_residuum) cl_Ebar = 2 * cl_E * cl_I_cam_bar self.propagator_object_to_farfield.propagator_combined.gradient_backprop_gpudata( cl_Ebar) subfields_combined_gradient = self.propagator_object_to_farfield.propagator_combined.cl_field1.get( ) subfields_combined_gradient *= self.object_multiareafield.mask_subfields_combined return objective, subfields_combined_gradient
def dot(self, x, y): from pyopencl.array import dot return dot(x, y, queue=self.queue).get()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import numpy as np import pyopencl as cl import pyopencl.array as cla import time a_np = np.random.rand(100000000).astype(np.float32) b_np = np.random.rand(100000000).astype(np.float32) ctx = cl.create_some_context() q = cl.CommandQueue(ctx) a_g = cla.to_device(q, a_np) b_g = cla.to_device(q, b_np) start_time = time.time() r_g = cla.dot(a_g, b_g) r_get = r_g.get() print(time.time() - start_time) start_time = time.time() r_np = np.dot(a_np, b_np) print(time.time() - start_time) print(r_g.get() - (r_np))
def solve(self, A, b, x0=None, tol=10e-5, iters=300): r""" Solve linear system of equations by a Jacobi iterative method. @param A Linear system matrix. @param b Linear system independent term. @param x0 Initial aproximation of the solution. @param tol Relative error tolerance: \n \$ \vert\vert b - A \, x \vert \vert_\infty / \vert\vert b \vert \vert_\infty \$ @param iters Maximum number of iterations. """ # Create/set OpenCL buffers self.setBuffers(A,b,x0) # Get dimensions for OpenCL execution n = np.uint32(len(b)) gSize = (clUtils.globalSize(n),) # Get a norm to can compare later for valid result bnorm = np.sqrt(cl_array.dot(self.b,self.b).get()) # Initialize the problem beta = bnorm self.dot_c_vec(1.0/beta, self.u) kernelargs = (self.A,self.u.data,self.v.data,n) self.program.dot_matT_vec(self.queue, gSize, None, *(kernelargs)) alpha = np.sqrt(cl_array.dot(self.v,self.v).get()) self.dot_c_vec(1.0/alpha, self.v) self.copy_vec(self.w, self.v) phibar = beta rhobar = alpha # Iterate while the result converges or maximum number # of iterations is reached. for i in range(0,iters): # Compute residues kernelargs = (self.A, self.b.data, self.x.data, self.r.data, n) self.program.r(self.queue, gSize, None, *(kernelargs)) rnorm = np.sqrt(cl_array.dot(self.r,self.r).get()) # Test if the final result has been reached if rnorm / bnorm <= tol: break # Compute next alpha, beta, u, v kernelargs = (self.A,self.u.data,self.v.data,self.u.data,alpha,n) self.program.u(self.queue, gSize, None, *(kernelargs)) beta = np.sqrt(cl_array.dot(self.u,self.u).get()) if not beta: break self.dot_c_vec(1.0/beta, self.u) kernelargs = (self.A,self.u.data,self.v.data,self.v.data,beta,n) self.program.v(self.queue, gSize, None, *(kernelargs)) alpha = np.sqrt(cl_array.dot(self.v,self.v).get()) if not alpha: break self.dot_c_vec(1.0/alpha, self.v) # Apply the orthogonal transformation c,s,rho = self.symOrtho(rhobar,beta) theta = s * alpha rhobar = -c * alpha phi = c * phibar phibar = s * phibar # Update x and w self.linear_comb(self.x, 1.0, self.x, phi/rho, self.w) self.linear_comb(self.w, 1.0, self.v, -theta/rho, self.w) # Return result computed x = np.zeros((n), dtype=np.float32) cl.enqueue_read_buffer(self.queue, self.x.data, x).wait() return (x, rnorm / bnorm, i+1)
def norm(self): """The L2-norm on the flattened vector.""" return np.sqrt(array.dot(self.array, self.array).get())