Beispiel #1
0
def dot(a, b, backend=None):
    if backend is None:
        backend = a.backend
    if backend == 'cython':
        return np.dot(a.dev, b.dev)
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        return gpuarray.dot(a.dev, b.dev).get()
    if backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        return gpuarray.dot(a.dev, b.dev).get()
Beispiel #2
0
def main(dtype, s):
    shape = (s, s)
    arr_np_a = np.random.random(shape).astype(dtype=dtype)
    arr_np_b = np.random.random(shape).astype(dtype=dtype)
    arr_g_a = array.to_device(queue, arr_np_a)
    arr_g_b = array.to_device(queue, arr_np_b)
    start = datetime.now()
    array.dot(arr_g_a, arr_g_b)
    diff = datetime.now() - start

    return diff
Beispiel #3
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtypes = [np.float32, np.complex64]
    if has_double_support(context.devices[0]):
        dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000, ), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000, ), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            vdot_ab = np.vdot(a, b)
            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
Beispiel #4
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtypes = [np.float32, np.complex64]
    if has_double_support(context.devices[0]):
        dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            vdot_ab = np.vdot(a, b)
            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
Beispiel #5
0
    def vdot(m1: Tensor, m2: Tensor) -> Tensor:
        """Returns a dot product of two tensors."""

        if m1.gpu or m2.gpu:
            return Tensor(clarray.dot(m1.data, m2.data), gpu=True)

        return Tensor(np.vdot(m1.data, m2.data))
Beispiel #6
0
    def error_value(self, predicted, expected):
        """Returns the absolute values of 1/2 * || expected - predicted ||."""
        predicted = self.convert_to_arrays(predicted)
        expected = self.convert_to_arrays(expected)

        out = predicted - expected
        out = pycl_array.dot(out, out) / 2
        return out.get().max()
Beispiel #7
0
    def test_dot(ctx_getter):
        from pyopencl.clrandom import rand as clrand
        a_gpu = clrand(context, queue, (200000,))
        a = a_gpu.get()
        b_gpu = clrand(context, queue, (200000,))
        b = b_gpu.get()

        dot_ab = numpy.dot(a, b)

        dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

        assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
Beispiel #8
0
def test_mem_pool_with_arrays(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)
    mem_pool = cl_tools.MemoryPool(cl_tools.CLAllocator(context))

    a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool)
    b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000

    result = cl_array.dot(a_dev, b_dev)
    assert a_dev.allocator is mem_pool
    assert b_dev.allocator is mem_pool
    assert result.allocator is mem_pool
Beispiel #9
0
    def test_dot(ctx_getter):
        from pyopencl.clrandom import rand as clrand
        a_gpu = clrand(context, queue, (200000, ))
        a = a_gpu.get()
        b_gpu = clrand(context, queue, (200000, ))
        b = b_gpu.get()

        dot_ab = numpy.dot(a, b)

        dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

        assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
Beispiel #10
0
def test_outoforderqueue_reductions(ctx_factory):
    context = ctx_factory()
    try:
        queue = cl.CommandQueue(context,
               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
    except Exception:
        pytest.skip("out-of-order queue not available")
    # 0/1 values to avoid accumulated rounding error
    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32'))
    a[800000] = 10  # all<5 looks true until near the end
    a_gpu = cl_array.to_device(queue, a)
    b1 = cl_array.sum(a_gpu).get()
    b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
    b3 = (a_gpu < 5).all().get()
    assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
Beispiel #11
0
	def rnorm2(self, X):
		""" Compute the norm square of the residuals.
		@param X Result of the last iteration (pyopencl.array object).
		@return norm square of the residuals.
		"""
		n = np.uint32(self.n)
		gSize  = (clUtils.globalSize(n),)
		kernelargs = (self.A,
		              self.B.data,
		              X.data,
		              self.R.data,
		              n)
		# Test if the final result has been reached
		self.program.r(self.queue, gSize, None, *(kernelargs))
		return cl_array.dot(self.R,self.R).get()
Beispiel #12
0
def test_dot(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand
    a_gpu = clrand(context, queue, (200000,), np.float32)
    a = a_gpu.get()
    b_gpu = clrand(context, queue, (200000,), np.float32)
    b = b_gpu.get()

    dot_ab = np.dot(a, b)

    dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

    assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
Beispiel #13
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dev = context.devices[0]

    dtypes = [np.float32, np.complex64]
    if has_double_support(dev):
        if has_struct_arg_count_bug(dev) == "apple":
            dtypes.extend([np.float64])
        else:
            dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            try:
                vdot_ab = np.vdot(a, b)
            except NotImplementedError:
                import sys
                is_pypy = "__pypy__" in sys.builtin_module_names
                if is_pypy:
                    print("PYPY: VDOT UNIMPLEMENTED")
                    continue
                else:
                    raise

            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
            assert rel_err < 1e-4, rel_err
Beispiel #14
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dev = context.devices[0]

    dtypes = [np.float32, np.complex64]
    if has_double_support(dev):
        if has_struct_arg_count_bug(dev) == "apple":
            dtypes.extend([np.float64])
        else:
            dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            try:
                vdot_ab = np.vdot(a, b)
            except NotImplementedError:
                import sys
                is_pypy = '__pypy__' in sys.builtin_module_names
                if is_pypy:
                    print("PYPY: VDOT UNIMPLEMENTED")
                    continue
                else:
                    raise

            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
            assert rel_err < 1e-4, rel_err
Beispiel #15
0
    def solve(self, A, b, x0=None, tol=10e-5, iters=300):
        r""" Solve linear system of equations by a Jacobi
		iterative method.
		@param A Linear system matrix.
		@param b Linear system independent term.
		@param x0 Initial aproximation of the solution.
		@param tol Relative error tolerance: \n
		\$ \vert\vert b - A \, x \vert \vert_\infty /
		\vert\vert b \vert \vert_\infty \$
		@param iters Maximum number of iterations.
		"""
        # Create/set OpenCL buffers
        self.setBuffers(A, b, x0)
        # Get dimensions for OpenCL execution
        n = np.uint32(len(b))
        gSize = (clUtils.globalSize(n), )
        # Get a norm to can compare later for valid result
        bnorm = np.sqrt(cl_array.dot(self.b, self.b).get())
        # Initialize the problem
        beta = bnorm
        self.dot_c_vec(1.0 / beta, self.u)
        kernelargs = (self.A, self.u.data, self.v.data, n)
        self.program.dot_matT_vec(self.queue, gSize, None, *(kernelargs))
        alpha = np.sqrt(cl_array.dot(self.v, self.v).get())
        self.dot_c_vec(1.0 / alpha, self.v)
        self.copy_vec(self.w, self.v)
        phibar = beta
        rhobar = alpha
        # Iterate while the result converges or maximum number
        # of iterations is reached.
        for i in range(0, iters):
            # Compute residues
            kernelargs = (self.A, self.b.data, self.x.data, self.r.data, n)
            self.program.r(self.queue, gSize, None, *(kernelargs))
            rnorm = np.sqrt(cl_array.dot(self.r, self.r).get())
            # Test if the final result has been reached
            if rnorm / bnorm <= tol:
                break
            # Compute next alpha, beta, u, v
            kernelargs = (self.A, self.u.data, self.v.data, self.u.data, alpha,
                          n)
            self.program.u(self.queue, gSize, None, *(kernelargs))
            beta = np.sqrt(cl_array.dot(self.u, self.u).get())
            if not beta:
                break
            self.dot_c_vec(1.0 / beta, self.u)
            kernelargs = (self.A, self.u.data, self.v.data, self.v.data, beta,
                          n)
            self.program.v(self.queue, gSize, None, *(kernelargs))
            alpha = np.sqrt(cl_array.dot(self.v, self.v).get())
            if not alpha:
                break
            self.dot_c_vec(1.0 / alpha, self.v)
            # Apply the orthogonal transformation
            c, s, rho = self.symOrtho(rhobar, beta)
            theta = s * alpha
            rhobar = -c * alpha
            phi = c * phibar
            phibar = s * phibar
            # Update x and w
            self.linear_comb(self.x, 1.0, self.x, phi / rho, self.w)
            self.linear_comb(self.w, 1.0, self.v, -theta / rho, self.w)
        # Return result computed
        x = np.zeros((n), dtype=np.float32)
        cl.enqueue_read_buffer(self.queue, self.x.data, x).wait()
        return (x, rnorm / bnorm, i + 1)
Beispiel #16
0
	def solve(self, A, B, x0=None, tol=10e-6, iters=300, w=1.0):
		r""" Solve linear system of equations by a Jacobi
		iterative method.
		@param A Linear system matrix.
		@param B Linear system independent term.
		@param x0 Initial aproximation of the solution.
		@param tol Relative error tolerance: \n
		\$ \vert\vert B - A \, x \vert \vert_\infty /
		\vert\vert B \vert \vert_\infty \$
		@param iters Maximum number of iterations.
		@param w Relaxation factor (could be autoupdated
		if the method diverges)
		"""
		# Create/set OpenCL buffers
		w = np.float32(w)
		self.setBuffers(A,B,x0)
		# Get dimensions for OpenCL execution
		n      = np.uint32(len(B))
		gSize  = (clUtils.globalSize(n),)
		# Get a norm to can compare later for valid result
		bnorm2 = cl_array.dot(self.B,self.B).get()
		FreeCAD.Console.PrintMessage(bnorm2)
		FreeCAD.Console.PrintMessage("\n")
		rnorm2 = 0.
		# Iterate while the result converges or maximum number
		# of iterations is reached.
		for i in range(0,iters):
			rnorm2 = self.rnorm2(self.X0)
			FreeCAD.Console.PrintMessage("\t")
			FreeCAD.Console.PrintMessage(rnorm2)
			FreeCAD.Console.PrintMessage(" -> ")
			FreeCAD.Console.PrintMessage(rnorm2 / bnorm2)
			FreeCAD.Console.PrintMessage("\n")
			if np.sqrt(rnorm2 / bnorm2) <= tol:
				break
			# Iterate
			kernelargs = (self.A,
			              self.B.data,
			              self.X0.data,
			              self.X.data,
			              w,
			              n)
			self.program.jacobi(self.queue, gSize, None, *(kernelargs))
			# Test if the result is diverging
			temp_rnorm2 = self.rnorm2(self.X)
			if(temp_rnorm2 > rnorm2):
				FreeCAD.Console.PrintMessage("\t\tDivergence found...\n\t\tw = ")
				w = w * rnorm2 / temp_rnorm2
				FreeCAD.Console.PrintMessage(w)
				FreeCAD.Console.PrintMessage("\n")
				# Discard the result
				continue
			kernelargs = (self.A,
			              self.B.data,
			              self.X.data,
			              self.X0.data,
			              w,
			              n)
			self.program.jacobi(self.queue, gSize, None, *(kernelargs))
		# Return result computed
		cl.enqueue_read_buffer(self.queue, self.X0.data,  self.x).wait()
		return (np.copy(self.x), np.sqrt(rnorm2 / bnorm2), i)
Beispiel #17
0
    def calc_gradient_gpu(self,
                          subfields_combined,
                          cl_I_cam_measured=None,
                          forward_only=False,
                          individual_farfields=False):
        """
        calculate objective (forward model) and gradient (gradient backpropagation) with respect to subfields_combined


        Parameters
        ----------
        individual_farfields
        subfields_combined
        cl_I_cam_measured
        forward_only

        Returns
        -------
        (objective, gradient : np.array)
        (objective, I_cam : pyopencl.array.Array) if forward_only=True, and saves far_field
        (E, I) if individual_farfields==True

        """

        # propagate to far-field
        self.propagator_object_to_farfield.propagator_combined.cl_field1.set(
            subfields_combined, self.cl_queue)
        self.propagator_object_to_farfield.propagator_combined.propagate_gpudata(
        )

        cl_far_field = self.propagator_object_to_farfield.propagator_combined.cl_field2

        # add reference field in new array
        if individual_farfields:
            cl_E = cl_far_field

        else:
            cl_E = cl_far_field + self.cl_far_field_ref

        cl_I = abs(cl_E)
        cl_I *= cl_I

        # apply detector model
        cl_I_cam = self.convolver_detector.convolve_gpu(cl_I)

        if individual_farfields:  #returns intensity without transmission applied
            return cl_far_field.get(), (cl_I_cam * self.cl_mask_bfp).get()

        # apply condenser transmission
        self.transmission_detection.apply_inline_gpu(cl_I_cam)

        cl_residuum = cl_I_cam - cl_I_cam_measured
        objective = cla.dot(cl_residuum, cl_residuum,
                            queue=self.cl_queue).get()

        if forward_only:  # return objective, I_cam with and withou tranmission applied
            self.far_field.field[:] = cl_E.get()  #store far_field
            cl_I_cam_no_transmission = self.convolver_detector.convolve_gpu(
                abs(cl_E)**2) * self.cl_mask_bfp
            return objective, cl_I_cam, cl_I_cam_no_transmission

        # backward gradient propagation
        self.transmission_detection.apply_inline_gpu(cl_residuum)

        cl_I_cam_bar = self.convolver_detector.convolve_bar_gpu(cl_residuum)

        cl_Ebar = 2 * cl_E * cl_I_cam_bar
        self.propagator_object_to_farfield.propagator_combined.gradient_backprop_gpudata(
            cl_Ebar)

        subfields_combined_gradient = self.propagator_object_to_farfield.propagator_combined.cl_field1.get(
        )

        subfields_combined_gradient *= self.object_multiareafield.mask_subfields_combined

        return objective, subfields_combined_gradient
Beispiel #18
0
 def dot(self, x, y):
     from pyopencl.array import dot
     return dot(x, y, queue=self.queue).get()
Beispiel #19
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pyopencl as cl
import pyopencl.array as cla
import time

a_np = np.random.rand(100000000).astype(np.float32)
b_np = np.random.rand(100000000).astype(np.float32)

ctx = cl.create_some_context()
q = cl.CommandQueue(ctx)

a_g = cla.to_device(q, a_np)
b_g = cla.to_device(q, b_np)

start_time = time.time()

r_g = cla.dot(a_g, b_g)
r_get = r_g.get()
print(time.time() - start_time)
start_time = time.time()
r_np = np.dot(a_np, b_np)
print(time.time() - start_time)

print(r_g.get() - (r_np))
	def solve(self, A, b, x0=None, tol=10e-5, iters=300):
		r""" Solve linear system of equations by a Jacobi
		iterative method.
		@param A Linear system matrix.
		@param b Linear system independent term.
		@param x0 Initial aproximation of the solution.
		@param tol Relative error tolerance: \n
		\$ \vert\vert b - A \, x \vert \vert_\infty /
		\vert\vert b \vert \vert_\infty \$
		@param iters Maximum number of iterations.
		"""
		# Create/set OpenCL buffers
		self.setBuffers(A,b,x0)
		# Get dimensions for OpenCL execution
		n      = np.uint32(len(b))
		gSize  = (clUtils.globalSize(n),)
		# Get a norm to can compare later for valid result
		bnorm = np.sqrt(cl_array.dot(self.b,self.b).get())
		# Initialize the problem
		beta = bnorm
		self.dot_c_vec(1.0/beta, self.u)
		kernelargs = (self.A,self.u.data,self.v.data,n)
		self.program.dot_matT_vec(self.queue, gSize, None, *(kernelargs))
		alpha = np.sqrt(cl_array.dot(self.v,self.v).get())
		self.dot_c_vec(1.0/alpha, self.v)
		self.copy_vec(self.w, self.v)
		phibar = beta
		rhobar = alpha
		# Iterate while the result converges or maximum number
		# of iterations is reached.
		for i in range(0,iters):
			# Compute residues
			kernelargs = (self.A,
			              self.b.data,
			              self.x.data,
			              self.r.data,
			              n)
			self.program.r(self.queue, gSize, None, *(kernelargs))
			rnorm = np.sqrt(cl_array.dot(self.r,self.r).get())
			# Test if the final result has been reached
			if rnorm / bnorm <= tol:
				break
			# Compute next alpha, beta, u, v
			kernelargs = (self.A,self.u.data,self.v.data,self.u.data,alpha,n)
			self.program.u(self.queue, gSize, None, *(kernelargs))
			beta = np.sqrt(cl_array.dot(self.u,self.u).get())
			if not beta:
				break
			self.dot_c_vec(1.0/beta, self.u)
			kernelargs = (self.A,self.u.data,self.v.data,self.v.data,beta,n)
			self.program.v(self.queue, gSize, None, *(kernelargs))
			alpha = np.sqrt(cl_array.dot(self.v,self.v).get())
			if not alpha:
				break
			self.dot_c_vec(1.0/alpha, self.v)
			# Apply the orthogonal transformation
			c,s,rho = self.symOrtho(rhobar,beta)
			theta  =  s * alpha
			rhobar = -c * alpha
			phi    =  c * phibar
			phibar =  s * phibar
			# Update x and w
			self.linear_comb(self.x, 1.0, self.x, phi/rho,   self.w)
			self.linear_comb(self.w, 1.0, self.v, -theta/rho, self.w)
		# Return result computed
		x = np.zeros((n), dtype=np.float32)
		cl.enqueue_read_buffer(self.queue, self.x.data,  x).wait()
		return (x, rnorm / bnorm, i+1)
Beispiel #21
0
 def dot(self, x, y):
     from pyopencl.array import dot
     return dot(x, y, queue=self.queue).get()
Beispiel #22
0
 def norm(self):
     """The L2-norm on the flattened vector."""
     return np.sqrt(array.dot(self.array, self.array).get())