Ejemplo n.º 1
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Input matrix.
        A = inputs[0]

        l, n = A.shape
        if l != n:
            raise ValueError('A must be a square matrix')

        lda = max(1, n)

        # cusolver operates on F ordered matrices, but A is expected
        # to be symmetric so it does not matter.
        # We copy A if needed
        if self.inplace:
            L = A
        else:
            L = pygpu.array(A, copy=True)

        # The output matrix will contain only the upper or lower
        # triangular factorization of A. If L is C ordered (it
        # probably is as it is the default in Theano) we just switch
        # the fill mode parameter of cusolver
        l_parameter = 0 if self.lower else 1
        if L.flags['C_CONTIGUOUS']:
            l_parameter = 1 - l_parameter

        L_ptr = L.gpudata

        with context:
            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
                context.cusolver_handle, l_parameter, n, L_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            cusolver.cusolverDnSpotrf(context.cusolver_handle, l_parameter, n,
                                      L_ptr, lda, workspace_ptr,
                                      workspace_size, dev_info_ptr)

            val_dev_info = np.asarray(dev_info)[0]
            if val_dev_info > 0:
                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')

        # cusolver leaves the elements in the matrix outside the considered
        # upper or lower triangle unchanged, so we need to put zeros outside
        # the triangle
        if self.lower:
            tril(L)
        else:
            triu(L)

        outputs[0][0] = L
Ejemplo n.º 2
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Input matrix.
        A = inputs[0]

        l, n = A.shape
        if l != n:
            raise ValueError('A must be a square matrix')

        lda = max(1, n)

        # cusolver operates on F ordered matrices, but A is expected
        # to be symmetric so it does not matter.
        # We copy A if needed
        if self.inplace:
            L = A
        else:
            L = pygpu.array(A, copy=True)

        # The output matrix will contain only the upper or lower
        # triangular factorization of A. If L is C ordered (it
        # probably is as it is the default in Theano) we just switch
        # the fill mode parameter of cusolver
        l_parameter = 0 if self.lower else 1
        if L.flags['C_CONTIGUOUS']:
            l_parameter = 1 - l_parameter

        L_ptr = L.gpudata

        with context:
            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
                context.cusolver_handle, l_parameter, n, L_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype='float32',
                                    context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            cusolver.cusolverDnSpotrf(
                context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
                workspace_size, dev_info_ptr)

            val_dev_info = np.asarray(dev_info)[0]
            if val_dev_info > 0:
                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')

        # cusolver leaves the elements in the matrix outside the considered
        # upper or lower triangle unchanged, so we need to put zeros outside
        # the triangle
        if self.lower:
            tril(L)
        else:
            triu(L)

        outputs[0][0] = L
Ejemplo n.º 3
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Input matrix.
        A = inputs[0]

        l, n = A.shape
        if l != n:
            raise ValueError('A must be a square matrix')

        lda = max(1, n)

        # cusolver operates on F ordered matrices
        if not self.inplace:
            LU = pygpu.array(A, copy=True, order='F')
        else:
            LU = A.T if A.flags['C_CONTIGUOUS'] else A

        LU_ptr = LU.gpudata

        with context:
            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                context.cusolver_handle, n, n, LU_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            pivots = pygpu.zeros(n, dtype='int32', context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, LU_ptr,
                                      lda, workspace_ptr, pivots_ptr,
                                      dev_info_ptr)

            if self.check_output:
                val_dev_info = np.asarray(dev_info)[0]
                if val_dev_info > 0:
                    raise LinAlgError('LU decomposition failed')

            outputs[1][0] = pivots

        outputs[0][0] = LU
Ejemplo n.º 4
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[-3:-1] == s).all()
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s, np.complex64, np.complex64,
                                       batch=np.prod(input_shape[:-3]))

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 5
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[1:-1] == s[:-1]).all()

            # # construct output shape
            # output_shape = [input_shape[0]] + list(s)
            # # DFT of real input is symmetric, no need to store
            # # redundant coefficients
            # output_shape[-1] = output_shape[-1] // 2 + 1
            # # extra dimension with length 2 for real/imag
            # output_shape += [2]
            # output_shape = tuple(output_shape)

            # Output is the same shape as the input (m, ..., n, 2)
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape,
                                   context=inputs[0][0].context,
                                   dtype='float32')
                # z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                #                    dtype='float32')

            input_pycuda = inputs[0][0]
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out skcuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s,
                                       np.complex64,
                                       np.complex64,
                                       batch=input_shape[0])

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 6
0
def print_times_axpy():
    print('')
    print('AXPY')
    print('====')
    print('')
    for shape in shapes:
        print('shape = {}'.format(shape))
        x_gpu = pygpu.zeros(shape, dtype=dtype)
        y_gpu = pygpu.zeros(shape, dtype=dtype)
        # Run once so kernel is compiled
        odl.space.gpuary_tensors.axpy(a, x_gpu, y_gpu)
        tstart = time()
        for _ in range(n_runs):
            odl.space.gpuary_tensors.axpy(a, x_gpu, y_gpu)
        tstop = time()
        print('GPU time:            {:.5}'.format((tstop - tstart) / n_runs))

        x_cpu = np.zeros(shape, dtype=dtype)
        y_cpu = np.zeros_like(x_cpu)
        tstart = time()
        for _ in range(n_runs):
            y_cpu += a * x_cpu
        tstop = time()
        print('CPU time, no copy:   {:.5}'.format((tstop - tstart) / n_runs))

        tstart = time()
        for _ in range(n_runs):
            axpy(x_cpu, y_cpu, a=a)
        tstop = time()
        print('BLAS time:           {:.5}'.format((tstop - tstart) / n_runs))

        tstart = time()
        for _ in range(n_runs):
            x_gpu_to_cpu = a * np.asarray(x_gpu)
            y_gpu_to_cpu = np.asarray(y_gpu)
            y_gpu_to_cpu += x_gpu_to_cpu
            y_gpu[:] = y_gpu_to_cpu
        tstop = time()
        print('CPU time, with copy: {:.5}'.format((tstop - tstart) / n_runs))

        print('')
Ejemplo n.º 7
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[1:-1] == s[:-1]).all()

            # # construct output shape
            # output_shape = [input_shape[0]] + list(s)
            # # DFT of real input is symmetric, no need to store
            # # redundant coefficients
            # output_shape[-1] = output_shape[-1] // 2 + 1
            # # extra dimension with length 2 for real/imag
            # output_shape += [2]
            # output_shape = tuple(output_shape)

            # Output is the same shape as the input (m, ..., n, 2)
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                                   dtype='float32')
                # z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                #                    dtype='float32')

            input_pycuda = inputs[0][0]
            # I thought we'd need to change the type on output_pycuda
            # so it is complex64, but as it turns out skcuda.fft
            # doesn't really care either way and treats the array as
            # if it is complex64 anyway.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s, np.complex64, np.complex64,
                                       batch=input_shape[0])

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 8
0
def print_times_lico():
    print('')
    print('LICO')
    print('====')
    print('')
    for shape in shapes:
        print('shape = {}'.format(shape))
        x_gpu = pygpu.zeros(shape, dtype=dtype)
        y_gpu = pygpu.zeros(shape, dtype=dtype)
        out_gpu = x_gpu._empty_like_me()
        # Run once so kernel is compiled
        odl.space.gpuary_tensors.lico(a, x_gpu, b, y_gpu, out_gpu)
        tstart = time()
        for _ in range(n_runs):
            odl.space.gpuary_tensors.lico(a, x_gpu, b, y_gpu, out_gpu)
        tstop = time()
        print('GPU time:            {:.5}'.format((tstop - tstart) / n_runs))

        x_cpu = np.zeros(shape, dtype=dtype)
        y_cpu = np.zeros_like(x_cpu)
        out_cpu = np.empty_like(x_cpu)
        tstart = time()
        for _ in range(n_runs):
            np.multiply(a, x_cpu, out=out_cpu)
            out_cpu += b * y_cpu
        tstop = time()
        print('CPU time, no copy:   {:.5}'.format((tstop - tstart) / n_runs))

        out_gpu = x_gpu._empty_like_me()
        tstart = time()
        for _ in range(n_runs):
            x_gpu_to_cpu = np.asarray(x_gpu)
            out_cpu = b * np.asarray(y_gpu)
            out_cpu += a * x_gpu_to_cpu
            out_gpu[:] = out_cpu
        tstop = time()
        print('CPU time, with copy: {:.5}'.format((tstop - tstart) / n_runs))

        print('')
Ejemplo n.º 9
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert that last dimension corresponds to
            # input forward transform size.
            # assert (input_shape[1:-2] == s[:-1]).all()
            # assert ((input_shape[-2] - 1) * 2 + s[-1] % 2 == s[-1]).all()

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            # output_shape = [input_shape[0]] + list(s)
            # output_shape = tuple(output_shape)
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape,
                                   context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by skcuda as a complex64
            # array instead.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s,
                                       np.complex64,
                                       np.complex64,
                                       batch=output_shape[0])

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.ifft(input_pycuda, output_pycuda, plan[0])
                # strangely enough, enabling rescaling here makes it run
                # very, very slowly, so do this rescaling manually
                # afterwards!

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 10
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert that last dimension corresponds to
            # input forward transform size.
            # assert (input_shape[1:-2] == s[:-1]).all()
            # assert ((input_shape[-2] - 1) * 2 + s[-1] % 2 == s[-1]).all()

            # construct output shape
            # chop off the extra length-2 dimension for real/imag
            # output_shape = [input_shape[0]] + list(s)
            # output_shape = tuple(output_shape)
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by skcuda as a complex64
            # array instead.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s, np.complex64, np.complex64,
                                       batch=output_shape[0])

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.ifft(input_pycuda, output_pycuda, plan[0])
                # strangely enough, enabling rescaling here makes it run
                # very, very slowly, so do this rescaling manually
                # afterwards!

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 11
0
 def test_GpuArray(self):
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32, ), context=ctx))
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=0)
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=1)
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=2)
     if PY3:
         with self.assertRaises(RuntimeError):
             pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=3)
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32, ), context=ctx), protocol=-1)
Ejemplo n.º 12
0
 def test_GpuArray(self):
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32,), context=ctx))
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=0)
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=1)
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=2)
     if PY3:
         with self.assertRaises(RuntimeError):
             pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=3)
     with self.assertRaises(RuntimeError):
         pickle.dumps(pygpu.zeros((32,), context=ctx), protocol=-1)
Ejemplo n.º 13
0
def print_times_scal():
    print('')
    print('SCAL')
    print('====')
    print('')
    for shape in shapes:
        print('shape = {}'.format(shape))
        x_gpu = pygpu.zeros(shape, dtype=dtype)
        out_gpu = x_gpu._empty_like_me()
        # Run once so kernel is compiled
        odl.space.gpuary_tensors.scal(a, x_gpu, out_gpu)
        tstart = time()
        for _ in range(n_runs):
            odl.space.gpuary_tensors.scal(a, x_gpu, out_gpu)
        tstop = time()
        # print('GPU time:            {:.5}'.format((tstop - tstart) / n_runs))

        x_cpu = np.zeros(shape, dtype=dtype)
        tstart = time()
        for _ in range(n_runs):
            np.multiply(a, x_cpu, out=x_cpu)
        tstop = time()
        print('CPU time, no copy:   {:.5}'
              ''.format((tstop - tstart) / n_runs * 1e3))

        tstart = time()
        for _ in range(n_runs):
            scal(a, x_cpu)
        tstop = time()
        print('BLAS time:           {:.5}'
              ''.format((tstop - tstart) / n_runs * 1e3))

        tstart = time()
        for _ in range(n_runs):
            x_gpu_to_cpu = np.asarray(x_gpu)
            np.multiply(a, x_gpu_to_cpu, out=x_gpu_to_cpu)
            x_gpu[:] = x_gpu_to_cpu
        tstop = time()
        print('CPU time, with copy: {:.5}'
              ''.format((tstop - tstart) / n_runs * 1e3))

        print('')
Ejemplo n.º 14
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape,
                                   context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by skcuda as a complex64
            # array instead.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s,
                                       np.complex64,
                                       np.complex64,
                                       batch=np.prod(input_shape[:-3]))

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.ifft(input_pycuda, output_pycuda, plan[0])
                # strangely enough, enabling rescaling here makes it run
                # very, very slowly, so do this rescaling manually
                # afterwards!

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 15
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]

            # Since padding is not supported, assert s matches input shape.
            # assert (input_shape[1:-1] == s).all()
            assert (input_shape[-3:-1] == s).all()
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape,
                                   context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s,
                                       np.complex64,
                                       np.complex64,
                                       batch=np.prod(input_shape[:-3]))

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 16
0
        def thunk():
            input_shape = inputs[0][0].shape
            s = inputs[1][0]
            output_shape = input_shape

            z = outputs[0]

            # only allocate if there is no previous allocation of the
            # right size.
            if z[0] is None or z[0].shape != output_shape:
                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
                                   dtype='float32')

            input_pycuda = inputs[0][0]
            # input_pycuda is a float32 array with an extra dimension,
            # but will be interpreted by skcuda as a complex64
            # array instead.
            output_pycuda = z[0]

            with input_pycuda.context:
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
                    plan[0] = fft.Plan(s, np.complex64, np.complex64,
                                       batch=np.prod(input_shape[:-3]))

                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.ifft(input_pycuda, output_pycuda, plan[0])
                # strangely enough, enabling rescaling here makes it run
                # very, very slowly, so do this rescaling manually
                # afterwards!

                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()
Ejemplo n.º 17
0
def test_zero_noparam():
    try:
        pygpu.zeros()
        assert False
    except TypeError:
        pass
Ejemplo n.º 18
0
def test_zeros_no_dtype():
    # no dtype and order param
    x = pygpu.zeros((), context=ctx)
    y = numpy.zeros(())
    check_meta(x, y)
Ejemplo n.º 19
0
def zeros(shp, order, dtype):
    x = pygpu.zeros(shp, dtype, order, context=ctx)
    y = numpy.zeros(shp, dtype, order)
    check_all(x, y)
Ejemplo n.º 20
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Size of the matrices to invert.
        z = outputs[0]

        # Matrix.
        A = inputs[0]

        # Solution vectors.
        b = inputs[1]

        assert len(A.shape) == 2
        assert len(b.shape) == 2

        if self.trans in ["T", "C"]:
            trans = 1
            l, n = A.shape
            k, m = b.shape
        elif self.trans == "N":
            trans = 0
            n, l = A.shape
            k, m = b.shape
        else:
            raise ValueError("Invalid value for trans")
        if l != n:
            raise ValueError("A must be a square matrix")
        if n != k:
            raise ValueError("A and b must be aligned.")

        lda = max(1, n)
        ldb = max(1, k)

        # We copy A and b as cusolver operates inplace
        b = pygpu.array(b, copy=True, order="F")
        if not self.inplace:
            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # cusolver expects a F ordered matrix, but A is not explicitly
        # converted between C and F order, instead we switch the
        # "transpose" flag.
        if A.flags["C_CONTIGUOUS"]:
            trans = 1 - trans

        if A.dtype == "float32":
            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
            potrf = cusolver.cusolverDnSpotrf
            potrs = cusolverDnSpotrs
            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
            getrf = cusolver.cusolverDnSgetrf
            getrs = cusolver.cusolverDnSgetrs
        elif A.dtype == "float64":
            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
            potrf = cusolver.cusolverDnDpotrf
            potrs = cusolverDnDpotrs
            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
            getrf = cusolver.cusolverDnDgetrf
            getrs = cusolver.cusolverDnDgetrs
        else:
            raise ValueError("Unsupported dtype")

        if self.A_structure == "symmetric":
            with context:
                workspace_size = potrf_bufferSize(context.cusolver_handle, 0,
                                                  n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype=A.dtype,
                                    context=context)

            dev_info = pygpu.zeros((1, ), dtype="int32", context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                potrf(
                    context.cusolver_handle,
                    0,
                    n,
                    A_ptr,
                    lda,
                    workspace_ptr,
                    workspace_size,
                    dev_info_ptr,
                )
                self.check_dev_info(dev_info)

                potrs(
                    context.cusolver_handle,
                    0,
                    n,
                    m,
                    A_ptr,
                    lda,
                    b_ptr,
                    ldb,
                    dev_info_ptr,
                )

        else:
            # general case for A
            with context:
                workspace_size = getrf_bufferSize(context.cusolver_handle, n,
                                                  n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype=A.dtype,
                                    context=context)

            pivots = pygpu.zeros(n, dtype="int32", context=context)

            dev_info = pygpu.zeros((1, ), dtype="int32", context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                getrf(
                    context.cusolver_handle,
                    n,
                    n,
                    A_ptr,
                    lda,
                    workspace_ptr,
                    pivots_ptr,
                    dev_info_ptr,
                )
                self.check_dev_info(dev_info)

                getrs(
                    context.cusolver_handle,
                    trans,
                    n,
                    m,
                    A_ptr,
                    lda,
                    pivots_ptr,
                    b_ptr,
                    ldb,
                    dev_info_ptr,
                )

        z[0] = b
Ejemplo n.º 21
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Size of the matrices to invert.
        z = outputs[0]

        # Matrix.
        A = inputs[0]

        # Solution vectors.
        b = inputs[1]

        assert (len(A.shape) == 2)
        assert (len(b.shape) == 2)

        if self.trans in ['T', 'C']:
            trans = 1
            l, n = A.shape
            k, m = b.shape
        elif self.trans == 'N':
            trans = 0
            n, l = A.shape
            k, m = b.shape
        else:
            raise ValueError('Invalid value for trans')
        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # We copy A and b as cusolver operates inplace
        b = pygpu.array(b, copy=True, order='F')
        if not self.inplace:
            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # cusolver expects a F ordered matrix, but A is not explicitly
        # converted between C and F order, instead we switch the
        # "transpose" flag.
        if A.flags['C_CONTIGUOUS']:
            trans = 1 - trans

        if self.A_structure == 'symmetric':
            with context:
                workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
                    context.cusolver_handle, 0, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                cusolver.cusolverDnSpotrf(context.cusolver_handle, 0, n, A_ptr,
                                          lda, workspace_ptr, workspace_size,
                                          dev_info_ptr)
                self.check_dev_info(dev_info)

                cusolverDnSpotrs(context.cusolver_handle, 0, n, m, A_ptr, lda,
                                 b_ptr, ldb, dev_info_ptr)

        else:
            # general case for A
            with context:
                workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                    context.cusolver_handle, n, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            pivots = pygpu.zeros(n, dtype='int32', context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, A_ptr,
                                          lda, workspace_ptr, pivots_ptr,
                                          dev_info_ptr)
                self.check_dev_info(dev_info)

                cusolver.cusolverDnSgetrs(context.cusolver_handle, trans, n, m,
                                          A_ptr, lda, pivots_ptr, b_ptr, ldb,
                                          dev_info_ptr)

        z[0] = b
Ejemplo n.º 22
0
def test_zero_noparam():
    try:
        pygpu.zeros()
        assert False
    except TypeError:
        pass
Ejemplo n.º 23
0
def test_zeros_no_dtype():
    # no dtype and order param
    x = pygpu.zeros((), context=ctx)
    y = numpy.zeros(())
    check_meta(x, y)
Ejemplo n.º 24
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Size of the matrices to invert.
        z = outputs[0]

        # Matrix.
        A = inputs[0]

        # Solution vectors.
        b = inputs[1]

        assert(len(A.shape) == 2)
        assert(len(b.shape) == 2)

        if self.trans in ['T', 'C']:
            trans = 1
            l, n = A.shape
            k, m = b.shape
        elif self.trans == 'N':
            trans = 0
            n, l = A.shape
            k, m = b.shape
        else:
            raise ValueError('Invalid value for trans')
        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # We copy A and b as cusolver operates inplace
        b = pygpu.array(b, copy=True, order='F')
        if not self.inplace:
            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # cusolver expects a F ordered matrix, but A is not explicitly
        # converted between C and F order, instead we switch the
        # "transpose" flag.
        if A.flags['C_CONTIGUOUS']:
            trans = 1 - trans

        if A.dtype == 'float32':
            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
            potrf = cusolver.cusolverDnSpotrf
            potrs = cusolverDnSpotrs
            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
            getrf = cusolver.cusolverDnSgetrf
            getrs = cusolver.cusolverDnSgetrs
        elif A.dtype == 'float64':
            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
            potrf = cusolver.cusolverDnDpotrf
            potrs = cusolverDnDpotrs
            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
            getrf = cusolver.cusolverDnDgetrf
            getrs = cusolver.cusolverDnDgetrs
        else:
            raise ValueError("Unsupported dtype")

        if self.A_structure == 'symmetric':
            with context:
                workspace_size = potrf_bufferSize(
                    context.cusolver_handle, 0, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                potrf(
                    context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
                    workspace_size, dev_info_ptr)
                self.check_dev_info(dev_info)

                potrs(
                    context.cusolver_handle, 0, n, m, A_ptr, lda,
                    b_ptr, ldb, dev_info_ptr)

        else:
            # general case for A
            with context:
                workspace_size = getrf_bufferSize(
                    context.cusolver_handle, n, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)

            pivots = pygpu.zeros(n, dtype='int32', context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                getrf(
                    context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
                    pivots_ptr, dev_info_ptr)
                self.check_dev_info(dev_info)

                getrs(
                    context.cusolver_handle, trans, n, m, A_ptr, lda,
                    pivots_ptr, b_ptr, ldb, dev_info_ptr)

        z[0] = b
Ejemplo n.º 25
0
def zeros(shp, order, dtype):
    x = pygpu.zeros(shp, dtype, order, context=ctx)
    y = numpy.zeros(shp, dtype, order)
    check_all(x, y)