Example #1
0
    def __call__(self, persid):
        from theano.gpuarray.type import get_context
        from theano.gpuarray import pygpu

        array_type, name = persid.split(".")

        if name in self.cache:
            return self.cache[name]
        ret = None
        if array_type == "gpuarray":
            with self.zip_file.open(name) as f:
                ctx_name = pickle.load(f)
                array = np.lib.format.read_array(f)
            if config.experimental.unpickle_gpu_on_cpu:
                # directly return numpy array
                warnings.warn(
                    "config.experimental.unpickle_gpu_on_cpu is set "
                    "to True. Unpickling GpuArray as numpy.ndarray"
                )
                ret = array
            elif pygpu:
                ret = pygpu.array(array, context=get_context(ctx_name))
            else:
                raise ImportError("pygpu not found. Cannot unpickle GpuArray")
        else:
            with self.zip_file.open(name) as f:
                ret = np.lib.format.read_array(f)
        self.cache[name] = ret
        return ret
Example #2
0
    def __call__(self, persid):
        from theano.gpuarray.type import get_context
        from theano.gpuarray import pygpu
        array_type, name = persid.split('.')

        if name in self.cache:
            return self.cache[name]
        ret = None
        if array_type == 'gpuarray':
            with self.zip_file.open(name) as f:
                ctx_name = pickle.load(f)
                array = np.lib.format.read_array(f)
            if config.experimental.unpickle_gpu_on_cpu:
                # directly return numpy array
                warnings.warn("config.experimental.unpickle_gpu_on_cpu is set "
                              "to True. Unpickling GpuArray as numpy.ndarray")
                ret = array
            elif pygpu:
                ret = pygpu.array(array, context=get_context(ctx_name))
            else:
                raise ImportError("pygpu not found. Cannot unpickle GpuArray")
        else:
            with self.zip_file.open(name) as f:
                ret = np.lib.format.read_array(f)
        self.cache[name] = ret
        return ret
Example #3
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Input matrix.
        A = inputs[0]

        l, n = A.shape
        if l != n:
            raise ValueError('A must be a square matrix')

        lda = max(1, n)

        # cusolver operates on F ordered matrices, but A is expected
        # to be symmetric so it does not matter.
        # We copy A if needed
        if self.inplace:
            L = A
        else:
            L = pygpu.array(A, copy=True)

        # The output matrix will contain only the upper or lower
        # triangular factorization of A. If L is C ordered (it
        # probably is as it is the default in Theano) we just switch
        # the fill mode parameter of cusolver
        l_parameter = 0 if self.lower else 1
        if L.flags['C_CONTIGUOUS']:
            l_parameter = 1 - l_parameter

        L_ptr = L.gpudata

        with context:
            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
                context.cusolver_handle, l_parameter, n, L_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            cusolver.cusolverDnSpotrf(context.cusolver_handle, l_parameter, n,
                                      L_ptr, lda, workspace_ptr,
                                      workspace_size, dev_info_ptr)

            val_dev_info = np.asarray(dev_info)[0]
            if val_dev_info > 0:
                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')

        # cusolver leaves the elements in the matrix outside the considered
        # upper or lower triangle unchanged, so we need to put zeros outside
        # the triangle
        if self.lower:
            tril(L)
        else:
            triu(L)

        outputs[0][0] = L
Example #4
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Input matrix.
        A = inputs[0]

        l, n = A.shape
        if l != n:
            raise ValueError('A must be a square matrix')

        lda = max(1, n)

        # cusolver operates on F ordered matrices, but A is expected
        # to be symmetric so it does not matter.
        # We copy A if needed
        if self.inplace:
            L = A
        else:
            L = pygpu.array(A, copy=True)

        # The output matrix will contain only the upper or lower
        # triangular factorization of A. If L is C ordered (it
        # probably is as it is the default in Theano) we just switch
        # the fill mode parameter of cusolver
        l_parameter = 0 if self.lower else 1
        if L.flags['C_CONTIGUOUS']:
            l_parameter = 1 - l_parameter

        L_ptr = L.gpudata

        with context:
            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
                context.cusolver_handle, l_parameter, n, L_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype='float32',
                                    context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            cusolver.cusolverDnSpotrf(
                context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
                workspace_size, dev_info_ptr)

            val_dev_info = np.asarray(dev_info)[0]
            if val_dev_info > 0:
                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')

        # cusolver leaves the elements in the matrix outside the considered
        # upper or lower triangle unchanged, so we need to put zeros outside
        # the triangle
        if self.lower:
            tril(L)
        else:
            triu(L)

        outputs[0][0] = L
Example #5
0
def transfer_not_contiguous(shp, dtype):
    a = numpy.random.rand(*shp) * 10
    a = a[::-1]
    b = pygpu.array(a, context=ctx)
    c = numpy.asarray(b)

    assert numpy.allclose(c, a)
    assert a.shape == b.shape == c.shape
    # the result array (c) is C contiguous
    assert a.strides == b.strides == (-c.strides[0], ) + c.strides[1:]
    assert a.dtype == b.dtype == c.dtype
    assert c.flags.c_contiguous
def transfer_not_contiguous(shp, dtype):
    a = numpy.random.rand(*shp) * 10
    a = a[::-1]
    b = pygpu.array(a, context=ctx)
    c = numpy.asarray(b)

    assert numpy.allclose(c, a)
    assert a.shape == b.shape == c.shape
    # the result array (c) is C contiguous
    assert a.strides == b.strides == (-c.strides[0],) + c.strides[1:]
    assert a.dtype == b.dtype == c.dtype
    assert c.flags.c_contiguous
def transfer_fortran(shp, dtype):
    a = numpy.random.rand(*shp) * 10
    a_ = numpy.asfortranarray(a)
    if len(shp) > 1:
        assert a_.strides != a.strides
    a = a_
    b = pygpu.array(a, context=ctx)
    c = numpy.asarray(b)

    assert a.shape == b.shape == c.shape
    assert a.dtype == b.dtype == c.dtype
    assert a.flags.f_contiguous
    assert c.flags.f_contiguous
    assert a.strides == b.strides == c.strides
    assert numpy.allclose(c, a)
Example #8
0
def transfer_fortran(shp, dtype):
    a = numpy.random.rand(*shp) * 10
    a_ = numpy.asfortranarray(a)
    if len(shp) > 1:
        assert a_.strides != a.strides
    a = a_
    b = pygpu.array(a, context=ctx)
    c = numpy.asarray(b)

    assert a.shape == b.shape == c.shape
    assert a.dtype == b.dtype == c.dtype
    assert a.flags.f_contiguous
    assert c.flags.f_contiguous
    assert a.strides == b.strides == c.strides
    assert numpy.allclose(c, a)
Example #9
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Input matrix.
        A = inputs[0]

        l, n = A.shape
        if l != n:
            raise ValueError('A must be a square matrix')

        lda = max(1, n)

        # cusolver operates on F ordered matrices
        if not self.inplace:
            LU = pygpu.array(A, copy=True, order='F')
        else:
            LU = A.T if A.flags['C_CONTIGUOUS'] else A

        LU_ptr = LU.gpudata

        with context:
            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                context.cusolver_handle, n, n, LU_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            pivots = pygpu.zeros(n, dtype='int32', context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, LU_ptr,
                                      lda, workspace_ptr, pivots_ptr,
                                      dev_info_ptr)

            if self.check_output:
                val_dev_info = np.asarray(dev_info)[0]
                if val_dev_info > 0:
                    raise LinAlgError('LU decomposition failed')

            outputs[1][0] = pivots

        outputs[0][0] = LU
Example #10
0
    def perform(self, node, inputs, outputs):
        ctx = node.inputs[0].type.context

        # Solution set
        x = outputs[0]

        # Matrix.
        A = inputs[0]

        # right hand side
        b = inputs[1]

        assert(len(A.shape) == 2)
        assert(len(b.shape) in [1, 2])

        # implicitly deal with the difference between C order
        # and fortran order by flipping the trans and lower flags
        lower = not self.lower
        trans = self.trans
        if trans in ['T', 'C']:
            trans = 'N'
            l, n = A.shape
        elif trans == 'N':
            trans = 'T'
            n, l = A.shape
        else:
            raise ValueError('Invalid value for trans')

        if b.ndim == 2:
            k, m = b.shape
        else:
            k, = b.shape
            m = 1

        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # solution overwrites right hand side on exit
        b = pygpu.array(b, copy=True, order='F')

        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # unit scalar used for multiplication
        alpha = 1.0
        # indicates matrix A is on left of B
        side = 'l'
        # set whether upper or lower part of matrix A stored
        uplo = 'l' if lower else 'u'
        # indicates elements on diagonal of matrix A may not be unity
        diag = 'n'

        if A.dtype == 'float32':
            trsv = cublas.cublasStrsv
            trsm = cublas.cublasStrsm
        elif A.dtype == 'float64':
            trsv = cublas.cublasDtrsv
            trsm = cublas.cublasDtrsm
        else:
            raise ValueError("Unsupported dtype")

        with ctx:
            if b.ndim == 1:
                # matrix vector solve
                trsv(ctx.cublas_handle, uplo, trans, diag, n,
                     A_ptr, lda, b_ptr, 1)
            else:
                trsm(ctx.cublas_handle, side, uplo, trans, diag,
                     n, m, alpha, A_ptr, lda, b_ptr, ldb)

        x[0] = b
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
                    kern_stride=(1, 1), version=-1, verbose=0, random=True,
                    print_=None, id=None, rtol=1e-5, atol=1e-8,
                    nb_iter=0, ones=False, compile_kshp=None):
    #
    # This function is the core of several of the big unit-test drivers,
    # but it can also be used very directly on its own to test a specific
    # kind of convolution.
    #
    # See `test_example` (above) for an example of how to use this directly.
    #
    # :param kshape: (4d)The shape of the kernel at run time.
    # :param compile_kshp: (2d) hardcode the shape of the kernel in
    #                      the generated code This is supposed to be
    #                      faster, but we need to check That we raise
    #                      an error if the input have the wrong shape.
    #
    if ones:
        assert not random
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
                                  dtype='float32')
        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
                                   dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(
                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
        npy_kern = -(theano._asarray(numpy.arange(
                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
    img = pygpu.array(npy_img)
    kern = pygpu.array(npy_kern)

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.
    if img_stride != (1, 1):
        img = img[:, :, ::img_stride[0], ::img_stride[1]]
        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
    if kern_stride != (1, 1):
        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]

    t2 = None
    rval = True
    try:
        t0 = time.time()
        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
        t1 = time.time()
        i = gftensor4()
        k = gftensor4()
        op = GpuConv(border_mode=mode,
                     subsample=subsample,
                     version=version,
                     verbose=verbose,
                     kshp=compile_kshp)(i, k)
        f = theano.function([i, k], op, mode=mode_with_gpu)
        gpuval = f(img, kern)
        t2 = time.time()
        for i in range(nb_iter):
            gpuval2 = f(img, kern)
            assert numpy.allclose(numpy.asarray(gpuval),
                                  numpy.asarray(gpuval2))
            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
        gpuval = numpy.asarray(gpuval)
        if gpuval.shape != cpuval.shape:
            print >> sys.stdout, "ERROR: shape mismatch",
            print >> sys.stdout, gpuval.shape, cpuval.shape
            rval = False
        if rval:
            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
            assert numpy.all(numpy.isfinite(gpuval))
    except NotImplementedError as e:
        print >> sys.stdout, '_params_allgood Failed allclose', e
        rval = False

    if (t2 is not None):
        if mode == 'valid':
            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
        else:
            approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
                         kshape[3] * ishape[2] * ishape[3] * 2)
        approx_fp /= 1e6
        cpu_mflops = approx_fp / (t1 - t0)
        gpu_mflops = approx_fp / (t2 - t1)
        if verbose > 0:
            print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
            print >> sys.stdout, '%12.5f  %7.2f %7.2f %7.1f' % (approx_fp,
                    cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
    if not rval:
        print >> sys.stdout, ('test_' + mode + ' id=' + str(id) +
                              ' FAILED for ishape, kshape, mode, subsample,' +
                              ' img_stride, kern_stride, version', ishape,
                              kshape, mode, subsample, img_stride, kern_stride,
                              version)
        diff = cpuval - gpuval
        diffabs = numpy.absolute(diff)
        pr_diff = diffabs / numpy.absolute(cpuval)
        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
        print "max absolute diff:", (diffabs.max(), "avg abs diff:",
                                     numpy.average(diffabs))
        print "median abs diff:", (numpy.median(diffabs), "nb close:",
                                   nb_close, "/", diff.size)
        print "max relatif diff:", (pr_diff.max(), "avg rel diff:",
                                    numpy.average(pr_diff))
    if not rval and print_ != False:
        if npy_img.shape[0] > 5:
            print "img", npy_img[0]
            print "kern", npy_kern[0]
            print "gpu", gpuval[0][0]
            print "cpu", cpuval[0][0]
            print "diff", diff[0][0]
        else:
            print "img", npy_img
            print "kern", npy_kern
            print "gpu", gpuval
            print "cpu", cpuval
            print "diff", diff

    return rval
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
                    kern_stride=(1, 1), version=-1, verbose=0, random=True,
                    print_=None, id=None, rtol=1e-5, atol=1e-8,
                    nb_iter=0, ones=False, compile_kshp=None):
    #
    # This function is the core of several of the big unit-test drivers,
    # but it can also be used very directly on its own to test a specific
    # kind of convolution.
    #
    # See `test_example` (above) for an example of how to use this directly.
    #
    # :param kshape: (4d)The shape of the kernel at run time.
    # :param compile_kshp: (2d) hardcode the shape of the kernel in
    #                      the generated code This is supposed to be
    #                      faster, but we need to check That we raise
    #                      an error if the input have the wrong shape.
    #
    if ones:
        assert not random
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
                                  dtype='float32')
        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
                                   dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(
                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
        npy_kern = -(theano._asarray(numpy.arange(
                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
    img = pygpu.array(npy_img)
    kern = pygpu.array(npy_kern)

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.
    if img_stride != (1, 1):
        img = img[:, :, ::img_stride[0], ::img_stride[1]]
        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
    if kern_stride != (1, 1):
        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]

    t2 = None
    rval = True
    try:
        t0 = time.time()
        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
        t1 = time.time()
        i = gftensor4()
        k = gftensor4()
        op = GpuConv(border_mode=mode,
                     subsample=subsample,
                     version=version,
                     verbose=verbose,
                     kshp=compile_kshp)(i, k)
        f = theano.function([i, k], op, mode=mode_with_gpu)
        gpuval = f(img, kern)
        t2 = time.time()
        for i in range(nb_iter):
            gpuval2 = f(img, kern)
            assert numpy.allclose(numpy.asarray(gpuval),
                                  numpy.asarray(gpuval2))
            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
        gpuval = numpy.asarray(gpuval)
        if gpuval.shape != cpuval.shape:
            print >> sys.stdout, "ERROR: shape mismatch",
            print >> sys.stdout, gpuval.shape, cpuval.shape
            rval = False
        if rval:
            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
            assert numpy.all(numpy.isfinite(gpuval))
    except NotImplementedError, e:
        print >> sys.stdout, '_params_allgood Failed allclose', e
        rval = False
Example #13
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Size of the matrices to invert.
        z = outputs[0]

        # Matrix.
        A = inputs[0]

        # Solution vectors.
        b = inputs[1]

        assert(len(A.shape) == 2)
        assert(len(b.shape) == 2)

        if self.trans in ['T', 'C']:
            trans = 1
            l, n = A.shape
            k, m = b.shape
        elif self.trans == 'N':
            trans = 0
            n, l = A.shape
            k, m = b.shape
        else:
            raise ValueError('Invalid value for trans')
        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # We copy A and b as cusolver operates inplace
        b = pygpu.array(b, copy=True, order='F')
        if not self.inplace:
            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # cusolver expects a F ordered matrix, but A is not explicitly
        # converted between C and F order, instead we switch the
        # "transpose" flag.
        if A.flags['C_CONTIGUOUS']:
            trans = 1 - trans

        if A.dtype == 'float32':
            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
            potrf = cusolver.cusolverDnSpotrf
            potrs = cusolverDnSpotrs
            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
            getrf = cusolver.cusolverDnSgetrf
            getrs = cusolver.cusolverDnSgetrs
        elif A.dtype == 'float64':
            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
            potrf = cusolver.cusolverDnDpotrf
            potrs = cusolverDnDpotrs
            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
            getrf = cusolver.cusolverDnDgetrf
            getrs = cusolver.cusolverDnDgetrs
        else:
            raise ValueError("Unsupported dtype")

        if self.A_structure == 'symmetric':
            with context:
                workspace_size = potrf_bufferSize(
                    context.cusolver_handle, 0, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                potrf(
                    context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
                    workspace_size, dev_info_ptr)
                self.check_dev_info(dev_info)

                potrs(
                    context.cusolver_handle, 0, n, m, A_ptr, lda,
                    b_ptr, ldb, dev_info_ptr)

        else:
            # general case for A
            with context:
                workspace_size = getrf_bufferSize(
                    context.cusolver_handle, n, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)

            pivots = pygpu.zeros(n, dtype='int32', context=context)

            dev_info = pygpu.zeros((1,), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                getrf(
                    context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
                    pivots_ptr, dev_info_ptr)
                self.check_dev_info(dev_info)

                getrs(
                    context.cusolver_handle, trans, n, m, A_ptr, lda,
                    pivots_ptr, b_ptr, ldb, dev_info_ptr)

        z[0] = b
Example #14
0
 def run_noncontiguous_triu(self):
     a = numpy.random.rand(5, 5)
     b = pygpu.array(a, context=context)
     b = b[::-1]
     assert b.flags.c_contiguous is b.flags.f_contiguous is False
     triu(b)
Example #15
0
    def filter_inplace(self, data, old_data, strict=False,
                       allow_downcast=None):
        if (isinstance(data, gpuarray.GpuArray) and
                data.typecode == self.typecode):
            # This is just to make this condition not enter the
            # following branches
            pass
        elif strict:
            if not isinstance(data, gpuarray.GpuArray):
                raise TypeError("%s expected a GpuArray object." % self,
                                data, type(data))
            if self.typecode != data.typecode:
                raise TypeError("%s expected typecode %d (dtype %s), "
                                "got %d (dtype %s)." %
                                (self, self.typecode, self.dtype,
                                 data.typecode, str(data.dtype)))
            if self.context != data.context:
                raise TypeError("data context does not match type context")
            # fallthrough to ndim check
        elif (allow_downcast or
              (allow_downcast is None and
               type(data) == float and
               self.dtype == config.floatX)):
            if not isinstance(data, gpuarray.GpuArray):
                data = np.array(data, dtype=self.dtype, copy=False,
                                ndmin=len(self.broadcastable))
            else:
                data = gpuarray.array(data, dtype=self.typecode, copy=False,
                                      ndmin=len(self.broadcastable),
                                      context=self.context)
        else:
            if not hasattr(data, 'dtype'):
                converted_data = theano._asarray(data, self.dtype)
                # We use the `values_eq` static function from TensorType
                # to handle NaN values.
                if TensorType.values_eq(np.asarray(data),
                                        converted_data,
                                        force_same_dtype=False):
                    data = converted_data

            up_dtype = scalar.upcast(self.dtype, data.dtype)
            if up_dtype == self.dtype:
                if not isinstance(data, gpuarray.GpuArray):
                    data = np.array(data, dtype=self.dtype, copy=False)
                else:
                    data = gpuarray.array(data, dtype=self.dtype, copy=False)
            else:
                raise TypeError("%s cannot store a value of dtype %s "
                                "without risking loss of precision." %
                                (self, data.dtype))

        if self.ndim != data.ndim:
            raise TypeError("Wrong number of dimensions: expected %s, "
                            "got %s with shape %s." % (self.ndim, data.ndim,
                                                       data.shape), data)
        shp = data.shape
        for i, b in enumerate(self.broadcastable):
            if b and shp[i] != 1:
                raise TypeError("Non-unit value on shape on a broadcastable"
                                " dimension.", shp, self.broadcastable)
        if not isinstance(data, gpuarray.GpuArray):
            if old_data is not None and old_data.shape == data.shape and (
                # write() only work if the destitation is contiguous.
                    old_data.flags['C_CONTIGUOUS'] or
                    old_data.flags['F_CONTIGUOUS']):
                old_data.write(data)
                data = old_data
            else:
                data = pygpu.array(data, context=self.context)
        return data
def _params_allgood(ishape,
                    kshape,
                    mode,
                    subsample=(1, 1),
                    img_stride=(1, 1),
                    kern_stride=(1, 1),
                    version=-1,
                    verbose=0,
                    random=True,
                    print_=None,
                    id=None,
                    rtol=1e-5,
                    atol=1e-8,
                    nb_iter=0,
                    ones=False,
                    compile_kshp=None):
    #
    # This function is the core of several of the big unit-test drivers,
    # but it can also be used very directly on its own to test a specific
    # kind of convolution.
    #
    # See `test_example` (above) for an example of how to use this directly.
    #
    # :param kshape: (4d)The shape of the kernel at run time.
    # :param compile_kshp: (2d) hardcode the shape of the kernel in
    #                      the generated code This is supposed to be
    #                      faster, but we need to check That we raise
    #                      an error if the input have the wrong shape.
    #
    if ones:
        assert not random
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
                                  dtype='float32')
        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
                                   dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(
            numpy.prod(ishape)).reshape(ishape),
                                  dtype='float32') + 1
        npy_kern = -(
            theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape),
                            dtype='float32') + 1)
    img = pygpu.array(npy_img)
    kern = pygpu.array(npy_kern)

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.
    if img_stride != (1, 1):
        img = img[:, :, ::img_stride[0], ::img_stride[1]]
        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
    if kern_stride != (1, 1):
        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]

    t2 = None
    rval = True
    try:
        t0 = time.time()
        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
        t1 = time.time()
        i = gftensor4()
        k = gftensor4()
        op = GpuConv(border_mode=mode,
                     subsample=subsample,
                     version=version,
                     verbose=verbose,
                     kshp=compile_kshp)(i, k)
        f = theano.function([i, k], op, mode=mode_with_gpu)
        gpuval = f(img, kern)
        t2 = time.time()
        for i in range(nb_iter):
            gpuval2 = f(img, kern)
            assert numpy.allclose(numpy.asarray(gpuval),
                                  numpy.asarray(gpuval2))
            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
        gpuval = numpy.asarray(gpuval)
        if gpuval.shape != cpuval.shape:
            print("ERROR: shape mismatch", end=' ', file=sys.stdout)
            print(gpuval.shape, cpuval.shape, file=sys.stdout)
            rval = False
        if rval:
            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
            assert numpy.all(numpy.isfinite(gpuval))
    except NotImplementedError as e:
        print('_params_allgood Failed allclose', e, file=sys.stdout)
        rval = False

    if (t2 is not None):
        if mode == 'valid':
            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
        else:
            approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
                         kshape[3] * ishape[2] * ishape[3] * 2)
        approx_fp /= 1e6
        cpu_mflops = approx_fp / (t1 - t0)
        gpu_mflops = approx_fp / (t2 - t1)
        if verbose > 0:
            print('%15s' % str(ishape),
                  '%15s' % str(kshape),
                  end=' ',
                  file=sys.stdout)
            print('%12.5f  %7.2f %7.2f %7.1f' %
                  (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
                  file=sys.stdout)
    if not rval:
        print(('test_' + mode + ' id=' + str(id) +
               ' FAILED for ishape, kshape, mode, subsample,' +
               ' img_stride, kern_stride, version', ishape, kshape, mode,
               subsample, img_stride, kern_stride, version),
              file=sys.stdout)
        diff = cpuval - gpuval
        diffabs = numpy.absolute(diff)
        pr_diff = diffabs / numpy.absolute(cpuval)
        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
        print("max absolute diff:",
              (diffabs.max(), "avg abs diff:", numpy.average(diffabs)))
        print("median abs diff:",
              (numpy.median(diffabs), "nb close:", nb_close, "/", diff.size))
        print("max relatif diff:",
              (pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)))
    if not rval and print_ != False:
        if npy_img.shape[0] > 5:
            print("img", npy_img[0])
            print("kern", npy_kern[0])
            print("gpu", gpuval[0][0])
            print("cpu", cpuval[0][0])
            print("diff", diff[0][0])
        else:
            print("img", npy_img)
            print("kern", npy_kern)
            print("gpu", gpuval)
            print("cpu", cpuval)
            print("diff", diff)

    return rval
Example #17
0
def random_array(dtype):
    dtype = np.dtype(dtype)
    if dtype == bool:
        return np.random.randint(low=0, high=2, size=10).astype(bool)
    elif np.issubsctype(dtype, np.unsignedinteger):
        return np.random.randint(low=0, high=10, size=10).astype(dtype)
    elif np.issubsctype(dtype, np.signedinteger):
        return np.random.randint(low=0, high=10, size=10).astype(dtype)
    elif np.issubsctype(dtype, np.floating):
        return np.random.uniform(low=-4, high=4, size=10).astype(dtype)
    else:
        raise ValueError('unable to handle dtype {}'.format(dtype))


x_npy = random_array(dtype)
x_npy = np.array([0, 1, -1, np.inf, -np.inf, np.nan], dtype=dtype)
x_pygpu = pygpu.array(x_npy, dtype=dtype)
x_pygpu = pygpu.array([0, 1, -1, np.inf, -np.inf, np.nan], dtype=dtype)
ufunc_npy = getattr(np, ufunc)
ufunc_pygpu = getattr(pygpu.ufuncs, ufunc)
res_npy = ufunc_npy(x_npy)
res_pygpu = ufunc_pygpu(x_pygpu)

print('=== testing ufunc {} for dtype {} ==='.format(ufunc, dtype))
print('x =', x_npy)
print('npy:   {}(x) ='.format(ufunc))
print(res_npy)
print('pygpu: {}(x) ='.format(ufunc))
print(res_pygpu)
Example #18
0

def random_array(dtype):
    dtype = np.dtype(dtype)
    if dtype == bool:
        return np.random.randint(low=0, high=2, size=10).astype(bool)
    elif np.issubsctype(dtype, np.unsignedinteger):
        return np.random.randint(low=0, high=10, size=10).astype(dtype)
    elif np.issubsctype(dtype, np.signedinteger):
        return np.random.randint(low=0, high=10, size=10).astype(dtype)
    elif np.issubsctype(dtype, np.floating):
        return np.random.uniform(low=-4, high=4, size=10).astype(dtype)
    else:
        raise ValueError('unable to handle dtype {}'.format(dtype))


x_npy = random_array(dtype)
x_pygpu = pygpu.array(x_npy, dtype=dtype)
ufunc_npy = getattr(np, ufunc)
ufunc_pygpu = getattr(pygpu.ufuncs, ufunc)
res_npy = ufunc_npy.reduce(x_npy, axis=axis, keepdims=keepdims)
res_pygpu = ufunc_pygpu.reduce(x_pygpu, axis=axis, keepdims=keepdims)

print('=== testing reduce of ufunc {} for dtype {} ==='
      ''.format(ufunc, dtype))
print('x =', x_npy)
print('npy:   {}.reduce(x) ='.format(ufunc))
print(res_npy)
print('pygpu: {}.reduce(x) ='.format(ufunc))
print(res_pygpu)
Example #19
0
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
                    kern_stride=(1, 1), version=-1, verbose=0, random=True,
                    print_=None, id=None, rtol=1e-5, atol=1e-8,
                    nb_iter=0, ones=False, compile_kshp=None):
    #
    # This function is the core of several of the big unit-test drivers,
    # but it can also be used very directly on its own to test a specific
    # kind of convolution.
    #
    # See `test_example` (above) for an example of how to use this directly.
    #
    # :param kshape: (4d)The shape of the kernel at run time.
    # :param compile_kshp: (2d) hardcode the shape of the kernel in
    #                      the generated code This is supposed to be
    #                      faster, but we need to check That we raise
    #                      an error if the input have the wrong shape.
    #
    if ones:
        assert not random
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
                                  dtype='float32')
        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
                                   dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(
                numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
        npy_kern = -(theano._asarray(numpy.arange(
                    numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
    img = pygpu.array(npy_img)
    kern = pygpu.array(npy_kern)

    #we take the stride after the transfert as we make c_contiguous
    #data on the GPU.
    if img_stride != (1, 1):
        img = img[:, :, ::img_stride[0], ::img_stride[1]]
        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
    if kern_stride != (1, 1):
        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]

    t2 = None
    rval = True
    try:
        t0 = time.time()
        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
        t1 = time.time()
        i = gftensor4()
        k = gftensor4()
        op = GpuConv(border_mode=mode,
                     subsample=subsample,
                     version=version,
                     verbose=verbose,
                     kshp=compile_kshp)(i, k)
        f = theano.function([i, k], op, mode=mode_with_gpu)
        gpuval = f(img, kern)
        t2 = time.time()
        for i in range(nb_iter):
            gpuval2 = f(img, kern)
            assert numpy.allclose(numpy.asarray(gpuval),
                                  numpy.asarray(gpuval2))
            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
        gpuval = numpy.asarray(gpuval)
        if gpuval.shape != cpuval.shape:
            print >> sys.stdout, "ERROR: shape mismatch",
            print >> sys.stdout, gpuval.shape, cpuval.shape
            rval = False
        if rval:
            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
            assert numpy.all(numpy.isfinite(gpuval))
    except NotImplementedError, e:
        print >> sys.stdout, '_params_allgood Failed allclose', e
        rval = False
Example #20
0
    def filter_inplace(self,
                       data,
                       old_data,
                       strict=False,
                       allow_downcast=None):
        if (isinstance(data, gpuarray.GpuArray)
                and data.typecode == self.typecode):
            # This is just to make this condition not enter the
            # following branches
            pass
        elif strict:
            if not isinstance(data, gpuarray.GpuArray):
                raise TypeError("%s expected a GpuArray object." % self, data,
                                type(data))
            if self.typecode != data.typecode:
                raise TypeError("%s expected typecode %d (dtype %s), "
                                "got %d (dtype %s)." %
                                (self, self.typecode, self.dtype,
                                 data.typecode, str(data.dtype)))
            if self.context != data.context:
                raise TypeError("data context does not match type context")
            # fallthrough to ndim check
        elif (allow_downcast or (allow_downcast is None and type(data) == float
                                 and self.dtype == config.floatX)):
            if not isinstance(data, gpuarray.GpuArray):
                data = np.array(data,
                                dtype=self.dtype,
                                copy=False,
                                ndmin=len(self.broadcastable))
            else:
                data = gpuarray.array(data,
                                      dtype=self.typecode,
                                      copy=False,
                                      ndmin=len(self.broadcastable),
                                      context=self.context)
        else:
            if not hasattr(data, 'dtype'):
                converted_data = theano._asarray(data, self.dtype)
                # We use the `values_eq` static function from TensorType
                # to handle NaN values.
                if TensorType.values_eq(np.asarray(data),
                                        converted_data,
                                        force_same_dtype=False):
                    data = converted_data

            up_dtype = scalar.upcast(self.dtype, data.dtype)
            if up_dtype == self.dtype:
                if not isinstance(data, gpuarray.GpuArray):
                    data = np.array(data, dtype=self.dtype, copy=False)
                else:
                    data = gpuarray.array(data, dtype=self.dtype, copy=False)
            else:
                raise TypeError("%s cannot store a value of dtype %s "
                                "without risking loss of precision." %
                                (self, data.dtype))

        if self.ndim != data.ndim:
            raise TypeError(
                "Wrong number of dimensions: expected %s, "
                "got %s with shape %s." % (self.ndim, data.ndim, data.shape),
                data)
        shp = data.shape
        for i, b in enumerate(self.broadcastable):
            if b and shp[i] != 1:
                raise TypeError(
                    "Non-unit value on shape on a broadcastable"
                    " dimension.", shp, self.broadcastable)
        if not isinstance(data, gpuarray.GpuArray):
            if old_data is not None and old_data.shape == data.shape:
                old_data.write(data)
                data = old_data
            else:
                data = pygpu.array(data, context=self.context)
        return data
Example #21
0
    def perform(self, node, inputs, outputs):
        ctx = node.inputs[0].type.context

        # Solution set
        x = outputs[0]

        # Matrix.
        A = inputs[0]

        # right hand side
        b = inputs[1]

        assert len(A.shape) == 2
        assert len(b.shape) in [1, 2]

        # implicitly deal with the difference between C order
        # and fortran order by flipping the trans and lower flags
        lower = not self.lower
        trans = self.trans
        if trans in ["T", "C"]:
            trans = "N"
            l, n = A.shape
        elif trans == "N":
            trans = "T"
            n, l = A.shape
        else:
            raise ValueError("Invalid value for trans")

        if b.ndim == 2:
            k, m = b.shape
        else:
            (k, ) = b.shape
            m = 1

        if l != n:
            raise ValueError("A must be a square matrix")
        if n != k:
            raise ValueError("A and b must be aligned.")

        lda = max(1, n)
        ldb = max(1, k)

        # solution overwrites right hand side on exit
        b = pygpu.array(b, copy=True, order="F")

        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # unit scalar used for multiplication
        alpha = 1.0
        # indicates matrix A is on left of B
        side = "l"
        # set whether upper or lower part of matrix A stored
        uplo = "l" if lower else "u"
        # indicates elements on diagonal of matrix A may not be unity
        diag = "n"

        if A.dtype == "float32":
            trsv = cublas.cublasStrsv
            trsm = cublas.cublasStrsm
        elif A.dtype == "float64":
            trsv = cublas.cublasDtrsv
            trsm = cublas.cublasDtrsm
        else:
            raise ValueError("Unsupported dtype")

        with ctx:
            if b.ndim == 1:
                # matrix vector solve
                trsv(ctx.cublas_handle, uplo, trans, diag, n, A_ptr, lda,
                     b_ptr, 1)
            else:
                trsm(
                    ctx.cublas_handle,
                    side,
                    uplo,
                    trans,
                    diag,
                    n,
                    m,
                    alpha,
                    A_ptr,
                    lda,
                    b_ptr,
                    ldb,
                )

        x[0] = b
Example #22
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Size of the matrices to invert.
        z = outputs[0]

        # Matrix.
        A = inputs[0]

        # Solution vectors.
        b = inputs[1]

        assert len(A.shape) == 2
        assert len(b.shape) == 2

        if self.trans in ["T", "C"]:
            trans = 1
            l, n = A.shape
            k, m = b.shape
        elif self.trans == "N":
            trans = 0
            n, l = A.shape
            k, m = b.shape
        else:
            raise ValueError("Invalid value for trans")
        if l != n:
            raise ValueError("A must be a square matrix")
        if n != k:
            raise ValueError("A and b must be aligned.")

        lda = max(1, n)
        ldb = max(1, k)

        # We copy A and b as cusolver operates inplace
        b = pygpu.array(b, copy=True, order="F")
        if not self.inplace:
            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # cusolver expects a F ordered matrix, but A is not explicitly
        # converted between C and F order, instead we switch the
        # "transpose" flag.
        if A.flags["C_CONTIGUOUS"]:
            trans = 1 - trans

        if A.dtype == "float32":
            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
            potrf = cusolver.cusolverDnSpotrf
            potrs = cusolverDnSpotrs
            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
            getrf = cusolver.cusolverDnSgetrf
            getrs = cusolver.cusolverDnSgetrs
        elif A.dtype == "float64":
            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
            potrf = cusolver.cusolverDnDpotrf
            potrs = cusolverDnDpotrs
            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
            getrf = cusolver.cusolverDnDgetrf
            getrs = cusolver.cusolverDnDgetrs
        else:
            raise ValueError("Unsupported dtype")

        if self.A_structure == "symmetric":
            with context:
                workspace_size = potrf_bufferSize(context.cusolver_handle, 0,
                                                  n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype=A.dtype,
                                    context=context)

            dev_info = pygpu.zeros((1, ), dtype="int32", context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                potrf(
                    context.cusolver_handle,
                    0,
                    n,
                    A_ptr,
                    lda,
                    workspace_ptr,
                    workspace_size,
                    dev_info_ptr,
                )
                self.check_dev_info(dev_info)

                potrs(
                    context.cusolver_handle,
                    0,
                    n,
                    m,
                    A_ptr,
                    lda,
                    b_ptr,
                    ldb,
                    dev_info_ptr,
                )

        else:
            # general case for A
            with context:
                workspace_size = getrf_bufferSize(context.cusolver_handle, n,
                                                  n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype=A.dtype,
                                    context=context)

            pivots = pygpu.zeros(n, dtype="int32", context=context)

            dev_info = pygpu.zeros((1, ), dtype="int32", context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                getrf(
                    context.cusolver_handle,
                    n,
                    n,
                    A_ptr,
                    lda,
                    workspace_ptr,
                    pivots_ptr,
                    dev_info_ptr,
                )
                self.check_dev_info(dev_info)

                getrs(
                    context.cusolver_handle,
                    trans,
                    n,
                    m,
                    A_ptr,
                    lda,
                    pivots_ptr,
                    b_ptr,
                    ldb,
                    dev_info_ptr,
                )

        z[0] = b
Example #23
0
    def perform(self, node, inputs, outputs):
        ctx = node.inputs[0].type.context

        # Solution set
        x = outputs[0]

        # Matrix.
        A = inputs[0]

        # right hand side
        b = inputs[1]

        assert (len(A.shape) == 2)
        assert (len(b.shape) in [1, 2])

        # implicitly deal with the difference between C order
        # and fortran order by flipping the trans and lower flags
        lower = not self.lower
        trans = self.trans
        if trans in ['T', 'C']:
            trans = 'N'
            l, n = A.shape
        elif trans == 'N':
            trans = 'T'
            n, l = A.shape
        else:
            raise ValueError('Invalid value for trans')

        if b.ndim == 2:
            k, m = b.shape
        else:
            k, = b.shape
            m = 1

        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # solution overwrites right hand side on exit
        b = pygpu.array(b, copy=True, order='F')

        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # unit scalar used for multiplication
        alpha = 1.0
        # indicates matrix A is on left of B
        side = 'l'
        # set whether upper or lower part of matrix A stored
        uplo = 'l' if lower else 'u'
        # indicates elements on diagonal of matrix A may not be unity
        diag = 'n'

        with ctx:
            if b.ndim == 1:
                # matrix vector solve
                cublas.cublasStrsv(ctx.cublas_handle, uplo, trans, diag, n,
                                   A_ptr, lda, b_ptr, 1)
            else:
                cublas.cublasStrsm(ctx.cublas_handle, side, uplo, trans, diag,
                                   n, m, alpha, A_ptr, lda, b_ptr, ldb)

        x[0] = b
Example #24
0
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context

        # Size of the matrices to invert.
        z = outputs[0]

        # Matrix.
        A = inputs[0]

        # Solution vectors.
        b = inputs[1]

        assert (len(A.shape) == 2)
        assert (len(b.shape) == 2)

        if self.trans in ['T', 'C']:
            trans = 1
            l, n = A.shape
            k, m = b.shape
        elif self.trans == 'N':
            trans = 0
            n, l = A.shape
            k, m = b.shape
        else:
            raise ValueError('Invalid value for trans')
        if l != n:
            raise ValueError('A must be a square matrix')
        if n != k:
            raise ValueError('A and b must be aligned.')

        lda = max(1, n)
        ldb = max(1, k)

        # We copy A and b as cusolver operates inplace
        b = pygpu.array(b, copy=True, order='F')
        if not self.inplace:
            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata

        # cusolver expects a F ordered matrix, but A is not explicitly
        # converted between C and F order, instead we switch the
        # "transpose" flag.
        if A.flags['C_CONTIGUOUS']:
            trans = 1 - trans

        if self.A_structure == 'symmetric':
            with context:
                workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
                    context.cusolver_handle, 0, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                cusolver.cusolverDnSpotrf(context.cusolver_handle, 0, n, A_ptr,
                                          lda, workspace_ptr, workspace_size,
                                          dev_info_ptr)
                self.check_dev_info(dev_info)

                cusolverDnSpotrs(context.cusolver_handle, 0, n, m, A_ptr, lda,
                                 b_ptr, ldb, dev_info_ptr)

        else:
            # general case for A
            with context:
                workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                    context.cusolver_handle, n, n, A_ptr, lda)

            workspace = pygpu.zeros(workspace_size,
                                    dtype='float32',
                                    context=context)

            pivots = pygpu.zeros(n, dtype='int32', context=context)

            dev_info = pygpu.zeros((1, ), dtype='int32', context=context)

            workspace_ptr = workspace.gpudata
            pivots_ptr = pivots.gpudata
            dev_info_ptr = dev_info.gpudata

            with context:
                cusolver.cusolverDnSgetrf(context.cusolver_handle, n, n, A_ptr,
                                          lda, workspace_ptr, pivots_ptr,
                                          dev_info_ptr)
                self.check_dev_info(dev_info)

                cusolver.cusolverDnSgetrs(context.cusolver_handle, trans, n, m,
                                          A_ptr, lda, pivots_ptr, b_ptr, ldb,
                                          dev_info_ptr)

        z[0] = b
Example #25
0
    def filter_inplace(self,
                       data,
                       old_data,
                       strict=False,
                       allow_downcast=None):
        if isinstance(data,
                      gpuarray.GpuArray) and data.typecode == self.typecode:
            # This is just to make this condition not enter the
            # following branches
            pass
        elif strict:
            if not isinstance(data, gpuarray.GpuArray):
                raise TypeError(f"{self} expected a GpuArray object.", data,
                                type(data))
            if self.typecode != data.typecode:
                raise TypeError(
                    f"{self} expected typecode {int(self.typecode)} (dtype {self.dtype}), "
                    f"got {int(data.typecode)} (dtype {data.dtype}).")
            if self.context != data.context:
                raise TypeError("data context does not match type context")
            # fallthrough to ndim check
        elif allow_downcast or (allow_downcast is None and type(data) == float
                                and self.dtype == config.floatX):
            if not isinstance(data, gpuarray.GpuArray):
                data = np.array(data,
                                dtype=self.dtype,
                                copy=False,
                                ndmin=len(self.broadcastable))
            else:
                data = gpuarray.array(
                    data,
                    dtype=self.typecode,
                    copy=False,
                    ndmin=len(self.broadcastable),
                    context=self.context,
                )
        else:
            if not hasattr(data, "dtype"):
                converted_data = _asarray(data, self.dtype)
                # We use the `values_eq` static function from TensorType
                # to handle NaN values.
                if TensorType.values_eq(np.asarray(data),
                                        converted_data,
                                        force_same_dtype=False):
                    data = converted_data

            up_dtype = scalar.upcast(self.dtype, data.dtype)
            if up_dtype == self.dtype:
                if not isinstance(data, gpuarray.GpuArray):
                    data = np.array(data, dtype=self.dtype, copy=False)
                else:
                    data = gpuarray.array(data, dtype=self.dtype, copy=False)
            else:
                raise TypeError(
                    f"{self} cannot store a value of dtype {data.dtype} "
                    "without risking loss of precision.")

        if self.ndim != data.ndim:
            raise TypeError(
                f"Wrong number of dimensions: expected {self.ndim}, "
                f"got {data.ndim} with shape {data.shape}.",
                data,
            )
        shp = data.shape
        for i, b in enumerate(self.broadcastable):
            if b and shp[i] != 1:
                raise TypeError(
                    "Non-unit value on shape on a broadcastable"
                    " dimension.",
                    shp,
                    self.broadcastable,
                )
        if not isinstance(data, gpuarray.GpuArray):
            if (old_data is not None and old_data.shape == data.shape and (
                    # write() only work if the destitation is contiguous.
                    old_data.flags["C_CONTIGUOUS"]
                    or old_data.flags["F_CONTIGUOUS"])):
                old_data.write(data)
                data = old_data
            else:
                data = pygpu.array(data, context=self.context)
        return data
Example #26
0
 def run_noncontiguous_triu(self):
     a = numpy.random.rand(5, 5)
     a = a[::-1]
     b = pygpu.array(a, context=context)
     assert b.flags.c_contiguous is b.flags.f_contiguous is False
     triu(b)