Esempi in Python per culaCheckStatus, esempi in Python per cula.culaCheckStatus

Esempio n. 1

0

Mostra file

def pculaConfigInit(config):
    """
    Initialize pCULA configuration structure to sensible defaults.
    """

    status = _libpcula.pculaConfigInit(ctypes.byref(config))
    culaCheckStatus(status)

Esempio n. 2

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaConfigInit(config):
    """
    Initialize pCULA configuration structure to sensible defaults.
    """

    status = _libpcula.pculaConfigInit(ctypes.byref(config))
    culaCheckStatus(status)

Esempio n. 3

0

Mostra file

def pculaZpotrf(config, uplo, n, a, lda):
    """
    Cholesky decomposition.

    """

    status = _libpcula.pculaZpotrf(ctypes.byref(config), uplo, n, int(a), lda)
    culaCheckStatus(status)

Esempio n. 4

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaZpotrf(config, uplo, n, a, lda):
    """
    Cholesky decomposition.

    """

    status = _libpcula.pculaZpotrf(ctypes.byref(config), uplo, n, int(a), lda)
    culaCheckStatus(status)

Esempio n. 5

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaZpotrs(config, uplo, n, nrhs, a, lda, b, ldb):
    """
    Cholesky solve.

    """

    status = _libpcula.pculaZpotrs(ctypes.byref(config), uplo, n, nrhs, int(a),
                                   lda, int(b), ldb)
    culaCheckStatus(status)

Esempio n. 6

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaZgetrf(config, m, n, a, lda, ipiv):
    """
    LU decomposition.

    """

    status = _libpcula.pculaZgetrf(ctypes.byref(config), m, n, int(a), lda,
                                  int(ipiv))
    culaCheckStatus(status)

Esempio n. 7

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaZgesv(config, n, nrhs, a, lda, ipiv, b, ldb):
    """
    General system solve using LU decomposition.

    """

    status = _libpcula.pculaZgesv(ctypes.byref(config), n, nrhs, int(a), lda,
                                  int(ipiv), int(b), ldb)
    culaCheckStatus(status)

Esempio n. 8

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaZtrsm(config, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb):
    """
    Triangular system solve.

    """

    status = _libpcula.pculaZtrsm(ctypes.byref(config), side, uplo, transa,
                                  diag, m, n, alpha, int(a), lda, int(b), ldb)                                  
    culaCheckStatus(status)

Esempio n. 9

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaZposv(config, uplo, n, nrhs, a, lda, b, ldb):
    """
    QR factorization.

    """

    status = _libpcula.pculaZposv(ctypes.byref(config), uplo, n, nrhs, int(a), lda,
                                   int(b), ldb)
    culaCheckStatus(status)

Esempio n. 10

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaZgetrs(config, trans, n, nrhs, a, lda, ipiv, b, ldb):
    """
    LU solve.

    """

    status = _libpcula.pculaZgetrs(ctypes.byref(config), trans, n, nrhs, int(a), lda,
                                  int(ipiv), int(b), ldb)
    culaCheckStatus(status)

Esempio n. 11

0

Mostra file

def pculaZpotrs(config, uplo, n, nrhs, a, lda, b, ldb):
    """
    Cholesky solve.

    """

    status = _libpcula.pculaZpotrs(ctypes.byref(config), uplo, n, nrhs, int(a),
                                   lda, int(b), ldb)
    culaCheckStatus(status)

Esempio n. 12

0

Mostra file

def pculaZposv(config, uplo, n, nrhs, a, lda, b, ldb):
    """
    QR factorization.

    """

    status = _libpcula.pculaZposv(ctypes.byref(config), uplo, n, nrhs, int(a),
                                  lda, int(b), ldb)
    culaCheckStatus(status)

Esempio n. 13

0

Mostra file

def pculaZgetrs(config, trans, n, nrhs, a, lda, ipiv, b, ldb):
    """
    LU solve.

    """

    status = _libpcula.pculaZgetrs(ctypes.byref(config), trans, n, nrhs,
                                   int(a), lda, int(ipiv), int(b), ldb)
    culaCheckStatus(status)

Esempio n. 14

0

Mostra file

def pculaZtrsm(config, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb):
    """
    Triangular system solve.

    """

    status = _libpcula.pculaZtrsm(ctypes.byref(config), side, uplo, transa,
                                  diag, m, n, alpha, int(a), lda, int(b), ldb)
    culaCheckStatus(status)

Esempio n. 15

0

Mostra file

def pculaZgetrf(config, m, n, a, lda, ipiv):
    """
    LU decomposition.

    """

    status = _libpcula.pculaZgetrf(ctypes.byref(config), m, n, int(a), lda,
                                   int(ipiv))
    culaCheckStatus(status)

Esempio n. 16

0

Mostra file

def pculaZgesv(config, n, nrhs, a, lda, ipiv, b, ldb):
    """
    General system solve using LU decomposition.

    """

    status = _libpcula.pculaZgesv(ctypes.byref(config), n, nrhs, int(a), lda,
                                  int(ipiv), int(b), ldb)
    culaCheckStatus(status)

Esempio n. 17

0

Mostra file

File: pcula.py Progetto: Brainiarc7/scikit-cuda

def pculaDgemm(config, transa, transb, m, n, k, alpha, A, lda, B, ldb,
               beta, C, ldc):
    """
    Matrix-matrix product for general matrix.

    """

    status = _libpcula.pculaDgemm(ctypes.byref(config), transa, transb, m, n, k, alpha, 
                                  int(A), lda, int(B), ldb, beta, int(C), ldc)
    culaCheckStatus(status)

Esempio n. 18

0

Mostra file

def pculaDgemm(config, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C,
               ldc):
    """
    Matrix-matrix product for general matrix.

    """

    status = _libpcula.pculaDgemm(ctypes.byref(config), transa, transb, m, n,
                                  k, alpha, int(A), lda, int(B), ldb, beta,
                                  int(C), ldc)
    culaCheckStatus(status)

Esempio n. 19

0

Mostra file

def svd(a_gpu, jobu='A', jobvt='A'):
    """
    Singular Value Decomposition.

    Factors the matrix `a` into two unitary matrices, `u` and `vh`,
    and a 1-dimensional array of real, non-negative singular values,
    `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`.

    Parameters
    ----------
    a : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)` to decompose.
    jobu : {'A', 'S', 'O', 'N'}
        If 'A', return the full `u` matrix with shape `(m, m)`.
        If 'S', return the `u` matrix with shape `(m, k)`.
        If 'O', return the `u` matrix with shape `(m, k) without
        allocating a new matrix.
        If 'N', don't return `u`.
    jobvt : {'A', 'S', 'O', 'N'}
        If 'A', return the full `vh` matrix with shape `(n, n)`.
        If 'S', return the `vh` matrix with shape `(k, n)`.
        If 'O', return the `vh` matrix with shape `(k, n) without
        allocating a new matrix.
        If 'N', don't return `vh`.

    Returns
    -------
    u : pycuda.gpuarray.GPUArray
        Unitary matrix of shape `(m, m)` or `(m, k)` depending on
        value of `jobu`.
    s : pycuda.gpuarray.GPUArray
        Array containing the singular values, sorted such that `s[i] >= s[i+1]`.
        `s` is of length `min(m, n)`.
    vh : pycuda.gpuarray.GPUArray
        Unitary matrix of shape `(n, n)` or `(k, n)`, depending
        on `jobvt`.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix regardless
    of the values of `jobu` and `jobvt`.

    Only one of `jobu` or `jobvt` may be set to `O`, and then only for
    a square matrix.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6)
    >>> a = np.asarray(a, np.complex64)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S')
    >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4)
    True

    """

    if not _has_cula:
        raise NotImplementError('CULA not installed')

    # The free version of CULA only supports single precision floating
    # point numbers:
    data_type = a_gpu.dtype.type
    real_type = np.float32
    if data_type == np.complex64:
        cula_func = cula._libcula.culaDeviceCgesvd
    elif data_type == np.float32:
        cula_func = cula._libcula.culaDeviceSgesvd
    else:
        if cula._libcula_toolkit == 'standard':
            if data_type == np.complex128:
                cula_func = cula._libcula.culaDeviceZgesvd
            elif data_type == np.float64:
                cula_func = cula._libcula.culaDeviceDgesvd
            else:
                raise ValueError('unsupported type')
            real_type = np.float64
        else:
            raise ValueError('double precision not supported')

    # Since CUDA assumes that arrays are stored in column-major
    # format, the input matrix is assumed to be transposed:
    n, m = a_gpu.shape
    square = (n == m)

    # Since the input matrix is transposed, jobu and jobvt must also
    # be switched because the computed matrices will be returned in
    # reversed order:
    jobvt, jobu = jobu, jobvt

    # Set the leading dimension of the input matrix:
    lda = max(1, m)

    # Allocate the array of singular values:
    s_gpu = gpuarray.empty(min(m, n), real_type)

    # Set the leading dimension and allocate u:
    jobu = upper(jobu)
    jobvt = upper(jobvt)
    ldu = m
    if jobu == 'A':
        u_gpu = gpuarray.empty((ldu, m), data_type)
    elif jobu == 'S':
        u_gpu = gpuarray.empty((min(m, n), ldu), data_type)
    elif jobu == 'O':
        if not square:
            raise ValueError('in-place computation of singular vectors ' +
                             'of non-square matrix not allowed')
        ldu = 1
        u_gpu = a_gpu
    else:
        ldu = 1
        u_gpu = gpuarray.empty((), data_type)

    # Set the leading dimension and allocate vh:
    if jobvt == 'A':
        ldvt = n
        vh_gpu = gpuarray.empty((n, n), data_type)
    elif jobvt == 'S':
        ldvt = min(m, n)
        vh_gpu = gpuarray.empty((n, ldvt), data_type)
    elif jobvt == 'O':
        if jobu == 'O':
            raise ValueError('jobu and jobvt cannot both be O')
        if not square:
            raise ValueError('in-place computation of singular vectors ' +
                             'of non-square matrix not allowed')
        ldvt = 1
        vh_gpu = a_gpu
    else:
        ldvt = 1
        vh_gpu = gpuarray.empty((), data_type)

    # Compute SVD and check error status:

    status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda,
                       int(s_gpu.gpudata), int(u_gpu.gpudata), ldu,
                       int(vh_gpu.gpudata), ldvt)

    cula.culaCheckStatus(status)

    # Free internal CULA memory:
    cula.culaFreeBuffers()

    # Since the input is assumed to be transposed, it is necessary to
    # return the computed matrices in reverse order:
    if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']:
        return vh_gpu, s_gpu, u_gpu
    elif jobu == 'N' and jobvt != 'N':
        return vh_gpu, s_gpu
    elif jobu != 'N' and jobvt == 'N':
        return s_gpu, u_gpu
    else:
        return s_gpu

Esempio n. 20

0

Mostra file

File: linalg.py Progetto: stevertaylor/scikits.cuda

def cho_factor(a_gpu, uplo='L'):
    """
    Cholesky factorisation

    Performs an in-place cholesky factorisation on the matrix 'a' 
    such that a = x*x.T or x.T*x, if the lower='L' or upper='U'
    triangle of 'a' is used, respectively.

    Parameters
    ----------
    a : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, m)` to decompose.
    uplo: use the upper='U' or lower='L' (default) triangle of 'a'

    Returns
    -------
    a: Cholesky factorised matrix

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.array([[3.0,0.0],[0.0,7.0]])
    >>> a = np.asarray(a, np.float64)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> cho_factor(a_gpu)

    """

    if not _has_cula:
        raise NotImplementError('CULA not installed')

    data_type = a_gpu.dtype.type
    real_type = np.float32
    if cula._libcula_toolkit == 'standard':
        if data_type == np.complex64:
            cula_func = cula._libcula.culaDeviceCpotrf
        elif data_type == np.float32:
            cula_func = cula._libcula.culaDeviceSpotrf
        if data_type == np.complex128:
            cula_func = cula._libcula.culaDeviceZpotrf
        elif data_type == np.float64:
            cula_func = cula._libcula.culaDeviceDpotrf
        else:
            raise ValueError('unsupported type')
        real_type = np.float64
    else:
        raise ValueError('Cholesky factorisation not included in CULA Dense Free version')

    # Since CUDA assumes that arrays are stored in column-major
    # format, the input matrix is assumed to be transposed:
    n, m = a_gpu.shape
    square = (n == m)

    if (n!=m):
        raise ValueError('Matrix must be symmetric positive-definite')

    # Set the leading dimension of the input matrix:
    lda = max(1, m)

    status = cula_func(uplo, n, int(a_gpu.gpudata), lda)

    cula.culaCheckStatus(status)

    # Free internal CULA memory:
    cula.culaFreeBuffers()

Esempio n. 21

0

Mostra file

File: linalg.py Progetto: stevertaylor/scikits.cuda

def cho_solve(a_gpu, b_gpu, uplo='L'):
    """
    Cholesky solver

    Solve a system of equations via cholesky factorisation,
    i.e. a*x = b.
    Overwrites 'b' to give 'inv(a)*b', and overwrites the chosen triangle
    of 'a' with factorised triangle

    Parameters
    ----------
    a : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, m)` to decompose.
    b : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, 1)` to decompose.
    uplo: use the upper='U' or lower='L' (default) triangle of 'a'

    Returns
    -------
    a: Cholesky factorised matrix

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.array([[3.0,0.0],[0.0,7.0]])
    >>> a = np.asarray(a, np.float64)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> b = np.array([11.,19.])
    >>> b = np.asarray(b, np.float64)
    >>> b_gpu  = gpuarray.to_gpu(b)
    >>> cho_solve(a_gpu,b_gpu)

    """

    if not _has_cula:
        raise NotImplementError('CULA not installed')

    data_type = a_gpu.dtype.type
    real_type = np.float32
    if cula._libcula_toolkit == 'standard':
        if data_type == np.complex64:
            cula_func = cula._libcula.culaDeviceCpotrf
        elif data_type == np.float32:
            cula_func = cula._libcula.culaDeviceSpotrf
        if data_type == np.complex128:
            cula_func = cula._libcula.culaDeviceZpotrf
        elif data_type == np.float64:
            cula_func = cula._libcula.culaDeviceDpotrf
        else:
            raise ValueError('unsupported type')
        real_type = np.float64
    else:
        raise ValueError('Cholesky factorisation not included in CULA Dense Free version')

    # Since CUDA assumes that arrays are stored in column-major
    # format, the input matrix is assumed to be transposed:
    na, ma = a_gpu.shape
    square = (na == ma)
    
    if (na!=ma):
        raise ValueError('Matrix must be symmetric positive-definite')

    # Set the leading dimension of the input matrix:
    lda = max(1, ma)
    ldb = lda

    # Assuming we are only solving for a vector. Hence, nrhs = 1
    status = cula_func(uplo, na, 1, int(a_gpu.gpudata), lda, int(b_gpu.gpudata), ldb)

    cula.culaCheckStatus(status)

    # Free internal CULA memory:
    cula.culaFreeBuffers()

Esempio n. 22

0

Mostra file

File: linalg.py Progetto: mforbes/scikits.cuda

def svd(a_gpu, jobu='A', jobvt='A'):
    """
    Singular Value Decomposition.

    Factors the matrix `a` into two unitary matrices, `u` and `vh`,
    and a 1-dimensional array of real, non-negative singular values,
    `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`.

    Parameters
    ----------
    a : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)` to decompose.
    jobu : {'A', 'S', 'O', 'N'}
        If 'A', return the full `u` matrix with shape `(m, m)`.
        If 'S', return the `u` matrix with shape `(m, k)`.
        If 'O', return the `u` matrix with shape `(m, k) without
        allocating a new matrix.
        If 'N', don't return `u`.
    jobvt : {'A', 'S', 'O', 'N'}
        If 'A', return the full `vh` matrix with shape `(n, n)`.
        If 'S', return the `vh` matrix with shape `(k, n)`.
        If 'O', return the `vh` matrix with shape `(k, n) without
        allocating a new matrix.
        If 'N', don't return `vh`.

    Returns
    -------
    u : pycuda.gpuarray.GPUArray
        Unitary matrix of shape `(m, m)` or `(m, k)` depending on
        value of `jobu`.
    s : pycuda.gpuarray.GPUArray
        Array containing the singular values, sorted such that `s[i] >= s[i+1]`.
        `s` is of length `min(m, n)`.
    vh : pycuda.gpuarray.GPUArray
        Unitary matrix of shape `(n, n)` or `(k, n)`, depending
        on `jobvt`.

    Notes
    -----
    Double precision is only supported if the standard version of the
    CULA Dense toolkit is installed.

    This function destroys the contents of the input matrix regardless
    of the values of `jobu` and `jobvt`.

    Only one of `jobu` or `jobvt` may be set to `O`, and then only for
    a square matrix.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6)
    >>> a = np.asarray(a, np.complex64)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 'S', 'S')
    >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4)
    True

    """

    if not _has_cula:
        raise NotImplementError('CULA not installed')

    # The free version of CULA only supports single precision floating
    # point numbers:
    data_type = a_gpu.dtype.type
    real_type = np.float32
    if data_type == np.complex64:
        cula_func = cula._libcula.culaDeviceCgesvd
    elif data_type == np.float32:
        cula_func = cula._libcula.culaDeviceSgesvd
    else:
        if cula._libcula_toolkit == 'standard':
            if data_type == np.complex128:
                cula_func = cula._libcula.culaDeviceZgesvd
            elif data_type == np.float64:
                cula_func = cula._libcula.culaDeviceDgesvd
            else:
                raise ValueError('unsupported type')
            real_type = np.float64
        else:
            raise ValueError('double precision not supported')

    # Since CUDA assumes that arrays are stored in column-major
    # format, the input matrix is assumed to be transposed:
    n, m = a_gpu.shape
    square = (n == m)

    # Since the input matrix is transposed, jobu and jobvt must also
    # be switched because the computed matrices will be returned in
    # reversed order:
    jobvt, jobu = jobu, jobvt

    # Set the leading dimension of the input matrix:
    lda = max(1, m)

    # Allocate the array of singular values:
    s_gpu = gpuarray.empty(min(m, n), real_type)

    # Set the leading dimension and allocate u:
    jobu = upper(jobu)
    jobvt = upper(jobvt)
    ldu = m
    if jobu == 'A':
        u_gpu = gpuarray.empty((ldu, m), data_type)
    elif jobu == 'S':
        u_gpu = gpuarray.empty((min(m, n), ldu), data_type)
    elif jobu == 'O':
        if not square:
            raise ValueError('in-place computation of singular vectors '+
                             'of non-square matrix not allowed')
        ldu = 1
        u_gpu = a_gpu
    else:
        ldu = 1
        u_gpu = gpuarray.empty((), data_type)

    # Set the leading dimension and allocate vh:
    if jobvt == 'A':
        ldvt = n
        vh_gpu = gpuarray.empty((n, n), data_type)
    elif jobvt == 'S':
        ldvt = min(m, n)
        vh_gpu = gpuarray.empty((n, ldvt), data_type)
    elif jobvt == 'O':
        if jobu == 'O':
            raise ValueError('jobu and jobvt cannot both be O')
        if not square:
            raise ValueError('in-place computation of singular vectors '+
                             'of non-square matrix not allowed')
        ldvt = 1
        vh_gpu = a_gpu
    else:
        ldvt = 1
        vh_gpu = gpuarray.empty((), data_type)

    # Compute SVD and check error status:

    status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata),
                       lda, int(s_gpu.gpudata), int(u_gpu.gpudata),
                       ldu, int(vh_gpu.gpudata), ldvt)

    cula.culaCheckStatus(status)

    # Free internal CULA memory:
    cula.culaFreeBuffers()

    # Since the input is assumed to be transposed, it is necessary to
    # return the computed matrices in reverse order:
    if jobu in ['A', 'S', 'O'] and jobvt in ['A', 'S', 'O']:
        return vh_gpu, s_gpu, u_gpu
    elif jobu == 'N' and jobvt != 'N':
        return vh_gpu, s_gpu
    elif jobu != 'N' and jobvt == 'N':
        return s_gpu, u_gpu
    else:
        return s_gpu

Esempio n. 23

0

Mostra file

File: linalg.py Progetto: aeweiwi/scikits.cuda

def svd(a_gpu, full_matrices=1, compute_uv=1):
    """
    Singular Value Decomposition.

    Factors the matrix `a` into two unitary matrices, `u` and `vh`,
    and a 1-dimensional array of real, non-negative singular values,
    `s`, such that `a == dot(u.T, dot(diag(s), vh.T))`.

    Parameters
    ----------
    a : pycuda.gpuarray.GPUArray
        Input matrix of shape `(m, n)` to decompose.
    full_matrices : bool, optional
        If True (default), `u` and `vh` have the shapes
        `(m, m)` and `(n, n)`, respectively.  Otherwise, the shapes
        are `(m, k)` and `(k, n)`, resp., where `k = min(m, n)`.
    compute_uv : bool, optional
        If True (default), compute `u` and `vh` in addition to `s`.

    Returns
    -------
    u : pycuda.gpuarray.GPUArray
        Unitary matrix of shape `(m, m)` or `(m, k)` depending on
        value of `full_matrices`.
    s : pycuda.gpuarray.GPUArray
        Array containing the singular values, sorted such that `s[i] >= s[i+1]`.
        `s` is of length `min(m, n)`.
    vh : pycuda.gpuarray.GPUArray
        Unitary matrix of shape `(n, n)` or `(k, n)`, depending
        on `full_matrices`. 

    Notes
    -----
    This function destroys the contents of the input matrix.
    
    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6)
    >>> a = np.asarray(a, np.complex64)
    >>> a_gpu = gpuarray.to_gpu(a)
    >>> u_gpu, s_gpu, vh_gpu = linalg.svd(a_gpu, 0)
    >>> np.allclose(a, np.dot(u_gpu.get(), np.dot(np.diag(s_gpu.get()), vh_gpu.get())), 1e-4)
    True

    """

    # The free version of CULA only supports single precision floating
    # point numbers:
    real_dtype = np.dtype(np.float32)
    if a_gpu.dtype == np.complex64:
        cula_func = cula._libcula.culaDeviceCgesvd        
    elif a_gpu.dtype == np.float32:
        cula_func = cula._libcula.culaDeviceSgesvd
    else:
        raise ValueError('unsupported type')

    # Transpose shape because CUDA assumes arrays are stored in
    # column-major format:
    (m, n) = a_gpu.shape[::-1]
    
    # Set LDA:
    lda = max(1, m)

    # Set S:
    s_gpu = gpuarray.empty(min(m, n), real_dtype)
    
    # Set JOBU and JOBVT:
    if compute_uv:
        if full_matrices:
            jobu = 'A'
            jobvt = 'A'
        else:
            jobu = 'S'
            jobvt = 'S'
    else:
        jobu = 'N'
        jobvt = 'N'

    # Set LDU and transpose of U:
    ldu = m
    if jobu == 'A':
        u_gpu = gpuarray.empty((ldu, m), a_gpu.dtype)
    elif jobu == 'S':
        u_gpu = gpuarray.empty((min(m, n), ldu), a_gpu.dtype)
    else:
        ldu = 1
        u_gpu = gpuarray.empty((1, 1), a_gpu.dtype)
        
    # Set LDVT and transpose of VT:
    if jobvt == 'A':
        ldvt = n
        vt_gpu = gpuarray.empty((n, n), a_gpu.dtype)
    elif jobvt == 'S':
        ldvt = min(m, n)
        vt_gpu = gpuarray.empty((n, ldvt), a_gpu.dtype)
    else:
        ldvt = 1
        vt_gpu = gpuarray.empty((1, 1), a_gpu.dtype)

    # Compute SVD and check error status:
    status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata),
                       lda, int(s_gpu.gpudata), int(u_gpu.gpudata),
                       ldu, int(vt_gpu.gpudata), ldvt)
    cula.culaCheckStatus(status)

    if compute_uv:
        return vt_gpu, s_gpu, u_gpu
    else:
        return s_gpu