Beispiel #1
0
    def setUp(self):
        self.dev = cupy.cuda.runtime.getDevice()
        assert self.dev != 1

        global _test_cache_dir
        _test_cache_dir = tempfile.mkdtemp()
        os.environ['CUPY_CACHE_DIR'] = _test_cache_dir

        self.kern = cupy.RawKernel(_test_source1,
                                   'test_sum',
                                   backend=self.backend)
        self.mod2 = cupy.RawModule(code=_test_source2, backend=self.backend)
        self.mod3 = cupy.RawModule(code=_test_source3,
                                   options=('-DPRECISION=2', ),
                                   backend=self.backend)
Beispiel #2
0
def _lombscargle(x, y, freqs, pgram, y_dot):

    if (pgram.dtype == 'float32'):
        c_type = "float"
    elif (pgram.dtype == 'float64'):
        c_type = "double"

    device_id = cp.cuda.Device()
    numSM = device_id.attributes["MultiProcessorCount"]
    threadsperblock = (128, )
    blockspergrid = (numSM * 20, )

    src = _cupy_lombscargle_src.substitute(datatype=c_type)
    module = cp.RawModule(code=src, options=("-std=c++11", ))
    kernel = module.get_function("_cupy_lombscargle")
    # print("Registers", kernel.num_regs)

    kernel_args = (
        x.shape[0],
        freqs.shape[0],
        x,
        y,
        freqs,
        pgram,
        y_dot,
    )

    kernel(blockspergrid, threadsperblock, kernel_args)

    cp.cuda.runtime.deviceSynchronize()
def GetModule(source, cuoptions):
    """Returns a cupy raw module"""
    if _cupy_has_RawModule(): return cp.RawModule(source, options=cuoptions)
    else:
        return cp.core.core.compile_with_cache(source,
                                               options=cuoptions,
                                               prepend_cupy_headers=False)
Beispiel #4
0
    def compile(self, kernel_instance):
        """call the CUDA compiler to compile the kernel, return the device function

        :param kernel_name: The name of the kernel to be compiled, used to lookup the
            function after compilation.
        :type kernel_name: string

        :param kernel_string: The CUDA kernel code that contains the function `kernel_name`
        :type kernel_string: string

        :returns: An CUDA kernel that can be called directly.
        :rtype: cupy.RawKernel
        """
        kernel_string = kernel_instance.kernel_string
        kernel_name = kernel_instance.name

        compiler_options = self.compiler_options
        if not any(['--std=' in opt for opt in self.compiler_options]):
            compiler_options = ['--std=c++11'] + self.compiler_options

        options = tuple(compiler_options)

        self.current_module = cp.RawModule(code=kernel_string,
                                           options=options,
                                           name_expressions=[kernel_name])

        self.func = self.current_module.get_function(kernel_name)
        return self.func
def _populate_kernel_cache():
    for numba_type, c_type in _SUPPORTED_TYPES.items():
        # JIT compile the numba kernels, flip = 0/1 (correlate/convolve)
        sig = _numba_convolve_2d_signature(numba_type)
        _numba_kernel_cache[(0, str(numba_type))] = cuda.jit(
            sig, fastmath=True
        )(_numba_correlate_2d)
        _numba_kernel_cache[(1, str(numba_type))] = cuda.jit(
            sig, fastmath=True
        )(_numba_convolve_2d)

        # Instantiate the cupy kernel for this type and compile
        if isinstance(numba_type, Complex):
            header = "#include <cupy/complex.cuh>"
        else:
            header = ""
        src = loaded_from_source.substitute(datatype=c_type, header=header)
        module2 = cp.RawModule(
            code=src, options=("-std=c++11", "-use_fast_math")
        )
        _cupy_kernel_cache[(0, str(numba_type))] = module2.get_function(
            "_cupy_correlate_2d"
        )
        _cupy_kernel_cache[(1, str(numba_type))] = module2.get_function(
            "_cupy_convolve_2d"
        )
Beispiel #6
0
 def test_invalid_compiler_flag(self):
     with pytest.raises(cupy.cuda.compiler.CompileException) as ex:
         mod = cupy.RawModule(code=_test_source3,
                              options=('-DPRECISION=3', ),
                              backend=self.backend)
         mod.get_function('test_sum')  # enforce compilation
     assert 'precision not supported' in str(ex.value)
Beispiel #7
0
def main():
    N = 8
    module = cupy.RawModule(code=code,
                            options=('-std=c++11', ),
                            name_expressions=('kernel<float>',
                                              'kernel<double>'))

    # The kernel computes out = A*B+C where A, B and C are 4x4 matrices.
    # A and B are arrays of N such matrices and C is a matrix kernel parameter.

    for (ctype, dtype) in zip(('float', 'double'),
                              (numpy.float32, numpy.float64)):

        A = cupy.random.rand(16 * N, dtype=dtype).reshape(N, 4, 4)
        B = cupy.random.rand(16 * N, dtype=dtype).reshape(N, 4, 4)
        C = numpy.random.rand(16).astype(dtype).reshape(4, 4)
        out = cupy.empty_like(A)

        Matrix = numpy.dtype({
            'names': ['value'],
            'formats': [(dtype, (4, 4))]
        })

        kernel = module.get_function('kernel<{}>'.format(ctype))
        args = (A, B, C.ravel().view(Matrix), out)
        kernel((1, ), (N, ), args)

        expected = cupy.matmul(A, B) + cupy.asarray(C[None, :, :])

        cupy.testing.assert_array_almost_equal(expected, out)
        print("Kernel output matches expected value for "
              "type '{}'.".format(ctype))
Beispiel #8
0
    def test_context_switch_RawModule6(self):
        # run test_template_specialization() on another device
        # in this test, re-compiling happens at kernel launch
        if self.backend == 'nvcc':
            self.skipTest('nvcc does not support template specialization')

        # compile code
        name_expressions = ['my_sqrt<unsigned int>']
        name = name_expressions[0]
        with cupy.cuda.Device(0):
            mod = cupy.RawModule(code=test_cxx_template,
                                 options=('--std=c++11', ),
                                 name_expressions=name_expressions)

            # get specialized kernels
            ker = mod.get_function(name)

        # switch device
        with cupy.cuda.Device(1):
            # prepare inputs & expected outputs
            in_arr = cupy.testing.shaped_random((10, ), dtype=cupy.uint32)
            out_arr = in_arr**2

            # run
            ker((1, ), (10, ), (in_arr, 10))

            # check results
            assert cupy.allclose(in_arr, out_arr)
Beispiel #9
0
def _lombscargle(x, y, freqs, pgram, y_dot):

    if (str(pgram.dtype)) in _kernel_cache:
        kernel = _kernel_cache[(str(pgram.dtype))]
    else:
        module = cp.RawModule(code=_cupy_lombscargle_src,
                              options=("-std=c++11", ))
        kernel = _kernel_cache[(str(
            pgram.dtype))] = module.get_function("_cupy_lombscargle_" +
                                                 str(pgram.dtype))
        print("Registers", kernel.num_regs)

    device_id = cp.cuda.Device()
    numSM = device_id.attributes["MultiProcessorCount"]
    threadsperblock = (128, )
    blockspergrid = (numSM * 20, )

    kernel_args = (
        x.shape[0],
        freqs.shape[0],
        x,
        y,
        freqs,
        pgram,
        y_dot,
    )

    kernel(blockspergrid, threadsperblock, kernel_args)

    cp.cuda.runtime.deviceSynchronize()
Beispiel #10
0
def _populate_kernel_cache(np_type, blocks, dim_x, dim_z, dim_u, max_tpb):

    # Check in np_type is a supported option
    if np_type not in _SUPPORTED_TYPES:
        raise ValueError(
            "Datatype {} not found for Kalman Filter".format(np_type))

    if np_type == "float32":
        c_type = "float"
    else:
        c_type = "double"

    #     Check CuPy version
    # Update to only check for v8.X in cuSignal 0.16
    # Instantiate the cupy kernel for this type and compile
    specializations = (
        "_cupy_predict<{}, {}, {}, {}, {}>".format(c_type, blocks, dim_x,
                                                   dim_u, max_tpb),
        "_cupy_update<{}, {}, {}, {}, {}>".format(c_type, blocks, dim_x, dim_z,
                                                  max_tpb),
    )
    module = cp.RawModule(
        code=cuda_code_kalman,
        options=(
            "-std=c++11",
            "-fmad=true",
        ),
        name_expressions=specializations,
    )

    _cupy_kernel_cache[(str(np_type),
                        "predict")] = module.get_function(specializations[0])

    _cupy_kernel_cache[(str(np_type),
                        "update")] = module.get_function(specializations[1])
Beispiel #11
0
    def test_template_specialization(self):
        if self.backend == 'nvcc':
            self.skipTest('nvcc does not support template specialization')

        # compile code
        name_expressions = [
            'my_sqrt<int>', 'my_sqrt<float>', 'my_sqrt<complex<double>>',
            'my_func'
        ]
        mod = cupy.RawModule(code=test_cxx_template,
                             options=('--std=c++11', ),
                             name_expressions=name_expressions)

        dtypes = (cupy.int32, cupy.float32, cupy.complex128, cupy.float64)
        for ker_T, dtype in zip(name_expressions, dtypes):
            # get specialized kernels
            ker = mod.get_function(ker_T)

            # prepare inputs & expected outputs
            in_arr = cupy.testing.shaped_random((10, ), dtype=dtype)
            out_arr = in_arr**2

            # run
            ker((1, ), (10, ), (in_arr, 10))

            # check results
            assert cupy.allclose(in_arr, out_arr)
Beispiel #12
0
    def test_cuFloatComplex(self):
        N = 100
        block = 32
        grid = (N + block - 1) // block
        dtype = cupy.complex64

        mod = cupy.RawModule(code=_test_cuComplex, translate_cucomplex=True)
        a = cupy.random.random((N, )) + 1j * cupy.random.random((N, ))
        a = a.astype(dtype)
        b = cupy.random.random((N, )) + 1j * cupy.random.random((N, ))
        b = b.astype(dtype)
        c = cupy.random.random((N, )) + 1j * cupy.random.random((N, ))
        c = c.astype(dtype)
        out = cupy.zeros((N, ), dtype=dtype)
        out_float = cupy.zeros((N, ), dtype=cupy.float32)
        out_up = cupy.zeros((N, ), dtype=cupy.complex128)

        ker = mod.get_function('test_addf')
        ker((grid, ), (block, ), (a, b, out))
        assert (out == a + b).all()

        ker = mod.get_function('test_subf')
        ker((grid, ), (block, ), (a, b, out))
        assert (out == a - b).all()

        ker = mod.get_function('test_mulf')
        ker((grid, ), (block, ), (a, b, out))
        assert cupy.allclose(out, a * b)

        ker = mod.get_function('test_divf')
        ker((grid, ), (block, ), (a, b, out))
        assert (out == a / b).all()

        ker = mod.get_function('test_conjf')
        ker((grid, ), (block, ), (a, out))
        assert (out == cupy.conj(a)).all()

        ker = mod.get_function('test_absf')
        ker((grid, ), (block, ), (a, out_float))
        assert (out_float == cupy.abs(a)).all()

        ker = mod.get_function('test_fmaf')
        ker((grid, ), (block, ), (a, b, c, out))
        assert cupy.allclose(out, a * b + c)

        ker = mod.get_function('test_makef')
        ker((grid, ), (block, ), (out, ))
        # because of precision issue, the (A==B).all() semantics would fail
        assert cupy.allclose(out, 1.8 - 1j * 8.7)

        ker = mod.get_function('test_upcast')
        ker((grid, ), (block, ), (a, out_up))
        assert (out_up == a.astype(cupy.complex128)).all()

        # NumPy scalars.
        b = cupy.complex64(2 + 3j)
        ker = mod.get_function('test_addf_scalar')
        ker((grid, ), (block, ), (a, b, out))
        assert (out == a + b).all()
Beispiel #13
0
def _slic_cupy(
    image,
    n_features,
    n_centers,
    max_iter,
    template,
    center_block,
    center_grid,
    image_block,
    image_grid,
):

    import cupy as cp

    template = 'extern "C" { ' + template + " }"
    module = cp.RawModule(code=template, options=("-std=c++11", ))
    gpu_slic_init = module.get_function("init_clusters")
    gpu_slic_expectation = module.get_function("expectation")
    gpu_slic_maximization = module.get_function("maximization")

    data_gpu = cp.asarray(image)
    centers_gpu = cp.zeros((n_centers, n_features + 3), dtype=cp.float32)
    labels_gpu = cp.zeros(image.shape[:3], dtype=cp.uint32)

    gpu_slic_init(
        center_grid,
        center_block,
        (
            data_gpu,
            centers_gpu,
        ),
    )
    cp.cuda.runtime.deviceSynchronize()

    for _ in range(max_iter):
        gpu_slic_expectation(
            image_grid,
            image_block,
            (
                data_gpu,
                centers_gpu,
                labels_gpu,
            ),
        )
        cp.cuda.runtime.deviceSynchronize()

        gpu_slic_maximization(
            center_grid,
            center_block,
            (
                data_gpu,
                labels_gpu,
                centers_gpu,
            ),
        )
        cp.cuda.runtime.deviceSynchronize()

    labels = np.asarray(labels_gpu.get(), dtype=np.intp)
    return labels
Beispiel #14
0
def _get_function(fatbin, func):

    dir = os.path.dirname(Path(__file__).parent)

    module = cp.RawModule(
        path=dir + fatbin,
    )
    return module.get_function(func)
Beispiel #15
0
def load_cupy_module(fname, **kwargs):
    try: fname = str((Path(__file__).parent / fname).resolve())
    except: pass
    with open(fname) as f:
        code = f.read()
    macros = ['#define {!s} {!s}'.format(k, v) for k,v in kwargs.items()]
    code = '\n'.join(macros + [code])
    return cp.RawModule(code=code)
Beispiel #16
0
 def test_module_load_failure(self):
     # in principle this test is better done in test_driver.py, but
     # this error is more likely to appear when using RawModule, so
     # let us do it here
     with pytest.raises(cupy.cuda.driver.CUDADriverError) as ex:
         cupy.RawModule(os.path.expanduser('~/this_does_not_exist.cubin'),
                        backend=self.backend)
     assert 'CUDA_ERROR_FILE_NOT_FOUND' in str(ex.value)
Beispiel #17
0
def SART2D(p, sp, order, x0):
    # x0 = xinit.copy()
    block1D = (8, 1)
    grid1D = ((sp['nBins'] + block1D[0] - 1) // block1D[0], 1)
    block2D = (8, 8)
    grid2D = ((sp['nWidth'] + block2D[0] - 1) // block2D[0],
              (sp['nHeight'] + block2D[1] - 1) // block2D[1])
    mod = cupy.RawModule(code=source_texref)
    fGetResiduals = mod.get_function('fGetResiduals')
    AssignResidualError = mod.get_function('AssignResidualError_kernel')

    channelDescImg = ChannelFormatDescriptor(
        32, 0, 0, 0, runtime.cudaChannelFormatKindFloat)
    cuArrayImg = CUDAarray(channelDescImg, sp['nWidth'], sp['nHeight'])
    resourceDescImg = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                         cuArrayImg)
    address_modeImg = (runtime.cudaAddressModeClamp,
                       runtime.cudaAddressModeClamp)
    texDescImg = TextureDescriptor(address_modeImg,
                                   runtime.cudaFilterModePoint,
                                   runtime.cudaReadModeElementType)

    # 1D texture
    channelDesc1D = ChannelFormatDescriptor(32, 0, 0, 0,
                                            runtime.cudaChannelFormatKindFloat)
    cuArray1D = CUDAarray(channelDesc1D, sp['nBins'])
    resourceDesc1D = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                        cuArray1D)
    address_mode1D = (runtime.cudaAddressModeClamp,
                      runtime.cudaAddressModeClamp)
    texDesc1D = TextureDescriptor(address_mode1D, runtime.cudaFilterModePoint,
                                  runtime.cudaReadModeElementType)
    d_fResidualsData = cupy.zeros(sp['nBins'], cupy.float32)

    for v in range(sp['nViews']):
        # print('{}\n'.format(v))
        nView = order[v]
        fLambda = sp['fRotateDir'] * 2.0 * np.pi / float(
            sp['nNumAngle']) * float(nView + sp['nStartAngle'])
        fCosLambda = np.cos(fLambda)
        fSinLambda = np.sin(fLambda)
        cuArrayImg.copy_from(x0)
        TextureReference(mod.get_texref('texImage'), resourceDescImg,
                         texDescImg)
        getErrArgs = (d_fResidualsData, p, sp['nBins'], sp['fSod'], sp['fOdd'],
                      sp['fCellSize'], sp['fPixelSize'], sp['fFovRadius'],
                      fCosLambda, fSinLambda, nView, sp['fOffSet'],
                      sp['fAngleOfSlope'])
        fGetResiduals(grid1D, block1D, getErrArgs)
        cuArray1D.copy_from(d_fResidualsData)
        TextureReference(mod.get_texref('texFP'), resourceDesc1D, texDesc1D)
        AssignResidualErrorArgs = (x0, sp['nWidth'], sp['nHeight'],
                                   sp['nBins'], sp['fSod'], sp['fOdd'],
                                   sp['fCellSize'], sp['fPixelSize'],
                                   fCosLambda, fSinLambda, sp['fOffSet'],
                                   sp['fAngleOfSlope'], sp['relax_factor'])
        AssignResidualError(grid2D, block2D, AssignResidualErrorArgs)
    return x0
Beispiel #18
0
def SART2DBackWard(grad, order, sp):
    grad_ = grad.copy()
    block1D = (8, 1)
    grid1D = ((sp['nBins'] + block1D[0] - 1) // block1D[0], 1)
    block2D = (8, 8)
    grid2D = ((sp['nWidth'] + block2D[0] - 1) // block2D[0],
              (sp['nHeight'] + block2D[1] - 1) // block2D[1])
    mod = cupy.RawModule(code=source_texref)
    AssignResidualError = mod.get_function('AssignResidualError_kernel')
    FpKernel = mod.get_function('fGetFp_kernel')
    # 2D texture
    channelDescImg = ChannelFormatDescriptor(
        32, 0, 0, 0, runtime.cudaChannelFormatKindFloat)
    cuArrayImg = CUDAarray(channelDescImg, sp['nWidth'], sp['nHeight'])
    resourceDescImg = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                         cuArrayImg)
    address_modeImg = (runtime.cudaAddressModeClamp,
                       runtime.cudaAddressModeClamp)
    texDescImg = TextureDescriptor(address_modeImg,
                                   runtime.cudaFilterModePoint,
                                   runtime.cudaReadModeElementType)
    # 1D texture
    channelDesc1D = ChannelFormatDescriptor(32, 0, 0, 0,
                                            runtime.cudaChannelFormatKindFloat)
    cuArray1D = CUDAarray(channelDesc1D, sp['nBins'])
    resourceDesc1D = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                        cuArray1D)
    address_mode1D = (runtime.cudaAddressModeClamp,
                      runtime.cudaAddressModeClamp)
    texDesc1D = TextureDescriptor(address_mode1D, runtime.cudaFilterModePoint,
                                  runtime.cudaReadModeElementType)
    d_fOneProj = cupy.zeros(sp['nBins'], cupy.float32)
    for v in range(sp['nViews']):
        nView = order[sp['nViews'] - 1 - v]
        fLambda = sp['fRotateDir'] * 2.0 * np.pi / float(
            sp['nNumAngle']) * float(nView + sp['nStartAngle'])
        fCosLambda = np.cos(fLambda)
        fSinLambda = np.sin(fLambda)
        # A*x
        cuArrayImg.copy_from(grad)
        TextureReference(mod.get_texref('texImage'), resourceDescImg,
                         texDescImg)
        args = (d_fOneProj, sp['nBins'], sp['fSod'], sp['fOdd'],
                sp['fCellSize'], sp['fPixelSize'], sp['fFovRadius'],
                fCosLambda, fSinLambda, nView, sp['fOffSet'],
                sp['fAngleOfSlope'])
        FpKernel(grid1D, block1D, args)
        # AT*A*x
        cuArray1D.copy_from(d_fOneProj)
        TextureReference(mod.get_texref('texFP'), resourceDesc1D, texDesc1D)
        AssignResidualErrorArgs = (grad, sp['nWidth'], sp['nHeight'],
                                   sp['nBins'], sp['fSod'], sp['fOdd'],
                                   sp['fCellSize'], sp['fPixelSize'],
                                   fCosLambda, fSinLambda, sp['fOffSet'],
                                   sp['fAngleOfSlope'], sp['relax_factor'])
        AssignResidualError(grid2D, block2D, AssignResidualErrorArgs)
        grad = grad_ - sp['relax_factor'] * grad
    return grad
Beispiel #19
0
    def test_load_ptx(self):
        # generate ptx in the temp dir
        file_path = self._generate_file('ptx')

        # load ptx and test the kernel
        mod = cupy.RawModule(path=file_path, backend=self.backend)
        ker = mod.get_function('test_div')
        x1, x2, y = self._helper(ker, cupy.float32)
        assert cupy.allclose(y, x1 / (x2 + 1.0))
Beispiel #20
0
    def __init__(self, lower, upper, radius):
        self.lower = lower
        self.upper = upper
        self.cuda_kernel = self.compile_kernel(lower, upper)
        self.cuda_module = cp.RawModule(
            code=open("./opsi/util/cv/cuda/boxblur.cu").read())
        self.apply_filter = self.cuda_module.get_function("applyFilter")

        self.update_radius(radius)
Beispiel #21
0
    def __init__(self, dtype):

        # Load the CUDA kernel through cupy
        loaded_from_source = _load_cuda_kernel(dtype)
        median_filter_module = cp.RawModule(code=loaded_from_source)
        self.single_image_median_filter = median_filter_module.get_function("two_dimensional_median_filter")

        # Warm up the CUDA functions
        self._warm_up(dtype)
Beispiel #22
0
    def setUp(self):
        if hasattr(self, 'clean_up'):
            util.clear_memo()
        self.dev = cupy.cuda.runtime.getDevice()
        assert self.dev != 1

        self.temporary_cache_dir_context = use_temporary_cache_dir()
        self.in_memory_context = compile_in_memory(self.in_memory)
        self.cache_dir = self.temporary_cache_dir_context.__enter__()
        self.in_memory_context.__enter__()

        self.kern = cupy.RawKernel(_test_source1,
                                   'test_sum',
                                   backend=self.backend)
        self.mod2 = cupy.RawModule(code=_test_source2, backend=self.backend)
        self.mod3 = cupy.RawModule(code=_test_source3,
                                   options=('-DPRECISION=2', ),
                                   backend=self.backend)
Beispiel #23
0
    def test_invalid_compiler_flag(self):
        if cupy.cuda.runtime.is_hip and self.backend == 'nvrtc':
            self.skipTest('hiprtc does not handle #error macro properly')

        with pytest.raises(cupy.cuda.compiler.CompileException) as ex:
            mod = cupy.RawModule(code=_test_source3,
                                 options=('-DPRECISION=3',),
                                 backend=self.backend)
            mod.get_function('test_sum')  # enforce compilation
        assert 'precision not supported' in str(ex.value)
Beispiel #24
0
 def test_const_memory(self):
     mod = cupy.RawModule(code=test_const_mem, backend=self.backend)
     ker = mod.get_function('multiply_by_const')
     mem_ptr = mod.get_global('some_array')
     const_arr = cupy.ndarray((100, ), cupy.float32, mem_ptr)
     data = cupy.arange(100, dtype=cupy.float32)
     const_arr[...] = data
     output_arr = cupy.ones(100, dtype=cupy.float32)
     ker((1, ), (100, ), (output_arr, cupy.int32(100)))
     assert (data == output_arr).all()
Beispiel #25
0
 def test_module_load_failure(self):
     # in principle this test is better done in test_driver.py, but
     # this error is more likely to appear when using RawModule, so
     # let us do it here
     with pytest.raises(cupy.cuda.driver.CUDADriverError) as ex:
         mod = cupy.RawModule(
             path=os.path.expanduser('~/this_does_not_exist.cubin'),
             backend=self.backend)
         mod.get_function('nonexisting_kernel')  # enforce loading
     assert ('CUDA_ERROR_FILE_NOT_FOUND' in str(ex.value)  # CUDA
             or 'hipErrorFileNotFound' in str(ex.value))  # HIP
Beispiel #26
0
    def test_cuDoubleComplex(self):
        N = 100
        block = 32
        grid = (N + block - 1) // block
        dtype = cupy.complex128

        mod = cupy.RawModule(
            code=_test_cuComplex,
            translate_cucomplex=True)
        a = cupy.random.random((N,)) + 1j*cupy.random.random((N,))
        a = a.astype(dtype)
        b = cupy.random.random((N,)) + 1j*cupy.random.random((N,))
        b = b.astype(dtype)
        c = cupy.random.random((N,)) + 1j*cupy.random.random((N,))
        c = c.astype(dtype)
        out = cupy.zeros((N,), dtype=dtype)
        out_float = cupy.zeros((N,), dtype=cupy.float64)
        out_down = cupy.zeros((N,), dtype=cupy.complex64)

        ker = mod.get_function('test_add')
        ker((grid,), (block,), (a, b, out))
        assert (out == a + b).all()

        ker = mod.get_function('test_sub')
        ker((grid,), (block,), (a, b, out))
        assert (out == a - b).all()

        ker = mod.get_function('test_mul')
        ker((grid,), (block,), (a, b, out))
        assert (out == a * b).all()

        ker = mod.get_function('test_div')
        ker((grid,), (block,), (a, b, out))
        assert (out == a / b).all()

        ker = mod.get_function('test_conj')
        ker((grid,), (block,), (a, out))
        assert (out == cupy.conj(a)).all()

        ker = mod.get_function('test_abs')
        ker((grid,), (block,), (a, out_float))
        assert (out_float == cupy.abs(a)).all()

        ker = mod.get_function('test_fma')
        ker((grid,), (block,), (a, b, c, out))
        assert (out == a * b + c).all()

        ker = mod.get_function('test_make')
        ker((grid,), (block,), (out,))
        assert (out == 1.8 - 1j * 8.7).all()

        ker = mod.get_function('test_downcast')
        ker((grid,), (block,), (a, out_down))
        assert (out_down == a.astype(cupy.complex64)).all()
Beispiel #27
0
    def test_context_switch_RawModule4(self):
        # run test_load_cubin() on another device
        # generate cubin in the temp dir and load it on device 0
        file_path = self._generate_file('cubin')
        mod = cupy.RawModule(path=file_path, backend=self.backend)
        ker = mod.get_function('test_div')
        # in this test, reloading happens at kernel launch
        cupy.cuda.runtime.setDevice(1)

        x1, x2, y = self._helper(ker, cupy.float32)
        assert cupy.allclose(y, x1 / (x2 + 1.0))
Beispiel #28
0
    def setUp(self):
        global _test_cache_dir
        _test_cache_dir = tempfile.mkdtemp()
        os.environ['CUPY_CACHE_DIR'] = _test_cache_dir

        self.kern_grid_sync = cupy.RawKernel(
            _test_grid_sync, 'test_grid_sync', backend='nvcc',
            enable_cooperative_groups=True)
        self.mod_grid_sync = cupy.RawModule(
            code=_test_grid_sync, backend='nvcc',
            enable_cooperative_groups=True)
Beispiel #29
0
 def test_compile_module(self):
     module = cupy.RawModule(code=_test_compile_src,
                             backend=self.backend,
                             options=('-DOP=+', ))
     log = io.StringIO()
     with use_temporary_cache_dir():
         module.compile(log_stream=log)
     assert 'warning' in log.getvalue()
     kern = module.get_function('test_op')
     x1, x2, y = self._helper(kern, cupy.float32)
     assert cupy.allclose(y, x1 + x2)
Beispiel #30
0
 def _helper2(self, type_str):
     mod2 = cupy.RawModule(code=std_code,
                           jitify=self.jitify,
                           name_expressions=['shift<%s>' % type_str, ],
                           options=('--std=c++11',))
     ker = mod2.get_function('shift<%s>' % type_str)
     N = 256
     a = cupy.random.random_integers(0, 7, N).astype(cupy.int32)
     b = a.copy()
     ker((1,), (N,), (a, N))
     assert cupy.allclose(a, b+100)