Python jit Examples, numbapro.cuda.jit Python Examples

Example #1

0

Show file

File: sli_test_multigpu_mt.py Project: lundybernard/cuda_practice

def device_controller(cid):
    cuda.select_device(cid)  # bind device to thread
    device = cuda.get_current_device()  # get current device

    # print some information about the CUDA card
    prefix = '[%s]' % device
    print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY)

    max_thread = device.MAX_THREADS_PER_BLOCK

    with compiler_lock:  # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        cuda_kernel = cuda.jit(signature)(kernel)

    # prepare data
    N = 12345
    data = np.arange(N, dtype=np.int32) * (cid + 1)
    orig = data.copy()

    # determine number of threads and blocks
    if N >= max_thread:
        ngrid = int(ceil(float(N) / max_thread))
        nthread = max_thread
    else:
        ngrid = 1
        nthread = N

    print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread))

    # real CUDA work
    d_data = cuda.to_device(data)  # transfer to device
    cuda_kernel[ngrid, nthread](d_data, d_data)  # compute inplace
    d_data.copy_to_host(data)  # transfer to host

    # check result
    if not np.all(data == orig + 1):
        raise ValueError

Example #2

0

Show file

File: sli_test_multigpu_mt.py Project: lundybernard/cuda_practice

def device_controller(cid):
    cuda.select_device(cid)                    # bind device to thread
    device = cuda.get_current_device()         # get current device

    # print some information about the CUDA card
    prefix = '[%s]' % device
    print( prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY )
    
    max_thread = device.MAX_THREADS_PER_BLOCK

    with compiler_lock:                        # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        cuda_kernel = cuda.jit(signature)(kernel)

    # prepare data
    N = 12345
    data = np.arange(N, dtype=np.int32) * (cid + 1)
    orig = data.copy()

    # determine number of threads and blocks
    if N >= max_thread:
        ngrid = int(ceil(float(N) / max_thread))
        nthread = max_thread
    else:
        ngrid = 1
        nthread = N

    print( prefix, 'grid x thread = %d x %d' % (ngrid, nthread) )

    # real CUDA work
    d_data = cuda.to_device(data)                   # transfer to device
    cuda_kernel[ngrid, nthread](d_data, d_data)     # compute inplace
    d_data.copy_to_host(data)                       # transfer to host

    # check result
    if not np.all(data == orig + 1):
        raise ValueError

Example #3

0

Show file

File: precomputed_representation.py Project: RelentlessResults/decompose001

def get_value(factors, row_index, column_index, longest_wavelet, num_wavelengths, offsets_per_wavelength):
    # convert to full window index
    offset = column_index + longest_wavelet
    offset -= (row_index % offsets_per_wavelength) + (longest_wavelet / 2)

    wavelength_index = np.int32(row_index / num_wavelengths)
    wavelength = wavelength_index + 2

    target_x = np.float32(offset) / np.float32(wavelength * 0.5)
    if target_x >= 1.0 or target_x <= -1.0:
        return 0.0
    else:
        return math.sin(target_x * math.pi) * factors[row_index]


get_value_gpu = cuda.jit(restype=f4, argtypes=[f4[:], i4, i4, i4, i4, i4], device=True)(get_value)


@cuda.jit(argtypes=[f4[:], i4, i4, f4[:], i4])
def compute_sample_kernel(factors, longest_wavelet, offsets_per_wavelength, output, num_rows):
    num_wavelengths = longest_wavelet - 2
    output[cuda.gridDim.x] = 0.0
    for row_index in range(num_rows):
        output[cuda.grid(1)] += get_value_gpu(factors, row_index, cuda.gridDim.x, longest_wavelet,
                                              num_wavelengths, offsets_per_wavelength)
    output[cuda.grid(1)] += factors[-1]


def evaluation_function(factors, opts):

    start = timer()

Example #4

0

Show file

File: test_mandelbrot_cuda.py Project: lundybernard/cuda_practice

    """
    height = image.shape[0]
    width = image.shape[1]

    pixel_size_x = (max_x - min_x) / width
    pixel_size_y = (max_y - min_y) / height

    for x in range(width):
        real = min_x + x * pixel_size_x
        for y in range(height):
            imag = min_y + y * pixel_size_y
            color = mandel(real, imag, iters)
            image[y, x] = color


mandel_gpu = cuda.jit(restype=uint32, argtypes=[f8, f8, uint32], device=True)(mandel)

#@cuda.jit(argtypes=[f8, f8, f8, f8, uint8[:,:], uint32], target='parallel')
@vectorize([f8, f8, f8, f8, uint8[:,:], uint32], target='parallel')
def mandel_kernel (min_x, max_x, min_y, max_y, image, iters):
    height = image.shape[0]
    width = image.shape[1]

    pixel_size_x = (max_x - min_x) /width
    pixel_size_y = (max_y - min_y) / height

    startX, startY = cuda.grid(2)
    gridX = cuda.gridDim.x * cuda.blockDim.x
    gridY = cuda.gridDim.y * cuda.blockDim.y

    for x in range(startX, width, gridX):

Example #5

0

Show file

File: ga_representation.py Project: RelentlessResults/decompose001

from numbapro import cuda
from numba import *
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.animation as animation


def get_sample(wavelength, offset, amplitude, sample_position):
    x_position = (sample_position - offset) / wavelength

    if x_position >= 1.0 or x_position <= -1.0:
        return 0.0
    else:
        return amplitude * math.sin(x_position * math.pi)

get_sample_gpu = cuda.jit(restype=f8, argtypes=[f8, f8, f8, f8], device=True)(get_sample)


@cuda.jit(restype=void, argtypes=[f8[:], f8[:, :], f8])
def compute_sample_gpu(factors, result, width):
    x = (cuda.blockIdx.x * cuda.blockDim.x) + cuda.threadIdx.x
    y = (cuda.blockIdx.y * cuda.blockDim.y) + cuda.threadIdx.y
    wavelength = factors[y * 3]
    offset = factors[y * 3 + 1]
    amplitude = factors[y * 3 + 2]
    sample_position = np.float64(x) / width
    result[x][y] = get_sample_gpu(wavelength, offset, amplitude, sample_position)


class Segment:
    def __init__(self, values, amplitude, num_oscillators=12):

Example #6

0

Show file

    """
    height = image.shape[0]
    width = image.shape[1]

    pixel_size_x = (max_x - min_x) / width
    pixel_size_y = (max_y - min_y) / height

    for x in range(width):
        real = min_x + x * pixel_size_x
        for y in range(height):
            imag = min_y + y * pixel_size_y
            color = mandel(real, imag, iters)
            image[y, x] = color


mandel_gpu = cuda.jit(restype=uint32, argtypes=[f8, f8, uint32],
                      device=True)(mandel)


@cuda.jit(argtypes=[f8, f8, f8, f8, uint8[:, :], uint32])
def mandel_kernel(min_x, max_x, min_y, max_y, image, iters):
    height = image.shape[0]
    width = image.shape[1]

    pixel_size_x = (max_x - min_x) / width
    pixel_size_y = (max_y - min_y) / height

    startX, startY = cuda.grid(2)
    gridX = cuda.gridDim.x * cuda.blockDim.x
    gridY = cuda.gridDim.y * cuda.blockDim.y

    for x in range(startX, width, gridX):

Example #7

0

Show file

File: block_increment.py Project: Jingoo88/Projet-3A-2015

def likelihood(x, y, z):
    """ will compare the likelihood ratio of the new point
    compared to the actual one and choose based on this
    whether to change or keep the current state"""

    p = ((1 + x * x) * math.exp(-0.5 * x * x)) / (
        (1 + y * y) * math.exp(-0.5 * y * y))
    if p > z:
        return x
    else:
        return y


#We define our functions to be used on the device
index_gpu = cuda.jit(restype=uint16, argtypes=[uint16, uint16],
                     device=True)(index)
likelihood_gpu = cuda.jit(restype=f4, argtypes=[f4, f4, f4],
                          device=True)(likelihood)


@cuda.jit(argtypes=[f4, uint32, f4[:], f4[:], f4[:, :]])
def block_kernel(start, p, a_dev, b_dev, c_dev):
    """
    kernel will perform incrementation of the p chains
    at the same time
    """

    x = cuda.grid(1)  # equals to threadIdx.x + blockIdx.x * blockDim.x

    if x <= p:
        for i in range(p):