def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread)) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print( prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY ) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print( prefix, 'grid x thread = %d x %d' % (ngrid, nthread) ) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
def get_value(factors, row_index, column_index, longest_wavelet, num_wavelengths, offsets_per_wavelength): # convert to full window index offset = column_index + longest_wavelet offset -= (row_index % offsets_per_wavelength) + (longest_wavelet / 2) wavelength_index = np.int32(row_index / num_wavelengths) wavelength = wavelength_index + 2 target_x = np.float32(offset) / np.float32(wavelength * 0.5) if target_x >= 1.0 or target_x <= -1.0: return 0.0 else: return math.sin(target_x * math.pi) * factors[row_index] get_value_gpu = cuda.jit(restype=f4, argtypes=[f4[:], i4, i4, i4, i4, i4], device=True)(get_value) @cuda.jit(argtypes=[f4[:], i4, i4, f4[:], i4]) def compute_sample_kernel(factors, longest_wavelet, offsets_per_wavelength, output, num_rows): num_wavelengths = longest_wavelet - 2 output[cuda.gridDim.x] = 0.0 for row_index in range(num_rows): output[cuda.grid(1)] += get_value_gpu(factors, row_index, cuda.gridDim.x, longest_wavelet, num_wavelengths, offsets_per_wavelength) output[cuda.grid(1)] += factors[-1] def evaluation_function(factors, opts): start = timer()
""" height = image.shape[0] width = image.shape[1] pixel_size_x = (max_x - min_x) / width pixel_size_y = (max_y - min_y) / height for x in range(width): real = min_x + x * pixel_size_x for y in range(height): imag = min_y + y * pixel_size_y color = mandel(real, imag, iters) image[y, x] = color mandel_gpu = cuda.jit(restype=uint32, argtypes=[f8, f8, uint32], device=True)(mandel) #@cuda.jit(argtypes=[f8, f8, f8, f8, uint8[:,:], uint32], target='parallel') @vectorize([f8, f8, f8, f8, uint8[:,:], uint32], target='parallel') def mandel_kernel (min_x, max_x, min_y, max_y, image, iters): height = image.shape[0] width = image.shape[1] pixel_size_x = (max_x - min_x) /width pixel_size_y = (max_y - min_y) / height startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y for x in range(startX, width, gridX):
from numbapro import cuda from numba import * from PIL import Image import matplotlib.pyplot as plt import matplotlib.animation as animation def get_sample(wavelength, offset, amplitude, sample_position): x_position = (sample_position - offset) / wavelength if x_position >= 1.0 or x_position <= -1.0: return 0.0 else: return amplitude * math.sin(x_position * math.pi) get_sample_gpu = cuda.jit(restype=f8, argtypes=[f8, f8, f8, f8], device=True)(get_sample) @cuda.jit(restype=void, argtypes=[f8[:], f8[:, :], f8]) def compute_sample_gpu(factors, result, width): x = (cuda.blockIdx.x * cuda.blockDim.x) + cuda.threadIdx.x y = (cuda.blockIdx.y * cuda.blockDim.y) + cuda.threadIdx.y wavelength = factors[y * 3] offset = factors[y * 3 + 1] amplitude = factors[y * 3 + 2] sample_position = np.float64(x) / width result[x][y] = get_sample_gpu(wavelength, offset, amplitude, sample_position) class Segment: def __init__(self, values, amplitude, num_oscillators=12):
""" height = image.shape[0] width = image.shape[1] pixel_size_x = (max_x - min_x) / width pixel_size_y = (max_y - min_y) / height for x in range(width): real = min_x + x * pixel_size_x for y in range(height): imag = min_y + y * pixel_size_y color = mandel(real, imag, iters) image[y, x] = color mandel_gpu = cuda.jit(restype=uint32, argtypes=[f8, f8, uint32], device=True)(mandel) @cuda.jit(argtypes=[f8, f8, f8, f8, uint8[:, :], uint32]) def mandel_kernel(min_x, max_x, min_y, max_y, image, iters): height = image.shape[0] width = image.shape[1] pixel_size_x = (max_x - min_x) / width pixel_size_y = (max_y - min_y) / height startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y for x in range(startX, width, gridX):
def likelihood(x, y, z): """ will compare the likelihood ratio of the new point compared to the actual one and choose based on this whether to change or keep the current state""" p = ((1 + x * x) * math.exp(-0.5 * x * x)) / ( (1 + y * y) * math.exp(-0.5 * y * y)) if p > z: return x else: return y #We define our functions to be used on the device index_gpu = cuda.jit(restype=uint16, argtypes=[uint16, uint16], device=True)(index) likelihood_gpu = cuda.jit(restype=f4, argtypes=[f4, f4, f4], device=True)(likelihood) @cuda.jit(argtypes=[f4, uint32, f4[:], f4[:], f4[:, :]]) def block_kernel(start, p, a_dev, b_dev, c_dev): """ kernel will perform incrementation of the p chains at the same time """ x = cuda.grid(1) # equals to threadIdx.x + blockIdx.x * blockDim.x if x <= p: for i in range(p):