def __init__(self, gpuID=None, stream=None): if gpuID is not None: if gpuID < len(cuda.list_devices()) and gpuID >= 0: cuda.close() cuda.select_device(gpuID) else: raise ValueError('GPU ID not found') if stream is None: self.stream = cuda.stream() else: assert isinstance(stream, numba.cuda.cudadrv.driver.Stream) self.stream = stream self.blas = numbapro.cudalib.cublas.Blas(stream=self.stream) self.blockdim = 32 self.blockdim2 = (32, 32)
def block_increment(start, n): cuda.select_device(0) stream = cuda.stream() blockdim = 256 griddim = n // 256 + 1 c_host = np.zeros((n, n), dtype=np.float32) m_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) n_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) a_host = np.zeros(n, dtype=np.float32) a_dev = cuda.device_array_like(a_host) cuda_div[griddim, blockdim, stream](m_dev, n_dev, a_dev, n) #keeps a_dev on the device for the kernel ==> no access at this point to the device memory # so i cant know what appends to m_dev and n_dev best guess is python GC is # translated into desallocation on the device b_dev = curand.uniform((n * n), dtype=np.float32, device=True) c_dev = cuda.device_array_like(c_host, stream) block_kernel[griddim, blockdim, stream](start, n, a_dev, b_dev, c_dev) c_dev.copy_to_host(c_host, stream) stream.synchronize() return c_host
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread)) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print( prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY ) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print( prefix, 'grid x thread = %d x %d' % (ngrid, nthread) ) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
__version__ = '0.1' __maintainer__ = ['gilles.drigout', 'thomas.clavier'] __status__ = 'Development' # Uses device generated random normal simulations to generate cauchy simulation # Methods may be better if normal simulation are reused ==> to check from numbapro import cuda from numbapro import vectorize from numbapro.cudalib import curand import numpy as np import matplotlib.pyplot as plt cuda.select_device(0) class Cauchy: def __init__(self, size): self.container = np.empty(size, np.float64) def __get_cuda_randoms(self): prng = curand.PRNG(rndtype=curand.PRNG.XORWOW) prng.normal(self.container,0,1) #self.container = rand.reshape((x, y)) a completer
for i in range(1,sp500_open.size): # compute volatility ki = k if k < i else i price_avg = np.mean(sp500_open[i-ki:i+1]) value = np.sum(np.absolute(sp500_open[i-ki:i+1] - price_avg))/(1.0*ki*price_avg) sp500_volatility = np.append(sp500_volatility, value) sp500_price_clustering = np.array([]) sp500_volatility_clustering = np.array([]) for lag in range(1,500): # array of correlation with certain lags sp500_price_clustering = np.append(sp500_price_clustering, np.sum(np.multiply(sp500_price_change[lag:],sp500_price_change[:-lag]))) sp500_volatility_clustering = np.append(sp500_volatility_clustering, np.sum(np.multiply(sp500_volatility[lag:],sp500_volatility[:-lag]))) sp500_price_clustering = sp500_price_clustering/sp500_price_clustering[0] # normalize to first entry sp500_volatility_clustering = sp500_volatility_clustering/sp500_volatility_clustering[0] # normalize to first entry cuda.select_device(0) #select videocard w = 120 h = 30 initProb = 0.05 #generate random traders A = np.array(np.random.choice([0, 1], p=[1-initProb, initProb], size=w*h, replace=True).reshape(h,w), dtype=np.int32) B = np.empty_like(A) def calcCluster(grid): grid_abs = np.absolute(grid) # reduce field to active/inactive traders grid_abs = grid_abs == 1 # get field of True/False values # lw: matrix with cluster numbers, num: total number of clusters, area: matrix of cluster size
from timeit import default_timer as timer import math import numpy as np import pylab from numbapro import cuda, cudadrv # For machine with multiple devices cuda.select_device(0) @cuda.jit('float32(float32, float32)', device=True) def core(a, b): return a + b @cuda.jit('void(float32[:], float32[:], float32[:])') def vec_add(a, b, c): i = cuda.grid(1) c[i] = core(a[i], b[i]) @cuda.jit('void(float32[:], float32[:], float32[:])') def vec_add_ilp_x2(a, b, c): # read i = cuda.grid(1) ai = a[i] bi = b[i] bw = cuda.blockDim.x gw = cuda.gridDim.x stride = gw * bw j = i + stride aj = a[j] bj = b[j]
from __future__ import print_function, division import sys import os import numpy as np import timeit import itertools import math from numbapro import cuda, int32, float32, float64, void from timeit import default_timer as timer from numbapro.cudalib import curand # from numbapro.cudalib.sorting.radixlib import RadixSort from numbapro.cudalib.sorting.segsort import segmented_sort cuda.select_device(int(os.environ.get("CUDA_DEVICE", 0))) NN = int(os.environ.get("NN", 1000)) FILE = os.environ.get("FILE", "input{}.npy".format(NN)) try: xrange zip = itertools.izip except NameError: xrange = range cached_input_file = FILE # "input.npy" float_type = float32 float_dtype = np.float32 def generate_input():