def zoom_numbaThread(data, chunkIndices, zoomArray): """ 2-D zoom interpolation using purely python - fast if compiled with numba. Both the array to zoom and the output array are required as arguments, the zoom level is calculated from the size of the new array. Parameters: array (ndarray): The 2-D array to zoom zoomArray (ndarray): The array to place the calculation Returns: interpArray (ndarray): A pointer to the calculated ``zoomArray'' """ for i in range(chunkIndices[0], chunkIndices[1]): x = i*numba.float32(data.shape[0]-1)/(zoomArray.shape[0]-0.99999999) x1 = numba.int32(x) for j in range(zoomArray.shape[1]): y = j*numba.float32(data.shape[1]-1)/(zoomArray.shape[1]-0.99999999) y1 = numba.int32(y) xGrad1 = data[x1+1, y1] - data[x1, y1] a1 = data[x1, y1] + xGrad1*(x-x1) xGrad2 = data[x1+1, y1+1] - data[x1, y1+1] a2 = data[x1, y1+1] + xGrad2*(x-x1) yGrad = a2 - a1 zoomArray[i,j] = a1 + yGrad*(y-y1) return zoomArray
def preCalc(y, yA, yB, numDataPoints): i = cuda.grid(1) k = i % numDataPoints ans = float32(1.001 * float32(i)) y[i] = ans yA[i] = ans * 1.0 yB[i] = ans / 1.0
def cu_square_matrix_mul(A, B, C): sA = cuda.shared.array(shape=SM_SIZE, dtype=float32) sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x by = cuda.blockIdx.y bw = cuda.blockDim.x bh = cuda.blockDim.y x = tx + bx * bw y = ty + by * bh acc = float32(0) # forces all the math to be f32 for i in range(bpg): if x < n and y < n: sA[ty, tx] = A[y, tx + i * tpb] sB[ty, tx] = B[ty + i * tpb, x] cuda.syncthreads() if x < n and y < n: for j in range(tpb): acc += sA[ty, j] * sB[j, tx] cuda.syncthreads() if x < n and y < n: C[y, x] = acc
def _test_broadcasting(self, cls, a, b, c, d): "Test multiple args" vectorizer = cls(add_multiple_args) vectorizer.add(float32(float32, float32, float32, float32)) ufunc = vectorizer.build_ufunc() info = (cls, a.shape) self.assertPreciseEqual(ufunc(a, b, c, d), a + b + c + d, msg=info)
def _test_broadcasting(self, cls, a, b, c, d): "Test multiple args" vectorizer = cls(add_multiple_args) vectorizer.add(float32(float32, float32, float32, float32)) ufunc = vectorizer.build_ufunc() info = (cls, a.shape) self.assertTrue(np.all(ufunc(a, b, c, d) == a + b + c + d), info)
def test_implicit_broadcasting(self): for v in vectorizers: vectorizer = v(add) vectorizer.add(float32(float32, float32)) ufunc = vectorizer.build_ufunc() broadcasting_b = b[np.newaxis, :, np.newaxis, np.newaxis, :] self.assertTrue(np.all(ufunc(a, broadcasting_b) == a + broadcasting_b))
def _test_ufunc_attributes(self, cls, a, b, *args): "Test ufunc attributes" vectorizer = cls(add, *args) vectorizer.add(float32(float32, float32)) ufunc = vectorizer.build_ufunc() info = (cls, a.ndim) self.assertPreciseEqual(ufunc(a, b), a + b, msg=info) self.assertPreciseEqual(ufunc_reduce(ufunc, a), np.sum(a), msg=info) self.assertPreciseEqual(ufunc.accumulate(a), np.add.accumulate(a), msg=info) self.assertPreciseEqual(ufunc.outer(a, b), np.add.outer(a, b), msg=info)
def _test_ufunc_attributes(self, cls, a, b, *args): "Test ufunc attributes" vectorizer = cls(add, *args) vectorizer.add(float32(float32, float32)) ufunc = vectorizer.build_ufunc() info = (cls, a.ndim) self.assertTrue(np.all(ufunc(a, b) == a + b), info) self.assertTrue(ufunc_reduce(ufunc, a) == np.sum(a), info) self.assertTrue(np.all(ufunc.accumulate(a) == np.add.accumulate(a)), info) self.assertTrue(np.all(ufunc.outer(a, b) == np.add.outer(a, b)), info)
def raycast(sx, sy, camera, world): fx = nb.float32(sx * 2 - 1) fy = nb.float32(sy * 2 - 1) dx = nb.float32(camera.plane_offset.x + camera.plane_x_size.x * fx + camera.plane_y_size.x * fy) dy = nb.float32(camera.plane_offset.y + camera.plane_x_size.y * fx + camera.plane_y_size.y * fy) ddx = nb.float32(abs(1 / dx) if dx != 0 else np.inf) ddy = nb.float32(abs(1 / dy) if dy != 0 else np.inf) tx = int(camera.pos.x // 1) ty = int(camera.pos.y // 1) ox = camera.pos.x % 1 oy = camera.pos.y % 1 sx = nb.cuda.selp(dx < 0, -1, 1) ox = nb.cuda.selp(dx < 0, ox, (1 - ox)) * ddx sy = nb.cuda.selp(dy < 0, -1, 1) oy = nb.cuda.selp(dy < 0, oy, (1 - oy)) * ddy finished = False while not finished: ox += ddx tx += sx if not (0 <= tx < world.shape[0]): finished = True continue return 0, 0, 0
def as_soft_penalty(self): from numba import njit, float32, float64 from ..numba.model import softplus, d_softplus i_num = self.i_num i_den = self.i_den cmin_num = self.cmin_num cmin_den = self.cmin_den cmax_num = self.cmax_num cmax_den = self.cmax_den scale = self.scale @njit([ float32(float32[:], float32, float32), float64(float64[:], float64, float64), ]) def penalty(x, intensity, sharpness=1.0): _min = x[i_num] * cmin_num + x[i_den] * cmin_den _max = x[i_num] * cmax_num + x[i_den] * cmax_den return -softplus(-np.minimum(_min, _max) * scale * intensity, sharpness) @njit([ float32[:](float32[:], float32, float32), float64[:](float64[:], float64, float64), ]) def dpenalty(x, intensity, sharpness=1.0): j = np.zeros_like(x) _min = x[i_num] * cmin_num + x[i_den] * cmin_den _max = x[i_num] * cmax_num + x[i_den] * cmax_den partial = d_softplus(-np.minimum(_min, _max), sharpness * scale * intensity) * scale * intensity if _min < _max: j[i_num] = cmin_num * partial j[i_den] = cmin_den * partial else: j[i_num] = cmax_num * partial j[i_den] = cmax_den * partial return j @njit([ float32[:](float32[:], float32), float64[:](float64[:], float64), ]) def dpenalty_money(x, intensity): j = np.zeros_like(x) _min = x[i_num] * cmin_num + x[i_den] * cmin_den _max = x[i_num] * cmax_num + x[i_den] * cmax_den if np.absolute(_min) < 1e-5: partial = 0.5 * scale * intensity j[i_num] = cmin_num * partial j[i_den] = cmin_den * partial elif np.absolute(_max) < 1e-5: partial = 0.5 * scale * intensity j[i_num] = cmax_num * partial j[i_den] = cmax_den * partial return j return penalty, dpenalty, dpenalty_money
def execute(self, input): attack = nb.float32(0.002) release = nb.float32(0.0002) clipthreshold = nb.float32(0.9) amplitude = nb.float32(0.25) pa = self.pa output = np.zeros_like(input) for i in range(len(input)): # Input sample s = input[i] # Amplitude of the input sample. # Use amplitude instead of power (amplitude^2), so that short, # high amplitude peaks won't affect the AGC that much. p = np.abs(s) # Difference from the average amplitude pd = p - pa if pd >= 0: pa += pd * attack else: pa += pd * release # Normalize the amplitude if pa > 0: s *= amplitude / pa else: # this shouldn't happen often s = 0 # Some samples may still be above 1, so clip them p = s.real ** 2 + s.imag ** 2 if p > clipthreshold: s *= np.sqrt(clipthreshold / p) output[i] = s self.pa = pa return output
def cu_sigm_cfe_post(cmin, cmax, midpoint, a, sigma): "Construct CUDA device function for Sigmoidal coupling function." cmin, cmax, midpoint, a, sigma = [ float32(_) for _ in (cmin, cmax, midpoint, a, sigma) ] from math import exp @cuda.jit(device=True) def cfe(gx): return cmin + ((cmax - cmin) / (1.0 + exp(-a * ((gx - midpoint) / sigma)))) return cfe
def test_cuda_vectorize_device_call(self): @cuda.jit(float32(float32, float32, float32), device=True) def cu_device_fn(x, y, z): return x**y / z def cu_ufunc(x, y, z): return cu_device_fn(x, y, z) ufunc = vectorize([float32(float32, float32, float32)], target='cuda')(cu_ufunc) N = 100 X = np.array(np.random.sample(N), dtype=np.float32) Y = np.array(np.random.sample(N), dtype=np.float32) Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1 out = ufunc(X, Y, Z) gold = (X**Y) / Z self.assertTrue(np.allclose(out, gold))
def cu_simple_cfun(offset, cvar): "Construct CUDA device function for simple summation coupling." offset = float32(offset) @cuda.jit(device=True) def cfun(weights, state, i_post, i_thread): # 2*n reads H = float32(0.0) for j in range(state.shape[0]): H += weights[i_post, j] * (state[j, cvar, i_thread] + offset) return H return cfun
def _savi_gpu(nir_data, red_data, soil_factor, out): y, x = cuda.grid(2) if y < out.shape[0] and x < out.shape[1]: nir = nir_data[y, x] red = red_data[y, x] numerator = nir - red soma = nir + red + soil_factor[0] denominator = soma * (nb.float32(1.0) + soil_factor[0]) if denominator == 0.0: out[y, x] = np.nan else: out[y, x] = numerator / denominator
def conv2d(arr, il, ir, ao, fin, fl, fr, dlin, dli): inp = arr[il:ir] out = arr[ao] f = fin[fl:fr] dl = dlin[dli] fshared = cuda.shared.array(shape=0, dtype=float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bdx = cuda.blockDim.x bdy = cuda.blockDim.y tid = ty*bdx+tx nth = bdx*bdy for i in range(tid,f.size,nth): fshared[i] = f[i] cuda.syncthreads() do=-1 xc,yc = cuda.grid(2) if xc<out.shape[0] and yc<out.shape[1]: tmp = float32(0) idx = int32(0) for j in range(inp.shape[0]): if do!=dl[j]: do=dl[j] d=dl[j] if xc>=d: xl = xc-d else: xl = d-xc if xc<out.shape[0]-d: xr = xc+d else: xr = 2*out.shape[0] - (xc+d + 2) if yc>=d: yl = yc-d else: yl = d-yc if yc<out.shape[1]-d: yr = yc+d else: yr = 2*out.shape[1] - (yc+d + 2) tmp = cuda.fma(inp[j,xl,yl],fshared[idx], tmp) tmp = cuda.fma(inp[j,xl,yc],fshared[idx+1], tmp) tmp = cuda.fma(inp[j,xl,yr],fshared[idx+2], tmp) tmp = cuda.fma(inp[j,xc,yl],fshared[idx+3], tmp) tmp = cuda.fma(inp[j,xc,yc],fshared[idx+4], tmp) tmp = cuda.fma(inp[j,xc,yr],fshared[idx+5], tmp) tmp = cuda.fma(inp[j,xr,yl],fshared[idx+6], tmp) tmp = cuda.fma(inp[j,xr,yc],fshared[idx+7], tmp) tmp = cuda.fma(inp[j,xr,yr],fshared[idx+8], tmp) idx+=9 out[xc,yc] += tmp
class Smart2FluxDelimiter(FluxDelimiter): @staticmethod @vectorize([float32(float32, float32,float32), float64(float64, float64,float64)]) def __call__(phi_p, tetha_f, tetha_p): if phi_p<=tetha_p/3: return tetha_f/tetha_p*phi_p*(1-3*tetha_p+2*tetha_f)/(1-tetha_p) elif phi_p<=tetha_p/tetha_f*(1+tetha_f-tetha_p): return (tetha_f/tetha_p)*((1-tetha_f)/(1-tetha_p))*phi_p + (tetha_f/(1-tetha_p))*(tetha_f-tetha_p) elif phi_p<=1: return 1. else: return phi_p
class MinMod2FluxDelimiter(FluxDelimiter): @staticmethod @vectorize([ float32(float32, float32, float32), float64(float64, float64, float64) ]) def __call__(phi_p, tetha_f, tetha_p): if phi_p <= tetha_p: return tetha_f / tetha_p * phi_p elif phi_p <= 1: return ((1 - tetha_f) * phi_p + (tetha_f - tetha_p)) / (1 - tetha_p) else: return phi_p
def test_wrapper_address_protocol_libm(self): """Call cos and sinf from standard math library. """ import os import ctypes.util class LibM(types.WrapperAddressProtocol): def __init__(self, fname): if os.name == 'nt': lib = ctypes.cdll.msvcrt else: libpath = ctypes.util.find_library('m') lib = ctypes.cdll.LoadLibrary(libpath) self.lib = lib self._name = fname if fname == 'cos': addr = ctypes.cast(self.lib.cos, ctypes.c_voidp).value signature = float64(float64) elif fname == 'sinf': addr = ctypes.cast(self.lib.sinf, ctypes.c_voidp).value signature = float32(float32) else: raise NotImplementedError(f'wrapper address of `{fname}`' f' with signature `{signature}`') self._signature = signature self._address = addr def __repr__(self): return f'{type(self).__name__}({self._name!r})' def __wrapper_address__(self): return self._address def signature(self): return self._signature mycos = LibM('cos') mysin = LibM('sinf') def myeval(f, x): return f(x) # Not testing forceobj=True as it requires implementing # LibM.__call__ using ctypes which would be out-of-scope here. for jit_opts in [dict(nopython=True)]: jit_ = jit(**jit_opts) with self.subTest(jit=jit_opts): self.assertEqual(jit_(myeval)(mycos, 0.0), 1.0) self.assertEqual(jit_(myeval)(mysin, float32(0.0)), 0.0)
def rk4_rV(it, nrV, rti, Vti, o_tau, pi, tau, Delta, eta, J, I, cr, rc, cv, Vc, r_sigma, V_sigma, z0, z1): dr_0 = dr_(rti, Vti, o_tau, pi, tau, Delta) dV_0 = dV_(rti, Vti, o_tau, pi, tau, eta, J, I, cr, rc, cv, Vc) kh = nb.float32(0.5) dr_1 = dr_(rti + dt * kh * dr_0, Vti + dt * kh * dV_0, o_tau, pi, tau, Delta) dV_1 = dV_(rti + dt * kh * dr_0, Vti + dt * kh * dV_0, o_tau, pi, tau, eta, J, I, cr, rc, cv, Vc) dr_2 = dr_(rti + dt * kh * dr_1, Vti + dt * kh * dV_1, o_tau, pi, tau, Delta) dV_2 = dV_(rti + dt * kh * dr_1, Vti + dt * kh * dV_1, o_tau, pi, tau, eta, J, I, cr, rc, cv, Vc) kh = nb.float32(1.0) dr_3 = dr_(rti + dt * kh * dr_2, Vti + dt * kh * dV_2, o_tau, pi, tau, Delta) dV_3 = dV_(rti + dt * kh * dr_2, Vti + dt * kh * dV_2, o_tau, pi, tau, eta, J, I, cr, rc, cv, Vc) nrV[0, it] = rti + o_6 * dt * (dr_0 + 2 * (dr_1 + dr_2) + dr_3) + sqrt_dt * r_sigma * z0 nrV[0, it] *= nrV[0, it] > 0 nrV[1, it] = Vti + o_6 * dt * (dV_0 + 2 * (dV_1 + dV_2) + dV_3) + sqrt_dt * V_sigma * z1
def ax2hoL(axIn,p=P): pf = numba.float32(p > 0) * 2.0 - 1.0 # intype = ax.dtype # n = np.int64(ax.size / 4) ax,m,n,intype = prepIn(axIn) ho = np.zeros((n,3),dtype=intype) axn = axnormL(ax) #axn = ax for i in numba.prange(n): f = 0.75 * (axn[i,3] - np.sin(axn[i,3])) f = f ** (1.0 / 3.0) for j in range(3): ho[i,j] = f * axn[i,j] return ho
def test_cuda_vectorize_device_call(self): @cuda.jit(float32(float32, float32, float32), device=True) def cu_device_fn(x, y, z): return x ** y / z def cu_ufunc(x, y, z): return cu_device_fn(x, y, z) ufunc = vectorize([float32(float32, float32, float32)], target='cuda')( cu_ufunc) N = 100 X = np.array(np.random.sample(N), dtype=np.float32) Y = np.array(np.random.sample(N), dtype=np.float32) Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1 out = ufunc(X, Y, Z) gold = (X ** Y) / Z self.assertTrue(np.allclose(out, gold))
def cu_sums4(nme, member, vel, virial_potential, coll, nblocks): block_size = 256 block_size2 = block_size * 2 block_size3 = block_size * 3 block_size4 = block_size * 4 block_size5 = block_size * 5 sm = cuda.shared.array(256 * 6, nb.float32) i = cuda.grid(1) tx = cuda.threadIdx.x temp = nb.float32(0.0) virial = nb.float32(0.0) potential = nb.float32(0.0) mx = nb.float32(0.0) my = nb.float32(0.0) mz = nb.float32(0.0) if i < nme: idx = member[i] vi = vel[idx] mi = vi[3] vp = virial_potential[idx] temp = mi * (vi[0] * vi[0] + vi[1] * vi[1] + vi[2] * vi[2]) virial = vp[0] potential = vp[1] mx = vi[0] * mi my = vi[1] * mi mz = vi[2] * mi sm[tx] = temp sm[tx + block_size] = virial sm[tx + block_size2] = potential sm[tx + block_size3] = mx sm[tx + block_size4] = my sm[tx + block_size5] = mz cuda.syncthreads() offs = cuda.blockDim.x >> 1 while offs > 0: if tx < offs: sm[tx] += sm[tx + offs] sm[tx + block_size] += sm[tx + block_size + offs] sm[tx + block_size2] += sm[tx + block_size2 + offs] sm[tx + block_size3] += sm[tx + block_size3 + offs] sm[tx + block_size4] += sm[tx + block_size4 + offs] sm[tx + block_size5] += sm[tx + block_size5 + offs] offs >>= 1 cuda.syncthreads() if tx == 0: coll[cuda.blockIdx.x] = sm[0] coll[cuda.blockIdx.x + nblocks] = sm[block_size] coll[cuda.blockIdx.x + nblocks * 2] = sm[block_size2] coll[cuda.blockIdx.x + nblocks * 3] = sm[block_size3] coll[cuda.blockIdx.x + nblocks * 4] = sm[block_size4] coll[cuda.blockIdx.x + nblocks * 5] = sm[block_size5]
def test_cuda_vectorize_device_call(self): ufunc = vectorize([float32(float32, float32, float32)], target='cuda')( cu_ufunc) N = 100 X = np.array(np.random.sample(N), dtype=np.float32) Y = np.array(np.random.sample(N), dtype=np.float32) Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1 out = ufunc(X, Y, Z) gold = (X ** Y) / Z self.assertTrue(np.allclose(out, gold))
def make_euler(dt, f, n_svar, n_step): "Construct CUDA device function for Euler scheme." n_step = int32(n_step) dt = float32(dt) @cuda.jit(device=True) def scheme(X, I): dX = cuda.local.array((n_svar, ), float32) for i in range(n_step): f(dX, X, I) for j in range(n_svar): X[j] += dX[j] return scheme
def prepare_legendre(order, numba=True): if order == 1: def P(x): return x elif order == 2: def P(x): return 0.5 * (3.0*x**2 - 1.0) else: raise NotImplementedError("Order {:d} of Legendre polynomial has not been implemented".format(order)) if numba: vectorizing_factory = vectorize([float32(float32), float64(float64)], nopython=True) return vectorizing_factory(P) else: return P
def calculate_forces(positions, weights, accelerations): """ Calculate accelerations produced on all bodies by mutual gravitational forces. """ sh_positions = cuda.shared.array((tile_size, 2), float32) sh_weights = cuda.shared.array(tile_size, float32) i = cuda.grid(1) axi = float32(0.0) ayi = float32(0.0) xi = positions[i,0] yi = positions[i,1] for j in range(0, len(weights), tile_size): index = (j // tile_size) * cuda.blockDim.x + cuda.threadIdx.x sh_index = cuda.threadIdx.x sh_positions[sh_index,0] = positions[index,0] sh_positions[sh_index,1] = positions[index,1] sh_weights[sh_index] = weights[index] cuda.syncthreads() axi, ayi = tile_calculation(xi, yi, axi, ayi, sh_positions, sh_weights) cuda.syncthreads() accelerations[i,0] = axi accelerations[i,1] = ayi
def _sinewave(num, den): """Generate a complex sine wave of frequency sample_rate*num/den. Length is chosen such that a continuous sine wave can be made by repeating the returned signal.""" # The code below fails for num=0, so handle that as a special case if num == 0: return nb.complex64([1.0]) # "% den" is not absolutely necessary here, but wrapping the phase # using integers may avoid loss of floating point precision. phase = \ (np.arange(0, num*den, num, dtype = np.int64) % den) \ .astype(np.float32) * nb.float32(2.0 * np.pi / den) return np.cos(phase) + np.sin(phase) * nb.complex64(1j)
def test_4(self): sig = [ int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64), ] func = self.funcs['func3'] A = np.arange(100, dtype=np.float64) self._run_and_compare(func, sig, A, A) A = A.astype(np.float32) self._run_and_compare(func, sig, A, A) A = A.astype(np.int32) self._run_and_compare(func, sig, A, A) A = A.astype(np.uint32) self._run_and_compare(func, sig, A, A)
def fit(self, X, y, train_indices, valid_indices, sample_weights): max_bins = self.n_bins - 1 random_state = self.random_state # TODO: on obtiendra cette info via le binner qui est dans la foret n_samples, n_features = X.shape n_bins_per_feature = max_bins * np.ones(n_features) n_bins_per_feature = n_bins_per_feature.astype(np.intp) # Create the tree object, which is mostly a data container for the nodes tree = _TreeRegressor(n_features, random_state) # We build a tree context, that contains global information about # the data, in particular the way we'll organize data into contiguous # node indexes both for training and validation samples tree_context = TreeRegressorContext( X, y, sample_weights, train_indices, valid_indices, self.n_bins - 1, n_bins_per_feature, uintp(self.max_features), self.aggregation, float32(self.step), ) node_context = NodeRegressorContext(tree_context) best_split = SplitRegressor() candidate_split = SplitRegressor() compute_node_context = compute_node_regressor_context grow( tree, tree_context, node_context, compute_node_context, find_best_split_regressor_along_feature, copy_split_regressor, best_split, candidate_split, ) self._train_indices = train_indices self._valid_indices = valid_indices self._tree = tree self._tree_context = tree_context return self
def __init__(self, X_binned, max_bins, n_bins_per_feature, gradients, hessians, l2_regularization, min_hessian_to_split=1e-3, min_samples_leaf=20, min_gain_to_split=0.): self.X_binned = X_binned self.n_features = X_binned.shape[1] # Note: all histograms will have <max_bins> bins, but some of the # last bins may be unused if n_bins_per_feature[f] < max_bins self.max_bins = max_bins self.n_bins_per_feature = n_bins_per_feature self.gradients = gradients self.hessians = hessians # for root node, gradients and hessians are already ordered self.ordered_gradients = gradients.copy() self.ordered_hessians = hessians.copy() self.sum_gradients = self.gradients.sum() self.sum_hessians = self.hessians.sum() self.constant_hessian = hessians.shape[0] == 1 self.l2_regularization = l2_regularization self.min_hessian_to_split = min_hessian_to_split self.min_samples_leaf = min_samples_leaf self.min_gain_to_split = min_gain_to_split if self.constant_hessian: self.constant_hessian_value = self.hessians[0] # 1 scalar else: self.constant_hessian_value = float32(1.) # won't be used anyway # The partition array maps each sample index into the leaves of the # tree (a leaf in this context is a node that isn't splitted yet, not # necessarily a 'finalized' leaf). Initially, the root contains all # the indices, e.g.: # partition = [abcdefghijkl] # After a call to split_indices, it may look e.g. like this: # partition = [cef|abdghijkl] # we have 2 leaves, the left one is at position 0 and the second one at # position 3. The order of the samples is irrelevant. self.partition = np.arange(0, X_binned.shape[0], 1, np.uint32) # buffers used in split_indices to support parallel splitting. self.left_indices_buffer = np.empty_like(self.partition) self.right_indices_buffer = np.empty_like(self.partition)
def raycast(sx, sy, camera, world, texture_map, textures): fx = nb.float32(sx * 2 - 1) fy = nb.float32(sy * 2 - 1) dx = nb.float32(camera.plane_offset.x + camera.plane_x_size.x * fx + camera.plane_y_size.x * fy) dy = nb.float32(camera.plane_offset.y + camera.plane_x_size.y * fx + camera.plane_y_size.y * fy) dz = nb.float32(camera.plane_offset.z + camera.plane_x_size.z * fx + camera.plane_y_size.z * fy) ddx = nb.float32(abs(1 / dx)) ddy = nb.float32(abs(1 / dy)) ddz = nb.float32(abs(1 / dz)) tx = int(camera.pos.x // 1) ty = int(camera.pos.y // 1) tz = int(camera.pos.z // 1) ox = camera.pos.x % 1 oy = camera.pos.y % 1 oz = camera.pos.z % 1 sx = nb.cuda.selp(dx < 0, -1, 1) ox = nb.cuda.selp(dx < 0, ox, (1 - ox)) * ddx sy = nb.cuda.selp(dy < 0, -1, 1) oy = nb.cuda.selp(dy < 0, oy, (1 - oy)) * ddy sz = nb.cuda.selp(dz < 0, -1, 1) oz = nb.cuda.selp(dz < 0, oz, (1 - oz)) * ddz finished = False while not finished: if oz > ox < oy: ox += ddx tx += sx side = int(0 + (sx + 1) // 2) elif oz > oy < ox: oy += ddy ty += sy side = int(2 + (sy + 1) // 2) else: oz += ddz tz += sz side = int(4 + (sz + 1) // 2) if not ((fx := (0 <= tx < world.shape[0])) and (fy := (0 <= ty < world.shape[1])) and (fz := (0 <= tz < world.shape[2]))):
def find_node_split_subtraction(context, sample_indices, parent_histograms, sibling_histograms): """For each feature, find the best bin to split by histogram substraction This in turn calls _find_histogram_split_subtraction that does not need to scan the samples from this node and can therefore be significantly faster than computing the histograms from data. Returns the best SplitInfo among all features, along with all the feature histograms that can be latter used to compute the sibling or children histograms by substraction. """ # We can pick any feature (here the first) in the histograms to # compute the gradients: they must be the same across all features # anyway, we have tests ensuring this. Maybe a more robust way would # be to compute an average but it's probably not worth it. context.sum_gradients = (parent_histograms[0]['sum_gradients'].sum() - sibling_histograms[0]['sum_gradients'].sum()) n_samples = sample_indices.shape[0] if context.constant_hessian: context.sum_hessians = \ context.constant_hessian_value * float32(n_samples) else: context.sum_hessians = (parent_histograms[0]['sum_hessians'].sum() - sibling_histograms[0]['sum_hessians'].sum()) # Pre-allocate the results datastructure to be able to use prange split_infos = [ SplitInfo(-1., 0, 0, 0., 0., 0., 0., 0, 0) for i in range(context.n_features) ] histograms = np.empty(shape=(np.int64(context.n_features), np.int64(context.n_bins)), dtype=HISTOGRAM_DTYPE) for feature_idx in prange(context.n_features): split_info, histogram = _find_histogram_split_subtraction( context, feature_idx, parent_histograms, sibling_histograms, n_samples) split_infos[feature_idx] = split_info histograms[feature_idx, :] = histogram split_info = _find_best_feature_to_split_helper(split_infos) return split_info, histograms
def cu_cell_build(npa, pos, dim, box_low_boundary, inv_width, cell_size, cell_list, situation): i = cuda.grid(1) if i < npa: # pi = pos[i] pix = pos[i][0] piy = pos[i][1] piz = pos[i][2] if math.isnan(pix) or math.isnan(piy) or math.isnan(piz): situation[0] = i + nb.int32(1) return if pix < box_low_boundary[0] or pix >= -box_low_boundary[ 0] or piy < box_low_boundary[1] or piy >= -box_low_boundary[ 1] or piz < box_low_boundary[ 2] or piz >= -box_low_boundary[2]: situation[1] = i + nb.int32(1) return dpix = pix - box_low_boundary[0] dpiy = piy - box_low_boundary[1] dpiz = piz - box_low_boundary[2] ix = nb.int32(dpix * inv_width[0]) iy = nb.int32(dpiy * inv_width[1]) iz = nb.int32(dpiz * inv_width[2]) if ix == dim[0]: ix = nb.int32(0) if iy == dim[1]: iy = nb.int32(0) if iz == dim[2]: iz = nb.int32(0) cell_id = iz + dim[2] * (iy + ix * dim[1]) if cell_id >= cell_list.shape[0]: situation[1] = i + nb.int32(1) return size = cuda.atomic.add(cell_size, cell_id, nb.int32(1)) if size < cell_list.shape[1]: cell_list[cell_id][size][0] = pix cell_list[cell_id][size][1] = piy cell_list[cell_id][size][2] = piz cell_list[cell_id][size][3] = nb.float32(i) else: cuda.atomic.max(situation, nb.int32(2), size + nb.int32(1))
def om2axL(om, p=P):# depreciated now use qu2ax(om2qu()) -- kept for historical reasons. pf = numba.float32(p > 0) * 2.0 - 1.0 intype = om.dtype n = np.int64(om.size / 9) ax = np.zeros((n, 4), dtype=intype) # help for translating out of C version # [0,0], [0,1], [0, 2], [1,0], [1,1], [1,2], [2,0], [2,1], [2,2] # 0 1 2 3 4 5 6 7 8 for i in numba.prange(n): tr = om[i,0,0] + om[i,1,1] + om[i,2,2] t = 0.50 * (tr - 1.0) t = 1.0 if (t > 1.0) else t t = -1.0 if (t < -1.0) else t ax[i,3] = np.arccos(t) #if ((1.0 - np.abs(t)) > eps): mag = numba.float64(0.0) ax[i,2] = pf * (om[i,1,0] - om[i,0,1]) mag += ax[i,2]*ax[i,2] ax[i,1] = pf * (om[i,0,2] - om[i,2,0]) mag += ax[i,1] * ax[i,1] ax[i,0] = pf * (om[i,2,1] - om[i,1,2]) mag += ax[i,0] * ax[i,0] mag = np.sqrt(mag) if mag > eps: for j in range(3): ax[i,j] *= 1.0/mag else: if t > 0.0: ax[i, 0] = 0.0 ax[i, 1] = 0.0 ax[i, 2] = -1.0*pf else: d = np.zeros(3,dtype=intype) for j in range(3): d[j] = np.sqrt(0.5*(om[i,j,j]+1.0)) dargsrt = np.argsort(d) d[dargsrt[1]] = (om[i, dargsrt[2], dargsrt[1]] + om[i, dargsrt[1], dargsrt[2] ]) / (4.0 * d[dargsrt[2]]) d[dargsrt[0]] = (om[i, dargsrt[2], dargsrt[0]] + om[i, dargsrt[0], dargsrt[2]]) / (4.0 * d[dargsrt[2]]) for j in range(3): ax[i,j] = pf*d[j] ax = axnormL(ax) return ax
def _test_template_4(self, target): sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) np_ufunc = np.add def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) np.testing.assert_allclose(gold, result) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def _test_template_4(self, target): sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) np_ufunc = np.add def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) self.assertTrue(np.allclose(gold, result)) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def ho2axL(hoIn,p=P): pf = numba.float32(p > 0) * 2.0 - 1.0 ho,m,n,intype = prepIn(hoIn) ax = np.zeros((n,4),dtype=intype) tfit = np.array([1.0000000000018852,-0.5000000002194847,-0.024999992127593126, -0.003928701544781374,-0.0008152701535450438,-0.0002009500426119712, -0.00002397986776071756,-0.00008202868926605841,0.00012448715042090092, -0.0001749114214822577,0.0001703481934140054,-0.00012062065004116828, 0.000059719705868660826,-0.00001980756723965647,0.000003953714684212874, -0.00000036555001439719544],dtype=np.float64) for i in numba.prange(n): hmag = np.float64(0.0) for j in range(3): hmag += ho[i,j] * ho[i,j] if hmag < eps: ax[i,0] = 0.0 ax[i,1] = 0.0 ax[i,2] = -1.0 * pf ax[i,3] = 0.0 else: hm = hmag sqrthm = np.sqrt(hm) hn = np.zeros(3,dtype=intype) for j in range(3): hn[j] = ho[i,j] / sqrthm # hn = ho[i,:]/sqrthm s = tfit[0] + tfit[1] * hmag for j in range(2,16): hm *= hmag s += tfit[j] * hm s = 1.0 if (s > 1.0) else s s = -1.0 if (s < -1.0) else s s = 2.0 * np.arccos(s) for j in range(3): ax[i,j] = hn[j] if np.abs(s - PI) < eps: ax[i,3] = PI else: ax[i,3] = s return ax
def __init__(self, fname): if os.name == 'nt': lib = ctypes.cdll.msvcrt else: libpath = ctypes.util.find_library('m') lib = ctypes.cdll.LoadLibrary(libpath) self.lib = lib self._name = fname if fname == 'cos': addr = ctypes.cast(self.lib.cos, ctypes.c_voidp).value signature = float64(float64) elif fname == 'sinf': addr = ctypes.cast(self.lib.sinf, ctypes.c_voidp).value signature = float32(float32) else: raise NotImplementedError(f'wrapper address of `{fname}`' f' with signature `{signature}`') self._signature = signature self._address = addr
def _compile(cls, formula): with BTagScaleFactor._formulaLock: try: return BTagScaleFactor._formulaCache[formula] except KeyError: if 'x' in formula: feval = eval('lambda x: ' + formula, {'log': numpy.log, 'sqrt': numpy.sqrt}) out = numba.vectorize([ numba.float32(numba.float32), numba.float64(numba.float64), ])(feval) else: val = eval(formula, {'log': numpy.log, 'sqrt': numpy.sqrt}) def duck(_, out, where): out[where] = val out = duck BTagScaleFactor._formulaCache[formula] = out return out
def xoroshiro128p_normal_float32(states, index): '''Return a normally distributed float32 and advance ``states[index]``. The return value is drawn from a Gaussian of mean=0 and sigma=1 using the Box-Muller transform. This advances the RNG sequence by two steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: float32 ''' index = int64(index) u1 = xoroshiro128p_uniform_float32(states, index) u2 = xoroshiro128p_uniform_float32(states, index) z0 = math.sqrt(-float32(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT32 * u2) # discarding second normal value # z1 = math.sqrt(-float32(2.0) * math.log(u1)) * math.sin(TWO_PI_FLOAT32 * u2) return z0
def template_vectorize(self, target): # build basic native code ufunc sig = [int32(int32, int32), uint32(uint32, uint32), float32(float32, float32), float64(float64, float64)] basic_ufunc = vectorize(sig, target=target)(vector_add) # build python ufunc np_ufunc = np.add # test it out def test(ty): data = np.linspace(0., 100., 500).astype(ty) result = basic_ufunc(data, data) gold = np_ufunc(data, data) self.assertTrue(np.allclose(gold, result)) test(np.double) test(np.float32) test(np.int32) test(np.uint32)
def uint64_to_unit_float32(x): '''Convert uint64 to float64 value in the range [0.0, 1.0)''' x = uint64(x) return float32(uint64_to_unit_float64(x))
if exposure_indices is None: exposure_indices = np.empty( shape=(0, 2) , dtype=np.int32) for i in range(x.shape[0]): if np.sqrt((x[i]-x0)**2+(y[i]-y0)**2) < r: exposure_indices = np.vstack((exposure_indices,np.array([x[i],y[i]],dtype=np.int32))) return exposure_indices outfilename = 'test.txt' @jit(float32(float32,float32,float32,float32),nopython=True) def dist(x0,y0,x,y): return math.sqrt( (x0-x)*(x0-x)+(y0-y)*(y0-y) ) @jit(void(float32[:,:],int32[:,:],float32[:]),nopython=True,parallel= True) def set_doses_field(field, exposure_indices, doses): for i in prange(doses.shape[0]): field[exposure_indices[i,0],exposure_indices[i,1]] = doses[i] @jit(void(float32[:,:],int32[:,:],float32),nopython=True) def set_target(target, exposure_indices, dose): for i in range(exposure_indices.shape[0]): target[exposure_indices[i,0],exposure_indices[i,1]] = dose @njit(void(float32[:,:],float32[:,:],float32[:],float32[:]),parallel=True) def convolve_with_vector(field,exposure,v,h):
def test_2(self): sig = [float64(float64), float32(float32)] func = self.funcs['func1'] A = np.arange(100, dtype=np.float64) self._run_and_compare(func, sig, A)
i = cuda.grid(1) # Map i to array elements if i >= out.size: # Out of range? return # Do actual work out[i] = a * x[i] + y[i] """ Vectorize turns a scalar function into a elementwise operation over the input arrays. """ @vectorize([float32(float32, float32, float32)], target="cuda") def vec_saxpy(a, x, y): ### Task 1 ### # Complete the vectorize version # Hint: this is a scalar function of # float32(float32 a, float32 x, float32 y) return a * x + y # CPU code # --------- NUM_BLOCKS = 1 NUM_THREADS = 32 NELEM = NUM_BLOCKS * NUM_THREADS
for j in xrange(des_ngb): q = neighbor_dists[i, j]/h if q <= 0.5: n_ngb += (1 - 6*q**2 + 6*q**3) elif q <= 1.0: n_ngb += 2*(1-q)**3 n_ngb *= norm if n_ngb > des_ngb: upper = h else: lower = h error = np.fabs(n_ngb-des_ngb) hsml[i] = h return hsml @vectorize([float32(float32), float64(float64)]) def Kernel(q): if q <= 0.5: return 1 - 6*q**2 + 6*q**3 elif q <= 1.0: return 2 * (1-q)**3 else: return 0.0 @jit def DF(f, ngb): df = np.empty(ngb.shape) for i in xrange(ngb.shape[0]): for j in xrange(ngb.shape[1]): df[i,j] = f[ngb[i,j]] - f[i] return df
def cu_template_render_image(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128, tile_size=100): """ CPU part of the SPH render code that executes the rendering on the GPU does some basic particle set prunning and sets up the image tiles. It launches cuda kernels for rendering the individual sections of the image """ import pycuda.driver as drv import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule from radix_sort import radix_sort global_start = time.clock() start = time.clock() # construct an array of particles Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')] ps = drv.pagelocked_empty(len(s),dtype=Partstruct) with s.immediate_mode : ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']] if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start) ymin,ymax = xmin,xmax # ---------------------- # setup the global image # ---------------------- image = np.zeros((nx,ny),dtype=np.float32) dx = float32((xmax-xmin)/nx) dy = float32((ymax-ymin)/ny) x_start = xmin+dx/2 y_start = ymin+dy/2 zplane = 0.0 # ------------------------------------------------------------------------------------------------ # trim particles based on smoothing length -- the GPU will only render those that need < 32 pixels # ------------------------------------------------------------------------------------------------ start = time.clock() # gpu_bool = 2*ps['h'] < 15.*dx ps_gpu = ps#[gpu_bool] # ps_cpu = ps[~gpu_bool] #del(ps) if timing: '<<< Setting up gpu/cpu particle struct arrays took %f s'%(time.clock()-start) # ----------------------------------------------------------------- # set up the image slices -- max. size is 100x100 pixels # in this step only process particles that need kernels < 40 pixels # tiles are 100x100 = 1e4 pixels x 4 bytes = 40k # kernels are 31x31 pixels max = 3844 bytes # max shared memory size is 48k # ----------------------------------------------------------------- start = time.clock() tiles_pix, tiles_physical = make_tiles(nx,ny,xmin,xmax,ymin,ymax,tile_size) if timing: print '<<< Tiles made in %f s'%(time.clock()-start) Ntiles = tiles_pix.shape[0] # ------------------ # set up the kernels # ------------------ code = file(os.path.join(os.path.dirname(__file__),'template_kernel.cu')).read() mod = SourceModule(code,options=["--ptxas-options=-v"]) tile_histogram = mod.get_function("tile_histogram") distribute_particles = mod.get_function("distribute_particles") tile_render_kernel = mod.get_function("tile_render_kernel") calculate_keys = mod.get_function("calculate_keys") # ------------------------------------------------------------- # set up streams and figure out particle distributions per tile # ------------------------------------------------------------- # allocate histogram array hist = np.zeros(Ntiles,dtype=np.int32) # transfer histogram array and particle data to GPU hist_gpu = drv.mem_alloc(hist.nbytes) drv.memcpy_htod(hist_gpu,hist) start_g = drv.Event() end_g = drv.Event() start_g.record() ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes) drv.memcpy_htod(ps_on_gpu,ps_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g)) # make everything the right size xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax]) nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles]) # ----------------------------- # calculate pixels per particle # ----------------------------- # allocate key arrays -- these will be keys to sort particles into softening bins start_g.record() keys_gpu = drv.mem_alloc(int(4*len(s))) calculate_keys(ps_on_gpu, keys_gpu, np.int32(len(s)), np.float32(dx), block=(nthreads,1,1),grid=(1024,1,1)) end_g.record() end_g.synchronize() if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g)) # ---------------------------------------- # sort particles by their softening length # ---------------------------------------- start_g.record() radix_sort(int(keys_gpu), int(ps_on_gpu), np.int32(0), np.int32(len(s))) end_g.record() end_g.synchronize() if timing: print '<<< Radix sorting all tiles took %f ms'%(start_g.time_till(end_g)) start_g.record() tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles, block=(nthreads,1,1),grid=(1024,1,1)) drv.Context.synchronize() drv.memcpy_dtoh(hist,hist_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g)) print "<<< Total particle array = %d"%(hist.sum()) # --------------------------------------------------------------------------------- # figured out the numbers of particles per tile -- set up the tile particle buffers # --------------------------------------------------------------------------------- ps_tiles = np.empty(hist.sum(),dtype=Partstruct) ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes) tile_offsets = np.array([0],dtype=np.int32) tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32)) tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes) drv.memcpy_htod(tile_offsets_gpu,tile_offsets) start_g.record() distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), xmin, xmax, ymin, ymax, nx, ny, Ntiles, block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4) end_g.record() end_g.synchronize() if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g)) drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu) # ------------------------- # start going through tiles # ------------------------- # initialize the image on the device im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes) drv.memcpy_htod(im_gpu,image.astype(np.float32)) tile_start = time.clock() streams = [drv.Stream() for i in range(16)] for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : my_stream = streams[i%(16)] xmin_p, xmax_p, ymin_p, ymax_p = tiles_physical[i] xmin_t, xmax_t, ymin_t, ymax_t = tiles_pix[i] nx_tile = xmax_t-xmin_t+1 ny_tile = ymax_t-ymin_t+1 # make everything the right size xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t]) xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p]) if n_per_tile > nthreads*256: ngrid=128 else : ngrid = 64 tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i), xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t, im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]), block=(nthreads,1,1),grid=(ngrid,1,1),stream=my_stream) if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start) # ---------------------------------------------------------------------------------- # process the particles with large smoothing lengths concurrently with GPU execution # ---------------------------------------------------------------------------------- #if ind[1] != len(xs) : # start = time.clock() # image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:], # nx,ny,xmin,xmax,ymin,ymax)).T # if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1], # time.clock()-start) drv.Context.synchronize() if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start) drv.memcpy_dtoh(image,im_gpu) drv.stop_profiler() if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start) del(start_g) del(end_g) return image
from __future__ import print_function, absolute_import import numpy as np from numba import vectorize from numba import cuda, int32, float32, float64 from numba import unittest_support as unittest from numba.cuda.testing import skip_on_cudasim from numba.cuda.testing import CUDATestCase from numba import config sig = [int32(int32, int32), float32(float32, float32), float64(float64, float64)] target='cuda' if config.ENABLE_CUDASIM: target='cpu' test_dtypes = np.float32, np.int32 @skip_on_cudasim('ufunc API unsupported in the simulator') class TestCUDAVectorize(CUDATestCase): N = 1000001 def test_scalar(self): @vectorize(sig, target=target)
def foo(arr, val): i = cuda.grid(1) if i < arr.size: arr[i] = float32(i) / val
def _test_template_2(self, target): numba_sinc = vectorize([float64(float64), float32(float32)], target=target)(sinc) numpy_sinc = np.vectorize(sinc) self._run_and_compare(numba_sinc, numpy_sinc)
""" Demonstrate broadcasting when a scalar is provided as an argument to a vectorize function. Please read NumPy Broadcasting documentation for details about broadcasting: http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html """ from __future__ import print_function import numpy as np from numba import vectorize, float32 @vectorize([float32(float32, float32, float32)], target="parallel") def truncate(x, xmin, xmax): """ Truncate x[:] to [xmin, xmax] interval """ if x < xmin: x = xmin elif x > xmax: x = xmax return x def main(): x = np.arange(100, dtype=np.float32) print("x = %s" % x) xmin = np.float32(20) # as float32 type scalar xmax = np.float32(70) # as float32 type scalar # The scalar arguments are broadcasted into an array. # This process creates arrays of zero strides.
from __future__ import print_function, absolute_import import numpy as np from numba import vectorize from numba import cuda, int32, float32, float64 from timeit import default_timer as time from numba import unittest_support as unittest from numba.cuda.testing import skip_on_cudasim from numba.cuda.testing import CUDATestCase from numba import config sig = [int32(int32, int32), float32(float32, float32), float64(float64, float64)] target = "cuda" if config.ENABLE_CUDASIM: target = "cpu" test_dtypes = np.float32, np.int32 @skip_on_cudasim("ufunc API unsupported in the simulator") class TestCUDAVectorize(CUDATestCase): def test_scalar(self): @vectorize(sig, target=target) def vector_add(a, b): return a + b a = 1.2 b = 2.3 c = vector_add(a, b)
xy + \\text{trunc}\\left(\\frac{\\left(\\left|x - y\\right| - 1\\right)^{2}}{4}\\right) Args: x (array): First value array y (array): Second value array Returns: p (array): Pairing function result Note: This function has a vectorized version that is imported as :func:`~exa.algorithms.indexing.unordered_pairing`; use that function when working with array data. .. _pairing function: http://www.mattdipasquale.com/blog/2014/03/09/unique-unordered-pairing-function/ ''' return np.int64(x * y + np.trunc((np.abs(x - y) - 1)**2 / 4)) if global_config['pkg_numba']: from numba import jit, vectorize, int32, int64, float32, float64 arange1 = jit(nopython=True, cache=True)(arange1) arange2 = jit(nopython=True, cache=True)(arange2) indexes_sc1 = jit(nopython=True, cache=True)(indexes_sc1) indexes_sc2 = jit(nopython=True, cache=True)(indexes_sc2) unordered_pairing = vectorize([int32(int32, int32), int64(int64, int64), float32(float32, float32), float64(float64, float64)], nopython=True)(unordered_pairing)
def vector_cross(u, v): """ Return vector cross product of two 3d vectors as numpy array. :param u: First 3d vector :param v: Second 3d vector :return: Cross product of two vectors as numpy.array """ res = np.empty_like(u) res[0] = u[1] * v[2] - u[2] * v[1] res[1] = u[2] * v[0] - u[0] * v[2] res[2] = u[0] * v[1] - u[1] * v[0] return res @numba.jit(numba.float32(numba.float32[3], numba.float32[3])) def vector_dot(u, v): """ Return vector dot product of two 3d vectors. :param u: First 3d vector :param v: Second 3d vector :return: Dot product of two vectors """ return u[0]*v[0] + u[1]*v[1] + u[2]*v[2] @numba.jit(numba.float32(numba.float32[3])) def vector_len(v): return math.sqrt(v[0]*v[0] + v[1]*v[1] + v[2]*v[2])
def cu_template_render_image_single(s,nx,ny,xmin,xmax, qty='rho',timing = False, nthreads=128): """ CPU part of the SPH render code that executes the rendering on the GPU does some basic particle set prunning and sets up the image tiles. It launches cuda kernels for rendering the individual sections of the image """ import pycuda.driver as drv import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule from radix_sort import radix_sort global_start = time.clock() start = time.clock() # construct an array of particles Partstruct = [('x','f4'),('y','f4'),('qt','f4'),('h','f4')] ps = drv.pagelocked_empty(len(s),dtype=Partstruct) with s.immediate_mode : ps['x'],ps['y'],ps['qt'],ps['h'] = [s[arr] for arr in ['x','y','mass','smooth']] if timing: print '<<< Forming particle struct took %f s'%(time.clock()-start) ymin,ymax = xmin,xmax # ---------------------- # setup the global image # ---------------------- image = np.zeros((nx,ny),dtype=np.float32) dx = float32((xmax-xmin)/nx) dy = float32((ymax-ymin)/ny) x_start = xmin+dx/2 y_start = ymin+dy/2 zplane = 0.0 start = time.clock() # ------------------ # set up the kernels # ------------------ code = file('/home/itp/roskar/homegrown/template_kernel.cu').read() mod = SourceModule(code) tile_histogram = mod.get_function("tile_histogram") distribute_particles = mod.get_function("distribute_particles") tile_render_kernel = mod.get_function("tile_render_kernel") calculate_keys = mod.get_function("calculate_keys") # allocate histogram array hist = np.zeros(Ntiles,dtype=np.int32) # transfer histogram array and particle data to GPU hist_gpu = drv.mem_alloc(hist.nbytes) drv.memcpy_htod(hist_gpu,hist) start_g = drv.Event() end_g = drv.Event() start_g.record() ps_on_gpu = drv.mem_alloc(ps_gpu.nbytes) drv.memcpy_htod(ps_on_gpu,ps_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Particle copy onto GPU took %f ms'%(start_g.time_till(end_g)) # make everything the right size xmin,xmax,ymin,ymax = map(np.float32, [xmin,xmax,ymin,ymax]) nx,ny,Ntiles = map(np.int32, [nx,ny,Ntiles]) start_g.record() tile_histogram(ps_on_gpu,hist_gpu,np.int32(len(ps_gpu)),xmin,xmax,ymin,ymax,nx,ny,Ntiles, block=(nthreads,1,1),grid=(32,1,1)) drv.Context.synchronize() drv.memcpy_dtoh(hist,hist_gpu) end_g.record() end_g.synchronize() if timing: print '<<< Tile histogram took %f ms'%(start_g.time_till(end_g)) print "<<< Total particle array = %d"%(hist.sum()) # --------------------------------------------------------------------------------- # figured out the numbers of particles per tile -- set up the tile particle buffers # --------------------------------------------------------------------------------- ps_tiles = np.empty(hist.sum(),dtype=Partstruct) ps_tiles_gpu = drv.mem_alloc(ps_tiles.nbytes) tile_offsets = np.array([0],dtype=np.int32) tile_offsets = np.append(tile_offsets, hist.cumsum().astype(np.int32)) tile_offsets_gpu = drv.mem_alloc(tile_offsets.nbytes) drv.memcpy_htod(tile_offsets_gpu,tile_offsets) start_g.record() distribute_particles(ps_on_gpu, ps_tiles_gpu, tile_offsets_gpu, np.int32(len(ps_gpu)), xmin, xmax, ymin, ymax, nx, ny, Ntiles, block=(nthreads,1,1), grid=(np.int(Ntiles),1,1), shared=(nthreads*2+1)*4) end_g.record() end_g.synchronize() if timing: print '<<< Particle reshuffling took %f ms'%(start_g.time_till(end_g)) drv.memcpy_dtoh(ps_tiles, ps_tiles_gpu) # ------------------------- # start going through tiles # ------------------------- # initialize the image on the device im_gpu = drv.mem_alloc(image.astype(np.float32).nbytes) drv.memcpy_htod(im_gpu,image.astype(np.float32)) # allocate key arrays -- these will be keys to sort particles into softening bins start_g.record() keys_gpu = drv.mem_alloc(int(4*hist.sum())) calculate_keys(ps_tiles_gpu, keys_gpu, np.int32(hist.sum()), np.float32(dx), block=(nthreads,1,1),grid=(32,1,1)) end_g.record() end_g.synchronize() if timing: print '<<< Key generation took %f ms'%(start_g.time_till(end_g)) keys = np.empty(hist.sum(), dtype=np.int32) # ---------------------------------------- # sort particles by their softening length # ---------------------------------------- for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : radix_sort(int(keys_gpu), int(ps_tiles_gpu), tile_offsets[i], n_per_tile) drv.memcpy_dtoh(keys,keys_gpu) drv.memcpy_dtoh(ps_tiles,ps_tiles_gpu) # return keys,ps_tiles,tile_offsets,dx drv.Context.synchronize() tile_start = time.clock() for i in xrange(Ntiles) : n_per_tile = tile_offsets[i+1] - tile_offsets[i] if n_per_tile > 0 : my_stream = streams[i%16] xmin_p, xmax_p, ymin_p, ymax_p = tiles_physical[i] xmin_t, xmax_t, ymin_t, ymax_t = tiles_pix[i] nx_tile = xmax_t-xmin_t+1 ny_tile = ymax_t-ymin_t+1 # make everything the right size xmin_t,xmax_t,ymin_t,ymax_t = map(np.int32,[xmin_t,xmax_t,ymin_t,ymax_t]) xmin_p,xmax_p,ymin_p,ymax_p = map(np.float32, [xmin_p,xmax_p,ymin_p,ymax_p]) tile_render_kernel(ps_tiles_gpu,tile_offsets_gpu,np.int32(i), xmin_p,xmax_p,ymin_p,ymax_p,xmin_t,xmax_t,ymin_t,ymax_t, im_gpu,np.int32(image.shape[0]),np.int32(image.shape[1]), block=(nthreads,1,1),stream=my_stream) if timing: print '<<< %d kernels launched in %f s'%(Ntiles,time.clock()-tile_start) # ---------------------------------------------------------------------------------- # process the particles with large smoothing lengths concurrently with GPU execution # ---------------------------------------------------------------------------------- #if ind[1] != len(xs) : # start = time.clock() # image2 = (template_kernel_cpu(xs[ind[1]:],ys[ind[1]:],qts[ind[1]:],hs[ind[1]:], # nx,ny,xmin,xmax,ymin,ymax)).T # if timing: print '<<< Processing %d particles with large smoothing lengths took %e s'%(len(xs)-ind[1], # time.clock()-start) drv.Context.synchronize() if timing: print '<<< %d tiles rendered in %f s'%(Ntiles,time.clock()-tile_start) drv.memcpy_dtoh(image,im_gpu) drv.stop_profiler() if timing: print '<<< Total render done in %f s\n'%(time.clock()-global_start) del(start_g) del(end_g) return image
''' Demonstrate the significant performance difference between transferring regular host memory and pinned (pagelocked) host memory. ''' from __future__ import print_function from timeit import default_timer as timer import numpy as np from numba import vectorize, float32, cuda src = np.arange(10 ** 7, dtype=np.float32) dst = np.empty_like(src) @vectorize([float32(float32)], target='cuda') def copy_kernel(src): return src # Regular memory transfer ts = timer() d_src = cuda.to_device(src) d_dst = cuda.device_array_like(dst) copy_kernel(d_src, out=d_dst) d_dst.copy_to_host(dst) te = timer() print('regular', te - ts) del d_src, d_dst
from __future__ import absolute_import, print_function, division from numba import vectorize from numba import cuda, float32 import numpy as np from numba import unittest_support as unittest from numba.cuda.testing import skip_on_cudasim @cuda.jit(float32(float32, float32, float32), device=True) def cu_device_fn(x, y, z): return x ** y / z def cu_ufunc(x, y, z): return cu_device_fn(x, y, z) @skip_on_cudasim('ufunc API unsupported in the simulator') class TestCudaVectorizeDeviceCall(unittest.TestCase): def test_cuda_vectorize_device_call(self): ufunc = vectorize([float32(float32, float32, float32)], target='cuda')( cu_ufunc) N = 100 X = np.array(np.random.sample(N), dtype=np.float32) Y = np.array(np.random.sample(N), dtype=np.float32) Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1 out = ufunc(X, Y, Z)