def fill_layers(self): blocks = 1 threads_per_block = 19 dt = datetime.now() rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=dt.microsecond) for i, x in enumerate(self.weights_layer_2): fill_random[1, threads_per_block](self.weights_layer_2[i], 19, rng_states) threads_per_block = 17 dt = datetime.now() rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=dt.microsecond) for i, x in enumerate(self.weights_layer_3): fill_random[1, threads_per_block](self.weights_layer_3[i], 17, rng_states) threads_per_block = 17 dt = datetime.now() rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=dt.microsecond) for i, x in enumerate(self.weights_layer_4): fill_random[1, threads_per_block](self.weights_layer_4[i], 17, rng_states)
def crossover(self, father, mother): nn = copy.deepcopy(father) #for each biase update for layer for u in range(nn.num_layers - 1): #number of thread in the block TPB = 4 x = nn.biases[u].shape[0] y = nn.biases[u].shape[1] threadsperblock = [x, y] if (x > TPB): threadsperblock[0] = TPB if (y > TPB): threadsperblock[1] = TPB threadsperblock = (threadsperblock[0], threadsperblock[1]) #number of block for the given input blockspergrid_x = int(math.ceil(x / threadsperblock[0])) blockspergrid_y = int(math.ceil(y / threadsperblock[1])) blockspergrid = (blockspergrid_x, blockspergrid_y) #generate random number object to call in kernel rng_states = create_xoroshiro128p_states(x * y, seed=random.randint(0, 10)) #call to parallel function for crossover on each layer based on the random number. biase_crossover[blockspergrid, threadsperblock](rng_states, nn.biases[u], self.crossover_rate, mother.biases[u]) #for each weight update for layer for u in range(nn.num_layers - 1): #number of thread in the block TPB = 4 x = nn.weights[u].shape[0] y = nn.weights[u].shape[1] threadsperblock = [x, y] #blockspergrid_x=1 #blockspergrid_y=1 if (x > TPB): threadsperblock[0] = TPB if (y > TPB): threadsperblock[1] = TPB threadsperblock = (threadsperblock[0], threadsperblock[1]) blockspergrid_x = int(math.ceil(x / threadsperblock[0])) blockspergrid_y = int(math.ceil(y / threadsperblock[1])) blockspergrid = (blockspergrid_x, blockspergrid_y) #C_global_mem = cuda.device_array((w.shape[0], a.shape[1])) #generate random number object to use in kernel function. rng_states = create_xoroshiro128p_states(x * y, seed=random.randint(0, 10)) #call to parallel function for crossover on each layer based on the random number weight_crossover[blockspergrid, threadsperblock](rng_states, nn.weights[u], self.crossover_rate, mother.weights[u]) #print(nn.weights[u],"final") return nn
def maybe_create_rng_states(n, seed=0, rng_states=None): """Create or extend random states for CUDA kernel""" if rng_states is None: return create_xoroshiro128p_states(n, seed=seed) elif n > len(rng_states): new_states = device_array(n, dtype=rng_states.dtype) new_states[:len(rng_states)] = rng_states new_states[len(rng_states):] = create_xoroshiro128p_states( n - len(rng_states), seed=seed) return new_states return rng_states
def _calcularColsReparar(ponderaciones): resultado = np.zeros((ponderaciones.shape[0], ponderaciones.shape[1]), dtype=np.uint8) #colsCandidatas = np.zeros((ponderaciones.shape[0], ponderaciones.shape[1]), dtype=np.uint8) colsCandidatasGlobal = np.ones( (ponderaciones.shape[0], 10), dtype=np.int32) * -1 rng_states = create_xoroshiro128p_states(COL, seed=1) ponderacionMaxima = np.array([np.max(ponderaciones)]) #print(f"ponderacion maxima {ponderacionMaxima}") #iniciar kernel threadsperblock = (NSOL, COL) blockspergrid_x = int( math.ceil(ponderaciones.shape[0] / threadsperblock[0])) blockspergrid_y = int( math.ceil(ponderaciones.shape[1] / threadsperblock[1])) blockspergrid = (blockspergrid_x, blockspergrid_y) ponderaciones_global_mem = cuda.to_device(ponderaciones) resultado_global_mem = cuda.to_device(resultado) colsCandidatasGlobal_mem = cuda.to_device(colsCandidatasGlobal) poderacionMaxima_mem = cuda.to_device(ponderacionMaxima) rng_states_mem = cuda.to_device(rng_states) #llamar kernel kernelColsCandidatasGPU[blockspergrid, threadsperblock](ponderaciones_global_mem, poderacionMaxima_mem, colsCandidatasGlobal, rng_states_mem, resultado_global_mem) return colsCandidatasGlobal_mem.copy_to_host()
def fill_uniformly(self, n_of_spins, seed): """ Calculate positions for spins evenly distributed in the substrate. Parameters ---------- n_of_spins : int Number of spins. seed : int Seed for random number generation. Returns ------- positions : numpy array Calculated positions for spins. """ triangles = self.triangles block_size = 256 grid_size = int(math.ceil(float(n_of_spins) / block_size)) stream = cuda.stream() rng_states = create_xoroshiro128p_states(grid_size * block_size, seed=seed, stream=stream) positions = np.zeros([3, n_of_spins]) d_positions = cuda.to_device(positions, stream=stream) d_triangles = cuda.to_device(triangles.ravel(), stream=stream) d_max = cuda.to_device(np.max(np.max(triangles, 0), 0), stream=stream) fill_uniformly_cuda[grid_size, block_size, stream](d_positions, d_triangles, d_max, rng_states) stream.synchronize() positions = d_positions.copy_to_host(stream=stream) return positions
def detect_and_swap_gpu(matrix, indices, seed, threads_per_block=128, blocks=128): rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=seed) vec = np.zeros([threads_per_block * blocks, 3]).astype(int) d_matrix = cuda.to_device(matrix) d_vec = cuda.to_device(vec) _detect_gpu[blocks, threads_per_block](d_matrix, d_vec, rng_states) vec = d_vec.copy_to_host() vec = vec[~np.all(vec == 0, axis=1)] # select non-zero rows vec = vec[np.argsort(vec[:, 2])[::-1]] # TODO: greedy? vec_detected = vec.shape[0] # remove conflicted rows visited = {} selected = [] for i in range(vec.shape[0]): if vec[i, 0] not in visited and vec[i, 1] not in visited: selected.append(i) visited[vec[i, 0]] = 1 visited[vec[i, 1]] = 1 vec = vec[selected, :] for i in range(vec.shape[0]): swap_inplace(matrix, indices, vec[i, 0], vec[i, 1]) vec_swapped = vec.shape[0] return matrix, indices, vec_detected, vec_swapped
def simulate(doms, rs, x_start, y_start, blocks, threads_per_block, seed=None, verbose=True): N = blocks * threads_per_block x_y = [] t1 = time.time() # Set a random seed if seed is None: ran_seed = np.random.randint(1, 123456) else: ran_seed = seed x_start, y_start = np.float32(x_start), np.float32(y_start) # Initialize the random states for the kernel rng_states = create_xoroshiro128p_states(N, seed=ran_seed) # Create empty arrays for the (x, y) values out_x, out_y = np.zeros(N, dtype=np.float32), np.zeros(N, dtype=np.float32) # Create empty array for domhits domhits = np.zeros((N, len(rs)), dtype=np.int32) domhit_times = np.zeros((N, len(rs)), dtype=np.int32) # Calculate x, y and domhits move[blocks, threads_per_block](rng_states, x_start, y_start, out_x, out_y, doms, rs, domhits, domhit_times) # Save the hit information domhits = np.sum(domhits, axis=0) t2 = time.time() if verbose: print (t2-t1) x_y = np.array(x_y) domhits = np.array(domhits) return domhits, domhit_times, t2-t1
def test__cuda_random_step(): @cuda.jit() def test_kernel(steps, rng_states): thread_id = cuda.grid(1) if thread_id >= steps.shape[0]: return simulations._cuda_random_step(steps[thread_id, :], rng_states, thread_id) return N = int(1e5) seeds = [1, 1, 12] steps = np.zeros((len(seeds), N, 3)) block_size = 128 grid_size = int(math.ceil(N / block_size)) for i, seed in enumerate(seeds): stream = cuda.stream() rng_states = create_xoroshiro128p_states(grid_size * block_size, seed=seed, stream=stream) test_kernel[grid_size, block_size, stream](steps[i, :, :], rng_states) stream.synchronize() npt.assert_equal(steps[0], steps[1]) npt.assert_equal(np.all(steps[0] != steps[2]), True) npt.assert_almost_equal(np.mean(np.sum(steps[1::], axis=1) / N), 0, 3) _, p = normaltest(steps[1::].ravel()) npt.assert_almost_equal(p, 0) npt.assert_almost_equal(np.linalg.norm(steps, axis=2), np.ones((len(seeds), N))) return
def driver(pricer, do_plot=False): paths = np.zeros((numPath, NumStep + 1), order='F') paths[:, 0] = StockPrice DT = Maturity / NumStep ts = timer() threads_per_block = 64 blocks = 24 rng_states = random.create_xoroshiro128p_states(threads_per_block * blocks, seed=1) pricer(rng_states, paths, DT, InterestRate, Volatility) te = timer() elapsed = te - ts ST = paths[:, -1] PaidOff = np.maximum(paths[:, -1] - StrikePrice, 0) print("Result") print(f"Stock price: {np.mean(ST)}") print(f"Standard error: {np.std(ST)/np.sqrt(numPath)}") print(f"Paid off: {np.mean(PaidOff)}") optionPrice = np.mean(PaidOff) * np.exp(-InterestRate * Maturity) print(f"Option price: {optionPrice}") print("Performance") NumCompute = numPath * NumStep print(f"Mstep/second = {NumCompute/elapsed/1e6:.2f}") print(f"Time elapsed = {elapsed:.3f}") if do_plot: pathct = min(numPath, MAX_PATH_IN_PLOT) for i in range(pathct): pyplot.plot(paths[i]) print(f"Plotting {pathct}/{numPath} paths") pyplot.show()
def detect_and_swap_gpu(matrix, seed): threads_per_block = 128 blocks = 128 rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=seed) vec = np.zeros([threads_per_block * blocks, 2]).astype(int) d_matrix = cuda.to_device(matrix) d_vec = cuda.to_device(vec) _detect_gpu[blocks, threads_per_block](d_matrix, d_vec, rng_states) vec = d_vec.copy_to_host() vec = vec[~np.all(vec == 0, axis=1)] # select non-zero rows print(vec.shape) # remove conflicted rows visited = {} selected = [] for i in range(vec.shape[0]): if vec[i, 0] not in visited and vec[i, 1] not in visited: selected.append(i) visited[vec[i, 0]] = 1 visited[vec[i, 1]] = 1 vec = vec[selected, :] print(vec.shape) if vec.shape[0] > 0: blocks = (vec.shape[0] + threads_per_block - 1) // threads_per_block d_vec = cuda.to_device(vec) _swap_gpu[blocks, threads_per_block](d_matrix, d_vec) matrix = d_matrix.copy_to_host() return matrix
def GPUWrapper(data_out, device_id, photons_req_per_device, max_photons_per_device, muA, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range, ret_cols, absorb_threshold, absorb_chance): # TODO: These numbers can be optimized based on the device / architecture / number of photons threads_per_block = 256 blocks = 64 photons_per_thread = int( np.ceil(float(photons_req_per_device) / (threads_per_block * blocks))) max_photons_per_thread = int( np.ceil(float(max_photons_per_device) / (threads_per_block * blocks))) cuda.select_device(device_id) stream = cuda.stream() # use stream to trigger async memory transfer # Keeping this piece of code here for now -potentially we need this in the future # with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where # signature is the call to the function. So far I couldn't figure out what is the signature of the # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A) # But I couldn't get it to work yet. # MC_cuda_kernel = cuda.jit(propPhotonGPU) data = np.ndarray(shape=(threads_per_block * blocks, photons_per_thread, 12), dtype=np.float32) photon_counters = np.ndarray(shape=(threads_per_block * blocks, 5), dtype=np.int) data_out_device = cuda.device_array_like(data, stream=stream) photon_counters_device = cuda.device_array_like(photon_counters, stream=stream) # Used to initialize the threads random states. rng_states = create_xoroshiro128p_states( threads_per_block * blocks, seed=(np.random.randint(sys.maxsize) - 128) + device_id, stream=stream) # Actual kernel call propPhotonGPU[blocks, threads_per_block]( rng_states, data_out_device, photon_counters_device, photons_per_thread, max_photons_per_thread, muA, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range, absorb_threshold, absorb_chance) # Copy data back data_out_device.copy_to_host(data, stream=stream) photon_counters_device.copy_to_host(photon_counters, stream=stream) stream.synchronize() data = data.reshape(data.shape[0] * data.shape[1], data.shape[2]) data = data[:, ret_cols] data_out[device_id][0] = data photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0)) data_out[device_id][1] = photon_counters_aggr
def test_mutation(): a = np.array([[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]]) A_global_mem = cuda.to_device(a) dt = datetime.now() rng_states = create_xoroshiro128p_states(10, seed=dt.microsecond) mutation[2, 5](A_global_mem, 2, 3, rng_states, 0.3) A = A_global_mem.copy_to_host() print(A)
def _detect_possible_swaps(parent, gpu_random_seed, threads_per_block=128, blocks=128): rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=gpu_random_seed) # detected_pairs is for storing the detect results detected_pairs = np.zeros([threads_per_block * blocks, 3]).astype(int) d_parent = cuda.to_device(parent) d_detected_pairs = cuda.to_device(detected_pairs) _detect_gpu[blocks, threads_per_block](d_parent, d_detected_pairs, rng_states) detected_pairs = d_detected_pairs.copy_to_host() detected_pairs = detected_pairs[~np.all(detected_pairs == 0, axis=1)] # select non-zero rows return detected_pairs
def simulate(Nexp, N, initial_position, doms, rs, threads_per_block, pa=0.01, ps=0.02, seed=None, verbose=True): if verbose: print(Nexp) import time blocks = N // threads_per_block + 1 # N = threads_per_block * blocks # print initial_position x_start, y_start = np.array(initial_position, dtype=np.float32) domhits_all = [] domhitstimes_all = [] x_y = [] t1 = time.time() for i in range(Nexp): # for i in tqdm(range(Nexp)): if seed is None: # Set a random seed RanSeed = np.random.randint(1, 123456) # Initialize the random states for the kernel rng_states = create_xoroshiro128p_states(N, seed=RanSeed) # Create empty arrays for the (x, y) values out_x, out_y = np.zeros(N, dtype=np.float32), np.zeros(N, dtype=np.float32) # Create empty array for domhits domhits = np.zeros((N, len(rs)), dtype=np.int32) domhitstimes = np.zeros((N, len(rs)), dtype=np.int32) # Calculate x, y and domhits move[blocks, threads_per_block](rng_states, x_start, y_start, out_x, out_y, doms, rs, domhits, domhitstimes, pa, ps, N) # Save the hit information domhits = np.sum(domhits, axis=0) domhits_all.append(domhits) domhitstimes_all.append(domhitstimes) x_y.append([x_start, y_start]) t2 = time.time() if verbose: print(t2 - t1) x_y = np.array(x_y) domhits_all = np.array(domhits_all) return domhits_all, domhitstimes_all
def test_recombination(): dt = datetime.now() tot_ia = 5 rng_states = create_xoroshiro128p_states(1 * tot_ia, seed=dt.microsecond) inp_weights_local = np.array([[1., 2., 3., 4., 5., 6., 7., 8., 9.], [9., 8., 7., 6., 5., 4., 3., 2., 1.]]) inp_weights = cuda.to_device(inp_weights_local) out_weights = cuda.device_array((tot_ia, 9)) recombination[1, tot_ia](inp_weights, out_weights, 2, tot_ia, 9, rng_states) C = out_weights.copy_to_host() print(C)
def cudaTask(): T = 1000 #threadNum per block B = 10 #blockNum per grid AS = 1000 #loopNum in one thread res = np.zeros(T * B * AS, dtype='float64') #创建数组,初始化0,参考numpy使用手册 rng_states = create_xoroshiro128p_states( T * B, seed=1) #参见http://numba.pydata.org/numba-doc/0.35.0/cuda/random.html cudaNormalVariateKernel[B, T](rng_states, res) return res
def detect_and_swap_gpu(matrix, indices, seed, threads_per_block=128, blocks=128, mode='random'): # threads_per_block, blocks = 128, 128 # wandb.log({"threads_per_block": threads_per_block, "blocks": blocks}) if mode == 'random': rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=seed) vec = np.zeros([threads_per_block * blocks, 3]).astype(int) elif mode == 'all': vec = np.zeros([4096, 3]).astype(int) index = np.zeros([1]).astype(int) d_index = cuda.to_device(index) threadsperblock = (32, 32) blockspergrid_x = (matrix.shape[0] + threadsperblock[0] - 1) // threadsperblock[0] blockspergrid_y = (matrix.shape[1] + threadsperblock[1] - 1) // threadsperblock[1] blockspergrid = (blockspergrid_x, blockspergrid_y) d_matrix = cuda.to_device(matrix) d_vec = cuda.to_device(vec) if mode == 'random': _detect_gpu[blocks, threads_per_block](d_matrix, d_vec, rng_states) elif mode == 'all': _detect_all_gpu[blockspergrid, threadsperblock](d_matrix, d_index, d_vec) vec = d_vec.copy_to_host() vec = vec[~np.all(vec == 0, axis=1)] # select non-zero rows vec = vec[np.argsort(vec[:, 2])[::-1]] # TODO: greedy? # print(vec[:5]) vec_detected = vec.shape[0] # print(vec.shape) # remove conflicted rows visited = {} selected = [] for i in range(vec.shape[0]): if vec[i, 0] not in visited and vec[i, 1] not in visited: selected.append(i) visited[vec[i, 0]] = 1 visited[vec[i, 1]] = 1 vec = vec[selected, :] # print(vec.shape) for i in range(vec.shape[0]): swap_inplace(matrix, indices, vec[i, 0], vec[i, 1]) vec_swapped = vec.shape[0] # wandb.log({"detected": vec_detected, "swapped": vec_swapped}) return matrix, indices, vec_detected, vec_swapped
def fill_masks(masks, aligned): S = masks.shape[0] nframes, nimages, ncolor, H, W = aligned.shape threads_per_block = (32, 32) blocks_H = H // threads_per_block[0] + (H % threads_per_block[0] != 0) blocks_W = W // threads_per_block[1] + (W % threads_per_block[1] != 0) blocks = (blocks_H, blocks_W) nthreads = int( np.product([blocks[i] * threads_per_block[i] for i in range(2)])) seed = int(torch.rand(1) * 100) rng_states = create_xoroshiro128p_states(nthreads, seed=seed) fill_masks_numba[blocks, threads_per_block](rng_states, nimages, ncolor, H, W, S, nframes, aligned, masks)
def fill_weights_foo(weights, counts, nsubsets, nframes, gpuid): assert nframes <= 51, "Number of frames is maxed at 51." numba.cuda.select_device(gpuid) device = weights.device weights = numba.cuda.as_cuda_array(weights) counts = numba.cuda.as_cuda_array(counts) threads_per_block = 1024 blocks = nsubsets // threads_per_block + (nsubsets % threads_per_block != 0) seed = int(torch.rand(1) * 100) rng_states = create_xoroshiro128p_states(blocks * threads_per_block, seed=seed) create_weights_cuda[blocks, threads_per_block](rng_states, nsubsets, nframes, weights, counts) return weights
def call_process_no_buffer( self, process_kernel_no_buffer, N, ntotthreads=int(1e6), threads_per_block=512, ): ntotthreads = min(N, int(ntotthreads)) nblocks = math.ceil(ntotthreads / threads_per_block) actual_nthreads = threads_per_block * nblocks n_neutrons_per_thread = math.ceil(N / actual_nthreads) print("%s blocks, %s threads, %s neutrons per thread" % ( nblocks, threads_per_block, n_neutrons_per_thread)) rng_states = create_xoroshiro128p_states(actual_nthreads, seed=rng_seed) process_kernel_no_buffer[nblocks, threads_per_block]( rng_states, N, n_neutrons_per_thread, self.propagate_params) cuda.synchronize() return
def call_process_no_buffer(N, src, guide, mon, ntotthreads=int(1e5)): ntotthreads = min(N, ntotthreads) threads_per_block = 512 nblocks = math.ceil(ntotthreads / threads_per_block) actual_nthreads = threads_per_block * nblocks n_neutrons_per_thread = math.ceil(N / actual_nthreads) print("{} blocks, {} threads, {} neutrons per thread".format( nblocks, threads_per_block, n_neutrons_per_thread)) rng_states = create_xoroshiro128p_states(actual_nthreads, seed=1) counter = np.zeros(1, dtype=int) process_kernel_no_buffer[nblocks, threads_per_block](counter, N, n_neutrons_per_thread, src, guide, mon, rng_states) cuda.synchronize() print(f"processed {counter.sum():g} neutrons")
def index_bursts_by_frames(sims, aligned): S = sims.shape[0] nframes, nimages, ncolor, H, W = aligned.shape threads_per_block = (32, 32) blocks_H = H // threads_per_block[0] + (H % threads_per_block[0] != 0) blocks_W = W // threads_per_block[1] + (W % threads_per_block[1] != 0) blocks = (blocks_H, blocks_W) nthreads = int( np.product([blocks[i] * threads_per_block[i] for i in range(2)])) seed = int(torch.rand(1) * 100) rng_states = create_xoroshiro128p_states(nthreads, seed=seed) uniform_pix_sample_by_frames_numba[blocks, threads_per_block](rng_states, nimages, ncolor, H, W, S, nframes, aligned, sims)
def test_ex_3d_grid(self): # magictoken.ex_3d_grid.begin from numba import cuda from numba.cuda.random import (create_xoroshiro128p_states, xoroshiro128p_uniform_float32) import numpy as np @cuda.jit def random_3d(arr, rng_states): # Per-dimension thread indices and strides startx, starty, startz = cuda.grid(3) stridex, stridey, stridez = cuda.gridsize(3) # Linearized thread index tid = (startz * stridey * stridex) + (starty * stridex) + startx # Use strided loops over the array to assign a random value to each entry for i in range(startz, arr.shape[0], stridez): for j in range(starty, arr.shape[1], stridey): for k in range(startx, arr.shape[2], stridex): arr[i, j, k] = xoroshiro128p_uniform_float32( rng_states, tid) # Array dimensions X, Y, Z = 701, 900, 719 # Block and grid dimensions bx, by, bz = 8, 8, 8 gx, gy, gz = 16, 16, 16 # Total number of threads nthreads = bx * by * bz * gx * gy * gz # Initialize a state for each thread rng_states = create_xoroshiro128p_states(nthreads, seed=1) # Generate random numbers arr = cuda.device_array((X, Y, Z), dtype=np.float32) random_3d[(gx, gy, gz), (bx, by, bz)](arr, rng_states) # magictoken.ex_3d_grid.end # Some basic tests of the randomly-generated numbers host_arr = arr.copy_to_host() self.assertGreater(np.mean(host_arr), 0.49) self.assertLess(np.mean(host_arr), 0.51) self.assertTrue(np.all(host_arr <= 1.0)) self.assertTrue(np.all(host_arr >= 0.0))
def pso(nodes_h, edges_h, nrParticles, nrIterations, threads_per_block, blocks_per_grid): nodes = cuda.to_device(nodes_h) edges = cuda.to_device(edges_h) cuda.synchronize() # print(threads_per_block) # print(blocks_per_grid) # threads_per_block = 128 # blocks_per_grid = 30 currentPaths = cuda.device_array(shape=(nrParticles, len(nodes)), dtype=int) neighbourNodes = cuda.device_array(shape=(nrParticles, len(nodes)), dtype=int) rng_states = create_xoroshiro128p_states(threads_per_block * blocks_per_grid, seed=1) cuda.synchronize() getRandomPaths[blocks_per_grid, threads_per_block](edges, nodes, nrParticles,currentPaths,neighbourNodes,rng_states) cuda.synchronize() bestPaths = cuda.device_array(shape=currentPaths.shape, dtype=int) bestPaths[:] = currentPaths # copy on gpu cuda.synchronize() particles = cuda.device_array(shape=(nrParticles,2), dtype=float) cuda.synchronize() initParticles[blocks_per_grid, threads_per_block](bestPaths,particles, nodes) cuda.synchronize() globalBestPath = cuda.device_array(shape=(bestPaths.shape[1]), dtype=int) shortestGlobalIdx = getGlobalBestPath(particles, particles[0, BEST]) if shortestGlobalIdx != -1: globalBestPath[:] = bestPaths[shortestGlobalIdx, :] globalBestCost = particles[shortestGlobalIdx, BEST] for i in range(nrIterations): getNewPaths[blocks_per_grid, threads_per_block](currentPaths, bestPaths, globalBestPath, neighbourNodes, nodes ) cuda.synchronize() updateParticles[blocks_per_grid, threads_per_block](currentPaths,bestPaths,particles, nodes) cuda.synchronize() shortestGlobalIdx = getGlobalBestPath(particles, globalBestCost) if shortestGlobalIdx != -1: globalBestPath[:] = bestPaths[shortestGlobalIdx, :] globalBestCost = particles[shortestGlobalIdx, BEST] cuda.synchronize() return globalBestCost
def get_rrs_gpu(graph, p, mc): sources = np.random.choice(graph.shape[0], size=mc) probs = (p**graph).astype(float32) success = np.full((mc, graph.shape[0]), False, dtype=bool) new_nodes = np.full((mc, graph.shape[0]), False, dtype=bool) rrs = np.full((mc, graph.shape[0]), False, dtype=bool) threads_per_block = 128 blocks = math.ceil(mc / threads_per_block) rng_states = create_xoroshiro128p_states(mc, seed=mc) get_node_flow_gpu[blocks, threads_per_block](sources, probs, success, new_nodes, rrs, mc, rng_states) return rrs
def mc_integrate(lower_lim, upper_lim, nsamps): """ approximate the definite integral of `func` from `lower_lim` to `upper_lim` """ out = cuda.to_device(np.zeros(nsamps, dtype="float32")) rng_states = create_xoroshiro128p_states(nsamps, seed=42) # jit the function for use in CUDA kernels mc_integrator_kernel.forall(nsamps)(out, rng_states, lower_lim, upper_lim) # normalization factor to convert # to the average: (b - a)/(N - 1) factor = (upper_lim - lower_lim) / (nsamps - 1) return sum_reduce(out) * factor
def spark_gpu_batch_som(rdd_data, d, max_iters, rows, cols, smooth_iters=None, sigma_0=10, sigma_f=0.1, tau=400, seed=None, tpb=1024): # 1. Inicializamos pesos aleatorios d_weights = cuda.device_array((rows, cols, d), np.float32) rng_states = create_xoroshiro128p_states(rows * cols * d, seed=seed) rand_weights[(d_weights.size) // tpb + 1, tpb](rng_states, d_weights) weights = d_weights.copy_to_host() # 2. Bucle del algoritmo for t in range(max_iters): # 2.a Actualizamos los parámetros de control si procede if smooth_iters is None or t < max_iters: sigma = sigma_0 * math.exp((-t / tau)) else: sigma = sigma_f sigma_squared = sigma * sigma # 2.b Cada nodo del clúster de spark trabajará con un subconjunto # de las muestras del RDD para encontrar la BMU y realizar la suma # parcial de su ecucación de actualización de pesos out = rdd_data.mapPartitions(gpu_work_iter(weights, sigma_squared)) # 2.c En un único nodo usamos la GPU para juntar todas las sumas parciales obtenidas # y realizar la división out = out.collect() numParts = len(out) // 2 partials = np.concatenate(out) finish_update[rows * cols // tpb + 1, tpb](weights, partials, numParts) return weights
def __init__(self, geometry, freqs, monitors=None, name=None): dt = profile.get_dt() / 1000. # firing rate if isinstance(freqs, np.ndarray): freqs = freqs.flatten() if not np.all(freqs <= 1000. / profile.get_dt()): print(f'WARNING: The maximum supported frequency at dt={profile.get_dt()} ms ' f'is {1000. / profile.get_dt()} Hz. While we get your "freq" setting which ' f'is bigger than that.') # neuron model on CPU # ------------------- if profile.run_on_cpu(): def update(ST): ST['spike'] = np.random.random(ST['spike'].shape) < freqs * dt model = NeuType(name='poisson_input', ST=NeuState('spike'), steps=update, mode='vector') # neuron model on GPU # ------------------- else: def update(ST, rng_states, _obj_i): ST['spike'] = random.xoroshiro128p_uniform_float64(rng_states, _obj_i) < freqs * dt model = NeuType(name='poisson_input', ST=NeuState('spike'), steps=update, mode='scalar') # initialize neuron group # ----------------------- super(PoissonInput, self).__init__(model=model, geometry=geometry, monitors=monitors, name=name) # will automatically handle # the heterogeneous problem # ------------------------- self.pars['freqs'] = freqs # rng states # ---------- if profile.run_on_gpu(): num_block, num_thread = tools.get_cuda_size(self.num) self.rng_states = random.create_xoroshiro128p_states( num_block * num_thread, seed=np.random.randint(100000))
def E_Q_scattering_kernel_call(Qmin, Qmax, neutron_velocity, neutron_probability, E_Q, S_Q, scattering_coefficient, absorption_cross_section, threadsperblock = 64 ): neutron_velocity_gpu = cuda.to_device(neutron_velocity) neutron_probability_gpu = cuda.to_device(neutron_probability) scattered_neutron_velocity =cuda.device_array ((len(neutron_velocity),3)) #cuda.device_array_like(d_a) scattered_neutron_probability = cuda.device_array(len(neutron_probability)) write_user_input(E_Q, S_Q, scattering_coefficient, absorption_cross_section) rng_states = create_xoroshiro128p_states(15000, seed=1) blockspergrid = (neutron_velocity_gpu.size + (threadsperblock - 1)) // threadsperblock E_Q_kernel[blockspergrid, threadsperblock] (rng_states,Qmin,Qmax, neutron_velocity_gpu, neutron_probability_gpu, scattered_neutron_probability, scattered_neutron_velocity) scattered_neutron_velocity_cpu = scattered_neutron_velocity.copy_to_host() scattered_neutron_probability_cpu = scattered_neutron_probability.copy_to_host() return scattered_neutron_probability_cpu, scattered_neutron_velocity_cpu
def get_rrs_gpu(graph, node_num, p, mc): # Do we need node_num? # Randomly choose mc number of nodes with replacement from network sources = np.random.choice(node_num, size=mc) # Initiate a mc x n matrix to represent R rrs = np.full((mc, node_num), False, dtype=bool) # Creates mc random number generator states rng_states = create_xoroshiro128p_states(mc, seed=np.random.randint(mc)) # Number of blocks (each with 128 threads) needed to perform mc operations threads_per_block = 128 blocks = math.ceil(mc / threads_per_block) # Update the mc rows of the rrs array using GPU get_node_flow_gpu[blocks, threads_per_block](graph, sources, p, rrs, mc, rng_states) return (rrs)
""" from __future__ import print_function, absolute_import from numba import cuda from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32 import numpy as np @cuda.jit def compute_pi(rng_states, iterations, out): """Find the maximum value in values and store in result[0]""" thread_id = cuda.grid(1) # Compute pi by drawing random (x, y) points and finding what # fraction lie inside a unit circle inside = 0 for i in range(iterations): x = xoroshiro128p_uniform_float32(rng_states, thread_id) y = xoroshiro128p_uniform_float32(rng_states, thread_id) if x**2 + y**2 <= 1.0: inside += 1 out[thread_id] = 4.0 * inside / iterations threads_per_block = 64 blocks = 24 rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=1) out = np.zeros(threads_per_block * blocks, dtype=np.float32) compute_pi[blocks, threads_per_block](rng_states, 10000, out) print('pi:', out.mean())