def find_pi_cuda(n): '''Takes in integer n (n points used for integration) ''' # here's Monte Carlo for finding pi seed = rand.seed_getter_unique(n) x = rand.XORWOWRandomNumberGenerator(seed, 0) y = rand.XORWOWRandomNumberGenerator(seed, 0) ones = gpuarray.ones( int(n)) # need an array the size of guesses to use mask distance = np.hypot(x, y) mask = np.where(distance <= 1.0) pi = 4 * np.sum(ones[mask]) / n error = np.abs(np.pi - pi) return pi, error
def initializeRandomness(self): """ Initialize the random number generator used on GPU """ # Define a random seed_getter for curandom seed_getter = lambda N: gpuarray.to_gpu( np.random.randint(-2**30, 2**30, size=(N, )).astype(np.int32)) self.rand_gpu = curandom.XORWOWRandomNumberGenerator(seed_getter)
def __getattribute__(self, name): sampler = object.__getattribute__(self, '_sampler') if sampler is None: from pycuda import curandom sampler = curandom.XORWOWRandomNumberGenerator( curandom.seed_getter_uniform) self._sampler = sampler return sampler.__getattribute__(name)
def __init__(self, n_units, n_incoming, N, init_sd=1.0, precision=np.float32, magic_numbers=False): self.n_units = n_units self.n_incoming = n_incoming self.N = N w = np.random.normal(0, init_sd, (self.n_incoming, self.n_units)) b = np.random.normal(0, init_sd, (1, n_units)) self.weights = gpuarray.to_gpu(w.copy().astype(precision)) self.gW = gpuarray.empty_like(self.weights) # Prior and ID must be set after creation self.prior = -1 self.ID = -1 self.biases = gpuarray.to_gpu(b.copy().astype(precision)) self.gB = gpuarray.empty_like(self.biases) #Set up momentum variables for HMC sampler self.pW = gpuarray.to_gpu(np.random.normal(0, 1, self.gW.shape)) self.pB = gpuarray.to_gpu(np.random.normal(0, 1, self.gB.shape)) self.epsW = gpuarray.zeros(self.weights.shape, precision) + 1.0 self.epsB = gpuarray.zeros(self.biases.shape, precision) + 1.0 self.precision = precision self.outputs = gpuarray.zeros((self.N, self.n_units), precision) self.magic_numbers = magic_numbers #Define tan_h function on GPU if magic_numbers: self.tanh = ElementwiseKernel("float *x", "x[i] = 1.7159 * tanh(2/3*x[i]);", "tan_h", preamble="#include <math.h>") else: self.tanh = ElementwiseKernel( "float *x", "x[i] = tanh(min(max(-10.0,x[i]),10.0));", "tan_h", preamble="#include <math.h>") #Compile kernels kernels = SourceModule(open(path + '/kernels.cu', "r").read()) self.add_bias_kernel = kernels.get_function("add_bias") self.rng = curandom.XORWOWRandomNumberGenerator() ##Initialize posterior weights self.posterior_weights = list() self.posterior_biases = list()
def __init__(self, number_of_clauses, T, s, clause_drop_p=0.0, feature_drop_p=0.0, number_of_gpus=1, q=1.0, boost_true_positive_feedback=1, weighted_clauses=0, number_of_state_bits=8, append_negated=True, grid=(16 * 13, 1, 1), block=(128, 1, 1)): self.number_of_gpus = np.minimum(cuda.Device.count(), number_of_gpus) self.number_of_clauses = number_of_clauses self.number_of_state_bits = number_of_state_bits self.T = int(T) self.s = s self.q = q self.boost_true_positive_feedback = boost_true_positive_feedback self.weighted_clauses = weighted_clauses self.append_negated = append_negated self.grid = grid self.block = block self.clause_drop_p = clause_drop_p self.feature_drop_p = feature_drop_p self.X_train = np.array([]) self.Y_train = np.array([]) self.X_test = np.array([]) self.ta_state = np.array([]) self.clause_weights = np.array([]) self.initialized = False self.gpus = [] for c in range(self.number_of_gpus): print("Preparing GPU #%d" % (c)) gpu = GPU() gpu.device_id = c gpu.device = cuda.Device(c) gpu.context = gpu.device.make_context() gpu.g = curandom.XORWOWRandomNumberGenerator() gpu.mod_encode = SourceModule(kernels.code_encode, no_extern_c=True) gpu.prepare_encode = gpu.mod_encode.get_function("prepare_encode") gpu.encode = gpu.mod_encode.get_function("encode") self.gpus.append(gpu) gpu.context.pop() print()
def __getattribute__(self, name): if name in ('seed', 'set_seed'): return object.__getattribute__(self, name) sampler = object.__getattribute__(self, '_sampler') if sampler is None: from pycuda import curandom, gpuarray seed_func = curandom.seed_getter_uniform if self.seed is None \ else lambda N: gpuarray.to_gpu( np.array(N * [self.seed], dtype=np.int32)) sampler = curandom.XORWOWRandomNumberGenerator(seed_func) self._sampler = sampler return sampler.__getattribute__(name)
def test_adjoint(self, iters=5): """Test the adjoint operator. Args: iters (int): number of iterations """ src_shape = (self.data.nX1, self.data.nX2, 1) dest_shape = (self.data.nT, self.data.nC) u = gpuarray.zeros(src_shape, self.precision_complex, order='F') ut = gpuarray.zeros(src_shape, self.precision_real, order='F') Ku = gpuarray.zeros(dest_shape, self.precision_complex, order='F') v = gpuarray.zeros(dest_shape, self.precision_complex, order='F') vt = gpuarray.zeros(dest_shape, self.precision_real, order='F') Kadv = gpuarray.zeros(src_shape, self.precision_complex, order='F') generator = curandom.XORWOWRandomNumberGenerator() errors = [] try: i = 0 for i in range(iters): # randomness generator.fill_uniform(ut) generator.fill_uniform(vt) v = gpuarray_copy(vt.astype(self.precision_complex)) u = gpuarray_copy(ut.astype(self.precision_complex)) # apply operators self.apply(u, Ku) self.adjoint(v, Kadv) scp1 = dotc_gpu(Ku, v) scp2 = dotc_gpu(u, Kadv) n_Ku = dotc_gpu(Ku) n_Kadv = dotc_gpu(Kadv) n_u = dotc_gpu(u) n_v = dotc_gpu(v) errors.append(np.abs(scp1-scp2)) print("Test " + str(i) + ": <Ku,v>=" + str(scp1) + ", <u,Kadv>=" + str(scp2) + ", Error=" + str(np.abs(scp1-scp2)) + ", Relative Error=" + str((scp1-scp2)/(n_Ku*n_v + n_Kadv*n_u))) except KeyboardInterrupt: if len(errors) == 0: errors = -1 finally: print("Mean Error: " + repr(np.mean(errors))) print("Standarddeviation: " + repr(np.std(errors))) return i
def __init__(self, n_classes, n_incoming, N, init_sd=0.1, precision=np.float32): self.type = 'Softmax' self.n_incoming = n_incoming self.N = N w = np.random.normal(0, init_sd, (self.n_incoming, n_classes)) b = np.random.normal(0, init_sd, (1, n_classes)) self.weights = gpuarray.to_gpu(w.copy().astype(precision)) self.gW = gpuarray.empty_like(self.weights) #print self.weights # print init_sd self.biases = gpuarray.to_gpu(b.copy().astype(precision)) self.gB = gpuarray.empty_like(self.biases) # Prior and ID are set later self.prior = -1 self.ID = -1 #Set up momentum variables for HMC sampler self.pW = gpuarray.to_gpu(np.random.normal(0, 1, self.gW.shape)) self.pB = gpuarray.to_gpu(np.random.normal(0, 1, self.gB.shape)) #Store stepsizes for each parameter self.epsW = gpuarray.zeros(self.weights.shape, precision) + 1.0 self.epsB = gpuarray.zeros(self.biases.shape, precision) + 1.0 self.n_classes = n_classes self.n_incoming = n_incoming self.N = N self.outputs = gpuarray.zeros((self.N, self.n_classes), precision) self.precision = precision kernels = SourceModule(open(path + '/kernels.cu', "r").read()) self.softmax_kernel = kernels.get_function("softmax") self.add_bias_kernel = kernels.get_function("add_bias") self.rng = curandom.XORWOWRandomNumberGenerator() ##Initialize posterior weights self.posterior_weights = list() self.posterior_biases = list() self.eps_tol = 1e-10
def seed(s=None, device=None): """Resets the random number generator of the specified device. Args: s (int or None): Seed value. If it is ``None``, it initializes the generator without fixed seed. device: Device specifier (i.e. argument of :func:`get_device`). """ global _generators with DeviceUser(device) as user: seed_getter = _get_seed_getter(s) gen = curandom.XORWOWRandomNumberGenerator(seed_getter=seed_getter) _generators[user.device] = gen
def test_get_random_angle_in_radians(self): generator = curandom.XORWOWRandomNumberGenerator() grid = np.zeros((matrix_size, matrix_size)).astype(np.float32) grid = gpuarray.to_gpu(grid) for i in range(10): get_random_angle(generator.state, grid, np.int32(matrix_size), grid=(grid_dims, grid_dims), block=(block_dims, block_dims, 1)) grid_cpu = grid.get() for i in range(matrix_size): for j in range(matrix_size): self.assertGreater(grid_cpu[i][j], 0) self.assertLessEqual(grid_cpu[i][j], 2 * np.pi)
def test_survival_kernel_none_survive(self): # make sure all cells die initial_population = np.ones( (matrix_size, matrix_size)).astype(np.float32) survival_probabilities = np.ones( (matrix_size, matrix_size)).astype(np.float32) generator = curandom.XORWOWRandomNumberGenerator() run_primitive(Empty_grid().vars(matrix_size) == Initialize_grid().vars( matrix_size, initial_population, survival_probabilities, generator)) Config.engine.split() run_primitive(Survival_of_the_fittest().vars(survival_none, matrix_size, grid_dims, block_dims)) grid_a = Config.engine.stack.pop().get() grid_b = np.zeros((matrix_size, matrix_size)).astype(np.float32) self.assertTrue((grid_a == grid_b).all())
def test_cycle_termination(self): initial_population = np.zeros( (matrix_size, matrix_size)).astype(np.float32) initial_population[matrix_size // 2][matrix_size // 2] = 1 survival_probabilities = np.zeros( (matrix_size, matrix_size)).astype(np.float32) generator = curandom.XORWOWRandomNumberGenerator() run_primitive(Empty_grid().vars(matrix_size) == Initialize_grid().vars( matrix_size, initial_population, survival_probabilities, generator)) Config.engine.n_iters = n_iters Config.engine.cycle_start() run_primitive( Local_diffusion().vars(local_always, matrix_size, p_local_always, grid_dims, block_dims) == Bmsb_stop()) Config.engine.cycle_termination() self.assertEqual(Config.engine.iters, n_iters) self.assertFalse(Config.engine.is_split) self.assertFalse(Config.engine.continue_cycle)
def find_pi_cuda(n): '''Takes in integer n (n points used for integration), does Monte Carlo to find pi using gpu methods. Returns int pi''' # x and y coordinates generated rg = rand.XORWOWRandomNumberGenerator() x = rg.gen_uniform(n, np.float32) y = rg.gen_uniform(n, np.float32) # circle times am_circle = x**2 + y**2 outside = pycuda.gpuarray.zeros_like(x, np.float32) inside = pycuda.gpuarray.ones_like(x, np.float32) # here's Monte Carlo for finding pi H = pycuda.gpuarray.if_positive(am_circle <= 1, inside, outside) pi = 4 * gp.sum(H) / n return pi
def _test_population_growth_(self): initial_population = np.zeros( (matrix_size, matrix_size)).astype(np.float32) initial_population[matrix_size // 2][matrix_size // 2] = 1 initial_population[0][0] = 1 survival_probabilities = np.zeros( (matrix_size, matrix_size)).astype(np.float32) generator = curandom.XORWOWRandomNumberGenerator() run_primitive(Empty_grid().vars(matrix_size) == Initialize_grid().vars( matrix_size, initial_population, survival_probabilities, generator)) Config.engine.split() run_primitive(Population_growth().vars(population_growth, matrix_size, growth_rate, grid_dims, block_dims)) grid_a = Config.engine.stack.pop().get() grid_b = np.zeros((matrix_size, matrix_size)).astype(np.float32) grid_b[matrix_size // 2][matrix_size // 2] = 227 grid_b[0][0] = 227 self.assertTrue((grid_a == grid_b).all())
def get_generator(device=None): """Gets the random number generator for the given device. Args: device: Device specifier (an arugment of :func:`get_device`) Returns: pycuda.curandom.XORWOWRandomNumberGenerator: Random number generator. """ global _generators device = get_device(device) gen = _generators.get(device) if gen is not None: return gen with using_device(device): s = os.environ.get('CHAINER_SEED') seed_getter = _get_seed_getter(s) gen = curandom.XORWOWRandomNumberGenerator(seed_getter=seed_getter) _generators[device] = gen return gen
def test_non_local_diffusion_never(self): initial_population = np.zeros( (matrix_size, matrix_size)).astype(np.float32) initial_population[matrix_size // 2][matrix_size // 2] = 1 survival_probabilities = np.zeros( (matrix_size, matrix_size)).astype(np.float32) generator = curandom.XORWOWRandomNumberGenerator() run_primitive(Empty_grid().vars(matrix_size) == Initialize_grid().vars( matrix_size, initial_population, survival_probabilities, generator)) Config.engine.split() run_primitive(Non_local_diffusion().vars(non_local_never, matrix_size, p_non_local_never, mu, gamma, grid_dims, block_dims)) grid_a = Config.engine.stack.pop().get() grid_b = np.zeros((matrix_size, matrix_size)).astype(np.float32) grid_b[matrix_size // 2][matrix_size // 2] = 1 print('Non_local_diffusion_never\nGrid_a = {}\nGrid_b = {}'.format( grid_a, grid_b)) self.assertTrue((grid_a == grid_b).all())
def initialize_randoms(self): self.generator = curandom.XORWOWRandomNumberGenerator()
} } // end extern "C" """ mod = SourceModule(kernel_code, no_extern_c = True) # Get kernel functions local = mod.get_function('local_diffuse') non_local = mod.get_function('non_local_diffuse') survival_layer = mod.get_function('survival_of_the_fittest') population_layer = mod.get_function('population_growth') init_generators = mod.get_function('init_generators') # Initialize random number generator generator = curandom.XORWOWRandomNumberGenerator() data_type_size = sizeof(generator.state_type, "#include <curand_kernel.h>") generator._state = drv.mem_alloc((matrix_size * matrix_size) * data_type_size) seed = 123456789 init_generators(generator.state, np.int32(seed), np.int32(matrix_size), grid = (grid_dims, grid_dims), block = (block_dims, block_dims, 1)) # Run n_iters of the Brown Marmorated Stink Bug (BMSB) Diffusion Simulation run_primitive( empty_grid.vars(matrix_size) == initialize_grid.vars(matrix_size, initial_population, survival_probabilities, generator) == bmsb_stop_condition.vars(n_iters) <= local_diffusion.vars(local, matrix_size, p_local, grid_dims, block_dims) == non_local_diffusion.vars(non_local, matrix_size, p_non_local, mu, gamma, grid_dims, block_dims) == survival_function.vars(survival_layer, matrix_size, grid_dims, block_dims) == population_growth.vars(population_layer, matrix_size, growth_rate, grid_dims, block_dims) ==
# Create two timers: start = drv.Event() end = drv.Event() # Launch the kernel: start.record() rnorm(drv.Out(dest),mu,sigma,n,block=(tpb,1,1), grid=(nb,1)) end.record() # end timing # calculate the run length end.synchronize() gpu_secs = start.time_till(end)*1e-3 print("SourceModule time: %f" % gpu_secs) rng = curandom.XORWOWRandomNumberGenerator() # be kind and exclude initialization start.record() gpu_res = rng.gen_normal(n,dtype=np.float32) # lives on the device dest2 = np.add(np.multiply(gpu_res.get(),sigma),mu) # copy and scale end.record() # end timing # calculate the run length end.synchronize() gpu2_secs = start.time_till(end)*1e-3 print(" GPUArray time: %f" % gpu2_secs) start.record() # Numpy version: start.record() host = np.random.normal(size=n,loc=mu,scale=sigma) end.record() # end timing # calculate the run length