def make_kernel(delays, n_thread_per_block, n_inner): horizon = next_pow_of_2(delays.max() + 1) cfpre = cu_expr('sin(xj - xi)', ('xi', 'xj'), {}) cfpost = cu_expr('rcp_n * gx', ('gx', ), {'rcp_n': 1.0 / delays.shape[0]}) n_thread_per_block = int32(n_thread_per_block) n_inner = int32(n_inner) dcf = cu_delay_cfun(horizon, cfpre, cfpost, 1, n_thread_per_block) @cuda.jit def kernel(step, state, update, buf, dt, omega, cvars, weights, delays, a_values, s_values, Z): i_t = cuda.threadIdx.x i_thread = cuda.blockIdx.x * cuda.blockDim.x + i_t aff = cuda.shared.array((1, 1, 1, n_thread_per_block), float32) a = a_values[i_thread] s = math.sqrt(dt) * math.sqrt(2.0 * s_values[i_thread]) sqrt_dt = math.sqrt(dt) for i_step in range(n_inner): for i_post in range(weights.shape[0]): dcf(aff, delays, weights, state, i_post, i_thread, step[0], cvars, buf) update[i_post, i_thread] = dt * (omega + a * aff[0, 0, 0, i_t]) \ + s * Z[i_step, i_post, i_thread] for i_post in range(weights.shape[0]): state[0, i_post, 0, i_thread] += update[i_post, i_thread] if i_thread == 0: step[0] += 1 cuda.syncthreads() return horizon, kernel
def test_math_functions(self): cu_fn = cu_expr('exp(x) + sin(y)', ['x', 'y'], {}) x, y = numpy.random.randn(2, self.n_thread).astype(numpy.float32) out = numpy.zeros((self.n_thread,), numpy.float32) @self.jit_and_run(out, x, y) def kernel(out, x, y): i_thread = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x out[i_thread] = cu_fn(x[i_thread], y[i_thread]) numpy.testing.assert_allclose(out, numpy.exp(x) + numpy.sin(y), 1e-5, 1e-6)
def test_linear_constant_slopes(self): expr = 'ai * xi + aj * xj + offset' pars = 'xi xj offset'.split() const = {'ai': 0.3, 'aj': -0.84} cu_fn, fn = cu_expr(expr, pars, const, return_fn=True) pars = numpy.random.randn(3, 10, self.n_thread).astype(numpy.float32) out = numpy.zeros((10, self.n_thread), numpy.float32) @self.jit_and_run(out, *pars) def kernel(out, xi, xj, offset): i_thread = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x for i in range(out.shape[0]): out[i, i_thread] = cu_fn(xi[i, i_thread], xj[i, i_thread], offset[i, i_thread]) numpy.testing.assert_allclose(out, fn(*pars), 1e-5, 1e-6)
def test_kuramoto(self): # build & run Python simulations numpy.random.seed(42) n = 5 weights = numpy.zeros((n, n), numpy.float32) idelays = numpy.zeros((n, n), numpy.int32) for i in range(n - 1): idelays[i, i + 1] = i + 1 weights[i, i + 1] = i + 1 def gen_sim(a): dt = 0.1 conn = connectivity.Connectivity() conn.weights = weights conn.tract_lengths = idelays * dt conn.speed = 1.0 sim = simulator.Simulator( coupling=py_coupling.Kuramoto(a=a), connectivity=conn, model=models.Kuramoto(omega=100 * 2 * numpy.pi / 1e3), monitors=monitors.Raw(), integrator=integrators.EulerDeterministic(dt=dt)) sim.configure() sim.history[:] = 0.1 return sim a_values = numpy.r_[:self.n_thread].astype(numpy.float32) sims = [gen_sim(a) for a in a_values] py_data = [] py_coupling0 = [] for sim in sims: ys = [] cs = [] for (t, y), in sim(simulation_length=10.0): ys.append(y[0, :, 0]) # cs.append(sim.model._coupling_0[:, 0]) py_data.append(numpy.array(ys)) # py_coupling0.append(numpy.array(cs)) py_data = numpy.array(py_data) # py_coupling0 = numpy.array(py_coupling0) # build CUDA kernels cfpre = cu_expr('sin(xj - xi)', ('xi', 'xj'), {}) cfpost = cu_expr('rcp_n * gx', ('gx', ), {'rcp_n': 1.0 / n}) horiz2 = next_pow_of_2(sims[0].horizon) dcf = cu_delay_cfun(horiz2, cfpre, cfpost, 1, self.block_dim[0], aff_node_stride=1) # build kernel dt = numba.float32(sims[0].integrator.dt) omega = numba.float32(sims[0].model.omega[0]) cvars = numpy.array([0], numpy.int32) weights = sims[0].connectivity.weights.astype(numpy.float32) delays = sims[0].connectivity.idelays.astype(numpy.int32) @cuda.jit def kernel(step, state, coupling, aff, buf, dt, omega, cvars, weights, delays, a_values): i_thread = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x a = a_values[i_thread] for i_post in range(weights.shape[0]): dcf(aff, delays, weights, state, i_post, i_thread, step[0], cvars, buf) coupling[i_post, i_thread] = a * aff[0, i_post, 0, i_thread] state[0, i_post, 0, i_thread] += dt * (omega + a * aff[0, i_post, 0, i_thread]) step = numpy.array([0], numpy.int32) state = (numpy.zeros( (1, n, 1, self.n_thread)) + 0.1).astype(numpy.float32) coupling0 = numpy.zeros((n, self.n_thread), numpy.float32) aff = numpy.zeros((1, n, 1, self.n_thread), numpy.float32) buf = numpy.zeros((n, horiz2, 1, self.n_thread), numpy.float32) buf += 0.1 cu_data = numpy.zeros(py_data.shape, numpy.float32) cu_coupling0 = numpy.zeros((cu_data.shape[1], ) + coupling0.shape) for step_ in range(cu_data.shape[1]): step[0] = step_ kernel[self.block_dim, self.grid_dim](step, state, coupling0, aff, buf, dt, omega, cvars, weights, delays, a_values) cu_data[:, step_] = state[0, :, 0].T cu_coupling0[step_] = coupling0 # accept higher error because it accumulates over time # TODO test error proportional to time numpy.testing.assert_allclose(cu_data, py_data, 1e-2, 1e-2)