def cuda_step_free(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt): """Kernel function for free diffusion""" # Global thread index on a 1D grid thread_id = cuda.grid(1) if thread_id >= n_of_spins: return # Generate random step step = cuda.local.array(3, numba.double) step[0] = xoroshiro128p_normal_float64(rng_states, thread_id) step[1] = xoroshiro128p_normal_float64(rng_states, thread_id) step[2] = xoroshiro128p_normal_float64(rng_states, thread_id) normalizing_factor = math.sqrt(step[0]**2 + step[1]**2 + step[2]**2) step[0] = step_length * step[0] / normalizing_factor step[1] = step_length * step[1] / normalizing_factor step[2] = step_length * step[2] / normalizing_factor # Update positions positions[0, thread_id] = positions[0, thread_id] + step[0] positions[1, thread_id] = positions[1, thread_id] + step[1] positions[2, thread_id] = positions[2, thread_id] + step[2] # Calculate phase shift for measurement in range(g_x.shape[1]): phases[measurement, thread_id] += gamma * dt * \ (g_x[time_point, measurement] * positions[0, thread_id] + \ g_y[time_point, measurement] * positions[1, thread_id] + \ g_z[time_point, measurement] * positions[2, thread_id])
def cuda_step_cylinder(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, radius, orientation): """Kernel function for diffusion inside a sphere""" # Global thread index on a 1D grid thread_id = cuda.grid(1) if thread_id >= n_of_spins: return # Generate random unit step step = cuda.local.array(3, numba.double) step[0] = xoroshiro128p_normal_float64(rng_states, thread_id) step[1] = xoroshiro128p_normal_float64(rng_states, thread_id) step[2] = xoroshiro128p_normal_float64(rng_states, thread_id) normalizing_factor = math.sqrt(step[0]**2 + step[1]**2 + step[2]**2) step[0] = step[0] / normalizing_factor step[1] = step[1] / normalizing_factor step[2] = step[2] / normalizing_factor # Check for intersection and reflect the step off the surface i = 0 max_iter = 1e4 check_intersection = True intersection = cuda.local.array(3, numba.double) normal_vector = cuda.local.array(3, numba.double) while check_intersection and i < max_iter: i += 1 t = cylinder_intersection_check(positions[:, thread_id], step, orientation, radius) if t <= step_length: intersection[0] = positions[0, thread_id] + t*step[0] intersection[1] = positions[1, thread_id] + t*step[1] intersection[2] = positions[2, thread_id] + t*step[2] normal_vector[0] = (intersection[0]*orientation[0]+intersection[1]*orientation[1]+intersection[2]*orientation[2])*orientation[0] - intersection[0] normal_vector[1] = (intersection[0]*orientation[0]+intersection[1]*orientation[1]+intersection[2]*orientation[2])*orientation[1] - intersection[1] normal_vector[2] = (intersection[0]*orientation[0]+intersection[1]*orientation[1]+intersection[2]*orientation[2])*orientation[2] - intersection[2] normalizing_factor = math.sqrt(normal_vector[0]**2 + normal_vector[0]**2 + normal_vector[0]**2) normal_vector[0] /= normalizing_factor normal_vector[1] /= normalizing_factor normal_vector[2] /= normalizing_factor reflect_step(positions[:, thread_id], step, intersection, normal_vector, step_length) else: check_intersection = False positions[0, thread_id] = positions[0, thread_id] + step_length*step[0] positions[1, thread_id] = positions[1, thread_id] + step_length*step[1] positions[2, thread_id] = positions[2, thread_id] + step_length*step[2] # Calculate phase shift for measurement in range(g_x.shape[1]): phases[measurement, thread_id] += gamma * dt * \ (g_x[time_point, measurement] * positions[0, thread_id] + \ g_y[time_point, measurement] * positions[1, thread_id] + \ g_z[time_point, measurement] * positions[2, thread_id])
def symplectic_map_personal(x, px, step_values, n_iterations, epsilon, alpha, beta, x_star, delta, omega_0, omega_1, omega_2, action_radius, rng_states, gamma): i = cuda.threadIdx.x j = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x action = cuda.shared.array(shape=(512), dtype=float64) rot_angle = cuda.shared.array(shape=(512), dtype=float64) temp1 = cuda.shared.array(shape=(512), dtype=float64) temp2 = cuda.shared.array(shape=(512), dtype=float64) noise = cuda.shared.array(shape=(512), dtype=float64) l_x = cuda.shared.array(shape=(512), dtype=float64) l_px = cuda.shared.array(shape=(512), dtype=float64) l_step = cuda.shared.array(shape=(512), dtype=int32) noise[i] = random.xoroshiro128p_normal_float64(rng_states, j) if j < x.shape[0]: l_x[i] = x[j] l_px[i] = px[j] l_step[i] = step_values[j] for k in range(n_iterations): action[i] = (l_x[i] * l_x[i] + l_px[i] * l_px[i]) * 0.5 rot_angle[i] = omega_0 + (omega_1 + action[i]) + \ (0.5 * omega_2 * action[i] * action[i]) if (l_x[i] == 0.0 and l_px[i] == 0.0) or action[i] >= action_radius: l_x[i] = 0.0 l_px[i] = 0.0 break temp1[i] = l_x[i] temp2[i] = (l_px[i] + epsilon * noise[i] * (l_x[i]**beta) * math.exp(-( (x_star / (delta + abs(l_x[i])))**alpha))) l_x[i] = math.cos(rot_angle[i]) * temp1[i] + \ math.sin(rot_angle[i]) * temp2[i] l_px[i] = -math.sin(rot_angle[i]) * temp1[i] + \ math.cos(rot_angle[i]) * temp2[i] l_step[i] += 1 noise[i] = random.xoroshiro128p_normal_float64( rng_states, j) + gamma * noise[i] x[j] = l_x[i] px[j] = l_px[i] step_values[j] = l_step[i]
def cudaNormalVariateKernel(rng_states, an_array): threadId = cuda.grid( 1) #cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x base = threadId * 1000 for i in range(base + 1, base + 1000): an_array[i] = an_array[i - 1] + xoroshiro128p_normal_float64( rng_states, threadId)
def rng_kernel_float64(states, out, count, distribution): thread_id = cuda.grid(1) for i in range(count): if distribution == UNIFORM: out[thread_id * count + i] = xoroshiro128p_uniform_float64(states, thread_id) elif distribution == NORMAL: out[thread_id * count + i] = xoroshiro128p_normal_float64(states, thread_id)
def _cuda_random_step(step, rng_states, thread_id): """Generate a random step from a uniform distribution over a sphere. Parameters ---------- step : numba.cuda.cudadrv.devicearray.DeviceNDArray rng_states : numba.cuda.cudadrv.devicearray.DeviceNDArray thread_id : int Returns ------- None """ for i in range(3): step[i] = xoroshiro128p_normal_float64(rng_states, thread_id) _cuda_normalize_vector(step) return
def sample(q_array, input, sigma, rng): ''' Sample or state transition function in bayes filter theory. Here is a single example without any change in system state. :param q_array: :param input: :param sigma: :param rng: :return: ''' pos = cuda.grid(1) if pos < q_array.shape[1]: euler = cuda.local.array(shape=(3), dtype=float64) for i in range(euler.shape[0]): # euler[i] = input[i] + (xoroshiro128p_uniform_float32(rng, pos) - 0.5) euler[i] = input[i] + (xoroshiro128p_normal_float64(rng, pos) * sigma) quaternion_add_euler(q_array[:, pos], euler) cuda.syncthreads()
def cuda_step_mesh(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, triangles): """Kernel function for mesh simulations""" # Global thread index on a 1D grid thread_id = cuda.grid(1) if thread_id >= n_of_spins: return # Generate random step step = cuda.local.array(3, numba.double) step[0] = xoroshiro128p_normal_float64(rng_states, thread_id) step[1] = xoroshiro128p_normal_float64(rng_states, thread_id) step[2] = xoroshiro128p_normal_float64(rng_states, thread_id) normalizing_factor = math.sqrt(step[0]**2 + step[1]**2 + step[2]**2) step[0] = step[0] / normalizing_factor step[1] = step[1] / normalizing_factor step[2] = step[2] / normalizing_factor """ # Just cancel step when colliding with a barrier for triangle_idx in range(0,len(triangles), 9): A = triangles[triangle_idx:triangle_idx+3] B = triangles[triangle_idx+3:triangle_idx+6] C = triangles[triangle_idx+6:triangle_idx+9] t = triangle_intersection_check(A, B, C, positions[:, thread_id], step) if t > 0 and t <= step_length: step[0] = 0 step[1] = 0 step[2] = 0 break positions[0, thread_id] = positions[0, thread_id] + step[0]*step_length positions[1, thread_id] = positions[1, thread_id] + step[1]*step_length positions[2, thread_id] = positions[2, thread_id] + step[2]*step_length """ # Proper intersection check with reflection i = 0 max_iter = 1e6 check_intersection = True intersection = cuda.local.array(3, numba.double) normal_vector = cuda.local.array(3, numba.double) while check_intersection and i < max_iter: #if i > max_iter: # THROW AN ERROR MESSAGE SOMEHOW FROM HERE i += 1 for triangle_idx in range(0,len(triangles), 9): A = triangles[triangle_idx:triangle_idx+3] B = triangles[triangle_idx+3:triangle_idx+6] C = triangles[triangle_idx+6:triangle_idx+9] t = triangle_intersection_check(A, B, C, positions[:, thread_id], step) if t > 0 and t < step_length: intersection[0] = positions[0, thread_id] + t*step[0] intersection[1] = positions[1, thread_id] + t*step[1] intersection[2] = positions[2, thread_id] + t*step[2] normal_vector[0] = (B[1]-A[1])*(C[2]-A[2]) - (B[2]-A[2])*(C[1]-A[1]) normal_vector[1] = (B[2]-A[2])*(C[0]-A[0]) - (B[0]-A[0])*(C[2]-A[2]) normal_vector[2] = (B[0]-A[0])*(C[1]-A[1]) - (B[1]-A[1])*(C[0]-A[0]) normalizing_factor = math.sqrt(normal_vector[0]**2 + normal_vector[1]**2 + normal_vector[2]**2) normal_vector[0] = normal_vector[0] / normalizing_factor normal_vector[1] = normal_vector[1] / normalizing_factor normal_vector[2] = normal_vector[2] / normalizing_factor reflect_step(positions[:, thread_id], step, intersection, normal_vector, step_length) break elif triangle_idx == len(triangles) - 9: check_intersection = False positions[0, thread_id] = positions[0, thread_id] + step[0]*step_length positions[1, thread_id] = positions[1, thread_id] + step[1]*step_length positions[2, thread_id] = positions[2, thread_id] + step[2]*step_length # Calculate phase shift for measurement in range(g_x.shape[1]): phases[measurement, thread_id] += gamma * dt * \ (g_x[time_point, measurement] * positions[0, thread_id] + \ g_y[time_point, measurement] * positions[1, thread_id] + \ g_z[time_point, measurement] * positions[2, thread_id])
def mcmc(data, output, rng_states, n_iter): """Device code of our parallel MCMC implementation. """ shared = cuda.shared.array(shape=(2**9,), dtype=float64) # Shared Memory tx = cuda.threadIdx.x # Thread ID ty = cuda.blockIdx.x # Block ID bw = cuda.blockDim.x # Block Size idx = bw*ty+tx # Global ID theta = (0.,0.) # Initialize theta x = data[idx] # Fetch the data point logp_x = -(((theta[0]-x[0])**2)/(2*0.1) + ((theta[1]-x[1])**2)/(2*0.1)) # Log-likelihood of the data point shared[tx] = logp_x # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood of the sub-dataset from the first position logp = shared[0] # NOTE: Might cause some performance issues # Add the log-prior log_prior = -(((theta[0]-1)**2)/2 + ((theta[1]-1)**2)/2) logp += log_prior/2 # Main MCMC Loop for i in range(n_iter): # Propose a new theta theta_ = (theta[0] + 0.1*xoroshiro128p_normal_float64(rng_states, idx), theta[1] + 0.1*xoroshiro128p_normal_float64(rng_states, idx)) logp_x = -(((theta_[0]-x[0])**2)/(2*0.1) + ((theta_[1]-x[1])**2)/(2*0.1)) # Log-likelihood of the data point shared[tx] = logp_x # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood; # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/ logp_ = shared[0] # Add the log-prior log_prior = -(((theta_[0]-1)**2)/2 + ((theta_[1]-1)**2)/2) logp_ += log_prior/2 # Acceptance ratio alpha = math.exp(min(0,logp_-logp)) # Draw a uniform random number u = xoroshiro128p_uniform_float64(rng_states, idx) # Accept/Reject? if u < alpha: theta = theta_ logp = logp_ # Write the sample to the memory if tx == 0: output[i,ty] = theta
def mcmc_bench(X, Y, output, rng_states, n_iter): """Device code of our parallel MCMC implementation. """ shared = cuda.shared.array(shape=(2**9,), dtype=float64) # Shared Memory tx = cuda.threadIdx.x # Thread ID ty = cuda.blockIdx.x # Block ID bw = cuda.blockDim.x # Block Size idx = bw*ty+tx # Global ID alpha, beta0, beta1, sigma = 0, 0, 0, 1 x = X[idx] # Fetch the data point y = Y[idx] mu = alpha + beta0*x[0] + beta1*x[1] logp_xy = -((y-mu)**2)/(2*(sigma**2)) - math.log(sigma) # Log-likelihood of the data point shared[tx] = logp_xy # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood of the sub-dataset from the first position logp = shared[0] # NOTE: Might cause some performance issues # Add the log-prior log_prior = - ((alpha**2)/(2*(10**2)) + (beta0**2)/(2*(10**2)) + (beta1**2)/(2*(10**2)) + (sigma**2)/2) logp += log_prior # Main MCMC Loop for i in range(n_iter): # Propose a new theta alpha_ = alpha + 0.1*xoroshiro128p_normal_float64(rng_states, idx) beta0_ = beta0 + 0.1*xoroshiro128p_normal_float64(rng_states, idx) beta1_ = beta1 + 0.1*xoroshiro128p_normal_float64(rng_states, idx) sigma_ = sigma + 0.1*xoroshiro128p_normal_float64(rng_states, idx) mu = alpha_ + beta0_*x[0] + beta1_*x[1] logp_xy = -((y-mu)**2)/(2*(sigma_**2)) - math.log(sigma_) shared[tx] = logp_xy # Put the log-likelihood to the shared memory cuda.syncthreads() # Reduction using sequential addressing s = bw//2 while s>0: if tx < s: shared[tx] += shared[tx+s] cuda.syncthreads() s>>=1 # Get the log-likelihood; # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/ logp_ = shared[0] # Add the log-prior log_prior = - ((alpha_**2)/(2*(10**2)) + (beta0_**2)/(2*(10**2)) + (beta1_**2)/(2*(10**2)) + (sigma_**2)/2) logp_ += log_prior # Acceptance ratio gamma = math.exp(min(0,logp_-logp)) # Draw a uniform random number u = xoroshiro128p_uniform_float64(rng_states, idx) # Accept/Reject? if u < gamma: alpha = alpha_ beta0 = beta0_ beta1 = beta1_ sigma = sigma_ logp = logp_ # Write the sample to the memory if tx == 0: output[i,ty,0] = alpha output[i,ty,1] = beta0 output[i,ty,2] = beta1 output[i,ty,3] = sigma