Esempio n. 1
0
def cuda_step_free(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt):
    """Kernel function for free diffusion"""
    
    # Global thread index on a 1D grid
    thread_id = cuda.grid(1)
    if thread_id >= n_of_spins:
        return

    # Generate random step
    step = cuda.local.array(3, numba.double)
    step[0] = xoroshiro128p_normal_float64(rng_states, thread_id)
    step[1] = xoroshiro128p_normal_float64(rng_states, thread_id)
    step[2] = xoroshiro128p_normal_float64(rng_states, thread_id)
    normalizing_factor = math.sqrt(step[0]**2 + step[1]**2 + step[2]**2)
    step[0] = step_length * step[0] / normalizing_factor
    step[1] = step_length * step[1] / normalizing_factor
    step[2] = step_length * step[2] / normalizing_factor

    # Update positions
    positions[0, thread_id] = positions[0, thread_id] + step[0]
    positions[1, thread_id] = positions[1, thread_id] + step[1]
    positions[2, thread_id] = positions[2, thread_id] + step[2]
    
    # Calculate phase shift
    for measurement in range(g_x.shape[1]):
        phases[measurement, thread_id] += gamma * dt * \
                                          (g_x[time_point, measurement] * positions[0, thread_id] + \
                                           g_y[time_point, measurement] * positions[1, thread_id] + \
                                           g_z[time_point, measurement] * positions[2, thread_id])
Esempio n. 2
0
def cuda_step_cylinder(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, radius, orientation):
    """Kernel function for diffusion inside a sphere"""
    
    # Global thread index on a 1D grid
    thread_id = cuda.grid(1)
    if thread_id >= n_of_spins:
        return

    # Generate random unit step
    step = cuda.local.array(3, numba.double)
    step[0] = xoroshiro128p_normal_float64(rng_states, thread_id)
    step[1] = xoroshiro128p_normal_float64(rng_states, thread_id)
    step[2] = xoroshiro128p_normal_float64(rng_states, thread_id)
    normalizing_factor = math.sqrt(step[0]**2 + step[1]**2 + step[2]**2)
    step[0] = step[0] / normalizing_factor
    step[1] = step[1] / normalizing_factor
    step[2] = step[2] / normalizing_factor

    # Check for intersection and reflect the step off the surface
    i = 0
    max_iter = 1e4
    check_intersection = True
    intersection = cuda.local.array(3, numba.double)
    normal_vector = cuda.local.array(3, numba.double)
    while check_intersection and i < max_iter:
        i += 1
        t = cylinder_intersection_check(positions[:, thread_id], step, orientation, radius)
        if t <= step_length:
            intersection[0] = positions[0, thread_id] + t*step[0]
            intersection[1] = positions[1, thread_id] + t*step[1]
            intersection[2] = positions[2, thread_id] + t*step[2]
            normal_vector[0] = (intersection[0]*orientation[0]+intersection[1]*orientation[1]+intersection[2]*orientation[2])*orientation[0] - intersection[0]
            normal_vector[1] = (intersection[0]*orientation[0]+intersection[1]*orientation[1]+intersection[2]*orientation[2])*orientation[1] - intersection[1]
            normal_vector[2] = (intersection[0]*orientation[0]+intersection[1]*orientation[1]+intersection[2]*orientation[2])*orientation[2] - intersection[2]
            normalizing_factor = math.sqrt(normal_vector[0]**2 + normal_vector[0]**2 + normal_vector[0]**2)
            normal_vector[0] /= normalizing_factor
            normal_vector[1] /= normalizing_factor
            normal_vector[2] /= normalizing_factor
            reflect_step(positions[:, thread_id], step, intersection, normal_vector, step_length)
        else:
            check_intersection = False
            positions[0, thread_id] = positions[0, thread_id] + step_length*step[0]
            positions[1, thread_id] = positions[1, thread_id] + step_length*step[1]
            positions[2, thread_id] = positions[2, thread_id] + step_length*step[2]

    # Calculate phase shift
    for measurement in range(g_x.shape[1]):
        phases[measurement, thread_id] += gamma * dt * \
                                          (g_x[time_point, measurement] * positions[0, thread_id] + \
                                           g_y[time_point, measurement] * positions[1, thread_id] + \
                                           g_z[time_point, measurement] * positions[2, thread_id])
Esempio n. 3
0
def symplectic_map_personal(x, px, step_values, n_iterations, epsilon, alpha,
                            beta, x_star, delta, omega_0, omega_1, omega_2,
                            action_radius, rng_states, gamma):
    i = cuda.threadIdx.x
    j = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x

    action = cuda.shared.array(shape=(512), dtype=float64)
    rot_angle = cuda.shared.array(shape=(512), dtype=float64)
    temp1 = cuda.shared.array(shape=(512), dtype=float64)
    temp2 = cuda.shared.array(shape=(512), dtype=float64)
    noise = cuda.shared.array(shape=(512), dtype=float64)
    l_x = cuda.shared.array(shape=(512), dtype=float64)
    l_px = cuda.shared.array(shape=(512), dtype=float64)
    l_step = cuda.shared.array(shape=(512), dtype=int32)

    noise[i] = random.xoroshiro128p_normal_float64(rng_states, j)

    if j < x.shape[0]:
        l_x[i] = x[j]
        l_px[i] = px[j]
        l_step[i] = step_values[j]
        for k in range(n_iterations):
            action[i] = (l_x[i] * l_x[i] + l_px[i] * l_px[i]) * 0.5
            rot_angle[i] = omega_0 + (omega_1 + action[i]) + \
                (0.5 * omega_2 * action[i] * action[i])

            if (l_x[i] == 0.0
                    and l_px[i] == 0.0) or action[i] >= action_radius:
                l_x[i] = 0.0
                l_px[i] = 0.0
                break

            temp1[i] = l_x[i]
            temp2[i] = (l_px[i] + epsilon * noise[i] *
                        (l_x[i]**beta) * math.exp(-(
                            (x_star / (delta + abs(l_x[i])))**alpha)))
            l_x[i] = math.cos(rot_angle[i]) * temp1[i] + \
                math.sin(rot_angle[i]) * temp2[i]
            l_px[i] = -math.sin(rot_angle[i]) * temp1[i] + \
                math.cos(rot_angle[i]) * temp2[i]

            l_step[i] += 1

            noise[i] = random.xoroshiro128p_normal_float64(
                rng_states, j) + gamma * noise[i]
        x[j] = l_x[i]
        px[j] = l_px[i]
        step_values[j] = l_step[i]
Esempio n. 4
0
def cudaNormalVariateKernel(rng_states, an_array):
    threadId = cuda.grid(
        1)  #cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    base = threadId * 1000
    for i in range(base + 1, base + 1000):
        an_array[i] = an_array[i - 1] + xoroshiro128p_normal_float64(
            rng_states, threadId)
Esempio n. 5
0
def rng_kernel_float64(states, out, count, distribution):
    thread_id = cuda.grid(1)

    for i in range(count):
        if distribution == UNIFORM:
            out[thread_id * count + i] = xoroshiro128p_uniform_float64(states, thread_id)
        elif distribution == NORMAL:
            out[thread_id * count + i] = xoroshiro128p_normal_float64(states, thread_id)
Esempio n. 6
0
def _cuda_random_step(step, rng_states, thread_id):
    """Generate a random step from a uniform distribution over a sphere.

    Parameters
    ----------
    step : numba.cuda.cudadrv.devicearray.DeviceNDArray
    rng_states : numba.cuda.cudadrv.devicearray.DeviceNDArray
    thread_id : int

    Returns
    -------
    None
    """
    for i in range(3):
        step[i] = xoroshiro128p_normal_float64(rng_states, thread_id)
    _cuda_normalize_vector(step)
    return
Esempio n. 7
0
def sample(q_array, input, sigma, rng):
    '''
    Sample or state transition function in bayes filter theory.
    Here is a single example without any change in system state.
    :param q_array:
    :param input:
    :param sigma:
    :param rng:
    :return:
    '''
    pos = cuda.grid(1)

    if pos < q_array.shape[1]:

        euler = cuda.local.array(shape=(3), dtype=float64)
        for i in range(euler.shape[0]):
            # euler[i] = input[i] + (xoroshiro128p_uniform_float32(rng, pos) - 0.5)
            euler[i] = input[i] + (xoroshiro128p_normal_float64(rng, pos) *
                                   sigma)
        quaternion_add_euler(q_array[:, pos], euler)
    cuda.syncthreads()
Esempio n. 8
0
def cuda_step_mesh(positions, g_x, g_y, g_z, phases, rng_states, time_point, n_of_spins, gamma, step_length, dt, triangles):
    """Kernel function for mesh simulations"""
    
    # Global thread index on a 1D grid
    thread_id = cuda.grid(1)
    if thread_id >= n_of_spins:
        return

    # Generate random step
    step = cuda.local.array(3, numba.double)
    step[0] = xoroshiro128p_normal_float64(rng_states, thread_id)
    step[1] = xoroshiro128p_normal_float64(rng_states, thread_id)
    step[2] = xoroshiro128p_normal_float64(rng_states, thread_id)
    normalizing_factor = math.sqrt(step[0]**2 + step[1]**2 + step[2]**2)
    step[0] = step[0] / normalizing_factor
    step[1] = step[1] / normalizing_factor
    step[2] = step[2] / normalizing_factor

    """
    # Just cancel step when colliding with a barrier
    for triangle_idx in range(0,len(triangles), 9):
        A = triangles[triangle_idx:triangle_idx+3]
        B = triangles[triangle_idx+3:triangle_idx+6]
        C = triangles[triangle_idx+6:triangle_idx+9]
        t = triangle_intersection_check(A, B, C, positions[:, thread_id], step)
        if t > 0 and t <= step_length:
            step[0] = 0
            step[1] = 0
            step[2] = 0
            break
    positions[0, thread_id] = positions[0, thread_id] + step[0]*step_length
    positions[1, thread_id] = positions[1, thread_id] + step[1]*step_length
    positions[2, thread_id] = positions[2, thread_id] + step[2]*step_length
    """

    # Proper intersection check with reflection
    i = 0
    max_iter = 1e6
    check_intersection = True
    intersection = cuda.local.array(3, numba.double)
    normal_vector = cuda.local.array(3, numba.double)
    while check_intersection and i < max_iter:
        #if i > max_iter:
            # THROW AN ERROR MESSAGE SOMEHOW FROM HERE   
        i += 1
        for triangle_idx in range(0,len(triangles), 9):
            A = triangles[triangle_idx:triangle_idx+3]
            B = triangles[triangle_idx+3:triangle_idx+6]
            C = triangles[triangle_idx+6:triangle_idx+9]
            t = triangle_intersection_check(A, B, C, positions[:, thread_id], step)
            if t > 0 and t < step_length:
                intersection[0] = positions[0, thread_id] + t*step[0]
                intersection[1] = positions[1, thread_id] + t*step[1]
                intersection[2] = positions[2, thread_id] + t*step[2]
                normal_vector[0] = (B[1]-A[1])*(C[2]-A[2]) - (B[2]-A[2])*(C[1]-A[1])
                normal_vector[1] = (B[2]-A[2])*(C[0]-A[0]) - (B[0]-A[0])*(C[2]-A[2])
                normal_vector[2] = (B[0]-A[0])*(C[1]-A[1]) - (B[1]-A[1])*(C[0]-A[0])
                normalizing_factor = math.sqrt(normal_vector[0]**2 + normal_vector[1]**2 + normal_vector[2]**2)
                normal_vector[0] = normal_vector[0] / normalizing_factor
                normal_vector[1] = normal_vector[1] / normalizing_factor
                normal_vector[2] = normal_vector[2] / normalizing_factor
                reflect_step(positions[:, thread_id], step, intersection, normal_vector, step_length) 
                break
            elif triangle_idx == len(triangles) - 9:
                check_intersection = False
                positions[0, thread_id] = positions[0, thread_id] + step[0]*step_length
                positions[1, thread_id] = positions[1, thread_id] + step[1]*step_length
                positions[2, thread_id] = positions[2, thread_id] + step[2]*step_length
    
    # Calculate phase shift
    for measurement in range(g_x.shape[1]):
        phases[measurement, thread_id] += gamma * dt * \
                                          (g_x[time_point, measurement] * positions[0, thread_id] + \
                                           g_y[time_point, measurement] * positions[1, thread_id] + \
                                           g_z[time_point, measurement] * positions[2, thread_id])
Esempio n. 9
0
def mcmc(data, output, rng_states, n_iter):
    """Device code of our parallel MCMC implementation.
    """
    shared = cuda.shared.array(shape=(2**9,), dtype=float64)  # Shared Memory
    tx = cuda.threadIdx.x  # Thread ID
    ty = cuda.blockIdx.x  # Block ID
    bw = cuda.blockDim.x  # Block Size
    idx = bw*ty+tx  # Global ID
    
    theta = (0.,0.)  # Initialize theta
    x = data[idx]  # Fetch the data point
    logp_x = -(((theta[0]-x[0])**2)/(2*0.1) + ((theta[1]-x[1])**2)/(2*0.1))  # Log-likelihood of the data point
    shared[tx] = logp_x  # Put the log-likelihood to the shared memory
    cuda.syncthreads()
    
    # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance
    s = bw//2
    while s>0:
        if tx < s:
            shared[tx] += shared[tx+s]
        cuda.syncthreads()
        s>>=1
    # Get the log-likelihood of the sub-dataset from the first position
    logp = shared[0]  #  NOTE: Might cause some performance issues
    
    # Add the log-prior
    log_prior = -(((theta[0]-1)**2)/2 + ((theta[1]-1)**2)/2) 
    logp += log_prior/2
    
    # Main MCMC Loop
    for i in range(n_iter):
        # Propose a new theta
        theta_ = (theta[0] + 0.1*xoroshiro128p_normal_float64(rng_states, idx), theta[1] + 0.1*xoroshiro128p_normal_float64(rng_states, idx))
        logp_x = -(((theta_[0]-x[0])**2)/(2*0.1) + ((theta_[1]-x[1])**2)/(2*0.1))  # Log-likelihood of the data point
        shared[tx] = logp_x  # Put the log-likelihood to the shared memory
        cuda.syncthreads()
        
        # Reduction using sequential addressing
        s = bw//2
        while s>0:
            if tx < s:
                shared[tx] += shared[tx+s]
            cuda.syncthreads()
            s>>=1
        # Get the log-likelihood;
        # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/   
        logp_ = shared[0]
        
        # Add the log-prior
        log_prior = -(((theta_[0]-1)**2)/2 + ((theta_[1]-1)**2)/2) 
        logp_ += log_prior/2
        
        # Acceptance ratio
        alpha = math.exp(min(0,logp_-logp))
        # Draw a uniform random number
        u = xoroshiro128p_uniform_float64(rng_states, idx)
        # Accept/Reject?
        if u < alpha:
            theta = theta_
            logp = logp_
        
        # Write the sample to the memory
        if tx == 0:
            output[i,ty] = theta
Esempio n. 10
0
def mcmc_bench(X, Y, output, rng_states, n_iter):
    """Device code of our parallel MCMC implementation.
    """
    shared = cuda.shared.array(shape=(2**9,), dtype=float64)  # Shared Memory
    tx = cuda.threadIdx.x  # Thread ID
    ty = cuda.blockIdx.x  # Block ID
    bw = cuda.blockDim.x  # Block Size
    idx = bw*ty+tx  # Global ID
    
    alpha, beta0, beta1, sigma = 0, 0, 0, 1
    x = X[idx]  # Fetch the data point
    y = Y[idx]
    mu = alpha + beta0*x[0] + beta1*x[1]
    logp_xy = -((y-mu)**2)/(2*(sigma**2)) - math.log(sigma)  # Log-likelihood of the data point
    shared[tx] = logp_xy  # Put the log-likelihood to the shared memory
    cuda.syncthreads()
    
    # Reduction using sequential addressing. NOTE: Increasing the data points per thread might increase the performance
    s = bw//2
    while s>0:
        if tx < s:
            shared[tx] += shared[tx+s]
        cuda.syncthreads()
        s>>=1
    # Get the log-likelihood of the sub-dataset from the first position
    logp = shared[0]  #  NOTE: Might cause some performance issues
    
    # Add the log-prior
    log_prior = - ((alpha**2)/(2*(10**2)) + (beta0**2)/(2*(10**2)) + (beta1**2)/(2*(10**2)) + (sigma**2)/2)
    logp += log_prior
    
    # Main MCMC Loop
    for i in range(n_iter):
        # Propose a new theta
        alpha_ = alpha + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        beta0_ = beta0 + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        beta1_ = beta1 + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        sigma_ = sigma + 0.1*xoroshiro128p_normal_float64(rng_states, idx)
        
        mu = alpha_ + beta0_*x[0] + beta1_*x[1]
        logp_xy = -((y-mu)**2)/(2*(sigma_**2)) - math.log(sigma_)
        shared[tx] = logp_xy  # Put the log-likelihood to the shared memory
        cuda.syncthreads()
        
        # Reduction using sequential addressing
        s = bw//2
        while s>0:
            if tx < s:
                shared[tx] += shared[tx+s]
            cuda.syncthreads()
            s>>=1
        # Get the log-likelihood;
        # this will trigger a "broadcast", see https://devblogs.nvidia.com/using-shared-memory-cuda-cc/   
        logp_ = shared[0]
        
        # Add the log-prior
        log_prior = - ((alpha_**2)/(2*(10**2)) + (beta0_**2)/(2*(10**2)) + (beta1_**2)/(2*(10**2)) + (sigma_**2)/2)
        logp_ += log_prior
        
        # Acceptance ratio
        gamma = math.exp(min(0,logp_-logp))
        # Draw a uniform random number
        u = xoroshiro128p_uniform_float64(rng_states, idx)
        # Accept/Reject?
        if u < gamma:
            alpha = alpha_
            beta0 = beta0_
            beta1 = beta1_
            sigma = sigma_
            logp = logp_
        
        # Write the sample to the memory
        if tx == 0:
            output[i,ty,0] = alpha
            output[i,ty,1] = beta0
            output[i,ty,2] = beta1
            output[i,ty,3] = sigma