Esempio n. 1
0
    def fill_layers(self):
        blocks = 1

        threads_per_block = 19
        dt = datetime.now()
        rng_states = create_xoroshiro128p_states(threads_per_block * blocks,
                                                 seed=dt.microsecond)
        for i, x in enumerate(self.weights_layer_2):
            fill_random[1, threads_per_block](self.weights_layer_2[i], 19,
                                              rng_states)

        threads_per_block = 17
        dt = datetime.now()
        rng_states = create_xoroshiro128p_states(threads_per_block * blocks,
                                                 seed=dt.microsecond)
        for i, x in enumerate(self.weights_layer_3):
            fill_random[1, threads_per_block](self.weights_layer_3[i], 17,
                                              rng_states)

        threads_per_block = 17
        dt = datetime.now()
        rng_states = create_xoroshiro128p_states(threads_per_block * blocks,
                                                 seed=dt.microsecond)
        for i, x in enumerate(self.weights_layer_4):
            fill_random[1, threads_per_block](self.weights_layer_4[i], 17,
                                              rng_states)
Esempio n. 2
0
def crossover(self, father, mother):
    nn = copy.deepcopy(father)

    #for each biase update for layer
    for u in range(nn.num_layers - 1):
        #number of thread in the block
        TPB = 4
        x = nn.biases[u].shape[0]
        y = nn.biases[u].shape[1]
        threadsperblock = [x, y]

        if (x > TPB):
            threadsperblock[0] = TPB

        if (y > TPB):
            threadsperblock[1] = TPB
        threadsperblock = (threadsperblock[0], threadsperblock[1])
        #number of block for the given input
        blockspergrid_x = int(math.ceil(x / threadsperblock[0]))
        blockspergrid_y = int(math.ceil(y / threadsperblock[1]))
        blockspergrid = (blockspergrid_x, blockspergrid_y)
        #generate random number object to call in kernel
        rng_states = create_xoroshiro128p_states(x * y,
                                                 seed=random.randint(0, 10))
        #call to parallel function for crossover on each layer based on the random number.
        biase_crossover[blockspergrid,
                        threadsperblock](rng_states, nn.biases[u],
                                         self.crossover_rate, mother.biases[u])
    #for each weight update for layer
    for u in range(nn.num_layers - 1):
        #number of thread in the block
        TPB = 4
        x = nn.weights[u].shape[0]
        y = nn.weights[u].shape[1]
        threadsperblock = [x, y]
        #blockspergrid_x=1
        #blockspergrid_y=1
        if (x > TPB):
            threadsperblock[0] = TPB

        if (y > TPB):
            threadsperblock[1] = TPB
        threadsperblock = (threadsperblock[0], threadsperblock[1])

        blockspergrid_x = int(math.ceil(x / threadsperblock[0]))
        blockspergrid_y = int(math.ceil(y / threadsperblock[1]))
        blockspergrid = (blockspergrid_x, blockspergrid_y)
        #C_global_mem = cuda.device_array((w.shape[0], a.shape[1]))
        #generate random number object to use in kernel function.
        rng_states = create_xoroshiro128p_states(x * y,
                                                 seed=random.randint(0, 10))
        #call to parallel function for crossover on each layer based on the random number
        weight_crossover[blockspergrid,
                         threadsperblock](rng_states, nn.weights[u],
                                          self.crossover_rate,
                                          mother.weights[u])
        #print(nn.weights[u],"final")
    return nn
Esempio n. 3
0
def maybe_create_rng_states(n, seed=0, rng_states=None):
    """Create or extend random states for CUDA kernel"""
    if rng_states is None:
        return create_xoroshiro128p_states(n, seed=seed)
    elif n > len(rng_states):
        new_states = device_array(n, dtype=rng_states.dtype)
        new_states[:len(rng_states)] = rng_states
        new_states[len(rng_states):] = create_xoroshiro128p_states(
            n - len(rng_states), seed=seed)
        return new_states
    return rng_states
Esempio n. 4
0
def _calcularColsReparar(ponderaciones):
    resultado = np.zeros((ponderaciones.shape[0], ponderaciones.shape[1]),
                         dtype=np.uint8)
    #colsCandidatas = np.zeros((ponderaciones.shape[0], ponderaciones.shape[1]), dtype=np.uint8)
    colsCandidatasGlobal = np.ones(
        (ponderaciones.shape[0], 10), dtype=np.int32) * -1
    rng_states = create_xoroshiro128p_states(COL, seed=1)
    ponderacionMaxima = np.array([np.max(ponderaciones)])
    #print(f"ponderacion maxima {ponderacionMaxima}")
    #iniciar kernel
    threadsperblock = (NSOL, COL)
    blockspergrid_x = int(
        math.ceil(ponderaciones.shape[0] / threadsperblock[0]))
    blockspergrid_y = int(
        math.ceil(ponderaciones.shape[1] / threadsperblock[1]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)
    ponderaciones_global_mem = cuda.to_device(ponderaciones)
    resultado_global_mem = cuda.to_device(resultado)
    colsCandidatasGlobal_mem = cuda.to_device(colsCandidatasGlobal)
    poderacionMaxima_mem = cuda.to_device(ponderacionMaxima)
    rng_states_mem = cuda.to_device(rng_states)

    #llamar kernel
    kernelColsCandidatasGPU[blockspergrid,
                            threadsperblock](ponderaciones_global_mem,
                                             poderacionMaxima_mem,
                                             colsCandidatasGlobal,
                                             rng_states_mem,
                                             resultado_global_mem)

    return colsCandidatasGlobal_mem.copy_to_host()
Esempio n. 5
0
 def fill_uniformly(self, n_of_spins, seed):
     """
     Calculate positions for spins evenly distributed in the substrate.
     
     Parameters
     ----------
         n_of_spins : int
             Number of spins.
         seed : int
             Seed for random number generation.
     
     Returns
     -------
         positions : numpy array
         Calculated positions for spins.
     """
     triangles = self.triangles
     block_size = 256
     grid_size = int(math.ceil(float(n_of_spins) / block_size))
     stream = cuda.stream()
     rng_states = create_xoroshiro128p_states(grid_size * block_size,
                                              seed=seed,
                                              stream=stream)
     positions = np.zeros([3, n_of_spins])
     d_positions = cuda.to_device(positions, stream=stream)
     d_triangles = cuda.to_device(triangles.ravel(), stream=stream)
     d_max = cuda.to_device(np.max(np.max(triangles, 0), 0), stream=stream)
     fill_uniformly_cuda[grid_size, block_size,
                         stream](d_positions, d_triangles, d_max,
                                 rng_states)
     stream.synchronize()
     positions = d_positions.copy_to_host(stream=stream)
     return positions
Esempio n. 6
0
def detect_and_swap_gpu(matrix,
                        indices,
                        seed,
                        threads_per_block=128,
                        blocks=128):
    rng_states = create_xoroshiro128p_states(threads_per_block * blocks,
                                             seed=seed)
    vec = np.zeros([threads_per_block * blocks, 3]).astype(int)
    d_matrix = cuda.to_device(matrix)
    d_vec = cuda.to_device(vec)
    _detect_gpu[blocks, threads_per_block](d_matrix, d_vec, rng_states)

    vec = d_vec.copy_to_host()
    vec = vec[~np.all(vec == 0, axis=1)]  # select non-zero rows
    vec = vec[np.argsort(vec[:, 2])[::-1]]  # TODO: greedy?
    vec_detected = vec.shape[0]
    # remove conflicted rows
    visited = {}
    selected = []
    for i in range(vec.shape[0]):
        if vec[i, 0] not in visited and vec[i, 1] not in visited:
            selected.append(i)
            visited[vec[i, 0]] = 1
            visited[vec[i, 1]] = 1
    vec = vec[selected, :]
    for i in range(vec.shape[0]):
        swap_inplace(matrix, indices, vec[i, 0], vec[i, 1])
    vec_swapped = vec.shape[0]
    return matrix, indices, vec_detected, vec_swapped
Esempio n. 7
0
def simulate(doms, rs, x_start, y_start, blocks, threads_per_block, seed=None, verbose=True):

    N = blocks * threads_per_block


    x_y = []
    t1 = time.time()

    # Set a random seed
    if seed is None:
        ran_seed = np.random.randint(1, 123456)
    else:
        ran_seed = seed

    x_start, y_start = np.float32(x_start), np.float32(y_start)

    # Initialize the random states for the kernel
    rng_states = create_xoroshiro128p_states(N, seed=ran_seed)
    # Create empty arrays for the (x, y) values
    out_x, out_y = np.zeros(N, dtype=np.float32), np.zeros(N, dtype=np.float32)
    # Create empty array for domhits
    domhits = np.zeros((N, len(rs)), dtype=np.int32)
    domhit_times = np.zeros((N, len(rs)), dtype=np.int32)
    # Calculate x, y and domhits
    move[blocks, threads_per_block](rng_states, x_start, y_start, out_x, out_y, doms, rs, domhits, domhit_times)
    # Save the hit information
    domhits = np.sum(domhits, axis=0)
    
    t2 = time.time()
    if verbose:
        print (t2-t1)
    x_y = np.array(x_y)
    domhits = np.array(domhits)

    return domhits, domhit_times, t2-t1
Esempio n. 8
0
def test__cuda_random_step():
    @cuda.jit()
    def test_kernel(steps, rng_states):
        thread_id = cuda.grid(1)
        if thread_id >= steps.shape[0]:
            return
        simulations._cuda_random_step(steps[thread_id, :], rng_states,
                                      thread_id)
        return

    N = int(1e5)
    seeds = [1, 1, 12]
    steps = np.zeros((len(seeds), N, 3))
    block_size = 128
    grid_size = int(math.ceil(N / block_size))
    for i, seed in enumerate(seeds):
        stream = cuda.stream()
        rng_states = create_xoroshiro128p_states(grid_size * block_size,
                                                 seed=seed,
                                                 stream=stream)
        test_kernel[grid_size, block_size, stream](steps[i, :, :], rng_states)
        stream.synchronize()
    npt.assert_equal(steps[0], steps[1])
    npt.assert_equal(np.all(steps[0] != steps[2]), True)
    npt.assert_almost_equal(np.mean(np.sum(steps[1::], axis=1) / N), 0, 3)
    _, p = normaltest(steps[1::].ravel())
    npt.assert_almost_equal(p, 0)
    npt.assert_almost_equal(np.linalg.norm(steps, axis=2),
                            np.ones((len(seeds), N)))
    return
Esempio n. 9
0
def driver(pricer, do_plot=False):
    paths = np.zeros((numPath, NumStep + 1), order='F')
    paths[:, 0] = StockPrice
    DT = Maturity / NumStep

    ts = timer()
    threads_per_block = 64
    blocks = 24
    rng_states = random.create_xoroshiro128p_states(threads_per_block * blocks,
                                                    seed=1)
    pricer(rng_states, paths, DT, InterestRate, Volatility)
    te = timer()
    elapsed = te - ts

    ST = paths[:, -1]
    PaidOff = np.maximum(paths[:, -1] - StrikePrice, 0)
    print("Result")
    print(f"Stock price: {np.mean(ST)}")
    print(f"Standard error: {np.std(ST)/np.sqrt(numPath)}")
    print(f"Paid off: {np.mean(PaidOff)}")
    optionPrice = np.mean(PaidOff) * np.exp(-InterestRate * Maturity)
    print(f"Option price: {optionPrice}")

    print("Performance")
    NumCompute = numPath * NumStep
    print(f"Mstep/second = {NumCompute/elapsed/1e6:.2f}")
    print(f"Time elapsed = {elapsed:.3f}")

    if do_plot:
        pathct = min(numPath, MAX_PATH_IN_PLOT)
        for i in range(pathct):
            pyplot.plot(paths[i])
        print(f"Plotting {pathct}/{numPath} paths")
        pyplot.show()
Esempio n. 10
0
def detect_and_swap_gpu(matrix, seed):
    threads_per_block = 128
    blocks = 128
    rng_states = create_xoroshiro128p_states(threads_per_block * blocks,
                                             seed=seed)

    vec = np.zeros([threads_per_block * blocks, 2]).astype(int)
    d_matrix = cuda.to_device(matrix)
    d_vec = cuda.to_device(vec)

    _detect_gpu[blocks, threads_per_block](d_matrix, d_vec, rng_states)
    vec = d_vec.copy_to_host()
    vec = vec[~np.all(vec == 0, axis=1)]  # select non-zero rows
    print(vec.shape)
    # remove conflicted rows
    visited = {}
    selected = []
    for i in range(vec.shape[0]):
        if vec[i, 0] not in visited and vec[i, 1] not in visited:
            selected.append(i)
            visited[vec[i, 0]] = 1
            visited[vec[i, 1]] = 1
    vec = vec[selected, :]
    print(vec.shape)
    if vec.shape[0] > 0:
        blocks = (vec.shape[0] + threads_per_block - 1) // threads_per_block
        d_vec = cuda.to_device(vec)
        _swap_gpu[blocks, threads_per_block](d_matrix, d_vec)
        matrix = d_matrix.copy_to_host()
    return matrix
Esempio n. 11
0
def GPUWrapper(data_out, device_id, photons_req_per_device,
               max_photons_per_device, muA, muS, g, source_type, source_param1,
               source_param2, detector_params, max_N, max_distance_from_det,
               target_type, target_mask, target_gridsize, z_target, z_bounded,
               z_range, ret_cols, absorb_threshold, absorb_chance):

    # TODO: These numbers can be optimized based on the device / architecture / number of photons
    threads_per_block = 256
    blocks = 64
    photons_per_thread = int(
        np.ceil(float(photons_req_per_device) / (threads_per_block * blocks)))
    max_photons_per_thread = int(
        np.ceil(float(max_photons_per_device) / (threads_per_block * blocks)))

    cuda.select_device(device_id)
    stream = cuda.stream()  # use stream to trigger async memory transfer

    # Keeping this piece of code here for now -potentially we need this in the future
    #  with compiler_lock:                        # lock the compiler
    # prepare function for this thread
    # the jitted CUDA kernel is loaded into the current context
    # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where
    # signature is the call to the function. So far I couldn't figure out what is the signature of the
    # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A)
    # But I couldn't get it to work yet.
    #     MC_cuda_kernel = cuda.jit(propPhotonGPU)

    data = np.ndarray(shape=(threads_per_block * blocks, photons_per_thread,
                             12),
                      dtype=np.float32)
    photon_counters = np.ndarray(shape=(threads_per_block * blocks, 5),
                                 dtype=np.int)
    data_out_device = cuda.device_array_like(data, stream=stream)
    photon_counters_device = cuda.device_array_like(photon_counters,
                                                    stream=stream)

    # Used to initialize the threads random states.
    rng_states = create_xoroshiro128p_states(
        threads_per_block * blocks,
        seed=(np.random.randint(sys.maxsize) - 128) + device_id,
        stream=stream)

    # Actual kernel call
    propPhotonGPU[blocks, threads_per_block](
        rng_states, data_out_device, photon_counters_device,
        photons_per_thread, max_photons_per_thread, muA, muS, g, source_type,
        source_param1, source_param2, detector_params, max_N,
        max_distance_from_det, target_type, target_mask, target_gridsize,
        z_target, z_bounded, z_range, absorb_threshold, absorb_chance)
    # Copy data back
    data_out_device.copy_to_host(data, stream=stream)
    photon_counters_device.copy_to_host(photon_counters, stream=stream)
    stream.synchronize()

    data = data.reshape(data.shape[0] * data.shape[1], data.shape[2])
    data = data[:, ret_cols]
    data_out[device_id][0] = data

    photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0))
    data_out[device_id][1] = photon_counters_aggr
Esempio n. 12
0
def test_mutation():
    a = np.array([[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]])
    A_global_mem = cuda.to_device(a)
    dt = datetime.now()
    rng_states = create_xoroshiro128p_states(10, seed=dt.microsecond)
    mutation[2, 5](A_global_mem, 2, 3, rng_states, 0.3)
    A = A_global_mem.copy_to_host()
    print(A)
Esempio n. 13
0
def _detect_possible_swaps(parent, gpu_random_seed, threads_per_block=128, blocks=128):
    rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=gpu_random_seed)
    # detected_pairs is for storing the detect results
    detected_pairs = np.zeros([threads_per_block * blocks, 3]).astype(int)
    d_parent = cuda.to_device(parent)
    d_detected_pairs = cuda.to_device(detected_pairs)
    _detect_gpu[blocks, threads_per_block](d_parent, d_detected_pairs, rng_states)
    detected_pairs = d_detected_pairs.copy_to_host()
    detected_pairs = detected_pairs[~np.all(detected_pairs == 0, axis=1)]  # select non-zero rows
    return detected_pairs
Esempio n. 14
0
def simulate(Nexp,
             N,
             initial_position,
             doms,
             rs,
             threads_per_block,
             pa=0.01,
             ps=0.02,
             seed=None,
             verbose=True):
    if verbose:
        print(Nexp)
    import time
    blocks = N // threads_per_block + 1
    # N = threads_per_block * blocks
    # print initial_position
    x_start, y_start = np.array(initial_position, dtype=np.float32)

    domhits_all = []
    domhitstimes_all = []
    x_y = []
    t1 = time.time()
    for i in range(Nexp):
        # for i in tqdm(range(Nexp)):
        if seed is None:
            # Set a random seed
            RanSeed = np.random.randint(1, 123456)

        # Initialize the random states for the kernel
        rng_states = create_xoroshiro128p_states(N, seed=RanSeed)
        # Create empty arrays for the (x, y) values
        out_x, out_y = np.zeros(N,
                                dtype=np.float32), np.zeros(N,
                                                            dtype=np.float32)
        # Create empty array for domhits
        domhits = np.zeros((N, len(rs)), dtype=np.int32)
        domhitstimes = np.zeros((N, len(rs)), dtype=np.int32)

        # Calculate x, y and domhits
        move[blocks,
             threads_per_block](rng_states, x_start, y_start, out_x, out_y,
                                doms, rs, domhits, domhitstimes, pa, ps, N)
        # Save the hit information
        domhits = np.sum(domhits, axis=0)
        domhits_all.append(domhits)
        domhitstimes_all.append(domhitstimes)
        x_y.append([x_start, y_start])

    t2 = time.time()
    if verbose:
        print(t2 - t1)
    x_y = np.array(x_y)
    domhits_all = np.array(domhits_all)
    return domhits_all, domhitstimes_all
Esempio n. 15
0
def test_recombination():
    dt = datetime.now()
    tot_ia = 5
    rng_states = create_xoroshiro128p_states(1 * tot_ia, seed=dt.microsecond)
    inp_weights_local = np.array([[1., 2., 3., 4., 5., 6., 7., 8., 9.],
                                  [9., 8., 7., 6., 5., 4., 3., 2., 1.]])
    inp_weights = cuda.to_device(inp_weights_local)
    out_weights = cuda.device_array((tot_ia, 9))
    recombination[1, tot_ia](inp_weights, out_weights, 2, tot_ia, 9,
                             rng_states)
    C = out_weights.copy_to_host()
    print(C)
Esempio n. 16
0
def cudaTask():
    T = 1000  #threadNum per block
    B = 10  #blockNum per grid
    AS = 1000  #loopNum in one thread
    res = np.zeros(T * B * AS, dtype='float64')  #创建数组,初始化0,参考numpy使用手册

    rng_states = create_xoroshiro128p_states(
        T * B,
        seed=1)  #参见http://numba.pydata.org/numba-doc/0.35.0/cuda/random.html
    cudaNormalVariateKernel[B, T](rng_states, res)

    return res
Esempio n. 17
0
def detect_and_swap_gpu(matrix,
                        indices,
                        seed,
                        threads_per_block=128,
                        blocks=128,
                        mode='random'):
    # threads_per_block, blocks = 128, 128
    # wandb.log({"threads_per_block": threads_per_block, "blocks": blocks})
    if mode == 'random':
        rng_states = create_xoroshiro128p_states(threads_per_block * blocks,
                                                 seed=seed)
        vec = np.zeros([threads_per_block * blocks, 3]).astype(int)
    elif mode == 'all':
        vec = np.zeros([4096, 3]).astype(int)
        index = np.zeros([1]).astype(int)
        d_index = cuda.to_device(index)
        threadsperblock = (32, 32)
        blockspergrid_x = (matrix.shape[0] + threadsperblock[0] -
                           1) // threadsperblock[0]
        blockspergrid_y = (matrix.shape[1] + threadsperblock[1] -
                           1) // threadsperblock[1]
        blockspergrid = (blockspergrid_x, blockspergrid_y)

    d_matrix = cuda.to_device(matrix)
    d_vec = cuda.to_device(vec)

    if mode == 'random':
        _detect_gpu[blocks, threads_per_block](d_matrix, d_vec, rng_states)
    elif mode == 'all':
        _detect_all_gpu[blockspergrid, threadsperblock](d_matrix, d_index,
                                                        d_vec)

    vec = d_vec.copy_to_host()
    vec = vec[~np.all(vec == 0, axis=1)]  # select non-zero rows
    vec = vec[np.argsort(vec[:, 2])[::-1]]  # TODO: greedy?
    # print(vec[:5])
    vec_detected = vec.shape[0]
    # print(vec.shape)
    # remove conflicted rows
    visited = {}
    selected = []
    for i in range(vec.shape[0]):
        if vec[i, 0] not in visited and vec[i, 1] not in visited:
            selected.append(i)
            visited[vec[i, 0]] = 1
            visited[vec[i, 1]] = 1
    vec = vec[selected, :]
    # print(vec.shape)
    for i in range(vec.shape[0]):
        swap_inplace(matrix, indices, vec[i, 0], vec[i, 1])
    vec_swapped = vec.shape[0]
    # wandb.log({"detected": vec_detected, "swapped": vec_swapped})
    return matrix, indices, vec_detected, vec_swapped
Esempio n. 18
0
def fill_masks(masks, aligned):
    S = masks.shape[0]
    nframes, nimages, ncolor, H, W = aligned.shape
    threads_per_block = (32, 32)
    blocks_H = H // threads_per_block[0] + (H % threads_per_block[0] != 0)
    blocks_W = W // threads_per_block[1] + (W % threads_per_block[1] != 0)
    blocks = (blocks_H, blocks_W)
    nthreads = int(
        np.product([blocks[i] * threads_per_block[i] for i in range(2)]))
    seed = int(torch.rand(1) * 100)
    rng_states = create_xoroshiro128p_states(nthreads, seed=seed)
    fill_masks_numba[blocks, threads_per_block](rng_states, nimages, ncolor, H,
                                                W, S, nframes, aligned, masks)
Esempio n. 19
0
def fill_weights_foo(weights, counts, nsubsets, nframes, gpuid):
    assert nframes <= 51, "Number of frames is maxed at 51."
    numba.cuda.select_device(gpuid)
    device = weights.device
    weights = numba.cuda.as_cuda_array(weights)
    counts = numba.cuda.as_cuda_array(counts)
    threads_per_block = 1024
    blocks = nsubsets // threads_per_block + (nsubsets % threads_per_block !=
                                              0)
    seed = int(torch.rand(1) * 100)
    rng_states = create_xoroshiro128p_states(blocks * threads_per_block,
                                             seed=seed)
    create_weights_cuda[blocks, threads_per_block](rng_states, nsubsets,
                                                   nframes, weights, counts)
    return weights
Esempio n. 20
0
 def call_process_no_buffer(
         self, process_kernel_no_buffer, N,
         ntotthreads=int(1e6), threads_per_block=512,
 ):
     ntotthreads = min(N, int(ntotthreads))
     nblocks = math.ceil(ntotthreads / threads_per_block)
     actual_nthreads = threads_per_block * nblocks
     n_neutrons_per_thread = math.ceil(N / actual_nthreads)
     print("%s blocks, %s threads, %s neutrons per thread" % (
         nblocks, threads_per_block, n_neutrons_per_thread))
     rng_states = create_xoroshiro128p_states(actual_nthreads, seed=rng_seed)
     process_kernel_no_buffer[nblocks, threads_per_block](
         rng_states, N, n_neutrons_per_thread, self.propagate_params)
     cuda.synchronize()
     return
Esempio n. 21
0
def call_process_no_buffer(N, src, guide, mon, ntotthreads=int(1e5)):
    ntotthreads = min(N, ntotthreads)
    threads_per_block = 512
    nblocks = math.ceil(ntotthreads / threads_per_block)
    actual_nthreads = threads_per_block * nblocks
    n_neutrons_per_thread = math.ceil(N / actual_nthreads)
    print("{} blocks, {} threads, {} neutrons per thread".format(
        nblocks, threads_per_block, n_neutrons_per_thread))
    rng_states = create_xoroshiro128p_states(actual_nthreads, seed=1)
    counter = np.zeros(1, dtype=int)
    process_kernel_no_buffer[nblocks,
                             threads_per_block](counter, N,
                                                n_neutrons_per_thread, src,
                                                guide, mon, rng_states)
    cuda.synchronize()
    print(f"processed {counter.sum():g} neutrons")
Esempio n. 22
0
def index_bursts_by_frames(sims, aligned):
    S = sims.shape[0]
    nframes, nimages, ncolor, H, W = aligned.shape
    threads_per_block = (32, 32)
    blocks_H = H // threads_per_block[0] + (H % threads_per_block[0] != 0)
    blocks_W = W // threads_per_block[1] + (W % threads_per_block[1] != 0)
    blocks = (blocks_H, blocks_W)
    nthreads = int(
        np.product([blocks[i] * threads_per_block[i] for i in range(2)]))
    seed = int(torch.rand(1) * 100)
    rng_states = create_xoroshiro128p_states(nthreads, seed=seed)
    uniform_pix_sample_by_frames_numba[blocks,
                                       threads_per_block](rng_states, nimages,
                                                          ncolor, H, W, S,
                                                          nframes, aligned,
                                                          sims)
Esempio n. 23
0
    def test_ex_3d_grid(self):
        # magictoken.ex_3d_grid.begin
        from numba import cuda
        from numba.cuda.random import (create_xoroshiro128p_states,
                                       xoroshiro128p_uniform_float32)
        import numpy as np

        @cuda.jit
        def random_3d(arr, rng_states):
            # Per-dimension thread indices and strides
            startx, starty, startz = cuda.grid(3)
            stridex, stridey, stridez = cuda.gridsize(3)

            # Linearized thread index
            tid = (startz * stridey * stridex) + (starty * stridex) + startx

            # Use strided loops over the array to assign a random value to each entry
            for i in range(startz, arr.shape[0], stridez):
                for j in range(starty, arr.shape[1], stridey):
                    for k in range(startx, arr.shape[2], stridex):
                        arr[i, j, k] = xoroshiro128p_uniform_float32(
                            rng_states, tid)

        # Array dimensions
        X, Y, Z = 701, 900, 719

        # Block and grid dimensions
        bx, by, bz = 8, 8, 8
        gx, gy, gz = 16, 16, 16

        # Total number of threads
        nthreads = bx * by * bz * gx * gy * gz

        # Initialize a state for each thread
        rng_states = create_xoroshiro128p_states(nthreads, seed=1)

        # Generate random numbers
        arr = cuda.device_array((X, Y, Z), dtype=np.float32)
        random_3d[(gx, gy, gz), (bx, by, bz)](arr, rng_states)
        # magictoken.ex_3d_grid.end

        # Some basic tests of the randomly-generated numbers
        host_arr = arr.copy_to_host()
        self.assertGreater(np.mean(host_arr), 0.49)
        self.assertLess(np.mean(host_arr), 0.51)
        self.assertTrue(np.all(host_arr <= 1.0))
        self.assertTrue(np.all(host_arr >= 0.0))
Esempio n. 24
0
def pso(nodes_h, edges_h, nrParticles, nrIterations, threads_per_block, blocks_per_grid): 
    nodes = cuda.to_device(nodes_h)
    edges = cuda.to_device(edges_h)
    cuda.synchronize()
    # print(threads_per_block)
    # print(blocks_per_grid)
    # threads_per_block = 128
    # blocks_per_grid = 30
  

    currentPaths = cuda.device_array(shape=(nrParticles, len(nodes)), dtype=int)
    neighbourNodes = cuda.device_array(shape=(nrParticles, len(nodes)), dtype=int)
    rng_states = create_xoroshiro128p_states(threads_per_block * blocks_per_grid, seed=1)
    cuda.synchronize()

    getRandomPaths[blocks_per_grid, threads_per_block](edges, nodes, nrParticles,currentPaths,neighbourNodes,rng_states)
    cuda.synchronize()
    
    bestPaths = cuda.device_array(shape=currentPaths.shape, dtype=int)
    bestPaths[:] = currentPaths # copy on gpu
    cuda.synchronize()

    particles = cuda.device_array(shape=(nrParticles,2), dtype=float)
    cuda.synchronize()

    initParticles[blocks_per_grid, threads_per_block](bestPaths,particles, nodes)
    cuda.synchronize()

    globalBestPath = cuda.device_array(shape=(bestPaths.shape[1]), dtype=int)
    shortestGlobalIdx = getGlobalBestPath(particles, particles[0, BEST])
    if shortestGlobalIdx != -1:
      globalBestPath[:] = bestPaths[shortestGlobalIdx, :]    
      globalBestCost = particles[shortestGlobalIdx, BEST]

    for i in range(nrIterations):
      getNewPaths[blocks_per_grid, threads_per_block](currentPaths, bestPaths, globalBestPath, neighbourNodes, nodes )
      cuda.synchronize()

      updateParticles[blocks_per_grid, threads_per_block](currentPaths,bestPaths,particles, nodes)
      cuda.synchronize()
      
      shortestGlobalIdx = getGlobalBestPath(particles, globalBestCost)
      if shortestGlobalIdx != -1:
        globalBestPath[:] = bestPaths[shortestGlobalIdx, :]
        globalBestCost = particles[shortestGlobalIdx, BEST]
      cuda.synchronize()
    return globalBestCost
Esempio n. 25
0
def get_rrs_gpu(graph, p, mc):

    sources = np.random.choice(graph.shape[0], size=mc)
    probs = (p**graph).astype(float32)
    success = np.full((mc, graph.shape[0]), False, dtype=bool)
    new_nodes = np.full((mc, graph.shape[0]), False, dtype=bool)
    rrs = np.full((mc, graph.shape[0]), False, dtype=bool)

    threads_per_block = 128
    blocks = math.ceil(mc / threads_per_block)
    rng_states = create_xoroshiro128p_states(mc, seed=mc)

    get_node_flow_gpu[blocks,
                      threads_per_block](sources, probs, success, new_nodes,
                                         rrs, mc, rng_states)

    return rrs
Esempio n. 26
0
        def mc_integrate(lower_lim, upper_lim, nsamps):
            """
            approximate the definite integral of `func` from
            `lower_lim` to `upper_lim`
            """
            out = cuda.to_device(np.zeros(nsamps, dtype="float32"))
            rng_states = create_xoroshiro128p_states(nsamps, seed=42)

            # jit the function for use in CUDA kernels

            mc_integrator_kernel.forall(nsamps)(out, rng_states, lower_lim,
                                                upper_lim)
            # normalization factor to convert
            # to the average: (b - a)/(N - 1)
            factor = (upper_lim - lower_lim) / (nsamps - 1)

            return sum_reduce(out) * factor
Esempio n. 27
0
File: som.py Progetto: xKuZz/tfg
def spark_gpu_batch_som(rdd_data,
                        d,
                        max_iters,
                        rows,
                        cols,
                        smooth_iters=None,
                        sigma_0=10,
                        sigma_f=0.1,
                        tau=400,
                        seed=None,
                        tpb=1024):

    # 1. Inicializamos pesos aleatorios
    d_weights = cuda.device_array((rows, cols, d), np.float32)
    rng_states = create_xoroshiro128p_states(rows * cols * d, seed=seed)
    rand_weights[(d_weights.size) // tpb + 1, tpb](rng_states, d_weights)

    weights = d_weights.copy_to_host()
    # 2. Bucle del algoritmo
    for t in range(max_iters):
        # 2.a Actualizamos los parámetros de control si procede
        if smooth_iters is None or t < max_iters:
            sigma = sigma_0 * math.exp((-t / tau))
        else:
            sigma = sigma_f

        sigma_squared = sigma * sigma

        # 2.b Cada nodo del clúster de spark trabajará con un subconjunto
        # de las muestras del RDD para encontrar la BMU y realizar la suma
        # parcial de su ecucación de actualización de pesos
        out = rdd_data.mapPartitions(gpu_work_iter(weights, sigma_squared))

        # 2.c En un único nodo usamos la GPU para juntar todas las sumas parciales obtenidas
        #   y realizar la división
        out = out.collect()
        numParts = len(out) // 2

        partials = np.concatenate(out)
        finish_update[rows * cols // tpb + 1, tpb](weights, partials, numParts)

    return weights
Esempio n. 28
0
    def __init__(self, geometry, freqs, monitors=None, name=None):
        dt = profile.get_dt() / 1000.

        # firing rate
        if isinstance(freqs, np.ndarray):
            freqs = freqs.flatten()
        if not np.all(freqs <= 1000. / profile.get_dt()):
            print(f'WARNING: The maximum supported frequency at dt={profile.get_dt()} ms '
                  f'is {1000. / profile.get_dt()} Hz. While we get your "freq" setting which '
                  f'is bigger than that.')

        # neuron model on CPU
        # -------------------
        if profile.run_on_cpu():
            def update(ST):
                ST['spike'] = np.random.random(ST['spike'].shape) < freqs * dt

            model = NeuType(name='poisson_input', ST=NeuState('spike'), steps=update, mode='vector')

        # neuron model on GPU
        # -------------------
        else:
            def update(ST, rng_states, _obj_i):
                ST['spike'] = random.xoroshiro128p_uniform_float64(rng_states, _obj_i) < freqs * dt

            model = NeuType(name='poisson_input', ST=NeuState('spike'), steps=update, mode='scalar')

        # initialize neuron group
        # -----------------------
        super(PoissonInput, self).__init__(model=model, geometry=geometry, monitors=monitors, name=name)

        # will automatically handle
        # the heterogeneous problem
        # -------------------------
        self.pars['freqs'] = freqs

        # rng states
        # ----------
        if profile.run_on_gpu():
            num_block, num_thread = tools.get_cuda_size(self.num)
            self.rng_states = random.create_xoroshiro128p_states(
                num_block * num_thread, seed=np.random.randint(100000))
Esempio n. 29
0
def E_Q_scattering_kernel_call(Qmin, Qmax, neutron_velocity, neutron_probability, E_Q, S_Q, scattering_coefficient,
                               absorption_cross_section,  threadsperblock = 64 ):
    neutron_velocity_gpu = cuda.to_device(neutron_velocity)
    neutron_probability_gpu = cuda.to_device(neutron_probability)
    scattered_neutron_velocity =cuda.device_array ((len(neutron_velocity),3)) #cuda.device_array_like(d_a)
    scattered_neutron_probability = cuda.device_array(len(neutron_probability))

    write_user_input(E_Q, S_Q, scattering_coefficient, absorption_cross_section)
    rng_states = create_xoroshiro128p_states(15000, seed=1)

    blockspergrid = (neutron_velocity_gpu.size + (threadsperblock - 1)) // threadsperblock
    E_Q_kernel[blockspergrid, threadsperblock] (rng_states,Qmin,Qmax, neutron_velocity_gpu, neutron_probability_gpu,
                    scattered_neutron_probability,
                    scattered_neutron_velocity)


    scattered_neutron_velocity_cpu = scattered_neutron_velocity.copy_to_host()
    scattered_neutron_probability_cpu = scattered_neutron_probability.copy_to_host()

    return scattered_neutron_probability_cpu, scattered_neutron_velocity_cpu
Esempio n. 30
0
def get_rrs_gpu(graph, node_num, p, mc):  # Do we need node_num?

    # Randomly choose mc number of nodes with replacement from network
    sources = np.random.choice(node_num, size=mc)

    # Initiate a mc x n matrix to represent R
    rrs = np.full((mc, node_num), False, dtype=bool)

    # Creates mc random number generator states
    rng_states = create_xoroshiro128p_states(mc, seed=np.random.randint(mc))

    # Number of blocks (each with 128 threads) needed to perform mc operations
    threads_per_block = 128
    blocks = math.ceil(mc / threads_per_block)

    # Update the mc rows of the rrs array using GPU
    get_node_flow_gpu[blocks, threads_per_block](graph, sources, p, rrs, mc,
                                                 rng_states)

    return (rrs)
"""
from __future__ import print_function, absolute_import

from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
import numpy as np

@cuda.jit
def compute_pi(rng_states, iterations, out):
    """Find the maximum value in values and store in result[0]"""
    thread_id = cuda.grid(1)

    # Compute pi by drawing random (x, y) points and finding what
    # fraction lie inside a unit circle
    inside = 0
    for i in range(iterations):
        x = xoroshiro128p_uniform_float32(rng_states, thread_id)
        y = xoroshiro128p_uniform_float32(rng_states, thread_id)
        if x**2 + y**2 <= 1.0:
            inside += 1

    out[thread_id] = 4.0 * inside / iterations

threads_per_block = 64
blocks = 24
rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=1)
out = np.zeros(threads_per_block * blocks, dtype=np.float32)

compute_pi[blocks, threads_per_block](rng_states, 10000, out)
print('pi:', out.mean())