Esempio n. 1
0
 def __init__(self, r_cut, r_buff=0.5, cell_guess=50, n_guess=150):
     system = Ctx.get_active()
     if system is None:
         raise ValueError("No active system!")
     self.system = system
     self.cell_guess = cell_guess
     self.n_guess = n_guess
     self.r_cut2 = r_cut ** 2
     self.r_buff2 = (r_buff / 2) ** 2
     self.gpu = system.gpu
     self.tpb = 64
     self.bpg = int(self.system.N // self.tpb + 1)
     # self.situ_zero = np.zeros(1, dtype=np.int32)
     self.update_counts = 0
     self.dist_funcs = {}
     self.cu_nlist, self.cu_check_build = self._gen_func()
     with cuda.gpus[self.gpu]:
         self.p_n_max = cuda.pinned_array((1,), dtype=np.int32)
         self.p_situation = cuda.pinned_array((1,), dtype=np.int32)
         self.d_last_x = cuda.device_array_like(self.system.d_x)
         self.d_n_max = cuda.device_array(1, dtype=np.int32)
         self.d_nl = cuda.device_array((self.system.N, self.n_guess), dtype=np.int32)
         self.d_nc = cuda.device_array((self.system.N,), dtype=np.int32)
         self.d_situation = cuda.device_array(1, dtype=np.int32)
     self.clist = clist(r_cut, r_buff, cell_guess=self.cell_guess)
     self.neighbour_list()
     self.system.nlist = self  # register to system
Esempio n. 2
0
 def __init__(self, r_cut, r_buff=0.5, cell_guess=50):
     system = Ctx.get_active()
     if system is None:
         raise ValueError("Error, Initialize system first!")
     self.system = system
     self.ibox = np.asarray(np.floor(system.box / (r_cut + r_buff)),
                            dtype=np.int32)
     self.n_cell = int(np.multiply.reduce(self.ibox))
     self.cell_adj = np.ones(self.system.n_dim, dtype=np.int32) * 3
     self.gpu = system.gpu
     self.tpb = 64
     self.bpg = int(self.system.N // self.tpb + 1)
     self.bpg_cell = int(self.n_cell // self.tpb + 1)
     self.cell_guess = cell_guess
     # self.situ_zero = np.zeros(1, dtype=np.int32)
     self.cu_cell_map, self.cu_cell_list = self._gen_func()
     self.p_cell_max = cuda.pinned_array((1, ), dtype=np.int32)
     self.p_out_of_box = cuda.pinned_array((1, ), dtype=self.system.dtype)
     with cuda.gpus[self.gpu]:
         self.d_cells = cuda.device_array(self.system.d_x.shape[0],
                                          dtype=np.int32)
         self.d_cell_map = cuda.device_array((self.n_cell, 3**system.n_dim),
                                             dtype=np.int32)
         self.d_ibox = cuda.to_device(self.ibox)
         self.d_cell_adj = cuda.to_device(self.cell_adj)
         self.cu_cell_map[self.bpg_cell,
                          self.tpb](self.d_ibox, self.d_cell_adj,
                                    self.d_cell_map)
         self.d_cell_list = cuda.device_array(
             (self.n_cell, self.cell_guess), dtype=np.int32)
         self.d_cell_counts = cuda.device_array(self.n_cell, dtype=np.int32)
         self.d_cell_max = cuda.device_array(1, dtype=np.int32)
         self.d_out_of_box = cuda.device_array(1, dtype=self.system.dtype)
     self.update()
Esempio n. 3
0
def test_gpu_instrumentation():
    sdfg: dace.SDFG = axpy2GPU.to_sdfg(strict=True)
    sdfg.name = 'gpu_instrumentation'
    map1 = find_map_by_param(sdfg, 'i')
    map2 = find_map_by_param(sdfg, 'j')
    GPUTransformMap.apply_to(sdfg, _map_entry=map1, options={'gpu_id': 0})
    GPUTransformMap.apply_to(sdfg, _map_entry=map2, options={'gpu_id': 1})

    # Set instrumentation both on the state and the map
    sdfg.start_state.instrument = dace.InstrumentationType.GPU_Events
    map1.instrument = dace.InstrumentationType.GPU_Events
    map2.instrument = dace.InstrumentationType.GPU_Events

    size = 256
    np.random.seed(0)
    A = cuda.pinned_array(shape=1, dtype=np_dtype)
    X = cuda.pinned_array(shape=size, dtype=np_dtype)
    Y = cuda.pinned_array(shape=size, dtype=np_dtype)
    A.fill(np.random.rand())
    X[:] = np.random.rand(size)[:]
    Y[:] = np.random.rand(size)[:]
    Z = np.copy(Y)

    sdfg(A=A[0], X=X, Y=Y, N=size)
    assert np.allclose(Y, A * X + Z)

    # Print instrumentation report
    if sdfg.is_instrumented():
        report = sdfg.get_latest_report()
        print(report)
Esempio n. 4
0
 async def async_cuda_fn(value_in: float) -> float:
     stream = cuda.stream()
     h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8)
     h_src[:] = value_in
     d_ary = cuda.to_device(h_src, stream=stream)
     d_ary.copy_to_host(h_dst, stream=stream)
     await stream.async_done()
     return h_dst.mean()
Esempio n. 5
0
    def __init__(self, info, group):
        self.info = info
        self.group = group
        # self.ps = particle_set(info, group)

        self.ps = info.find_particle_set(group)
        if self.ps is None:
            self.ps = particle_set(info, group)
            info.particle_set.append(self.ps)

        self.last_ts = 0xffffffff
        self.temp = 0.0
        self.pressure = 0.0
        self.potential = 0.0
        self.momentum = 0.0
        self.block_size = 256
        self.nblocks = math.ceil(self.ps.nme / self.block_size)

        self.coll = np.zeros(self.nblocks * 6, dtype=np.float32)
        self.d_coll = cuda.to_device(self.coll)
        # self.result = np.zeros(3, dtype = nb.float32)
        self.result = cuda.pinned_array(16, dtype=np.float32)
        self.result[0] = 0.0
        self.result[1] = 0.0
        self.result[2] = 0.0
        self.result[3] = 0.0
        self.result[4] = 0.0
        self.result[5] = 0.0
        self.d_result = cuda.to_device(self.result)
Esempio n. 6
0
    def __init__(self, x_shape, n_samples, logger: logging.Logger = None):
        if logger is None:
            logger = logging.Logger('AutoMemoryBuilder')
            logger.addHandler(logging.NullHandler())

        self._logger = logger.getChild('AutoMemoryBuilder')
        self._logger.debug('Constructor')

        if len(x_shape) == 1:
            self._x_size = x_shape[0]
        else:
            assert len(x_shape) == 2
            self._x_size = x_shape[0] * x_shape[1]

        assert self._x_size >= n_samples

        self._n_samples = n_samples
        self._xmem_shape = (n_samples, self._x_size)

        self._i = 0

        self._logger.info('Allocating pinned array: {0} bytes'.format(
            n_samples * self._x_size))

        self._pinned_mem = cuda.pinned_array(self._xmem_shape)
        self._stream = cuda.stream()
Esempio n. 7
0
 def predict(self, features_gen=None, as_cuda_array=False, flatten=True):
     if features_gen is None:
         features_gen = self._build_features()
     predicted_cva = torch.empty((self.diffusion_engine.num_defs_per_path *
                                  self.diffusion_engine.num_paths, 1),
                                 dtype=torch.float32,
                                 device=self.device)
     with cuda.devices.gpus[self.device.index]:
         d_predicted_cva = cuda.as_cuda_array(
             predicted_cva.view(self.diffusion_engine.num_defs_per_path,
                                self.diffusion_engine.num_paths))
     if as_cuda_array:
         out = d_predicted_cva
     else:
         out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path,
                                  self.diffusion_engine.num_paths),
                                 dtype=np.float32)
     if flatten:
         out = out.reshape(-1)
     while True:
         t = yield
         self._predict(t, features_gen, predicted_cva)
         if not as_cuda_array:
             d_predicted_cva.copy_to_host(out)
         yield out
Esempio n. 8
0
 def test_host_alloc_pinned(self):
     ary = cuda.pinned_array(10, dtype=np.uint32)
     ary.fill(123)
     self.assertTrue(all(ary == 123))
     devary = cuda.to_device(ary)
     driver.device_memset(devary, 0, driver.device_memory_size(devary))
     self.assertTrue(all(ary == 123))
     devary.copy_to_host(ary)
     self.assertTrue(all(ary == 0))
 def test_host_alloc_pinned(self):
     ary = cuda.pinned_array(10, dtype=np.uint32)
     ary.fill(123)
     self.assertTrue(all(ary == 123))
     devary = cuda.to_device(ary)
     driver.device_memset(devary, 0, driver.device_memory_size(devary))
     self.assertTrue(all(ary == 123))
     devary.copy_to_host(ary)
     self.assertTrue(all(ary == 0))
 def _build_labels_backward(self, as_cuda_tensor):
     d_spread_integral_now = self.diffusion_engine.d_spread_integrals[0, 1:]
     d_spread_integral_next = self.diffusion_engine.d_spread_integrals[1,
                                                                       1:]
     d_mtm_next = self.diffusion_engine.d_mtm_by_cpty[0]
     d_rate_integral_now = self.diffusion_engine.d_dom_rate_integral[0]
     d_rate_integral_next = self.diffusion_engine.d_dom_rate_integral[1]
     d_def = self.diffusion_engine.d_def_indicators[0]
     d_labels_by_cpty = self.diffusion_engine.d_mtm_by_cpty[1]
     t_out = torch.empty((self.diffusion_engine.num_defs_per_path,
                          self.diffusion_engine.num_paths),
                         dtype=torch.float32,
                         device=self.device)
     with cuda.devices.gpus[self.device.index]:
         d_out = cuda.as_cuda_array(t_out)
     if as_cuda_tensor:
         out = t_out
     else:
         out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path,
                                  self.diffusion_engine.num_paths),
                                 dtype=np.float32)
     out[:] = 0
     if as_cuda_tensor:
         yield out.view(-1, 1)
     else:
         yield out.reshape(-1, 1)
     d_spread_integral_next.copy_to_device(
         self.diffusion_engine.spread_integrals[
             self.diffusion_engine.num_coarse_steps, 1:])
     d_rate_integral_next.copy_to_device(
         self.diffusion_engine.dom_rate_integral[
             self.diffusion_engine.num_coarse_steps])
     accumulate = False
     for t in range(self.diffusion_engine.num_coarse_steps - 1, -1, -1):
         d_spread_integral_now.copy_to_device(
             self.diffusion_engine.spread_integrals[t, 1:])
         d_rate_integral_now.copy_to_device(
             self.diffusion_engine.dom_rate_integral[t])
         d_mtm_next.copy_to_device(self.diffusion_engine.mtm_by_cpty[t + 1])
         d_def.copy_to_device(self.diffusion_engine.def_indicators[t])
         self.__cuda_build_labels_backward(d_spread_integral_now,
                                           d_spread_integral_next,
                                           d_rate_integral_now,
                                           d_rate_integral_next, d_mtm_next,
                                           d_labels_by_cpty, t > 0,
                                           accumulate)
         self.__cuda_aggregate_survival(d_labels_by_cpty, d_def, d_out)
         if as_cuda_tensor:
             yield out.view(-1, 1)
         else:
             d_out.copy_to_host(out)
             yield out.reshape(-1, 1)
         if not accumulate:
             accumulate = True
Esempio n. 11
0
 def test_host_operators(self):
     for ary in [
             cuda.mapped_array(10, dtype=np.uint32),
             cuda.pinned_array(10, dtype=np.uint32)
     ]:
         ary[:] = range(10)
         self.assertTrue(sum(ary + 1) == 55)
         self.assertTrue(sum((ary + 1) * 2 - 1) == 100)
         self.assertTrue(sum(ary < 5) == 5)
         self.assertTrue(sum(ary <= 5) == 6)
         self.assertTrue(sum(ary > 6) == 3)
         self.assertTrue(sum(ary >= 6) == 4)
         self.assertTrue(sum(ary**2) == 285)
         self.assertTrue(sum(ary // 2) == 20)
         self.assertTrue(sum(ary / 2.0) == 22.5)
         self.assertTrue(sum(ary % 2) == 5)
Esempio n. 12
0
    def __init__(self):
        self.device_array = cuda.device_array((1024, 2048), np.float32)
        self.host_array = cuda.pinned_array((1024, 2048), np.float32)
        self.stream = cuda.stream()
        self.zoom = 0.3
        self.pos = np.array([-1, 0], np.float32)

        dpi = mpl.rcParams['figure.dpi']
        figsize = 2048 / float(dpi), 1024 / float(dpi)

        self.fig = plt.figure(figsize=figsize)
        self.fig.canvas.mpl_connect('scroll_event', self.zoom_cb)
        self.ax = self.fig.add_axes([0, 0, 1, 1])
        self.ax.axis('off')
        self.calculate()
        self.img = self.ax.imshow(self.host_array)
Esempio n. 13
0
    def test_pinned_warn_on_host_array(self):
        @cuda.jit
        def foo(r, x):
            r[0] = x + 1

        N = 10
        ary = cuda.pinned_array(N, dtype=np.float32)

        with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
            with warnings.catch_warnings(record=True) as w:
                foo[1, N](ary, N)

        self.assertEqual(w[0].category, NumbaPerformanceWarning)
        self.assertIn('Host array used in CUDA kernel will incur',
                      str(w[0].message))
        self.assertIn('copy overhead', str(w[0].message))
 def _build_features(self):
     features = cuda.pinned_array(
         (self.diffusion_engine.num_defs_per_path,
          self.diffusion_engine.num_paths, self.num_features),
         dtype=np.float32)
     while True:
         t = yield
         t_prev_reset = self.prev_reset_arr[t]
         features[:, :, :2 * self.diffusion_engine.num_rates -
                  1] = self.diffusion_engine.X[
                      t, :2 * self.diffusion_engine.num_rates - 1].T[None]
         np.maximum(
             self.diffusion_engine.X[t,
                                     2 * self.diffusion_engine.num_rates:2 *
                                     self.diffusion_engine.num_rates +
                                     self.diffusion_engine.num_spreads -
                                     1].T[None],
             0.,
             out=features[:, :, 2 * self.diffusion_engine.num_rates -
                          1:2 * self.diffusion_engine.num_rates +
                          self.diffusion_engine.num_spreads - 2])
         if t_prev_reset > 0:
             features[:, :, 2 * self.diffusion_engine.num_rates +
                      self.diffusion_engine.num_spreads -
                      2:3 * self.diffusion_engine.num_rates +
                      self.diffusion_engine.num_spreads -
                      2] = self.diffusion_engine.X[
                          t_prev_reset, :self.diffusion_engine.
                          num_rates].T[None]
         else:
             features[:, :, 2 * self.diffusion_engine.num_rates +
                      self.diffusion_engine.num_spreads -
                      2:3 * self.diffusion_engine.num_rates +
                      self.diffusion_engine.num_spreads - 2] = 0
         self.__unpack(
             self.diffusion_engine.def_indicators[t],
             features[:, :, 3 * self.diffusion_engine.num_rates +
                      self.diffusion_engine.num_spreads - 2:])
         yield features.reshape(
             self.diffusion_engine.num_defs_per_path *
             self.diffusion_engine.num_paths, -1)
Esempio n. 15
0
def vecadd(x, y):
    with time_region_cuda() as t_xfer:
        d_x = cuda.to_device(x)
        d_y = cuda.to_device(y)
        d_z = cuda.device_array_like(x)

    block_size = 128
    num_blocks = N // block_size
    if N % block_size:
        num_blocks += 1

    with time_region_cuda() as t_kernel:
        _vecadd_cuda[num_blocks, block_size](d_z, d_x, d_y)

    with time_region_cuda(t_xfer.elapsed_time()) as t_xfer:
        ret = cuda.pinned_array(N)
        d_z.copy_to_host(ret)

    print(f'  CUDA xfer overheads: {t_xfer.elapsed_time()} s')
    print(f'  CUDA kernel time:    {t_kernel.elapsed_time()} s')
    return ret
Esempio n. 16
0
def similarity_gpu(
    emx,
    clusmethod,
    corrmethod,
    preout,
    postout,
    minexpr,
    maxexpr,
    minsamp,
    minclus,
    maxclus,
    criterion,
    mincorr,
    maxcorr,
    gsize,
    lsize,
    outfile):

    # allocate device buffers
    W = gsize
    N = emx.shape[1]
    N_pow2 = next_power_2(N)
    K = maxclus

    in_emx               = cuda.to_device(emx)
    in_index_cpu         = cuda.pinned_array((W, 2), dtype=np.int32)
    in_index_gpu         = cuda.device_array_like(in_index_cpu)
    work_x               = cuda.device_array((W, N_pow2), dtype=np.float32)
    work_y               = cuda.device_array((W, N_pow2), dtype=np.float32)
    work_gmm_data        = cuda.device_array((W, N, 2), dtype=np.float32)
    work_gmm_labels      = cuda.device_array((W, N), dtype=np.int8)
    work_gmm_pi          = cuda.device_array((W, K), dtype=np.float32)
    work_gmm_mu          = cuda.device_array((W, K, 2), dtype=np.float32)
    work_gmm_sigma       = cuda.device_array((W, K, 2, 2), dtype=np.float32)
    work_gmm_sigmaInv    = cuda.device_array((W, K, 2, 2), dtype=np.float32)
    work_gmm_normalizer  = cuda.device_array((W, K), dtype=np.float32)
    work_gmm_MP          = cuda.device_array((W, K, 2), dtype=np.float32)
    work_gmm_counts      = cuda.device_array((W, K), dtype=np.int32)
    work_gmm_logpi       = cuda.device_array((W, K), dtype=np.float32)
    work_gmm_xm          = cuda.device_array((W, 2), dtype=np.float32)
    work_gmm_Sxm         = cuda.device_array((W, 2), dtype=np.float32)
    work_gmm_gamma       = cuda.device_array((W, N, K), dtype=np.float32)
    work_gmm_logL        = cuda.device_array((W, 1), dtype=np.float32)
    work_gmm_entropy     = cuda.device_array((W, 1), dtype=np.float32)
    out_K_cpu            = cuda.pinned_array((W,), dtype=np.int8)
    out_K_gpu            = cuda.device_array_like(out_K_cpu)
    out_labels_cpu       = cuda.pinned_array((W, N), dtype=np.int8)
    out_labels_gpu       = cuda.device_array_like(out_labels_cpu)
    out_correlations_cpu = cuda.pinned_array((W, K), dtype=np.float32)
    out_correlations_gpu = cuda.device_array_like(out_correlations_cpu)

    # iterate through global work blocks
    n_genes = emx.shape[0]
    n_total_pairs = n_genes * (n_genes - 1) // 2

    index_x = 1
    index_y = 0

    for i in range(0, n_total_pairs, gsize):
        # print("%8d %8d" % (i, n_total_pairs))

        # determine number of pairs
        n_pairs = min(gsize, n_total_pairs - i)

        # initialize index array
        index_x_ = index_x
        index_y_ = index_y

        for j in range(n_pairs):
            in_index_cpu[j] = index_x_, index_y_
            index_x_, index_y_ = pairwise_increment(index_x_, index_y_)

        # copy index array to device
        in_index_gpu.copy_to_device(in_index_cpu)

        # execute similarity kernel
        similarity_gpu_helper[gsize // lsize, lsize](
            n_pairs,
            in_emx,
            in_index_gpu,
            clusmethod,
            corrmethod,
            preout,
            postout,
            minexpr,
            maxexpr,
            minsamp,
            minclus,
            maxclus,
            criterion,
            work_x,
            work_y,
            work_gmm_data,
            work_gmm_labels,
            work_gmm_pi,
            work_gmm_mu,
            work_gmm_sigma,
            work_gmm_sigmaInv,
            work_gmm_normalizer,
            work_gmm_MP,
            work_gmm_counts,
            work_gmm_logpi,
            work_gmm_xm,
            work_gmm_Sxm,
            work_gmm_gamma,
            work_gmm_logL,
            work_gmm_entropy,
            out_K_gpu,
            out_labels_gpu,
            out_correlations_gpu
        )
        cuda.synchronize()

        # copy results from device
        out_K_gpu.copy_to_host(out_K_cpu)
        out_labels_gpu.copy_to_host(out_labels_cpu)
        out_correlations_gpu.copy_to_host(out_correlations_cpu)

        # save correlation matrix to output file
        index_x_ = index_x
        index_y_ = index_y

        for j in range(n_pairs):
            # extract pairwise results
            K = out_K_cpu[j]
            labels = out_labels_cpu[j]
            correlations = out_correlations_cpu[j, 0:K]

            # save pairwise results
            write_pair(
                index_x_, index_y_,
                K,
                labels,
                correlations,
                mincorr,
                maxcorr,
                outfile)

            # increment pairwise index
            index_x_, index_y_ = pairwise_increment(index_x_, index_y_)

        # update local pairwise index
        index_x = index_x_
        index_y = index_y_
Esempio n. 17
0
def run_gpu_loop(
        weights,
        delays,
        total_time=60e3,
        bold_tr=1800,
        coupling_scaling=0.01,
        r_sigma=1e-3,
        V_sigma=1e-3,
        I=1.0,
        Delta=1.0,
        eta=-5.0,
        tau=100.0,
        J=15.0,
        cr=0.01,
        cv=0.0,
        dt=1.0,
        nh=256,  # history buf len, must be power of 2 & greater than delays.max()/dt
        nto=16,  # num parts of nh for tavg, e.g. nh=256, nto=4: tavg over 64 steps
        progress=False,
        icfun=default_icfun,
        rng_seed=42):
    assert weights.shape == delays.shape and weights.shape[0] == weights.shape[
        1]
    nn = weights.shape[0]
    w = weights.astype(np.float32)
    d = (delays / dt).astype(np.int32)
    assert d.max() < nh
    assert nto <= nh, 'oversampling <= buffer size'
    make_loop = make_gpu_loop
    # TODO
    block_dim_x = 96  # nodes
    grid_dim_x = 64, 16, 16  # subjects, noise, coupling
    nt = np.prod(grid_dim_x)
    # allocate workspace stuff
    print('allocating memory..')
    if True:  # TODO no dedent in lab editor..
        r, V = rV = np.zeros((2, nh, nn, nt), 'f')
        nrV = np.zeros((2, nt), 'f')  # no stack arrays in Numba
        print('creating rngs..', end='')
        rngs = create_xoroshiro128p_states(int(nt * nn * 2), rng_seed)
        print('done')
        tavg = np.zeros((nto, 2, nn, nt), 'f')  # buffer for temporal average
        bold_state = np.zeros((nn, 4, nt), 'f')  # buffer for bold state
        bold_state[:, 1:] = 1.0
        bold_out = np.zeros((nn, nt), 'f')  # buffer for bold output
        icfun(-np.r_[:nh] * dt, rV)
    I, Delta, eta, tau, J, cr, cv, r_sigma, V_sigma = [
        nb.float32(_)
        for _ in (I, Delta, eta, tau, J, cr, cv, r_sigma, V_sigma)
    ]
    print('workspace allocations done')
    # first call to jit the function
    cfpre, cfpost = make_linear_cfun(coupling_scaling)
    loop = make_loop(nh, nto, nn, dt, cfpre, cfpost, block_dim_x)
    # outer loop setup
    win_len = nto * dt
    total_wins = int(total_time / win_len)
    bold_skip = int(bold_tr / win_len)
    # pinned memory for speeding up kernel invocations
    from numba.cuda import to_device, pinned_array
    g_nrV, g_r, g_V, g_rngs, g_w, g_d, g_tavg, g_bold_state, g_bold_out = [
        to_device(_)
        for _ in (nrV, r, V, rngs, w, d, tavg, bold_state, bold_out)
    ]
    p_tavg = pinned_array(tavg.shape, dtype=np.float32)
    p_bold_out = pinned_array(bold_out.shape, dtype=np.float32)
    # TODO mem map this, since it will get too big
    # tavg_trace = np.zeros((total_wins, ) + tavg.shape, 'f')
    bold_trace = np.zeros((total_wins // bold_skip + 1, ) + bold_out.shape,
                          'f')
    # start time stepping
    print('starting time stepping')
    for t in (tqdm.trange if progress else range)(total_wins):
        loop[grid_dim_x,
             block_dim_x](g_nrV, g_r, g_V, g_rngs, g_w, g_d, g_tavg,
                          g_bold_state, g_bold_out, I, Delta, eta, tau, J, cr,
                          cv, r_sigma, V_sigma)
        g_tavg.copy_to_host(p_tavg)
        # print(p_tavg)
        # tavg_trace[t] = p_tavg
        if t % bold_skip == 0:
            g_bold_out.copy_to_host(p_bold_out)
            bold_trace[t // bold_skip] = p_bold_out
Esempio n. 18
0
    def _allocate_host_arrays(self):
        self.X = cuda.pinned_array(
            (self.num_coarse_steps + 1, self.num_diffusions, self.num_paths),
            np.float32)
        self.mtm_by_cpty = cuda.pinned_array(
            (self.num_coarse_steps + 1, self.num_spreads - 1, self.num_paths),
            np.float32)
        self.cash_flows_by_cpty = cuda.pinned_array(
            (self.num_coarse_steps + 1, self.num_spreads - 1, self.num_paths),
            np.float32)
        self.spread_integrals = cuda.pinned_array(
            (self.num_coarse_steps + 1, self.num_spreads, self.num_paths),
            np.float32)
        self.dom_rate_integral = cuda.pinned_array(
            (self.num_coarse_steps + 1, self.num_paths), np.float32)
        self.def_indicators = cuda.pinned_array(
            (self.num_coarse_steps + 1, (self.num_spreads - 1 + 7) // 8,
             self.num_defs_per_path, self.num_paths), np.int8)
        if not self.no_nested_cva:
            try:
                self.nested_cva = cuda.pinned_array(
                    (self.num_coarse_steps + 1, self.num_defs_per_path,
                     self.num_paths), np.float32)
            except cuda.cudadrv.driver.CudaAPIError:
                print(
                    'couldn\'t allocate pinned array for nested_cva, using the numpy allocator instead (non-pinned array).'
                )
                self.nested_cva = np.empty(
                    (self.num_coarse_steps + 1, self.num_defs_per_path,
                     self.num_paths), np.float32)
            try:
                self.nested_cva_sq = cuda.pinned_array(
                    (self.num_coarse_steps + 1, self.num_defs_per_path,
                     self.num_paths), np.float32)
            except cuda.cudadrv.driver.CudaAPIError:
                print(
                    'couldn\'t allocate pinned array for nested_cva_sq, using the numpy allocator instead (non-pinned array).'
                )
                self.nested_cva_sq = np.empty(
                    (self.num_coarse_steps + 1, self.num_defs_per_path,
                     self.num_paths), np.float32)

        if not self.no_nested_im:
            try:
                self.nested_im_by_cpty = cuda.pinned_array(
                    (self.num_coarse_steps + 1, self.num_spreads - 1,
                     self.num_paths), np.float32)
            except cuda.cudadrv.driver.CudaAPIError:
                print(
                    'couldn\'t allocate pinned array for nested_im_by_cpty, using the numpy allocator instead (non-pinned array).'
                )
                self.nested_im_by_cpty = np.empty(
                    (self.num_coarse_steps + 1, self.num_spreads - 1,
                     self.num_paths), np.float32)

        self.R = np.empty((self.num_diffusions, self.num_diffusions),
                          dtype=np.float32)
        # following matrices are flattened (and only upper-triangular entries are kept)
        self.g_R = np.empty(
            self.num_diffusions * (self.num_diffusions + 1) // 2, np.float32)
        self.g_L_T = np.empty(
            self.num_diffusions * (self.num_diffusions + 1) // 2, np.float32)

        self.g_diff_params = np.empty(
            5 * self.num_rates - 2 + 3 * self.num_spreads, np.float32)

        # storing copies of product specs for each warp
        self.vanillas_on_fx_f32 = np.empty((self.vanilla_specs.size, 3),
                                           np.float32)  # mat, notional, stk
        self.vanillas_on_fx_i32 = np.empty((self.vanilla_specs.size, 2),
                                           np.int32)  # cpty, fgn_ccy
        self.vanillas_on_fx_b8 = np.empty((self.vanilla_specs.size, 1),
                                          np.bool8)  # call_put
        # first_reset, reset_freq, notional, swap_rate
        self.irs_f32 = np.empty((self.irs_specs.size, 4), np.float32)
        # num_resets, cpty, ccy
        self.irs_i32 = np.empty((self.irs_specs.size, 3), np.int32)
        self.zcs_f32 = np.empty((self.zcs_specs.size, 2),
                                np.float32)  # mat, notional
        self.zcs_i32 = np.empty((self.zcs_specs.size, 2),
                                np.int32)  # cpty, ccy
Esempio n. 19
0
rte_off = 0.01
hp = 0.3
btm = 1
# Below are shape_next matrices transferred betweet host and device once in each iteration
u_shape_next = np.zeros((user, topic),
                        dtype=np.float32)  #cuda.pinned_array((500000,500))
v_shape_next = np.zeros((author, topic),
                        dtype=np.float32)  #cuda.pinned_array((5000,500))
y_shape_next = np.zeros((user, topic),
                        dtype=np.float32)  #cuda.pinned_array((500000,500))
theta_shape_next = np.zeros((tweet, topic),
                            dtype=np.float32)  #cuda.pinned_array((500000,500))
beta_shape_next = np.zeros((word, topic),
                           dtype=np.float32)  #cuda.pinned_array((500000,500))

elogu = cuda.pinned_array((user, topic), dtype=np.float32)
elogv = cuda.pinned_array((author, topic), dtype=np.float32)
elogy = cuda.pinned_array((user, topic), dtype=np.float32)
elogtheta = cuda.pinned_array((tweet, topic), dtype=np.float32)
elogbeta = cuda.pinned_array((word, topic), dtype=np.float32)


def computlog(a, b):
    return np.float32(digamma(a)) - np.float32(np.log(b))


def random_offset(x, y, k):

    return np.random.rand(x, y) * k

Esempio n. 20
0
	TWO_N32 = 0.232830643653869628906250e-9
	return np.uint32(u32)*(TWO_N32*(high-low))+low
	

@cuda.jit("void(int32, int32, float32[:])")
def zero_pos(npa, seed, d_ran):
	i = cuda.grid(1)
	if i < npa:
		d_ran[i] = saruprng(np.uint32(i), np.uint32(npa-i), np.uint32(seed), 5, -1.0, 1.0)

		
		
if __name__ == "__main__":
	# Create an image.
	npa = 100
	ran = cuda.pinned_array(npa, dtype=np.float32)
	d_ran = cuda.to_device(ran)	
	# Record the starting time.
	start = time.time()

	block_size = 256
	nblocks = math.ceil(npa / block_size)
	seed = 12454
	zero_pos[nblocks, block_size](npa, seed, d_ran)
	cuda.synchronize()
	ran = d_ran.copy_to_host()
	# Record the ending time.
	end = time.time()
	print(ran)
	
	print(end - start)
Esempio n. 21
0
def init_gpu(part, walls):
    global dim
    global pw_blk_rows, pw_blk_cols, pw_grd_rows, pw_grd_cols
    global pw_blk_shp, pw_grd_shp, pw_tot_shp
    global pp_blk_rows, pp_blk_cols, pp_grd_rows, pp_grd_cols
    global pp_blk_shp, pp_grd_shp, pp_tot_shp
    global get_pp_dt_gpu, get_pw_dt_gpu

    dim = part.dim

    bw = min(threads_per_block_max, part.num)
    pw_blk_rows = bw
    pw_blk_cols = 1

    gw = int(np.ceil(part.num / bw))
    pw_grd_rows = gw
    pw_grd_cols = len(walls)

    pw_blk_shp = (pw_blk_rows, pw_blk_cols)
    pw_grd_shp = (pw_grd_rows, pw_grd_cols)
    pw_tot_shp = (pw_grd_rows, pw_grd_cols, pw_blk_rows, pw_blk_cols)

    bp = min(optimal_part_num, part.num)
    pp_blk_rows = bp
    pp_blk_cols = bp

    gp = int(np.ceil(part.num / bp))
    pp_grd_rows = gp
    pp_grd_cols = gp

    pp_blk_shp = (pp_blk_rows, pp_blk_cols)
    pp_grd_shp = (pp_grd_rows, pp_grd_cols)
    pp_tot_shp = (pp_grd_rows, pp_grd_cols, pp_blk_rows, pp_blk_cols)

    part.walls_data = np.array([wall.data.copy() for wall in walls],
                               dtype=np_dtype)
    part.walls_data_dev = cuda.to_device(part.walls_data)

    part.pw_gap_min_dev = cuda.to_device(part.pw_gap_min)
    part.pp_gap_min_dev = cuda.to_device(part.pp_gap_min)

    part.pos_dev = cuda.to_device(part.pos)
    part.pos_loc_dev = cuda.to_device(part.pos_loc)
    part.vel_dev = cuda.to_device(part.vel)

    ## We wish to minimize data transfer between gpu and cpu for the sake of speed.
    ## The want to avoid passing all p x p times because these are floats.
    ## Instead, we will pass only the smallest time from each block via pp_dt_blk_gpu
    ## and a BOOLEAN array called pp_events
    ## An entry of pp_events is True only if the dt for that pair of particles
    ## is within thresh of the min_dt on its block.
    ## Now, the true GLOBAL min_dt may be from a different block
    ## So, once passed to the cpu, we find the global min_dt = min(pp_dt_blk)
    ## and set all entries of pp_events to False except those from
    ## blocks that achieve this global min_dt

    ## Repeat for particle-wall

    ## Finally, if the min_dt over p-p interactions is bigger than the min_dt over p-w,
    ## we set every entry of pp_events to False.  Conversely for pw_events.

    part.pw_events_dev = cuda.to_device(part.pw_events_old)
    part.pp_events_dev = cuda.to_device(part.pp_events_old)

    part.pw_dt_blk_gpu = cuda.pinned_array(pw_grd_shp, dtype=np_dtype)
    part.pw_dt_blk_gpu[:] = np.inf
    part.pw_dt_blk_dev = cuda.to_device(part.pw_dt_blk_gpu)

    part.pp_dt_blk_gpu = cuda.pinned_array(pp_grd_shp, dtype=np_dtype)
    part.pp_dt_blk_gpu[:] = np.inf
    part.pp_dt_blk_dev = cuda.to_device(part.pp_dt_blk_gpu)

    # Though inefficient, we may occasionally want to pass all pxp and pxw dt's
    # back to CPU for validation and debugging.  So, we create pp_dt_gpu and pw_dt_gpu
    # for this purpose.  In other cases, it is set as a 1x1 matrix and ignored
    # (other than as a placeholder in function signatures and calls).

    if check_gpu_cpu:
        pw_sh = [part.num, len(walls)]
        pp_sh = [part.num, part.num]
    else:
        pw_sh = [1, 1]
        pp_sh = [1, 1]

    part.pw_dt_gpu = cuda.pinned_array(pw_sh, dtype=np_dtype)
    part.pw_dt_gpu[:] = np.inf
    part.pw_dt_dev = cuda.to_device(part.pw_dt_gpu)

    part.pp_dt_gpu = cuda.pinned_array(pp_sh, dtype=np_dtype)
    part.pp_dt_gpu[:] = np.inf
    part.pp_dt_dev = cuda.to_device(part.pp_dt_gpu)

    update_gpu(part)

    @cuda.jit(device=False)
    def get_pp_dt_kernel(pos, vel, pp_gap_min, pp_events_dev, pp_dt_blk_dev,
                         pp_dt_dev):
        row_loc = cuda.threadIdx.x
        col_loc = cuda.threadIdx.y
        idx_loc = row_loc * cuda.blockDim.y + col_loc

        blk_row = cuda.blockIdx.x
        blk_col = cuda.blockIdx.y
        row_glob = blk_row * cuda.blockDim.x + row_loc
        col_glob = blk_col * cuda.blockDim.y + col_loc
        idx_glob = row_glob * cuda.blockDim.y * cuda.gridDim.y + col_glob

        p = row_glob
        q = col_glob
        N = pos.shape[0]

        pos1_shr = cuda.shared.array(shape=(pp_blk_rows, dim), dtype=nb_dtype)
        pos2_shr = cuda.shared.array(shape=(pp_blk_cols, dim), dtype=nb_dtype)
        vel1_shr = cuda.shared.array(shape=(pp_blk_rows, dim), dtype=nb_dtype)
        vel2_shr = cuda.shared.array(shape=(pp_blk_cols, dim), dtype=nb_dtype)
        pp_dt_shr = cuda.shared.array(shape=(pp_blk_rows, pp_blk_cols),
                                      dtype=nb_dtype)
        temp_shr = cuda.shared.array(shape=(pp_blk_rows, pp_blk_cols),
                                     dtype=nb_dtype)

        d = col_loc
        if d < dim:
            pos1_shr[row_loc, d] = pos[p, d]
            vel1_shr[row_loc, d] = vel[p, d]

        d = row_loc
        if d < dim:
            pos2_shr[col_loc, d] = pos[q, d]
            vel2_shr[col_loc, d] = vel[q, d]

        cuda.syncthreads()

        if (p < N) and (q < N):
            c0 = 0.0
            c1 = 0.0
            c2 = 0.0
            for d in range(dim):
                dx = pos1_shr[row_loc, d] - pos2_shr[col_loc, d]
                dv = vel1_shr[row_loc, d] - vel2_shr[col_loc, d]
                c0 += (dx**2)
                c1 += (dx * dv * 2)
                c2 += (dv**2)
            c0 -= (pp_gap_min[p, q]**2)
            dt = solver_gpu(c2, c1, c0, pp_events_dev[p, q])
        else:
            dt = np.inf

        pp_dt_shr[row_loc, col_loc] = dt
        temp_shr[row_loc, col_loc] = dt
        cuda.syncthreads()
        min_gpu(temp_shr)
        min_dt = temp_shr[0, 0]

        if (p < N) and (q < N):
            pp_events_dev[p, q] = (dt < min_dt + thresh)
            if idx_loc == 0:
                pp_dt_blk_dev[blk_row, blk_col] = min_dt
            if check_gpu_cpu:
                pp_dt_dev[p, q] = dt

    @cuda.jit(device=False)
    def get_pw_dt_kernel(pos_loc, vel, pw_gap_min, walls, pw_events_dev,
                         pw_dt_blk_dev, pw_dt_dev):
        row_loc = cuda.threadIdx.x
        col_loc = cuda.threadIdx.y
        idx_loc = row_loc * cuda.blockDim.y + col_loc

        blk_row = cuda.blockIdx.x
        blk_col = cuda.blockIdx.y
        row_glob = blk_row * cuda.blockDim.x + row_loc
        col_glob = blk_col * cuda.blockDim.y + col_loc
        idx_glob = row_glob * cuda.blockDim.y * cuda.gridDim.y + col_glob

        p = row_glob
        w = col_glob
        N = pos_loc.shape[0]
        W = walls.shape[0]
        shape = walls[w, 0, 0]

        pw_dt_shr = cuda.shared.array(shape=(pw_blk_rows, pw_blk_cols),
                                      dtype=nb_dtype)
        temp_shr = cuda.shared.array(shape=(pw_blk_rows, pw_blk_cols),
                                     dtype=nb_dtype)

        if (p < N) and (w < W):
            c0 = 0.0
            c1 = 0.0
            c2 = 0.0
            if shape <= -0.5:
                # I'm an ignore wall
                pw_dt_shr[row_loc, col_loc] = np.inf
            else:
                point = cuda.shared.array(shape=dim, dtype=nb_dtype)
                normal = cuda.shared.array(shape=dim, dtype=nb_dtype)
                d = row_loc
                if d < dim:
                    point[d] = walls[w, 1, d]
                    normal[d] = walls[w, 2, d]
                cuda.syncthreads()
                if shape <= 0.5:
                    # I'm a flat wall
                    for d in range(dim):
                        dx = pos_loc[p, d] - point[d]
                        dv = vel[p, d]
                        c0 += dx * normal[d]
                        c1 += dv * normal[d]
                    c0 -= pw_gap_min[p, w]
                elif shape <= 1.5:
                    # I'm a sphere wall
                    for d in range(dim):
                        dx = pos_loc[p, d] - point[d]
                        dv = vel[p, d]
                        c0 += dx * dx
                        c1 += dx * dv * 2
                        c2 += dv * dv
                    c0 -= pw_gap_min[p, w]**2
                elif shape <= 2.5:
                    # I'm a cylinder wall
                    dx_ax_mag = 0.0
                    dv_ax_mag = 0.0
                    for d in range(dim):
                        dx = pos_loc[p, d] - point[d]
                        dv = vel[p, d]
                        dx_ax_mag += dx * normal[d]
                        dv_ax_mag += dv * normal[d]

                    for d in range(dim):
                        dx = pos_loc[p, d] - point[d]
                        dv = vel[p, d]
                        dx_ax = dx_ax_mag * normal[d]
                        dx_normal = dx - dx_ax
                        dv_ax = dv_ax_mag * normal[d]
                        dv_normal = dv - dv_ax
                        c0 += dx_normal * dx_normal
                        c1 += dx_normal * dv_normal * 2
                        c2 += dv_normal * dv_normal
                    c0 -= pw_gap_min[p, w]**2
                else:
                    raise Exception('Invalid wall type')
                dt = solver_gpu(c2, c1, c0, pw_events_dev[p, w])
        else:
            dt = np.inf

        pw_dt_shr[row_loc, col_loc] = dt
        temp_shr[row_loc, col_loc] = dt
        cuda.syncthreads()
        min_gpu(temp_shr)
        min_dt = temp_shr[0, 0]

        if (p < N) and (w < W):
            pw_events_dev[p, w] = (dt < min_dt + thresh)
            if idx_loc == 0:
                pw_dt_blk_dev[blk_row, blk_col] = min_dt
            if check_gpu_cpu:
                pw_dt_dev[p, w] = dt

    def get_pp_dt_gpu(part, walls):
        get_pp_dt_kernel[pp_grd_shp,
                         pp_blk_shp](part.pos_dev, part.vel_dev,
                                     part.pp_gap_min_dev, part.pp_events_dev,
                                     part.pp_dt_blk_dev, part.pp_dt_dev)

        if check_gpu_cpu:
            part.pp_dt_gpu = part.pp_dt_dev.copy_to_host()

        part.pp_events_new = part.pp_events_dev.copy_to_host()
        part.pp_dt_blk_gpu = part.pp_dt_blk_dev.copy_to_host()

        part.pp_dt = np.min(part.pp_dt_blk_gpu)  # min over blocks
        blk_idx = (part.pp_dt_blk_gpu < part.pp_dt + thresh
                   )  # which blocks obtained that min

        # Need to remove True for p-p dts that were smaller than all others on their block
        # but are larger than the global dt from other blocks.   The corresponding
        # entry of blk_idx is False and part.pp_events_new is True.
        # So, we reshape blk_idx and take the entry-wise AND.

        reps = np.prod(pp_blk_shp)  # number of threads on each block
        A = np.repeat(blk_idx.ravel(),
                      reps)  # repetition for each thread on the block
        new_sh = part.pp_events_new.shape
        blk_events = A[:np.prod(new_sh)].reshape(
            new_sh)  # drop extra entries and reshape
        part.pp_events_new &= blk_events

    def get_pw_dt_gpu(part, walls):
        ## See comments for get_pp_dt_gpu

        get_pw_dt_kernel[pw_grd_shp,
                         pw_blk_shp](part.pos_loc_dev, part.vel_dev,
                                     part.pw_gap_min_dev, part.walls_data_dev,
                                     part.pw_events_dev, part.pw_dt_blk_dev,
                                     part.pw_dt_dev)

        if check_gpu_cpu:
            part.pw_dt_gpu = part.pw_dt_dev.copy_to_host()

        part.pw_events_new = part.pw_events_dev.copy_to_host()
        part.pw_dt_blk_gpu = part.pw_dt_blk_dev.copy_to_host()
        part.pw_dt = np.min(part.pw_dt_blk_gpu)
        blk_idx = (part.pw_dt_blk_gpu < part.pw_dt + thresh)

        reps = np.prod(pw_blk_shp)
        A = np.repeat(blk_idx.ravel(), reps)
        new_sh = part.pw_events_new.shape
        blk_events = A[:np.prod(new_sh)].reshape(new_sh)
        part.pw_events_new &= blk_events