Example #1
0
def laplace_3d_cuda(d, n):
    L = d.shape[0]
    M = d.shape[1]
    N = d.shape[2]

    blockdim = (8, 8, 8)
    griddim = (L // blockdim[0], M // blockdim[1], N // blockdim[2])
    #print(griddim)

    stream = cuda.stream()
    dd = cuda.to_device(d, stream)
    dn = cuda.to_device(n, stream)

    #%timeit -n 32 -r 16 d0td1_cuda_kernel[griddim, blockdim](dd, dn)
    for i in range(0, 100):
        laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N)

    evtstart = cuda.event(timing=True)
    evtend = cuda.event(timing=True)
    evtstart.record(stream)
    for i in range(100):
        laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N)
    evtend.record(stream)
    evtend.synchronize()
    print(cuda.event_elapsed_time(evtstart, evtend) / 100.)

    dd.to_host()
def main():
    inp = np.arange(1000000, dtype=np.int32)
    factor = 4
    start, end = cuda.event(True), cuda.event(True)

    reses = []
    for (name, f) in [
        ("not shared", mult_by_x_not_shared),
        ("shared", mult_by_x_shared),
        ("not shared", mult_by_x_not_shared),
    ]:
        times = []
        for i in range(100):

            d_out = cuda.device_array_like(inp)

            start.record()
            f[blocks, threadsPerBlock](cuda.to_device(inp), d_out,
                                       cuda.to_device(np.array([factor])))
            end.record()
            end.synchronize()

            out = d_out.copy_to_host()

            # Compilation...
            if i != 0:
                times.append(cuda.event_elapsed_time(start, end))
        print(
            f"{name}: {np.mean(times):.2f} +/- {np.std(times) / np.sqrt(len(times)):.3f} (max: {np.max(times):.2f})"
        )
        reses.append(out)
    assert np.all([reses[0] == reses_i for reses_i in reses])
Example #3
0
    def __exit__(self, *args):

        cuda.select_device(self.gpu)
        suffix = 'ms (' + self.label + ')' if self.label else 'ms'
        self.end.record()
        self.end.synchronize()
        time = cuda.event_elapsed_time(self.start, self.end)
        print('elapsed time:', int(time), suffix)
Example #4
0
def main(image1, image2):
    streams = []
    start_events = []
    end_events = []
    data1_gpu = []
    data2_gpu = []
    gpu_out = []
    out = []

    data_image1 = np.array(image1)
    data_image2 = np.array(image2)
    print(data_image1.shape, data_image2.shape)

    shpape_A = data_image1.shape
    # prevaod na 1 rozmerne pole
    data_image1 = data_image1.ravel()
    data_image2 = data_image2.ravel()

    input1 = np.split(data_image1, X)
    input2 = np.split(data_image1, X)

    for _ in range(len(input1)):
        streams.append(cuda.stream())
        start_events.append(cuda.event())
        end_events.append(cuda.event())

    for i in range(len(input1)):
        data1_gpu.append(cuda.to_device(input1[i], stream=streams[i]))
        data2_gpu.append(cuda.to_device(input2[i], stream=streams[i]))

    t_start = perf_counter()
    for i in range(len(input1)):
        start_events[i].record(streams[i])
        sumImages[1, 32, streams[i]](data1_gpu[i], data2_gpu[i])
    t_end = perf_counter()

    for i in range(len(input1)):
        end_events[i].record(streams[i])
        gpu_out.append(data2_gpu[i].copy_to_host(stream=streams[i]))

    for i in range(len(gpu_out)):
        out = np.concatenate((out, gpu_out[i]))

    kernel_times = []

    for k in range(len(input1)):
        kernel_times.append(
            cuda.event_elapsed_time(start_events[k], end_events[k]))

    out = out.reshape(shpape_A)
    out = out.astype('uint8')
    out = Image.fromarray(out)
    out.save("out_stream.png")
    print(f'Total time: {t_end - t_start}')
    print(f'Mean kernel duration (milliseconds): {np.mean(kernel_times)}')
    print(f'Mean kernel standard deviation \
          (milliseconds): {np.std(kernel_times)}')
Example #5
0
def main():
    # Compile
    lots_of_copies()
    few_copies()

    # Now benchmark
    start, end = cuda.event(timing=True), cuda.event(timing=True)
    n = 200
    for f in [lots_of_copies, few_copies]:
        times = []
        for _ in range(n):
            start.record()
            f()
            end.record()
            end.synchronize()
            t = cuda.event_elapsed_time(start, end)
            times.append(t)
        print(f.__name__, np.mean(times), np.std(times) / np.sqrt(n))
Example #6
0
def test_last_block():

    MAX_TPB = 512
    n = 1024

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    MyScan.exprefixsumNumba(a, reference, init=0)
    end = timer()

    auxidx = -1

    elb = a.size
    p2elb = np.int(np.ceil(np.log2(elb)))
    telb = 2 ** p2elb
    tlb = telb / 2
    startIdx = 0

    sm_size = telb * a.itemsize

    aux = np.empty(1, dtype=np.int8)

    trash = cuda.device_array(1)

    e1, e2 = cuda.event(), cuda.event()

    e1.record()
    MyScan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx)
    e2.record()

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", cuda.event_elapsed_time(e1, e2), " ms"

    print (a == reference).all()
Example #7
0
def last_block_test():

    MAX_TPB = 512
    n = 1024

    a = np.arange(n).astype(np.int32)
    reference = np.empty_like(a)

    start = timer()
    scan.exprefixsumNumba(a, reference, init = 0)
    end = timer()

    auxidx = -1

    elb = a.size
    p2elb = np.int(np.ceil(np.log2(elb)))
    telb = 2 ** p2elb
    tlb = telb / 2
    startIdx = 0

    sm_size = telb * a.itemsize

    aux = np.empty(1,dtype=np.int8)

    trash = cuda.device_array(1)

    e1, e2 = cuda.event(), cuda.event()

    e1.record()
    scan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx)
    e2.record()

    print "CPU took:    ", (end - start) * 1000, " ms"
    print "Kernel took: ", cuda.event_elapsed_time(e1,e2), " ms"

    print (a == reference).all()
    d_z2 = cuda.to_device(z2.astype(np.float32))
    d_w2 = cuda.to_device(w2.astype(np.float32))

    d_rbins_squared = cuda.to_device(DEFAULT_RBINS_SQUARED.astype(np.float32))
    d_result_nb = cuda.device_array_like(result.astype(np.float32))

    # running the Numba jit kernel
    for i in range(4):
        if i > 0:
            start.record()
            _s = time.time()
        count_weighted_pairs_3d_cuda[blocks,
                                     threads](d_x1, d_y1, d_z1, d_w1, d_x2,
                                              d_y2, d_z2, d_w2,
                                              d_rbins_squared, d_result_nb)
        if i > 0:
            end.record()
            end.synchronize()
            _e = time.time()
            timing_nb += cuda.event_elapsed_time(start, end)
            timing_nb_wall += (_e - _s)

    print('numba events:', timing_nb / 3, 'ms')
    print('numba wall  :', timing_nb_wall / 3 * 1000, 'ms')

    # print(count_weighted_pairs_3d_cuda.inspect_types())

if kind in ['both']:
    # check that the CUDA kernel agrees with the Numba kernel
    assert cp.allclose(d_result_cp, d_result_nb, rtol=5E-4)
Example #9
0
def mst_cluster_coassoc():
    t1,t2 = Timer(), Timer()

    #foldername = "/home/courses/aac2015/diogoaos/QCThesis/datasets/gaussmix1e4/"
    foldername = home + "QCThesis/datasets/gaussmix1e4/"

    print "Loading datasets"

    t1.tic()
    # dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",")
    # weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",")
    # fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",")

    dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",")
    weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",")
    fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",")
    t1.tac()

    print "loading elapsed time : ", t1.elapsed

    fe = fe[:-1]
    od = np.empty_like(fe)
    outdegree_from_firstedge(fe, od, dest.size)

    # fix weights to dissimilarity
    weight = 100 - weight

    print "# edges : ", dest.size
    print "# vertices : ", fe.size
    print "edges/vertices ratio : ", dest.size * 1.0 / fe.size

    t1.tic()
    mst, n_edges = boruvka_minho_seq(dest, weight, fe, od)
    t1.tac()

    print "seq: time elapsed : ", t1.elapsed
    print "seq: mst size :", mst.size
    print "seq: n_edges : ", n_edges

    if n_edges < mst.size:
        mst = mst[:n_edges]
    mst.sort()

    ev1,ev2 = cuda.event(), cuda.event()

    ev1.record()
    d_dest = cuda.to_device(dest)
    d_weight = cuda.to_device(weight)
    d_fe = cuda.to_device(fe)
    d_od = cuda.to_device(od)
    ev2.record()

    send_graph_time = cuda.event_elapsed_time(ev1,ev2)

    t2.tic()
    mst2, n_edges2 = boruvka_minho_gpu(d_dest, d_weight, d_fe, d_od, MAX_TPB=512, returnDevAry = True)
    t2.tac()

    ev1.record()
    mst2 = mst2.copy_to_host()
    n_edges2 = n_edges2.getitem(0)
    ev2.record()

    recv_mst_time = cuda.event_elapsed_time(ev1,ev2)
    print "gpu: send graph time : ", send_graph_time
    print "gpu: time elapsed : ", t2.elapsed    
    print "gpu: rcv mst time : ", recv_mst_time
    print "gpu: mst size :", mst2.size  
    print "seq: n_edges : ", n_edges2

    if n_edges2 < mst2.size:
        mst2 = mst2[:n_edges2]
    mst2.sort()

    if n_edges == n_edges2:
        mst_is_equal = (mst == mst2).all()
    else:
        mst_is_equal = False
    print "mst gpu == seq : ", mst_is_equal
for i in range(num_arrays):
    # Configure the blocks
    threadsperblock = (16, 16)
    blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0]))
    blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    # Start the kernel
    start_events[i].record(streams[i])
    matmul[blockspergrid, threadsperblock, streams[i]](A_gpu[i], B_gpu[i],
                                                       C_gpu[i])

for i in range(num_arrays):
    end_events[i].record(streams[i])

for i in range(num_arrays):
    # Copy the result back to the host
    C_out.append(C_gpu[i].copy_to_host(stream=streams[i]))

t_end = perf_counter()
kernel_times = []

for i in range(num_arrays):
    kernel_times.append(cuda.event_elapsed_time(start_events[i],
                                                end_events[i]))

print('Total time: %f' % (t_end - t_start))
print('Mean kernel duration (milliseconds): %f' % numpy.mean(kernel_times))
print('Mean kernel standard deviation (milliseconds): %f' %
      numpy.std(kernel_times))
Example #11
0
    def _cu_label(self, data, centroids):
        #WARNING: data is being transposed when sending to GPU

        data_ev1, data_ev2 = cuda.event(), cuda.event()
        labels_ev1, labels_ev2 = cuda.event(), cuda.event()
        dists_ev1, dists_ev2 = cuda.event(), cuda.event()

        N, D = data.shape
        K, cD = centroids.shape

        if self._cuda_mem not in ('manual','auto'):
            raise Exception("cuda_mem = \'manual\' or \'auto\'")

        if self._gridDim is None or self._blockDim is None:
            self._compute_cuda_dims(data)

        labels = np.empty(N, dtype=np.int32)

        if self._cuda_mem == 'manual':
            # copy dataset and centroids, allocate memory

            ## cuda persistent handles
            # avoids redundant data transfer
            # if dataset has not been sent to device, send it and save handle
            if self._cudaDataHandle is None:
                dataT = np.ascontiguousarray(data.T)

                self.man_prof['data_ev1'].record()
                dData = cuda.to_device(dataT)
                self.man_prof['data_ev2'].record()
                self.man_prof['data_ev2'].synchronize()
                time_ms = cuda.event_elapsed_time(self.man_prof['data_ev1'],
                                                  self.man_prof['data_ev2'])
                self.man_prof['data_timings'].append(time_ms)
                self._cudaDataHandle = dData
            # otherwise just use handle
            else:
                dData = self._cudaDataHandle

            # avoids creating labels array in device more than once
            if self._cuda_labels_handle is None:
                dLabels = cuda.device_array_like(labels)
                self._cuda_labels_handle = dLabels
            else:
                dLabels = self._cuda_labels_handle

            # avoids creating dists array in device more than once
            if self._cuda_dists_handle is None:
                dDists = cuda.device_array_like(self._dists)
                self._cuda_dists_handle = dDists
            else:
                dDists = self._cuda_dists_handle

            # copy centroids to device
            self.man_prof['centroids_ev1'].record()
            dCentroids = cuda.to_device(centroids)
            self.man_prof['centroids_ev2'].record()

            # launch kernel
            self.man_prof['kernel_ev1'].record()
            _cu_label_kernel_dists[self._gridDim, self._blockDim](dData,
                                                                 dCentroids,
                                                                 dLabels,
                                                                 dDists)
            self.man_prof['kernel_ev2'].record()

            # cuda.synchronize()

            # self.man_prof['kernel_ev2'].synchronize()

            # copy labels from device to host
            self.man_prof['labels_ev1'].record()
            dLabels.copy_to_host(ary=labels)
            self.man_prof['labels_ev2'].record()

            # copy distance to centroids from device to host
            self.man_prof['dists_ev1'].record()
            dists = dDists.copy_to_host()
            self.man_prof['dists_ev2'].record()
            self._dists = dists

            # synchronize host with gpu before computing times
            self.man_prof['dists_ev2'].synchronize()

            # store timings
            time_ms = cuda.event_elapsed_time(self.man_prof['centroids_ev1'],
                                              self.man_prof['centroids_ev2'])
            self.man_prof['centroids_timings'].append(time_ms)

            time_ms = cuda.event_elapsed_time(self.man_prof['kernel_ev1'],
                                              self.man_prof['kernel_ev2'])
            self.man_prof['kernel_timings'].append(time_ms)

            time_ms = cuda.event_elapsed_time(self.man_prof['labels_ev1'],
                                              self.man_prof['labels_ev2'])
            self.man_prof['labels_timings'].append(time_ms)

            time_ms = cuda.event_elapsed_time(self.man_prof['dists_ev1'],
                                              self.man_prof['dists_ev2'])
            self.man_prof['dists_timings'].append(time_ms)

        elif self._cuda_mem == 'auto':
            self.auto_prof['kernel_ev1'].record()
            _cu_label_kernel_dists[self._gridDim,self._blockDim](data,
                                                                centroids,
                                                                labels,
                                                                self._dists)
            self.auto_prof['kernel_ev2'].record()
            time_ms = cuda.event_elapsed_time(self.auto_prof['kernel_ev1'],
                                              self.auto_prof['kernel_ev2'])
            self.auto_prof['kernel_timings'].append(time_ms)

        else:
            raise ValueError("CUDA memory management type may either \
                              be \'manual\' or \'auto\'.")

        return labels
Example #12
0
    ## data column major
    # GPU data
    data_t = data.T
    dData = cuda.to_device(data_t)
    dCentroids = cuda.to_device(centroids)
    dLabels = cuda.device_array(n, dtype=np.int32)
    dDists = cuda.device_array(n, dtype=np.float32)

    # kernel
    kt_start.record()
    _cu_label_kernel_dists[bpg, tpb](dData, dCentroids, dLabels, dDists)
    kt_end.record()
    kt_end.synchronize()

    # time
    time_ms = cuda.event_elapsed_time(kt_start, kt_end)
    print 'Kernel time (data column major):{} ms'.format(time_ms)

    ## data row major
    # GPU data
    dData = cuda.to_device(data)
    dCentroids = cuda.to_device(centroids)
    dLabels = cuda.device_array(n, dtype=np.int32)
    dDists = cuda.device_array(n, dtype=np.float32)

    # kernel
    kt_start.record()
    _cu_label_kernel_dists[bpg, tpb](dData, dCentroids, dLabels, dDists)
    kt_end.record()
    kt_end.synchronize()
Example #13
0
 def elapsed_time(self):
     return self._time_off + 1.e-3*cuda.event_elapsed_time(self._t_start,
                                                           self._t_end)
Example #14
0
    def generate_batch(self,
                       end=None,
                       verbose=False,
                       fused=False,
                       nested_cva_at=None,
                       nested_im_at=None,
                       indicator_in_cva=False,
                       alpha=None,
                       im_window=None):
        if end is None:
            end = self.num_coarse_steps
        t = 0.
        self._reset()
        self.cuda_generate_exp1(self.d_exp_1, self.d_rng_states)
        self.stream.synchronize()
        self.cuda_compute_mtm(
            0, t, self.d_X, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty,
            self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32,
            self.d_vanillas_on_fx_b8, self.d_irs_f32, self.d_irs_i32,
            self.d_zcs_f32, self.d_zcs_i32, self.dt, self.max_coarse_per_reset,
            self.cDtoH_freq, True)
        self.stream.synchronize()
        self.d_mtm_by_cpty[0].copy_to_host(ary=self.mtm_by_cpty[0],
                                           stream=self.stream)
        self.d_cash_flows_by_cpty[0].copy_to_host(
            ary=self.cash_flows_by_cpty[0], stream=self.stream)

        _cuda_bulk_diffuse_event_begin = [cuda.event() for i in range(end)]
        _cuda_bulk_diffuse_event_end = [cuda.event() for i in range(end)]

        _cuda_compute_mtm_event_begin = [cuda.event() for i in range(end)]
        _cuda_compute_mtm_event_end = [cuda.event() for i in range(end)]

        _cuda_nested_cva_event_begin = [cuda.event() for i in range(end)]
        _cuda_nested_cva_event_end = [cuda.event() for i in range(end)]

        _cuda_nested_im_event_begin = [cuda.event() for i in range(end)]
        _cuda_nested_im_event_end = [cuda.event() for i in range(end)]

        for coarse_idx in range(1, end + 1):
            t += self.dT
            idx_in_dev_arr = (coarse_idx - 1) % self.cDtoH_freq + 1
            if not fused:
                _cuda_bulk_diffuse_event_begin[coarse_idx -
                                               1].record(stream=self.stream)
                self.cuda_bulk_diffuse(
                    idx_in_dev_arr, t, self.d_X, self.d_def_indicators,
                    self.d_dom_rate_integral, self.d_spread_integrals,
                    self.d_irs_f32, self.d_irs_i32, self.d_exp_1,
                    self.d_rng_states, self.dt, self.max_coarse_per_reset)
                _cuda_bulk_diffuse_event_end[coarse_idx -
                                             1].record(stream=self.stream)
                _cuda_compute_mtm_event_begin[coarse_idx -
                                              1].record(stream=self.stream)
                self.cuda_compute_mtm(
                    idx_in_dev_arr, t, self.d_X, self.d_mtm_by_cpty,
                    self.d_cash_flows_by_cpty, self.d_vanillas_on_fx_f32,
                    self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8,
                    self.d_irs_f32, self.d_irs_i32, self.d_zcs_f32,
                    self.d_zcs_i32, self.dt, self.max_coarse_per_reset,
                    self.cDtoH_freq, False)
                _cuda_compute_mtm_event_end[coarse_idx -
                                            1].record(stream=self.stream)
            else:
                _cuda_bulk_diffuse_event_begin[coarse_idx -
                                               1].record(stream=self.stream)
                if idx_in_dev_arr == 1:
                    self.cuda_diffuse_and_price(
                        1, self.cDtoH_freq, t, self.d_X,
                        self.d_dom_rate_integral, self.d_spread_integrals,
                        self.d_mtm_by_cpty, self.d_cash_flows_by_cpty,
                        self.d_irs_f32, self.d_irs_i32,
                        self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32,
                        self.d_vanillas_on_fx_b8, self.d_rng_states, self.dt,
                        self.max_coarse_per_reset, self.cDtoH_freq)
                    self.cuda_oversimulate_defs(1, self.cDtoH_freq,
                                                self.d_def_indicators,
                                                self.d_spread_integrals,
                                                self.d_exp_1)
                _cuda_bulk_diffuse_event_end[coarse_idx -
                                             1].record(stream=self.stream)
            if nested_cva_at is not None:
                _cuda_nested_cva_event_begin[coarse_idx -
                                             1].record(stream=self.stream)
                if coarse_idx in nested_cva_at:
                    self.cuda_nested_cva(
                        idx_in_dev_arr, self.num_coarse_steps - coarse_idx, t,
                        self.d_X, self.d_def_indicators,
                        self.d_dom_rate_integral, self.d_spread_integrals,
                        self.d_mtm_by_cpty, self.d_cash_flows_by_cpty,
                        self.d_irs_f32, self.d_irs_i32,
                        self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32,
                        self.d_vanillas_on_fx_b8, self.d_exp_1,
                        self.d_rng_states, self.dt, self.cDtoH_freq,
                        indicator_in_cva, self.d_nested_cva,
                        self.d_nested_cva_sq)
                    self.d_nested_cva.copy_to_host(
                        ary=self.nested_cva[coarse_idx], stream=self.stream)
                    self.d_nested_cva_sq.copy_to_host(
                        ary=self.nested_cva_sq[coarse_idx], stream=self.stream)
                _cuda_nested_cva_event_end[coarse_idx -
                                           1].record(stream=self.stream)

            if nested_im_at is not None:
                _cuda_nested_im_event_begin[coarse_idx -
                                            1].record(stream=self.stream)
                if coarse_idx in nested_im_at:
                    for adam_iter in range(self.num_adam_iters):
                        adam_init = adam_iter == 0
                        step_size = self.lam * (adam_iter + 1)**(-self.gamma)
                        self.cuda_nested_im(
                            alpha, adam_init, step_size, idx_in_dev_arr,
                            im_window, t, self.d_X,
                            self.d_mtm_by_cpty[idx_in_dev_arr], self.d_irs_f32,
                            self.d_irs_i32, self.d_vanillas_on_fx_f32,
                            self.d_vanillas_on_fx_i32,
                            self.d_vanillas_on_fx_b8, self.d_rng_states,
                            self.dt, self.d_nested_im_by_cpty,
                            self.d_nested_im_std_by_cpty, self.d_nested_im_m,
                            self.d_nested_im_v, self.adam_b1, self.adam_b2,
                            adam_iter)
                    self.d_nested_im_by_cpty.copy_to_host(
                        ary=self.nested_im_by_cpty[coarse_idx],
                        stream=self.stream)
                _cuda_nested_im_event_end[coarse_idx -
                                          1].record(stream=self.stream)

            if coarse_idx % self.cDtoH_freq == 0:
                self.d_X[self.max_coarse_per_reset:].copy_to_host(
                    ary=self.X[coarse_idx - self.cDtoH_freq + 1:coarse_idx +
                               1],
                    stream=self.stream)
                self.d_spread_integrals[1:].copy_to_host(
                    ary=self.spread_integrals[coarse_idx - self.cDtoH_freq +
                                              1:coarse_idx + 1],
                    stream=self.stream)
                self.d_dom_rate_integral[1:].copy_to_host(
                    ary=self.dom_rate_integral[coarse_idx - self.cDtoH_freq +
                                               1:coarse_idx + 1],
                    stream=self.stream)
                self.d_def_indicators[1:].copy_to_host(
                    ary=self.def_indicators[coarse_idx - self.cDtoH_freq +
                                            1:coarse_idx + 1],
                    stream=self.stream)
                self.d_mtm_by_cpty[1:].copy_to_host(
                    ary=self.mtm_by_cpty[coarse_idx - self.cDtoH_freq +
                                         1:coarse_idx + 1],
                    stream=self.stream)
                self.d_cash_flows_by_cpty[1:].copy_to_host(
                    ary=self.cash_flows_by_cpty[coarse_idx - self.cDtoH_freq +
                                                1:coarse_idx + 1],
                    stream=self.stream)
                self.d_X[:self.max_coarse_per_reset].copy_to_device(
                    self.d_X[-self.max_coarse_per_reset:], stream=self.stream)
                self.d_spread_integrals[0].copy_to_device(
                    self.d_spread_integrals[self.cDtoH_freq],
                    stream=self.stream)
                self.d_dom_rate_integral[0].copy_to_device(
                    self.d_dom_rate_integral[self.cDtoH_freq],
                    stream=self.stream)
                self.d_def_indicators[0].copy_to_device(
                    self.d_def_indicators[self.cDtoH_freq], stream=self.stream)

        if end % self.cDtoH_freq != 0:
            start_idx = (end // self.cDtoH_freq) * self.cDtoH_freq + 1
            length = end % self.cDtoH_freq
            self.d_X[self.max_coarse_per_reset:self.max_coarse_per_reset +
                     length].copy_to_host(ary=self.X[start_idx:start_idx +
                                                     length],
                                          stream=self.stream)
            self.d_spread_integrals[1:length + 1].copy_to_host(
                ary=self.spread_integrals[start_idx:start_idx + length],
                stream=self.stream)
            self.d_dom_rate_integral[1:length + 1].copy_to_host(
                ary=self.dom_rate_integral[start_idx:start_idx + length],
                stream=self.stream)
            self.d_def_indicators[1:length + 1].copy_to_host(
                ary=self.def_indicators[start_idx:start_idx + length],
                stream=self.stream)
            self.d_mtm_by_cpty[1:length + 1].copy_to_host(
                ary=self.mtm_by_cpty[start_idx:start_idx + length],
                stream=self.stream)
            self.d_cash_flows_by_cpty[1:length + 1].copy_to_host(
                ary=self.cash_flows_by_cpty[start_idx:start_idx + length],
                stream=self.stream)

        if verbose:
            print('Everything was successfully queued!')

        for evt_cuda_bulk_diffuse_event, evt_cuda_compute_mtm_event, evt_cuda_nested_cva_event, evt_cuda_nested_im_event in zip(
                _cuda_bulk_diffuse_event_end, _cuda_compute_mtm_event_end,
                _cuda_nested_cva_event_end, _cuda_nested_im_event_end):
            evt_cuda_bulk_diffuse_event.synchronize()
            evt_cuda_compute_mtm_event.synchronize()
            evt_cuda_nested_cva_event.synchronize()
            evt_cuda_nested_im_event.synchronize()

        self.stream.synchronize()

        if not fused:
            print('cuda_bulk_diffuse average elapsed time per launch: {0} ms'.
                  format(
                      round(
                          sum(
                              cuda.event_elapsed_time(evt_begin, evt_end)
                              for evt_begin, evt_end in zip(
                                  _cuda_bulk_diffuse_event_begin,
                                  _cuda_bulk_diffuse_event_end)) / end, 3)))
            print('compute_mtm average elapsed time per launch: {0} ms'.format(
                round(
                    sum(
                        cuda.event_elapsed_time(evt_begin, evt_end)
                        for evt_begin, evt_end in zip(
                            _cuda_compute_mtm_event_begin,
                            _cuda_compute_mtm_event_end)) / end, 3)))
        else:
            print('cuda_diffuse_and_price elapsed time: {0} ms'.format(
                round(
                    sum(
                        cuda.event_elapsed_time(evt_begin, evt_end)
                        for evt_begin, evt_end in zip(
                            _cuda_bulk_diffuse_event_begin,
                            _cuda_bulk_diffuse_event_end)), 3)))

        if nested_cva_at is not None:
            print('cuda_nested_cva average elapsed time per launch: {0} ms'.
                  format(
                      round(
                          sum(
                              cuda.event_elapsed_time(evt_begin, evt_end)
                              for evt_begin, evt_end in zip(
                                  _cuda_nested_cva_event_begin,
                                  _cuda_nested_cva_event_end)) /
                          len(nested_cva_at), 3)))

        if nested_im_at is not None:
            print('cuda_nested_im average elapsed time per launch: {0} ms'.
                  format(
                      round(
                          sum(
                              cuda.event_elapsed_time(evt_begin, evt_end)
                              for evt_begin, evt_end in zip(
                                  _cuda_nested_im_event_begin,
                                  _cuda_nested_im_event_end)) /
                          len(nested_im_at), 3)))

        # TODO: port this to CUDA
        self.cash_pos_by_cpty = ne.evaluate('c*exp(-r)',
                                            local_dict={
                                                'c':
                                                self.cash_flows_by_cpty,
                                                'r':
                                                self.dom_rate_integral[:,
                                                                       None, :]
                                            })
        np.cumsum(self.cash_pos_by_cpty, axis=0, out=self.cash_pos_by_cpty)
        self.cash_pos_by_cpty *= np.exp(self.dom_rate_integral[:, None, :])