def laplace_3d_cuda(d, n): L = d.shape[0] M = d.shape[1] N = d.shape[2] blockdim = (8, 8, 8) griddim = (L // blockdim[0], M // blockdim[1], N // blockdim[2]) #print(griddim) stream = cuda.stream() dd = cuda.to_device(d, stream) dn = cuda.to_device(n, stream) #%timeit -n 32 -r 16 d0td1_cuda_kernel[griddim, blockdim](dd, dn) for i in range(0, 100): laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N) evtstart = cuda.event(timing=True) evtend = cuda.event(timing=True) evtstart.record(stream) for i in range(100): laplace_3d_cuda_opt_kernel[griddim, blockdim, stream](dd, dn, L, M, N) evtend.record(stream) evtend.synchronize() print(cuda.event_elapsed_time(evtstart, evtend) / 100.) dd.to_host()
def main(): inp = np.arange(1000000, dtype=np.int32) factor = 4 start, end = cuda.event(True), cuda.event(True) reses = [] for (name, f) in [ ("not shared", mult_by_x_not_shared), ("shared", mult_by_x_shared), ("not shared", mult_by_x_not_shared), ]: times = [] for i in range(100): d_out = cuda.device_array_like(inp) start.record() f[blocks, threadsPerBlock](cuda.to_device(inp), d_out, cuda.to_device(np.array([factor]))) end.record() end.synchronize() out = d_out.copy_to_host() # Compilation... if i != 0: times.append(cuda.event_elapsed_time(start, end)) print( f"{name}: {np.mean(times):.2f} +/- {np.std(times) / np.sqrt(len(times)):.3f} (max: {np.max(times):.2f})" ) reses.append(out) assert np.all([reses[0] == reses_i for reses_i in reses])
def __exit__(self, *args): cuda.select_device(self.gpu) suffix = 'ms (' + self.label + ')' if self.label else 'ms' self.end.record() self.end.synchronize() time = cuda.event_elapsed_time(self.start, self.end) print('elapsed time:', int(time), suffix)
def main(image1, image2): streams = [] start_events = [] end_events = [] data1_gpu = [] data2_gpu = [] gpu_out = [] out = [] data_image1 = np.array(image1) data_image2 = np.array(image2) print(data_image1.shape, data_image2.shape) shpape_A = data_image1.shape # prevaod na 1 rozmerne pole data_image1 = data_image1.ravel() data_image2 = data_image2.ravel() input1 = np.split(data_image1, X) input2 = np.split(data_image1, X) for _ in range(len(input1)): streams.append(cuda.stream()) start_events.append(cuda.event()) end_events.append(cuda.event()) for i in range(len(input1)): data1_gpu.append(cuda.to_device(input1[i], stream=streams[i])) data2_gpu.append(cuda.to_device(input2[i], stream=streams[i])) t_start = perf_counter() for i in range(len(input1)): start_events[i].record(streams[i]) sumImages[1, 32, streams[i]](data1_gpu[i], data2_gpu[i]) t_end = perf_counter() for i in range(len(input1)): end_events[i].record(streams[i]) gpu_out.append(data2_gpu[i].copy_to_host(stream=streams[i])) for i in range(len(gpu_out)): out = np.concatenate((out, gpu_out[i])) kernel_times = [] for k in range(len(input1)): kernel_times.append( cuda.event_elapsed_time(start_events[k], end_events[k])) out = out.reshape(shpape_A) out = out.astype('uint8') out = Image.fromarray(out) out.save("out_stream.png") print(f'Total time: {t_end - t_start}') print(f'Mean kernel duration (milliseconds): {np.mean(kernel_times)}') print(f'Mean kernel standard deviation \ (milliseconds): {np.std(kernel_times)}')
def main(): # Compile lots_of_copies() few_copies() # Now benchmark start, end = cuda.event(timing=True), cuda.event(timing=True) n = 200 for f in [lots_of_copies, few_copies]: times = [] for _ in range(n): start.record() f() end.record() end.synchronize() t = cuda.event_elapsed_time(start, end) times.append(t) print(f.__name__, np.mean(times), np.std(times) / np.sqrt(n))
def test_last_block(): MAX_TPB = 512 n = 1024 a = np.arange(n).astype(np.int32) reference = np.empty_like(a) start = timer() MyScan.exprefixsumNumba(a, reference, init=0) end = timer() auxidx = -1 elb = a.size p2elb = np.int(np.ceil(np.log2(elb))) telb = 2 ** p2elb tlb = telb / 2 startIdx = 0 sm_size = telb * a.itemsize aux = np.empty(1, dtype=np.int8) trash = cuda.device_array(1) e1, e2 = cuda.event(), cuda.event() e1.record() MyScan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx) e2.record() print "CPU took: ", (end - start) * 1000, " ms" print "Kernel took: ", cuda.event_elapsed_time(e1, e2), " ms" print (a == reference).all()
def last_block_test(): MAX_TPB = 512 n = 1024 a = np.arange(n).astype(np.int32) reference = np.empty_like(a) start = timer() scan.exprefixsumNumba(a, reference, init = 0) end = timer() auxidx = -1 elb = a.size p2elb = np.int(np.ceil(np.log2(elb))) telb = 2 ** p2elb tlb = telb / 2 startIdx = 0 sm_size = telb * a.itemsize aux = np.empty(1,dtype=np.int8) trash = cuda.device_array(1) e1, e2 = cuda.event(), cuda.event() e1.record() scan.last_scan[1, tlb, 0, sm_size](a, aux, -1, elb, startIdx) e2.record() print "CPU took: ", (end - start) * 1000, " ms" print "Kernel took: ", cuda.event_elapsed_time(e1,e2), " ms" print (a == reference).all()
d_z2 = cuda.to_device(z2.astype(np.float32)) d_w2 = cuda.to_device(w2.astype(np.float32)) d_rbins_squared = cuda.to_device(DEFAULT_RBINS_SQUARED.astype(np.float32)) d_result_nb = cuda.device_array_like(result.astype(np.float32)) # running the Numba jit kernel for i in range(4): if i > 0: start.record() _s = time.time() count_weighted_pairs_3d_cuda[blocks, threads](d_x1, d_y1, d_z1, d_w1, d_x2, d_y2, d_z2, d_w2, d_rbins_squared, d_result_nb) if i > 0: end.record() end.synchronize() _e = time.time() timing_nb += cuda.event_elapsed_time(start, end) timing_nb_wall += (_e - _s) print('numba events:', timing_nb / 3, 'ms') print('numba wall :', timing_nb_wall / 3 * 1000, 'ms') # print(count_weighted_pairs_3d_cuda.inspect_types()) if kind in ['both']: # check that the CUDA kernel agrees with the Numba kernel assert cp.allclose(d_result_cp, d_result_nb, rtol=5E-4)
def mst_cluster_coassoc(): t1,t2 = Timer(), Timer() #foldername = "/home/courses/aac2015/diogoaos/QCThesis/datasets/gaussmix1e4/" foldername = home + "QCThesis/datasets/gaussmix1e4/" print "Loading datasets" t1.tic() # dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",") # weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",") # fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",") dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",") weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",") fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",") t1.tac() print "loading elapsed time : ", t1.elapsed fe = fe[:-1] od = np.empty_like(fe) outdegree_from_firstedge(fe, od, dest.size) # fix weights to dissimilarity weight = 100 - weight print "# edges : ", dest.size print "# vertices : ", fe.size print "edges/vertices ratio : ", dest.size * 1.0 / fe.size t1.tic() mst, n_edges = boruvka_minho_seq(dest, weight, fe, od) t1.tac() print "seq: time elapsed : ", t1.elapsed print "seq: mst size :", mst.size print "seq: n_edges : ", n_edges if n_edges < mst.size: mst = mst[:n_edges] mst.sort() ev1,ev2 = cuda.event(), cuda.event() ev1.record() d_dest = cuda.to_device(dest) d_weight = cuda.to_device(weight) d_fe = cuda.to_device(fe) d_od = cuda.to_device(od) ev2.record() send_graph_time = cuda.event_elapsed_time(ev1,ev2) t2.tic() mst2, n_edges2 = boruvka_minho_gpu(d_dest, d_weight, d_fe, d_od, MAX_TPB=512, returnDevAry = True) t2.tac() ev1.record() mst2 = mst2.copy_to_host() n_edges2 = n_edges2.getitem(0) ev2.record() recv_mst_time = cuda.event_elapsed_time(ev1,ev2) print "gpu: send graph time : ", send_graph_time print "gpu: time elapsed : ", t2.elapsed print "gpu: rcv mst time : ", recv_mst_time print "gpu: mst size :", mst2.size print "seq: n_edges : ", n_edges2 if n_edges2 < mst2.size: mst2 = mst2[:n_edges2] mst2.sort() if n_edges == n_edges2: mst_is_equal = (mst == mst2).all() else: mst_is_equal = False print "mst gpu == seq : ", mst_is_equal
for i in range(num_arrays): # Configure the blocks threadsperblock = (16, 16) blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0])) blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1])) blockspergrid = (blockspergrid_x, blockspergrid_y) # Start the kernel start_events[i].record(streams[i]) matmul[blockspergrid, threadsperblock, streams[i]](A_gpu[i], B_gpu[i], C_gpu[i]) for i in range(num_arrays): end_events[i].record(streams[i]) for i in range(num_arrays): # Copy the result back to the host C_out.append(C_gpu[i].copy_to_host(stream=streams[i])) t_end = perf_counter() kernel_times = [] for i in range(num_arrays): kernel_times.append(cuda.event_elapsed_time(start_events[i], end_events[i])) print('Total time: %f' % (t_end - t_start)) print('Mean kernel duration (milliseconds): %f' % numpy.mean(kernel_times)) print('Mean kernel standard deviation (milliseconds): %f' % numpy.std(kernel_times))
def _cu_label(self, data, centroids): #WARNING: data is being transposed when sending to GPU data_ev1, data_ev2 = cuda.event(), cuda.event() labels_ev1, labels_ev2 = cuda.event(), cuda.event() dists_ev1, dists_ev2 = cuda.event(), cuda.event() N, D = data.shape K, cD = centroids.shape if self._cuda_mem not in ('manual','auto'): raise Exception("cuda_mem = \'manual\' or \'auto\'") if self._gridDim is None or self._blockDim is None: self._compute_cuda_dims(data) labels = np.empty(N, dtype=np.int32) if self._cuda_mem == 'manual': # copy dataset and centroids, allocate memory ## cuda persistent handles # avoids redundant data transfer # if dataset has not been sent to device, send it and save handle if self._cudaDataHandle is None: dataT = np.ascontiguousarray(data.T) self.man_prof['data_ev1'].record() dData = cuda.to_device(dataT) self.man_prof['data_ev2'].record() self.man_prof['data_ev2'].synchronize() time_ms = cuda.event_elapsed_time(self.man_prof['data_ev1'], self.man_prof['data_ev2']) self.man_prof['data_timings'].append(time_ms) self._cudaDataHandle = dData # otherwise just use handle else: dData = self._cudaDataHandle # avoids creating labels array in device more than once if self._cuda_labels_handle is None: dLabels = cuda.device_array_like(labels) self._cuda_labels_handle = dLabels else: dLabels = self._cuda_labels_handle # avoids creating dists array in device more than once if self._cuda_dists_handle is None: dDists = cuda.device_array_like(self._dists) self._cuda_dists_handle = dDists else: dDists = self._cuda_dists_handle # copy centroids to device self.man_prof['centroids_ev1'].record() dCentroids = cuda.to_device(centroids) self.man_prof['centroids_ev2'].record() # launch kernel self.man_prof['kernel_ev1'].record() _cu_label_kernel_dists[self._gridDim, self._blockDim](dData, dCentroids, dLabels, dDists) self.man_prof['kernel_ev2'].record() # cuda.synchronize() # self.man_prof['kernel_ev2'].synchronize() # copy labels from device to host self.man_prof['labels_ev1'].record() dLabels.copy_to_host(ary=labels) self.man_prof['labels_ev2'].record() # copy distance to centroids from device to host self.man_prof['dists_ev1'].record() dists = dDists.copy_to_host() self.man_prof['dists_ev2'].record() self._dists = dists # synchronize host with gpu before computing times self.man_prof['dists_ev2'].synchronize() # store timings time_ms = cuda.event_elapsed_time(self.man_prof['centroids_ev1'], self.man_prof['centroids_ev2']) self.man_prof['centroids_timings'].append(time_ms) time_ms = cuda.event_elapsed_time(self.man_prof['kernel_ev1'], self.man_prof['kernel_ev2']) self.man_prof['kernel_timings'].append(time_ms) time_ms = cuda.event_elapsed_time(self.man_prof['labels_ev1'], self.man_prof['labels_ev2']) self.man_prof['labels_timings'].append(time_ms) time_ms = cuda.event_elapsed_time(self.man_prof['dists_ev1'], self.man_prof['dists_ev2']) self.man_prof['dists_timings'].append(time_ms) elif self._cuda_mem == 'auto': self.auto_prof['kernel_ev1'].record() _cu_label_kernel_dists[self._gridDim,self._blockDim](data, centroids, labels, self._dists) self.auto_prof['kernel_ev2'].record() time_ms = cuda.event_elapsed_time(self.auto_prof['kernel_ev1'], self.auto_prof['kernel_ev2']) self.auto_prof['kernel_timings'].append(time_ms) else: raise ValueError("CUDA memory management type may either \ be \'manual\' or \'auto\'.") return labels
## data column major # GPU data data_t = data.T dData = cuda.to_device(data_t) dCentroids = cuda.to_device(centroids) dLabels = cuda.device_array(n, dtype=np.int32) dDists = cuda.device_array(n, dtype=np.float32) # kernel kt_start.record() _cu_label_kernel_dists[bpg, tpb](dData, dCentroids, dLabels, dDists) kt_end.record() kt_end.synchronize() # time time_ms = cuda.event_elapsed_time(kt_start, kt_end) print 'Kernel time (data column major):{} ms'.format(time_ms) ## data row major # GPU data dData = cuda.to_device(data) dCentroids = cuda.to_device(centroids) dLabels = cuda.device_array(n, dtype=np.int32) dDists = cuda.device_array(n, dtype=np.float32) # kernel kt_start.record() _cu_label_kernel_dists[bpg, tpb](dData, dCentroids, dLabels, dDists) kt_end.record() kt_end.synchronize()
def elapsed_time(self): return self._time_off + 1.e-3*cuda.event_elapsed_time(self._t_start, self._t_end)
def generate_batch(self, end=None, verbose=False, fused=False, nested_cva_at=None, nested_im_at=None, indicator_in_cva=False, alpha=None, im_window=None): if end is None: end = self.num_coarse_steps t = 0. self._reset() self.cuda_generate_exp1(self.d_exp_1, self.d_rng_states) self.stream.synchronize() self.cuda_compute_mtm( 0, t, self.d_X, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_irs_f32, self.d_irs_i32, self.d_zcs_f32, self.d_zcs_i32, self.dt, self.max_coarse_per_reset, self.cDtoH_freq, True) self.stream.synchronize() self.d_mtm_by_cpty[0].copy_to_host(ary=self.mtm_by_cpty[0], stream=self.stream) self.d_cash_flows_by_cpty[0].copy_to_host( ary=self.cash_flows_by_cpty[0], stream=self.stream) _cuda_bulk_diffuse_event_begin = [cuda.event() for i in range(end)] _cuda_bulk_diffuse_event_end = [cuda.event() for i in range(end)] _cuda_compute_mtm_event_begin = [cuda.event() for i in range(end)] _cuda_compute_mtm_event_end = [cuda.event() for i in range(end)] _cuda_nested_cva_event_begin = [cuda.event() for i in range(end)] _cuda_nested_cva_event_end = [cuda.event() for i in range(end)] _cuda_nested_im_event_begin = [cuda.event() for i in range(end)] _cuda_nested_im_event_end = [cuda.event() for i in range(end)] for coarse_idx in range(1, end + 1): t += self.dT idx_in_dev_arr = (coarse_idx - 1) % self.cDtoH_freq + 1 if not fused: _cuda_bulk_diffuse_event_begin[coarse_idx - 1].record(stream=self.stream) self.cuda_bulk_diffuse( idx_in_dev_arr, t, self.d_X, self.d_def_indicators, self.d_dom_rate_integral, self.d_spread_integrals, self.d_irs_f32, self.d_irs_i32, self.d_exp_1, self.d_rng_states, self.dt, self.max_coarse_per_reset) _cuda_bulk_diffuse_event_end[coarse_idx - 1].record(stream=self.stream) _cuda_compute_mtm_event_begin[coarse_idx - 1].record(stream=self.stream) self.cuda_compute_mtm( idx_in_dev_arr, t, self.d_X, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_irs_f32, self.d_irs_i32, self.d_zcs_f32, self.d_zcs_i32, self.dt, self.max_coarse_per_reset, self.cDtoH_freq, False) _cuda_compute_mtm_event_end[coarse_idx - 1].record(stream=self.stream) else: _cuda_bulk_diffuse_event_begin[coarse_idx - 1].record(stream=self.stream) if idx_in_dev_arr == 1: self.cuda_diffuse_and_price( 1, self.cDtoH_freq, t, self.d_X, self.d_dom_rate_integral, self.d_spread_integrals, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_irs_f32, self.d_irs_i32, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_rng_states, self.dt, self.max_coarse_per_reset, self.cDtoH_freq) self.cuda_oversimulate_defs(1, self.cDtoH_freq, self.d_def_indicators, self.d_spread_integrals, self.d_exp_1) _cuda_bulk_diffuse_event_end[coarse_idx - 1].record(stream=self.stream) if nested_cva_at is not None: _cuda_nested_cva_event_begin[coarse_idx - 1].record(stream=self.stream) if coarse_idx in nested_cva_at: self.cuda_nested_cva( idx_in_dev_arr, self.num_coarse_steps - coarse_idx, t, self.d_X, self.d_def_indicators, self.d_dom_rate_integral, self.d_spread_integrals, self.d_mtm_by_cpty, self.d_cash_flows_by_cpty, self.d_irs_f32, self.d_irs_i32, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_exp_1, self.d_rng_states, self.dt, self.cDtoH_freq, indicator_in_cva, self.d_nested_cva, self.d_nested_cva_sq) self.d_nested_cva.copy_to_host( ary=self.nested_cva[coarse_idx], stream=self.stream) self.d_nested_cva_sq.copy_to_host( ary=self.nested_cva_sq[coarse_idx], stream=self.stream) _cuda_nested_cva_event_end[coarse_idx - 1].record(stream=self.stream) if nested_im_at is not None: _cuda_nested_im_event_begin[coarse_idx - 1].record(stream=self.stream) if coarse_idx in nested_im_at: for adam_iter in range(self.num_adam_iters): adam_init = adam_iter == 0 step_size = self.lam * (adam_iter + 1)**(-self.gamma) self.cuda_nested_im( alpha, adam_init, step_size, idx_in_dev_arr, im_window, t, self.d_X, self.d_mtm_by_cpty[idx_in_dev_arr], self.d_irs_f32, self.d_irs_i32, self.d_vanillas_on_fx_f32, self.d_vanillas_on_fx_i32, self.d_vanillas_on_fx_b8, self.d_rng_states, self.dt, self.d_nested_im_by_cpty, self.d_nested_im_std_by_cpty, self.d_nested_im_m, self.d_nested_im_v, self.adam_b1, self.adam_b2, adam_iter) self.d_nested_im_by_cpty.copy_to_host( ary=self.nested_im_by_cpty[coarse_idx], stream=self.stream) _cuda_nested_im_event_end[coarse_idx - 1].record(stream=self.stream) if coarse_idx % self.cDtoH_freq == 0: self.d_X[self.max_coarse_per_reset:].copy_to_host( ary=self.X[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_spread_integrals[1:].copy_to_host( ary=self.spread_integrals[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_dom_rate_integral[1:].copy_to_host( ary=self.dom_rate_integral[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_def_indicators[1:].copy_to_host( ary=self.def_indicators[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_mtm_by_cpty[1:].copy_to_host( ary=self.mtm_by_cpty[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_cash_flows_by_cpty[1:].copy_to_host( ary=self.cash_flows_by_cpty[coarse_idx - self.cDtoH_freq + 1:coarse_idx + 1], stream=self.stream) self.d_X[:self.max_coarse_per_reset].copy_to_device( self.d_X[-self.max_coarse_per_reset:], stream=self.stream) self.d_spread_integrals[0].copy_to_device( self.d_spread_integrals[self.cDtoH_freq], stream=self.stream) self.d_dom_rate_integral[0].copy_to_device( self.d_dom_rate_integral[self.cDtoH_freq], stream=self.stream) self.d_def_indicators[0].copy_to_device( self.d_def_indicators[self.cDtoH_freq], stream=self.stream) if end % self.cDtoH_freq != 0: start_idx = (end // self.cDtoH_freq) * self.cDtoH_freq + 1 length = end % self.cDtoH_freq self.d_X[self.max_coarse_per_reset:self.max_coarse_per_reset + length].copy_to_host(ary=self.X[start_idx:start_idx + length], stream=self.stream) self.d_spread_integrals[1:length + 1].copy_to_host( ary=self.spread_integrals[start_idx:start_idx + length], stream=self.stream) self.d_dom_rate_integral[1:length + 1].copy_to_host( ary=self.dom_rate_integral[start_idx:start_idx + length], stream=self.stream) self.d_def_indicators[1:length + 1].copy_to_host( ary=self.def_indicators[start_idx:start_idx + length], stream=self.stream) self.d_mtm_by_cpty[1:length + 1].copy_to_host( ary=self.mtm_by_cpty[start_idx:start_idx + length], stream=self.stream) self.d_cash_flows_by_cpty[1:length + 1].copy_to_host( ary=self.cash_flows_by_cpty[start_idx:start_idx + length], stream=self.stream) if verbose: print('Everything was successfully queued!') for evt_cuda_bulk_diffuse_event, evt_cuda_compute_mtm_event, evt_cuda_nested_cva_event, evt_cuda_nested_im_event in zip( _cuda_bulk_diffuse_event_end, _cuda_compute_mtm_event_end, _cuda_nested_cva_event_end, _cuda_nested_im_event_end): evt_cuda_bulk_diffuse_event.synchronize() evt_cuda_compute_mtm_event.synchronize() evt_cuda_nested_cva_event.synchronize() evt_cuda_nested_im_event.synchronize() self.stream.synchronize() if not fused: print('cuda_bulk_diffuse average elapsed time per launch: {0} ms'. format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_bulk_diffuse_event_begin, _cuda_bulk_diffuse_event_end)) / end, 3))) print('compute_mtm average elapsed time per launch: {0} ms'.format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_compute_mtm_event_begin, _cuda_compute_mtm_event_end)) / end, 3))) else: print('cuda_diffuse_and_price elapsed time: {0} ms'.format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_bulk_diffuse_event_begin, _cuda_bulk_diffuse_event_end)), 3))) if nested_cva_at is not None: print('cuda_nested_cva average elapsed time per launch: {0} ms'. format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_nested_cva_event_begin, _cuda_nested_cva_event_end)) / len(nested_cva_at), 3))) if nested_im_at is not None: print('cuda_nested_im average elapsed time per launch: {0} ms'. format( round( sum( cuda.event_elapsed_time(evt_begin, evt_end) for evt_begin, evt_end in zip( _cuda_nested_im_event_begin, _cuda_nested_im_event_end)) / len(nested_im_at), 3))) # TODO: port this to CUDA self.cash_pos_by_cpty = ne.evaluate('c*exp(-r)', local_dict={ 'c': self.cash_flows_by_cpty, 'r': self.dom_rate_integral[:, None, :] }) np.cumsum(self.cash_pos_by_cpty, axis=0, out=self.cash_pos_by_cpty) self.cash_pos_by_cpty *= np.exp(self.dom_rate_integral[:, None, :])