def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None, MAX_TPB = 512, stream = None): """ All input (except MAX_TPB and stream) are device arrays. edges : array with the IDs of the edges that will be part of the new graph n_edges : array of 1 element with the number of valid edges in the edges array; if n_edges < size of edges, the last elements of the edges array are not considered """ # check if number of valid edges was received if n_edges is None: edges_size = edges.size n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32)) else: edges_size = int(n_edges.getitem(0)) # check if a stream was received, if not create one if stream is None: myStream = cuda.stream() else: myStream = stream new_n_edges = edges_size * 2 # allocate memory for new graph ndest = cuda.device_array(new_n_edges, dtype = dest.dtype, stream = myStream) nweight = cuda.device_array(new_n_edges, dtype = weight.dtype, stream = myStream) nfe = cuda.device_array_like(fe, stream = myStream) nod = cuda.device_array_like(od, stream = myStream) # fill new outdegree with zeros vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB) memSet[vertexGrid, MAX_TPB, myStream](nod, 0) # count all edges of new array and who they belong to edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB) countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod) # get new first_edge array from new outdegree nfe.copy_to_device(nod, stream=myStream) ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream) # copy new first_edge to top_edge to serve as pointer in adding edges top_edge = cuda.device_array_like(nfe, stream = myStream) top_edge.copy_to_device(nfe, stream = myStream) addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od, top_edge, ndest, nweight) del top_edge #del dest, weight, fe, od return ndest, nweight, nfe, nod
def deserialize(cls, deserialize, header, frames): """Called when dask.distributed is performing a deserialization for data of this class. Do not use this directly. It is invoked by dask.distributed. Parameters ---------- deserialize : callable Used to deserialize data that needs further deserialization . header, frames : dict See custom serialization documentation in dask.distributed. Returns ------- obj : Buffer Returns an instance of Buffer. """ # Using IPC? if header['kind'] == 'ipc': ipch = deserialize(header['mem'], frames) # Open IPC handle with ipch as data: # Copy remote data over mem = cuda.device_array_like(data) mem.copy_to_device(data) # Not using IPC else: # Deserialize the numpy array mem = deserialize(header['mem'], frames) mem.flags['WRITEABLE'] = True # XXX: hack for numba to work return Buffer(mem)
def lombscargle( x, y, freqs, precenter=False, normalize=False, ): pgram = cuda.device_array_like(freqs) assert x.ndim == 1 assert y.ndim == 1 assert freqs.ndim == 1 # Check input sizes if x.shape[0] != y.shape[0]: raise ValueError("Input arrays do not have the same size.") y_dot = cuda.device_array(shape=(1, ), dtype=y.dtype) if normalize: cp.dot(y, y, out=y_dot) if precenter: y_in = y - y.mean() else: y_in = y _lombscargle(x, y_in, freqs, pgram, y_dot) return pgram
def few_copies(): inp = np.arange(10000) d_inp = cuda.to_device(inp) d_out = cuda.device_array_like(inp) double[blocks, threadsPerBlock](d_inp, d_out) out = d_out.copy_to_host()
def recall(self, x, how): if not isinstance(x, np.ndarray): x = np.array(x) assert np.size(x) == self._x_size orig_shape = x.shape x = x.reshape((self._x_size, 1)) out = np.empty_like(x) if how == 'm': mem = self._dev_mem_m op = mnn.mat_morph_mul_min_plus elif how == 'w': mem = self._dev_mem_w op = mnn.mat_morph_mul_max_plus else: raise ValueError('Expected how=\'m\' or how=\'w\'') stream = cuda.stream() dev_x = cuda.to_device(x, stream=stream) out_x = cuda.device_array_like(out, stream=stream) op(mem, x, out_x, stream=stream) out_x.copy_to_host(out, stream=stream) stream.synchronize() return out.T.reshape(orig_shape)
def _request_transfer(key, remoteinfo): logger.info("rebuild from: %s for %r", remoteinfo, key) context = zmq.Context() socket = context.socket(zmq.REQ) socket.connect("tcp://{0}:{1}".format(*remoteinfo)) myaddr = _global_addr[0] theiraddr = remoteinfo[0] if myaddr == theiraddr: # Same machine go by IPC logger.info("request by IPC") socket.send(pickle.dumps(('IPC', key))) rcv = socket.recv() ipch = pickle.loads(rcv) # Open IPC and copy to local context with ipch as data: copied = cuda.device_array_like(data) copied.copy_to_device(data) # Release _request_drop(socket, key) return copied else: # Different machine go by NET logger.info("request by NET: %s->%s", theiraddr, myaddr) socket.send(pickle.dumps(('NET', key))) rcv = socket.recv() output = cuda.to_device(pickle.loads(rcv)) # Release _request_drop(socket, key) return output
def _test_shared(self, arr): # Use a kernel that copies via shared memory to check loading and # storing different dtypes with shared memory. All threads in a block # collaborate to load in values, then the output values are written # only by the first thread in the block after synchronization. nelem = len(arr) nthreads = 16 nblocks = int(nelem / nthreads) dt = nps.from_dtype(arr.dtype) @cuda.jit def use_sm_chunk_copy(x, y): sm = cuda.shared.array(nthreads, dtype=dt) tx = cuda.threadIdx.x bx = cuda.blockIdx.x bd = cuda.blockDim.x # Load this block's chunk into shared i = bx * bd + tx if i < len(x): sm[tx] = x[i] cuda.syncthreads() # One thread per block writes this block's chunk if tx == 0: for j in range(nthreads): y[bd * bx + j] = sm[j] d_result = cuda.device_array_like(arr) use_sm_chunk_copy[nblocks, nthreads](arr, d_result) host_result = d_result.copy_to_host() np.testing.assert_array_equal(arr, host_result)
def __init__(self, r_cut, r_buff=0.5, cell_guess=50, n_guess=150): system = Ctx.get_active() if system is None: raise ValueError("No active system!") self.system = system self.cell_guess = cell_guess self.n_guess = n_guess self.r_cut2 = r_cut ** 2 self.r_buff2 = (r_buff / 2) ** 2 self.gpu = system.gpu self.tpb = 64 self.bpg = int(self.system.N // self.tpb + 1) # self.situ_zero = np.zeros(1, dtype=np.int32) self.update_counts = 0 self.dist_funcs = {} self.cu_nlist, self.cu_check_build = self._gen_func() with cuda.gpus[self.gpu]: self.p_n_max = cuda.pinned_array((1,), dtype=np.int32) self.p_situation = cuda.pinned_array((1,), dtype=np.int32) self.d_last_x = cuda.device_array_like(self.system.d_x) self.d_n_max = cuda.device_array(1, dtype=np.int32) self.d_nl = cuda.device_array((self.system.N, self.n_guess), dtype=np.int32) self.d_nc = cuda.device_array((self.system.N,), dtype=np.int32) self.d_situation = cuda.device_array(1, dtype=np.int32) self.clist = clist(r_cut, r_buff, cell_guess=self.cell_guess) self.neighbour_list() self.system.nlist = self # register to system
def find_last(arr, val, compare="eq"): """ Returns the index of the last occurrence of *val* in *arr*. Or the last occurrence of *arr* *compare* *val*, if *compare* is not eq Otherwise, returns -1. Parameters ---------- arr : device array val : scalar compare: str ('gt', 'lt', or 'eq' (default)) """ found = cuda.device_array_like(arr) if found.size > 0: if compare == "gt": gpu_mark_gt.forall(found.size)(arr, val, found, -1) elif compare == "lt": gpu_mark_lt.forall(found.size)(arr, val, found, -1) else: if arr.dtype in ("float32", "float64"): gpu_mark_found_float.forall(found.size)(arr, val, found, -1) else: gpu_mark_found_int.forall(found.size)(arr, val, found, -1) from cudf.core.column import as_column found_col = as_column(found) max_index = found_col.max() return max_index
def __init__(self, r_cut, r_buff=0.5, cell_guess=50): system = Ctx.get_active() if system is None: raise ValueError("Error, Initialize system first!") self.system = system self.ibox = np.asarray(np.floor(system.box / (r_cut + r_buff)), dtype=np.int32) self.n_cell = int(np.multiply.reduce(self.ibox)) self.cell_adj = np.ones(self.system.n_dim, dtype=np.int32) * 3 self.gpu = system.gpu self.tpb = 64 self.bpg = int(self.system.N // self.tpb + 1) self.bpg_cell = int(self.n_cell // self.tpb + 1) self.cell_guess = cell_guess # self.situ_zero = np.zeros(1, dtype=np.int32) global cu_cell_index, cu_cell_map, cu_cell_list cu_cell_index, cu_cell_map, cu_cell_list = _gen_func( system.dtype, system.n_dim) self.p_cell_max = cuda.pinned_array((1, ), dtype=np.int32) with cuda.gpus[self.gpu]: self.d_last_x = cuda.device_array_like(self.system.d_x) self.d_cells = cuda.device_array(self.system.d_x.shape[0], dtype=np.int32) self.d_cell_map = cuda.device_array((self.n_cell, 3**system.n_dim), dtype=np.int32) self.d_ibox = cuda.to_device(self.ibox) self.d_cell_adj = cuda.to_device(self.cell_adj) cu_cell_map[self.bpg_cell, self.tpb](self.d_ibox, self.d_cell_adj, self.d_cell_map) self.d_cell_list = cuda.device_array( (self.n_cell, self.cell_guess, self.system.n_dim + 1), dtype=self.system.dtype) self.d_cell_counts = cuda.device_array(self.n_cell, dtype=np.int32) self.d_cell_max = cuda.device_array(1, dtype=np.int32) self.update()
def test_1d_times(data_len, dtype=np.float32): num_times = 10 title = "Data length = {}, data type = {}".format(data_len, dtype) print(len(title) * "=") print(title) print(len(title) * "=") print("\ntesting 1d") t = time.time() arr1, arr2 = np.random.randn(2, data_len).astype(dtype) d_arr1 = cuda.to_device(arr1) d_arr2 = cuda.to_device(arr2) d_result = cuda.device_array_like(d_arr1) print("Data generated in {} seconds".format(time.time() - t)) print("\ntesting multiplication times") t = timeit.repeat((lambda: arr1 * arr2), number=num_times) print("cpu/numpy time = {:.3e}".format(max(t) / num_times)) t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times) print("cuda vectorize time = {:.3e}".format(max(t) / num_times)) t = timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times) print("cuda_mult_1d time = {:.3e}".format(max(t) / num_times)) # t = time.time() # mult_gpu_1d[blocksize, 32](d_arr1, d_arr2, d_result) # cuda.synchronize() # print("cuda_mult_1d time = {:.3e}".format(time.time() - t)) print("\ntesting sum times") t = timeit.repeat((lambda: arr1 + arr2), number=num_times) print("cpu/numpy time = {:.3e}".format(max(t) / num_times)) t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times) print("cuda vectorize time = {:.3e}".format(max(t) / num_times)) t = timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times) print("cuda_add_1d time = {:.3e}".format(max(t) / num_times)) # t = time.time() # add_gpu_1d[blocksize, 32](d_arr1, d_arr2, d_result) # print("cuda_add_1d time = {:.3e}".format(time.time() - t)) print("\ntesting reduction times") t = timeit.repeat((lambda: arr1.sum()), number=num_times) print("cpu/numpy time = {:.3e}".format(max(t) / num_times)) t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times) print("cuda vectorize time = {:.3e}".format(max(t) / num_times)) t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times) print("sum_gpu time = {:.3e}".format(max(t) / num_times))
def scale(in_arr1, scaler): out_arr = cuda.device_array_like(in_arr1) array_len = len(in_arr1) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads scale_kernel[(number_of_blocks, ), (number_of_threads, )](in_arr1, scaler, out_arr, array_len) return out_arr
def compute_stats(arr): """ Returns (mean, variance) """ mu = compute_mean(arr) tmp = cuda.device_array_like(arr) gpu_variance_step.forall(arr.size)(arr, mu, tmp) return mu, compute_mean(tmp)
def abs_arr(in_arr): out_arr = cuda.device_array_like(in_arr) array_len = len(in_arr) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads abs_kernel[(number_of_blocks, ), (number_of_threads, )](in_arr, out_arr, array_len) return out_arr
def division(in_arr1, in_arr2): out_arr = cuda.device_array_like(in_arr1) array_len = len(in_arr1) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads binary_div[(number_of_blocks, ), (number_of_threads, )](in_arr1, in_arr2, out_arr, array_len) return out_arr
def port_money_flow(asset_ind, pp_arr, volume_arr): out_arr = cuda.device_array_like(pp_arr) array_len = len(pp_arr) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads port_moneyflow_kernel[(number_of_blocks, ), (number_of_threads, )](asset_ind, pp_arr, volume_arr, out_arr, array_len) return out_arr
def onbalance_volume(close_arr, volume_arr): out_arr = cuda.device_array_like(close_arr) array_len = len(close_arr) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads onbalance_kernel[(number_of_blocks, ), (number_of_threads, )](close_arr, volume_arr, out_arr, array_len) return out_arr
def main(): inp = np.arange(10_000_000) d_out = cuda.device_array_like(inp) double[blocks, threadsPerBlock](cuda.to_device(inp), d_out) s = sum_reducer(d_out) exp = (len(inp) - 1) * len(inp) assert s == exp
def average_price(high_arr, low_arr, close_arr): out_arr = cuda.device_array_like(high_arr) array_len = len(high_arr) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads average_price_kernel[(number_of_blocks, ), (number_of_threads, )](high_arr, low_arr, close_arr, out_arr, array_len) return out_arr
def copy_array(arr, out=None): if out is None: out = cuda.device_array_like(arr) assert out.size == arr.size if arr.is_c_contiguous() and out.is_c_contiguous(): out.copy_to_device(arr) else: gpu_copy.forall(out.size)(arr, out) return out
def lowhigh_diff(high_arr, low_arr): out_arr = cuda.device_array_like(high_arr) array_len = len(high_arr) number_of_blocks = \ (array_len + (number_of_threads - 1)) // number_of_threads lowhigh_diff_kernel[(number_of_blocks, ), (number_of_threads, )](high_arr, low_arr, out_arr, array_len) return out_arr
def port_true_range(asset_indicator, high_arr, low_arr, close_arr): out_arr = cuda.device_array_like(high_arr) array_len = len(high_arr) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads port_true_range_kernel[(number_of_blocks, ), (number_of_threads, )](asset_indicator, high_arr, low_arr, close_arr, out_arr, array_len) return out_arr
def compute_signal(signal): signal_arr = signal.data.to_gpu_array() out_arr = cuda.device_array_like(signal_arr) number_of_threads = 256 array_len = len(signal) number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads signal_kernel[(number_of_blocks, ), (number_of_threads, )](signal_arr, out_arr, array_len) return out_arr
def recode(data, recode_table, na_value): """Recode data with the given recode table. And setting out-of-range values to *na_value* """ newdata = cuda.device_array_like(data) recode_table = to_device(recode_table) blksz = 32 * 4 blkct = min(16, max(1, data.size // blksz)) gpu_recode[blkct, blksz](newdata, data, recode_table, na_value) return newdata
def test_get_ipc_handle(self): # We don't attempt to close the IPC handle in this test because Numba # will be expecting a real IpcHandle object to have been returned from # get_ipc_handle, and it would cause problems to do so. arr = np.arange(2) d_arr = cuda.device_array_like(arr) ipch = d_arr.get_ipc_handle() ctx = cuda.current_context() self.assertTrue(ctx.memory_manager.get_ipc_handle_called) self.assertIn("Dummy IPC handle for alloc 1", ipch._ipc_handle)
def testProfile(): #rvecs = np.ones((NV, N), dtype=np.float32) #n=0 #ss=[] finalarr=[] t_start = timeit.default_timer() for i in range(0,k_num): sums = numpy.zeros(len(arr[i].getMat()), dtype=numpy.float64) #sums1 = numpy.zeros(NV, dtype=numpy.float64) # sums2 = numpy.zeros(NV, dtype=numpy.float64) if(i==k_num-1): sums1 = numpy.zeros(len(arr[i].getMat()), dtype=numpy.float64) d_rvecs1 = cuda.to_device(arr[i].getMat()) d_sums1 = cuda.device_array_like(sums1) vec_sum_row[len(arr[i].getMat()), threadsperblock](d_rvecs1,d_sums1) ss=d_sums1.copy_to_host(sums1) for value in ss: finalarr.append(value) # print(matrix) else: d_rvecs = cuda.to_device(arr[i].getMat()) d_sums = cuda.device_array_like(sums) t_start = timeit.default_timer() vec_sum_row[len(arr[i].getMat()), threadsperblock](d_rvecs,d_sums) t_end = timeit.default_timer() vv=d_sums.copy_to_host(sums) for value in vv: finalarr.append(value) #print(vv) # f=ss+vv # t_end = timeit.default_timer() #print(finalarr) print('gpu took ' + str(t_end - t_start) + ' seconds') # df = pd.DataFrame(finalarr) # df.to_csv('terminator.csv', index=False) # # return finalarr
def _test_device_array_like_same(self, d_a): """ Tests of device_array_like where shape, strides, dtype, and flags should all be equal. """ d_a_like = cuda.device_array_like(d_a) self.assertEqual(d_a.shape, d_a_like.shape) self.assertEqual(d_a.strides, d_a_like.strides) self.assertEqual(d_a.dtype, d_a_like.dtype) self.assertEqual(d_a.flags['C_CONTIGUOUS'], d_a_like.flags['C_CONTIGUOUS']) self.assertEqual(d_a.flags['F_CONTIGUOUS'], d_a_like.flags['F_CONTIGUOUS'])
def add(a, b): a_device = cuda.to_device(a) b_device = cuda.to_device(b) gpu_result = cuda.device_array_like(a) threads_per_block = (16, 16) blocks_per_grid_x = int(math.ceil(a.shape[0] / threads_per_block[0])) blocks_per_grid_y = int(math.ceil(b.shape[1] / threads_per_block[1])) blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y) _add[blocks_per_grid, threads_per_block](a_device, b_device, gpu_result) result = gpu_result.copy_to_host() return result
def test_transpose_bool(self): for rows, cols in self.small_variants: with self.subTest(rows=rows, cols=cols): arr = np.random.randint(2, size=(rows, cols), dtype=np.bool_) transposed = arr.T d_arr = cuda.to_device(arr) d_transposed = cuda.device_array_like(transposed) transpose(d_arr, d_transposed) host_transposed = d_transposed.copy_to_host() np.testing.assert_array_equal(transposed, host_transposed)
def moving_average_signal(stock_df, n_fast, n_slow): ma_slow = ci.moving_average(stock_df['close'], n_slow).to_gpu_array() ma_fast = ci.moving_average(stock_df['close'], n_fast).to_gpu_array() out_arr = cuda.device_array_like(ma_fast) array_len = len(ma_slow) number_of_threads = 256 number_of_blocks = (array_len + (number_of_threads - 1)) // number_of_threads moving_average_signal_kernel[(number_of_blocks, ), (number_of_threads, )](ma_fast, ma_slow, out_arr, array_len) return out_arr, ma_slow, ma_fast
def Compute(self, ClassImage_GPU): self.ThreadBlockRelax() RelaxImage_GPU = cuda.device_array_like(ClassImage_GPU) CompatMatrix_GPU = cuda.to_device(self.CompatMatrix) class_dimGPU = cuda.to_device(np.array(classImg_dim, dtype=np.int16)) self.relaxLabel[self.BlocksPerGridRelax, self.ThreadsPerBlockRelax](RelaxImage_GPU, ClassImage_GPU, CompatMatrix_GPU, class_dimGPU) self.GPURelax = RelaxImage_GPU.copy_to_host() return self.GPURelax, RelaxImage_GPU
def run(self, arr, k): if k >= MAX_FAST_UNIQUE_K: raise NotImplementedError('k >= {}'.format(MAX_FAST_UNIQUE_K)) # setup mem outsz_ptr = cuda.device_array(shape=1, dtype=np.intp) out = cuda.device_array_like(arr) # kernel self._kernel[1, 64](arr, k, out, outsz_ptr) # copy to host unique_ct = outsz_ptr.copy_to_host()[0] if unique_ct < 0: raise ValueError('too many unique value (hint: increase k)') else: hout = out.copy_to_host() return hout[:unique_ct]
def task2(): a = numpy.float32(2.0) # Force value to be float32 x = numpy.arange(NELEM, dtype="float32") y = numpy.arange(NELEM, dtype="float32") ### Task2 ### # a) Complete the memory transfer for x -> dx, y -> dy # b) Allocate device memory for dout # c) Transfer for out <- dout dx = cuda.to_device(x) dy = cuda.to_device(y) dout = cuda.device_array_like(x) griddim = NUM_BLOCKS blockdim = NUM_THREADS saxpy[griddim, blockdim](a, dx, dy, dout) out = dout.copy_to_host() print("out =", out) if numpy.allclose(a * x + y, out): print("Correct result") else: print("Incorrect result")
aryA = np.arange(BLOCKSIZE * BLOCKCOUNT, dtype=np.float32) print('data size: %.1fMB' % (aryA.size * aryA.dtype.itemsize / (2**20))) evt_total_begin = cuda.event() evt_total_end = cuda.event() evt_kernel_begin = cuda.event() evt_kernel_end = cuda.event() t_total_begin = timer() evt_total_begin.record() # explicity tranfer memory d_aryA = cuda.to_device(aryA) d_aryB = cuda.device_array_like(aryA) evt_kernel_begin.record() t_kernel_begin = timer() cu_copy_array[BLOCKCOUNT, BLOCKSIZE](d_aryB, d_aryA) t_kernel_end = timer() evt_kernel_end.record() aryB = d_aryB.copy_to_host() evt_total_end.record() evt_total_end.synchronize() t_total_end = timer()
def fillna(data, mask, value): out = cuda.device_array_like(data) out.copy_to_device(data) configured = gpu_fill_masked.forall(data.size) configured(value, mask, out) return out
def connected_comps_gpu(dest_in, weight_in, firstEdge_in, outDegree_in, MAX_TPB = 512, stream = None): if stream is None: myStream = cuda.stream() else: myStream = stream dest = cuda.to_device(dest_in, stream = myStream) weight = cuda.to_device(weight_in, stream = myStream) firstEdge = cuda.to_device(firstEdge_in, stream = myStream) outDegree = cuda.to_device(outDegree_in, stream = myStream) n_vertices = firstEdge.size n_edges = dest.size n_components = n_vertices # still need edge_id for conflict resolution in find_minedge edge_id = cuda.to_device(np.arange(n_edges, dtype = dest.dtype), stream = myStream) #labels = np.empty(n_vertices, dtype = dest.dtype) first_iter = True # initialize with name top_edge so we can recycle an array between iterations top_edge = cuda.device_array(n_components, dtype = dest.dtype, stream = myStream) labels = cuda.device_array(n_components, dtype = dest.dtype, stream = myStream) converged = cuda.device_array(1, dtype = np.int8, stream = myStream) gridDimLabels = compute_cuda_grid_dim(n_components, MAX_TPB) gridDim = compute_cuda_grid_dim(n_components, MAX_TPB) final_converged = False while(not final_converged): vertex_minedge = top_edge findMinEdge_CUDA[gridDim, MAX_TPB, myStream](weight, firstEdge, outDegree, vertex_minedge, dest) removeMirroredEdges_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge) colors = cuda.device_array(shape = n_components, dtype = np.int32, stream = myStream) initializeColors_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge, colors) # propagate colors until convergence propagateConverged = False while(not propagateConverged): propagateColors_CUDA[gridDim, MAX_TPB, myStream](colors, converged) converged_num = converged.getitem(0, stream = myStream) propagateConverged = True if converged_num == 1 else False # first we build the flags in the new_vertex array new_vertex = vertex_minedge # reuse the vertex_minedge array as the new new_vertex buildFlag_CUDA[gridDim, MAX_TPB, myStream](colors, new_vertex) # new_n_vertices is the number of vertices of the new contracted graph new_n_vertices = ex_prefix_sum_gpu(new_vertex, MAX_TPB = MAX_TPB, stream = myStream).getitem(0, stream = myStream) new_n_vertices = int(new_n_vertices) if first_iter: # first iteration defines labels as the initial colors and updates labels.copy_to_device(colors, stream = myStream) first_iter = False # other iterations update the labels with the new colors update_labels_single_pass_cuda[gridDimLabels, MAX_TPB, myStream](labels, colors, new_vertex) if new_n_vertices == 1: final_converged = True del new_vertex break newGridDim = compute_cuda_grid_dim(n_components, MAX_TPB) # count number of edges for new supervertices and write in new outDegree newOutDegree = cuda.device_array(shape = new_n_vertices, dtype = np.int32, stream = myStream) memSet[newGridDim, MAX_TPB, myStream](newOutDegree, 0) # zero the newOutDegree array countNewEdges_CUDA[gridDim, MAX_TPB, myStream](colors, firstEdge, outDegree, dest, new_vertex, newOutDegree) # new first edge array for contracted graph newFirstEdge = cuda.device_array_like(newOutDegree, stream = myStream) # copy newOutDegree to newFirstEdge newFirstEdge.copy_to_device(newOutDegree, stream = myStream) new_n_edges = ex_prefix_sum_gpu(newFirstEdge, MAX_TPB = MAX_TPB, stream = myStream) new_n_edges = new_n_edges.getitem(0, stream = myStream) new_n_edges = int(new_n_edges) # if no edges remain, then MST has converged if new_n_edges == 0: final_converged = True del newOutDegree, newFirstEdge, new_vertex break # create arrays for new edges new_dest = cuda.device_array(new_n_edges, dtype = np.int32, stream = myStream) new_edge_id = cuda.device_array(new_n_edges, dtype = np.int32, stream = myStream) new_weight = cuda.device_array(new_n_edges, dtype = weight.dtype, stream = myStream) top_edge = cuda.device_array_like(newFirstEdge, stream = myStream) top_edge.copy_to_device(newFirstEdge, stream = myStream) # assign and insert new edges assignInsert_CUDA[gridDim, MAX_TPB, myStream](edge_id, dest, weight, firstEdge, outDegree, colors, new_vertex, new_dest, new_edge_id, new_weight, top_edge) # delete old graph del new_vertex, edge_id, dest, weight, firstEdge, outDegree, colors # write new graph n_components = newFirstEdge.size edge_id = new_edge_id dest = new_dest weight = new_weight firstEdge = newFirstEdge outDegree = newOutDegree gridDim = newGridDim returnLabels = labels.copy_to_host() del dest, weight, edge_id, firstEdge, outDegree, converged, labels return returnLabels
def main(): # device = cuda.get_current_device() # maxtpb = device.MAX_THREADS_PER_BLOCK # warpsize = device.WARP_SIZE maxtpb = 512 warpsize = 32 # benchmark loop vary_warpsize = [] baseline = [] ilpx2 = [] ilpx4 = [] ilpx8 = [] # For OSX 10.8 where the GPU is used for graphic as well, # increasing the following to 10 * 2 ** 20 seems to be necessary to # produce consistent result. approx_data_size = 1.5 * 2**20 for multiplier in range(1, maxtpb // warpsize + 1): blksz = warpsize * multiplier gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8) print('kernel config [%d, %d]' % (gridsz, blksz)) N = blksz * gridsz A = np.arange(N, dtype=np.float32) B = np.arange(N, dtype=np.float32) print('data size %dMB' % (N / 2.**20 * A.dtype.itemsize)) dA = cuda.to_device(A) dB = cuda.to_device(B) assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz) vary_warpsize.append(blksz) dC = cuda.device_array_like(A) basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC)) expected_result = dC.copy_to_host() if basetime > 0: baseline.append(N / basetime) dC = cuda.device_array_like(A) x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC)) np.testing.assert_allclose(expected_result, dC.copy_to_host()) if x2time > 0: ilpx2.append(N / x2time) dC = cuda.device_array_like(A) x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC)) np.testing.assert_allclose(expected_result, dC.copy_to_host()) if x4time > 0: ilpx4.append(N / x4time) dC = cuda.device_array_like(A) x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC)) np.testing.assert_allclose(expected_result, dC.copy_to_host()) if x8time > 0: ilpx8.append(N / x8time) pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline') pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2') pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4') pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8') pylab.legend(loc=4) pylab.title(cuda.get_current_device().name) pylab.xlabel('block size') pylab.ylabel('float per second') pylab.show()
from timeit import default_timer as timer import numpy as np from numba import vectorize, float32, cuda src = np.arange(10 ** 7, dtype=np.float32) dst = np.empty_like(src) @vectorize([float32(float32)], target='cuda') def copy_kernel(src): return src # Regular memory transfer ts = timer() d_src = cuda.to_device(src) d_dst = cuda.device_array_like(dst) copy_kernel(d_src, out=d_dst) d_dst.copy_to_host(dst) te = timer() print('regular', te - ts) del d_src, d_dst assert np.allclose(dst, src) # Pinned (pagelocked) memory transfer with cuda.pinned(src, dst):