def start(self, A_matrix): """Decomposes A_matrix into two matrices L and U. @param A_matrix Coefficient matrix. @return float64[:,:], float64[:,:] """ A = A_matrix.flatten() L = np.zeros_like(A) rows = len(A_matrix) columns = len(A_matrix) tpb = 32 matrix_size = rows * columns with cuda.pinned(A, L): stream = cuda.stream() gpu_A = cuda.to_device(A, stream=stream) gpu_L = cuda.to_device(L, stream=stream) bpg = 1 for i in range(0, rows): self.gaussian_lu_decomposition[(bpg, bpg), (tpb, tpb)](gpu_A,\ gpu_L,\ rows, i) gpu_A.copy_to_host(A, stream) gpu_L.copy_to_host(L, stream) U = A.reshape(rows, columns) L = L.reshape(rows, columns) del stream return L, U
def start(self, A_matrix, b_vector): """Launches parallel Gauss Jordan elimination for a SLAE and returns its answer. @param A_matrix Coefficient matrix of a SLAE. @param b_vector Linearly independent vector of a SLAE. @return float64[:] """ if 0 in A_matrix.diagonal(): return None b = b_vector.reshape(len(b_vector), 1) A = np.hstack((A_matrix, b)) A = A.flatten() n = len(b) with cuda.pinned(A): stream = cuda.stream() gpu_A = cuda.to_device(A, stream=stream) bpg = 1 for i in range(0, n): self.gauss_jordan[(bpg, bpg), (tpb, tpb)](gpu_A, n, i) self.normalize[(bpg, bpg), (tpb, tpb)](gpu_A, n) gpu_A.copy_to_host(A, stream) x = A.reshape(n, (n + 1))[:, n] if True in np.isnan(x) or True in np.isinf(x): return None else: return x
def start(A_matrix): A = A_matrix.flatten() L = np.zeros_like(A) U = np.zeros_like(A) rows = len(A_matrix) columns = len(A_matrix) tpb = 32 n = rows * columns with cuda.pinned(A, L, U): stream = cuda.stream() gpu_A = cuda.to_device(A, stream=stream) gpu_L = cuda.to_device(L, stream=stream) gpu_U = cuda.to_device(U, stream=stream) bpg = n + (tpb - 1) // tpb crout[(bpg, bpg), (tpb, tpb)](gpu_A, gpu_L, gpu_U, rows) gpu_L.copy_to_host(L, stream) gpu_U.copy_to_host(U, stream) L = L.reshape(rows, columns) U = U.reshape(rows, columns) print(L) print(U) print(np.matmul(L, U))
def test_pinned(self): A = np.arange(2*1024*1024) # 16 MB total = 0 with cuda.pinned(A): for i in range(REPEAT): total += self._template('pinned', A) print('pinned', total / REPEAT)
def test_pinned(self): A = np.arange(2 * 1024 * 1024) # 16 MB total = 0 with cuda.pinned(A): for i in range(REPEAT): total += self._template('pinned', A) print('pinned', total / REPEAT)
def test_pinned(self): machine = platform.machine() if machine.startswith('arm') or machine.startswith('aarch64'): count = 262144 # 2MB else: count = 2097152 # 16MB A = np.arange(count) with cuda.pinned(A): self._run_copies(A)
def test_pinned_contextmanager(self): # Check that temporarily pinned memory is unregistered immediately, # such that it can be re-pinned at any time class PinnedException(Exception): pass arr = np.zeros(1) ctx = cuda.current_context() ctx.deallocations.clear() with self.check_ignored_exception(ctx): with cuda.pinned(arr): pass with cuda.pinned(arr): pass # Should also work inside a `defer_cleanup` block with cuda.defer_cleanup(): with cuda.pinned(arr): pass with cuda.pinned(arr): pass # Should also work when breaking out of the block due to an exception try: with cuda.pinned(arr): raise PinnedException except PinnedException: with cuda.pinned(arr): pass
def _send_arrays_to_gpu(self, arrays_to_transfer, n_gpu_arrays_needed): gpu_arrays = [] stream = cuda.stream() self.streams.append(stream) with cuda.pinned(*arrays_to_transfer): for arr in arrays_to_transfer: try: gpu_array = cuda.to_device(arr, stream) except cuda.cudadrv.driver.CudaAPIError: print_memory_info_after_transfer_failure(arr, n_gpu_arrays_needed) self.clear_cuda_memory(gpu_arrays) return [] gpu_arrays.append(gpu_array) return gpu_arrays
def corr_FD(x1, x2): threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(x1.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here #ft.FFTPlan(shape=x1.shape, itype=np.float32, otype=np.complex64) X1 = x1.astype(np.float32) X2 = x2.astype(np.float32) stream1 = cuda.stream() stream2 = cuda.stream() fftplan1 = ft.FFTPlan(shape=x1.shape, itype=np.float32, otype=np.complex64, stream=stream1) fftplan2 = ft.FFTPlan(shape=x2.shape, itype=np.float32, otype=np.complex64, stream=stream2) # pagelock memory with cuda.pinned(X1, X2): # We can overlap the transfer of response_complex with the forward FFT # on image_complex. d_X1 = cuda.to_device(X1, stream=stream1) d_X2 = cuda.to_device(X2, stream=stream2) fftplan1.forward(d_X1, out=d_X1) fftplan2.forward(d_X2, out=d_X2) print('d_X1 is ', np.shape(d_X1), type(d_X1), np.max(d_X1)) print('d_X2 is ', np.shape(d_X2), type(d_X2), np.max(d_X2)) stream2.synchronize() mult_inplace[blockpergrid, threadperblock, stream1](d_X1, d_X2) fftplan1.inverse(d_X1, out=d_X1) # implicitly synchronizes the streams c = d_X1.copy_to_host().real / np.prod(x1.shape) return c
def driver(pricer, pinned=False): paths = np.zeros((NumPath, NumStep + 1), order='F') paths[:, 0] = StockPrice DT = Maturity / NumStep if pinned: from numba import cuda with cuda.pinned(paths): ts = timer() pricer(paths, DT, InterestRate, Volatility) te = timer() else: ts = timer() pricer(paths, DT, InterestRate, Volatility) te = timer() ST = paths[:, -1] PaidOff = np.maximum(paths[:, -1] - StrikePrice, 0) print('Result') fmt = '%20s: %s' print(fmt % ('stock price', np.mean(ST))) print(fmt % ('standard error', np.std(ST) / sqrt(NumPath))) print(fmt % ('paid off', np.mean(PaidOff))) optionprice = np.mean(PaidOff) * exp(-InterestRate * Maturity) print(fmt % ('option price', optionprice)) print('Performance') NumCompute = NumPath * NumStep print(fmt % ('Mstep/second', '%.2f' % (NumCompute / (te - ts) / 1e6))) print(fmt % ('time elapsed', '%.3fs' % (te - ts))) if '--plot' in sys.argv: from matplotlib import pyplot pathct = min(NumPath, 100) for i in xrange(pathct): pyplot.plot(paths[i]) print('Plotting %d/%d paths' % (pathct, NumPath)) pyplot.show()
def driver(MCUDA, pinned=False): paths = np.zeros((Number_of_Paths, Number_of_Steps + 1), order='F') paths[:, 0] = SP Delta_T = Maturity / Number_of_Steps if pinned: with cuda.pinned(paths): time_s = timer() MCUDA(paths, Delta_T, IR, Beta) time_a = timer() else: time_s = timer() MCUDA(paths, Delta_T, IR, Beta) time_a = timer() stk = paths[:, -1] pOff = np.maximum(paths[:, -1] - K, 0) o_Price = np.mean(pOff) * exp(Maturity * -IR) print('error ', np.std(stk) / sqrt(Number_of_Paths)) print('payoff ', np.mean(pOff)) print('Option Pice', o_Price) print('Run Time') print(time_a - time_s)
def start(self, A_matrix, b_matrix): """Launches parallel Gaussian elimination for a SLAE and returns its answer. @param A_matrix Coefficient matrix of a SLAE. @param b_matrix Linearly independent vector of a SLAE. @return None """ if 0 in A_matrix.diagonal(): return None b = b_matrix.reshape(len(b_matrix), 1) A = np.hstack((A_matrix, b)) A = A.flatten() n = len(b) with cuda.pinned(A): stream = cuda.stream() gpu_A = cuda.to_device(A, stream=stream) bpg = 1 for i in range(0, n): self.gaussian_elimination[(bpg, bpg), (tpb, tpb)](gpu_A, n, i) gpu_A.copy_to_host(A, stream) # Restore A and b from augmented matrix Ab b = A.reshape(n, (n + 1))[:, n] A = A.reshape(n, (n + 1))[..., :-1] x = substitution.back_substitution(A, b) if True in np.isnan(x) or True in np.isinf(x): return None else: return x
d_src = cuda.to_device(src) d_dst = cuda.device_array_like(dst) copy_kernel(d_src, out=d_dst) d_dst.copy_to_host(dst) te = timer() print('regular', te - ts) del d_src, d_dst assert np.allclose(dst, src) # Pinned (pagelocked) memory transfer with cuda.pinned(src, dst): ts = timer() stream = cuda.stream() # use stream to trigger async memory transfer d_src = cuda.to_device(src, stream=stream) d_dst = cuda.device_array_like(dst, stream=stream) copy_kernel(d_src, out=d_dst, stream=stream) d_dst.copy_to_host(dst, stream=stream) stream.synchronize() te = timer() print('pinned', te - ts) assert np.allclose(dst, src)
def test_pinned(self): A = np.arange(2 * 1024 * 1024) # 16 MB with cuda.pinned(A): self._run_copies(A)
def main(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) # Build Image try: filename = sys.argv[1] image = ndimage.imread(filename, flatten=True).astype(np.float32) except IndexError: image = misc.lena().astype(np.float32) print("Image size: %s" % (image.shape, )) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print('CPU: %.2fs' % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) stream1 = cuda.stream() stream2 = cuda.stream() fftplan1 = FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream1) fftplan2 = FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream2) # pagelock memory with cuda.pinned(image_complex, response_complex): # We can overlap the transfer of response_complex with the forward FFT # on image_complex. d_image_complex = cuda.to_device(image_complex, stream=stream1) d_response_complex = cuda.to_device(response_complex, stream=stream2) fftplan1.forward(d_image_complex, out=d_image_complex) fftplan2.forward(d_response_complex, out=d_response_complex) stream2.synchronize() mult_inplace[blockpergrid, threadperblock, stream1](d_image_complex, d_response_complex) fftplan1.inverse(d_image_complex, out=d_image_complex) # implicitly synchronizes the streams cvimage_gpu = d_image_complex.copy_to_host().real / np.prod( image.shape) te = timer() print('GPU: %.2fs' % (te - ts)) # Plot the results plt.subplot(1, 2, 1) plt.title('CPU') plt.imshow(cvimage_cpu, cmap=plt.cm.gray) plt.axis('off') plt.subplot(1, 2, 2) plt.title('GPU') plt.imshow(cvimage_gpu, cmap=plt.cm.gray) plt.axis('off') plt.show()
def main(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) # Build Image try: filename = sys.argv[1] image = ndimage.imread(filename, flatten=True).astype(np.float32) except IndexError: image = misc.face(gray=True).astype(np.float32) print("Image size: %s" % (image.shape,)) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print('CPU: %.2fs' % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Trigger initialization the cuFFT system. # This takes significant time for small dataset. # We should not be including the time wasted here FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) stream1 = cuda.stream() stream2 = cuda.stream() fftplan1 = FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream1) fftplan2 = FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64, stream=stream2) # pagelock memory with cuda.pinned(image_complex, response_complex): # We can overlap the transfer of response_complex with the forward FFT # on image_complex. d_image_complex = cuda.to_device(image_complex, stream=stream1) d_response_complex = cuda.to_device(response_complex, stream=stream2) fftplan1.forward(d_image_complex, out=d_image_complex) fftplan2.forward(d_response_complex, out=d_response_complex) stream2.synchronize() mult_inplace[blockpergrid, threadperblock, stream1](d_image_complex, d_response_complex) fftplan1.inverse(d_image_complex, out=d_image_complex) # implicitly synchronizes the streams cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print('GPU: %.2fs' % (te - ts)) # Plot the results plt.subplot(1, 2, 1) plt.title('CPU') plt.imshow(cvimage_cpu, cmap=plt.cm.gray) plt.axis('off') plt.subplot(1, 2, 2) plt.title('GPU') plt.imshow(cvimage_gpu, cmap=plt.cm.gray) plt.axis('off') plt.show()
def test_pinned(self): A = np.arange(2*1024*1024) # 16 MB with cuda.pinned(A): self._run_copies(A)
def train_tree(dataset, max_depth=6, min_samples_per_node=1, tpb=1024, nstreams=3): """ Entrena un árbol con nuestro algoritmo para GPU basado en CUDT. :param dataset Conjunto de datos con el que entrenar el árbol. :param max_depth Profundida máxima del árbol. :param min_samples_per_node Mínimo de elementos en un nodo para seguir siendo evaluado. :param tpb Hebras por bloque CUDA. :pre dataset ha de ser un array de Numpy con todas las variables de tipo np.float32 y la última columna las etiquetas. Dichas etiquetas han de corresponderse a una clasificación binaria [0,1]. tpb ha de ser un valor válido para bloques unidimensionales de CUDA. """ N, d = dataset.shape d -= 1 values = cuda.device_array((d, N), dtype=np.float32) labels = cuda.device_array((d, N), dtype=np.int32) # 1.1 Generamos las listas de atributos y las ponemos en orden # ascendente de valor blocks = N // tpb + 1 dataset = np.ascontiguousarray(dataset.T) streams = [cuda.stream() for i in range(3)] d_scan = cuda.device_array((d, N), np.float32, stream=streams[2]) best_flag = cuda.device_array(N, np.bool, stream=streams[0]) my_flag = cuda.device_array((d, N), np.bool, stream=streams[1]) buffer_int = cuda.device_array((d, N), np.int32, stream=streams[2]) buffer_int2 = cuda.device_array((d, N), np.int32, stream=streams[0]) locks = [ cuda.device_array(1, np.int32, stream=stream) for stream in streams ] locks[:][0] = 0 my_mins = [ cuda.device_array(1, np.float32, stream=stream) for stream in streams ] my_min_idxs = [ cuda.device_array(2, np.int32, stream=stream) for stream in streams ] my_totals = [ cuda.device_array(1, dtype=np.int32, stream=stream) for stream in streams ] address = cuda.device_array((d, N), np.int32, stream=streams[2]) set_range[(N // tpb + 1, d), tpb, streams[2]](address) with cuda.pinned(dataset): d_labels = cuda.to_device(dataset[-1], stream=streams[0]) d_dataset = cuda.to_device(dataset[:-1], stream=streams[1]) indexes = cp.argsort(cp.array(dataset[:-1]), axis=1) indexes = cuda.to_device(indexes, stream=0) fill_2d[d * N // tpb + 1, tpb, streams[0]](values, indexes, d_dataset) fill_2d_label[d * N // tpb + 1, tpb, streams[1]](labels, indexes, d_labels) cuda.synchronize() # 1.2 Generamos el nodo inicial outputs = [] ActiveNode = collections.namedtuple('ActiveNode', 'idx start end') start_node = ActiveNode(idx=0, start=0, end=N) active_list = [start_node] # 2. Recorremos los niveles de profundidad for current_depth in range(max_depth): best_flag[:] = True level = {} next_active_list = [] # 2.1 Buscamos split points for i, node in enumerate(active_list): n = node.end - node.start s = node.start e = node.end node_tpb = min(max(32, 2**math.ceil(math.log2(n))), tpb) id_stream = i % 3 my_stream = streams[id_stream] # Criterio de Poda: Mínimo de elementos en hoja o último nivel de profundidad. if n == 1: level[node.idx] = (False, values[0, node.start]) continue elif n <= min_samples_per_node or current_depth == max_depth - 1: my_totals[id_stream][0] = 0 utils.warp_based_reduce_sum[n // node_tpb + 1, node_tpb, my_stream]( labels[0, node.start:node.end], my_totals[id_stream]) my_total = my_totals[id_stream].copy_to_host( stream=my_stream)[0] label = 0 if my_total / n <= 0.5 else 1 level[node.idx] = (False, label) continue else: # Realizamos el scan de los labels aux = cuda.device_array((d, n // node_tpb + 1), dtype=np.float32, stream=my_stream) aux[:] = 0 my_total = utils.multi_scan_with_gini(labels[:, s:e], d_scan[:, s:e], values[:, s:e], my_mins[id_stream], aux, node_tpb, my_stream) if my_total == 0 or my_total == n: level[node.idx] = (False, my_total) aux[:, :] = 0 continue blocks = (n // node_tpb + 1, d) utils.min_index_reduction[blocks, node_tpb, my_stream](d_scan[:, s:e], my_mins[id_stream], my_min_idxs[id_stream], aux, locks[id_stream]) my_host_idx = my_min_idxs[id_stream].copy_to_host( stream=my_stream) my_attr_list = my_host_idx[0] my_index = my_host_idx[1] # Ponemos a False (0) en Best Flag los atributos que quedan reorganizados a la izquierda. set_best_flag[n // node_tpb + 1, node_tpb, my_stream]( best_flag, indexes[my_attr_list, node.start:node.start + my_index + 1]) set_my_flag[(n // tpb + 1, d), tpb, my_stream](my_flag[:, s:e], buffer_int[:, s:e], best_flag, indexes[:, s:e]) utils.multi_scan_with_address(buffer_int[:, s:e], d_scan[:, s:e], address[:, s:e], my_flag[:, s:e], aux, node.start, node_tpb, my_stream) # Añadimos el nuevo nodo a la salida del árbol. my_values = values[my_attr_list, node.start + my_index:node.start + my_index + 2].copy_to_host(stream=my_stream) the_value = (my_values[0] + my_values[1]) / 2 level[node.idx] = (True, my_attr_list, my_index, the_value) # Añadimos a la lista de pendientes los nuevos nodos generados. left_node = ActiveNode(idx=2 * node.idx, start=node.start, end=node.start + my_index + 1) right_node = ActiveNode(idx=2 * node.idx + 1, start=node.start + my_index + 1, end=node.end) next_active_list.append(left_node) next_active_list.append(right_node) # Añadimos el nivel del árbol a la salida cuda.synchronize() outputs.append(level) if current_depth == max_depth - 1: return outputs # 2.2 Reorganizamos las listas de atributos fill_buffer[(N // tpb + 1, d), tpb, streams[0]](d_scan, values) fill_buffer[(N // tpb + 1, d), tpb, streams[1]](buffer_int, indexes) fill_buffer[(N // tpb + 1, d), tpb, streams[2]](buffer_int2, labels) fill_2d_b[d * N // tpb + 1, tpb, streams[0]](values, address, d_scan) fill_2d_b[d * N // tpb + 1, tpb, streams[1]](indexes, address, buffer_int) fill_2d_b[d * N // tpb + 1, tpb, streams[2]](labels, address, buffer_int2) # 2.3 Cambiamos la lista de nodos activos a la del siguiente nivel active_list = next_active_list cuda.synchronize() return outputs