def test_laplace_small(self): NN = 256 NM = 256 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM iter_max = 1000 tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m)) timer = time.time() iter = 0 blockdim = (tpb, tpb) griddim = (NN // blockdim[0], NM // blockdim[1]) error_grid = np.zeros(griddim) stream = cuda.stream() dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: self.assertTrue(error_grid.dtype == np.float64) jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.copy_to_host(error_grid, stream=stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp if iter % 100 == 0: print("%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time() - timer)) iter += 1 runtime = time.time() - timer print(" total: %f s" % runtime)
def test_func(self): np.random.seed(42) A = np.array(np.random.random((n, n)), dtype=np.float32) B = np.array(np.random.random((n, n)), dtype=np.float32) C = np.empty_like(A) s = time() stream = cuda.stream() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC) dC.copy_to_host(C, stream) e = time() tcuda = e - s # Host compute s = time() Cans = np.dot(A, B) e = time() tcpu = e - s # Check result np.testing.assert_allclose(C, Cans, rtol=1e-5)
def test_func(self): A = np.array(np.random.random((n, n)), dtype=np.float32) B = np.array(np.random.random((n, n)), dtype=np.float32) C = np.empty_like(A) print("N = %d x %d" % (n, n)) s = time() stream = cuda.stream() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC) dC.copy_to_host(C, stream) e = time() tcuda = e - s # Host compute Amat = np.matrix(A) Bmat = np.matrix(B) s = time() Cans = Amat * Bmat e = time() tcpu = e - s print('cpu: %f' % tcpu) print('cuda: %f' % tcuda) print('cuda speedup: %.2fx' % (tcpu / tcuda)) # Check result self.assertTrue(np.allclose(C, Cans))
def test_gufunc_stream(self): #cuda.driver.flush_pending_free() matrix_ct = 1001 # an odd number to test thread/block division in CUDA A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2, 4) B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4, 5) ts = time() stream = cuda.stream() dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream) dC = gufunc(dA, dB, out=dC, stream=stream) C = dC.copy_to_host(stream=stream) stream.synchronize() tcuda = time() - ts ts = time() Gold = ut.matrix_multiply(A, B) tcpu = time() - ts stream_speedups.append(tcpu / tcuda) self.assertTrue(np.allclose(C, Gold))
def stupidconv_gpu(img, filt, padval): """ does convolution without using FFT because FFT is pissing me off and giving me weird answers :param img: :param filt: :param padval: :return: """ cuda.close() cuda.select_device(1) # get the number of nonzero entries in the filter for later averaging of result filt_nnz = np.count_nonzero(filt) # pad the images s_filt = filt.shape s_img = img.shape # appropriate padding depends on context # pad with filt size all around img pad_img = np.ones((s_img[0] + (2 * s_filt[0]), s_img[1] + (2 * s_filt[1])), dtype=np.float32) * padval pad_img[s_filt[0]: s_img[0] + s_filt[0], s_filt[1]: s_img[1] + s_filt[1]] = img output = np.zeros(pad_img.shape, dtype=np.float32) d_pad_img = cuda.to_device(pad_img) d_filt = cuda.to_device(filt) d_output = cuda.to_device(output) stupidconv_gpu_helper(d_pad_img, d_filt, s_img[0], s_img[1], s_filt[0], s_filt[1], d_output) output = d_output.copy_to_host() output = output[s_filt[0]:s_filt[0] + s_img[0], s_filt[1]:s_filt[1] + s_img[1]] return output / filt_nnz
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = PRNG(PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def test_for_pre(self): """Test issue with loop not running due to bad sign-extension at the for loop precondition. """ @cuda.jit(argtypes=[float32[:, :], float32[:, :], float32[:]]) def diagproduct(c, a, b): startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y height = c.shape[0] width = c.shape[1] for x in range(startX, width, (gridX)): for y in range(startY, height, (gridY)): c[y, x] = a[y, x] * b[x] N = 8 A, B = generate_input(N) F = np.empty(A.shape, dtype=A.dtype) blockdim = (32, 8) griddim = (1, 1) dA = cuda.to_device(A) dB = cuda.to_device(B) dF = cuda.to_device(F, copy=False) diagproduct[griddim, blockdim](dF, dA, dB) E = np.dot(A, np.diag(B)) np.testing.assert_array_almost_equal(dF.copy_to_host(), E)
def setup(self): self.stream = cuda.stream() self.f32 = np.zeros(self.n, dtype=np.float32) self.d_f32 = cuda.to_device(self.f32, self.stream) self.f64 = np.zeros(self.n, dtype=np.float64) self.d_f64 = cuda.to_device(self.f64, self.stream) self.stream.synchronize()
def setup(self): self.stream = cuda.stream() self.small_data = np.zeros(512, dtype=np.float64) self.large_data = np.zeros(512 * 1024, dtype=np.float64) self.d_small_data = cuda.to_device(self.small_data, self.stream) self.d_large_data = cuda.to_device(self.large_data, self.stream) self.stream.synchronize()
def test_with_context(self): @cuda.jit def vector_add_scalar(arr, val): i = cuda.grid(1) if i < arr.size: arr[i] += val hostarr = np.arange(10, dtype=np.float32) with cuda.gpus[0]: arr1 = cuda.to_device(hostarr) with cuda.gpus[1]: arr2 = cuda.to_device(hostarr) with cuda.gpus[0]: vector_add_scalar[1, 10](arr1, 1) with cuda.gpus[1]: vector_add_scalar[1, 10](arr2, 2) with cuda.gpus[0]: np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1)) with cuda.gpus[1]: np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2)) with cuda.gpus[0]: # Transfer from GPU1 to GPU0 arr1.copy_to_device(arr2) np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 2))
def driver(niters, seed): curr = seed nxt = np.zeros(len(seed)) nxt[0] = seed[0] nxt[-1] = seed[-1] start_time = time.time() threads_per_block = 256 blocks_per_grid = int(math.ceil(float(len(curr) - 2) / threads_per_block)) d_nxt = cuda.to_device(nxt) d_curr = cuda.to_device(curr) for iter in range(niters): kernel[blocks_per_grid, threads_per_block](d_nxt, d_curr, len(curr) - 2) tmp = d_nxt d_nxt = d_curr d_curr = tmp d_curr.copy_to_host(curr) elapsed_time = time.time() - start_time print('Elapsed time for N=' + str(len(seed) - 2) + ', # iters=' + str(niters) + ' is ' + str(elapsed_time) + ' s') print(str(float(niters) / elapsed_time) + ' iters / s') return curr
def test_func(self): @cuda.jit(argtypes=[float32[:, ::1], float32[:, ::1], float32[:, ::1]]) def cu_square_matrix_mul(A, B, C): sA = cuda.shared.array(shape=SM_SIZE, dtype=float32) sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x by = cuda.blockIdx.y bw = cuda.blockDim.x bh = cuda.blockDim.y x = tx + bx * bw y = ty + by * bh acc = float32(0) # forces all the math to be f32 for i in range(bpg): if x < n and y < n: sA[ty, tx] = A[y, tx + i * tpb] sB[ty, tx] = B[ty + i * tpb, x] cuda.syncthreads() if x < n and y < n: for j in range(tpb): acc += sA[ty, j] * sB[j, tx] cuda.syncthreads() if x < n and y < n: C[y, x] = acc np.random.seed(42) A = np.array(np.random.random((n, n)), dtype=np.float32) B = np.array(np.random.random((n, n)), dtype=np.float32) C = np.empty_like(A) s = time() stream = cuda.stream() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC) dC.copy_to_host(C, stream) e = time() tcuda = e - s # Host compute s = time() Cans = np.dot(A, B) e = time() tcpu = e - s # Check result np.testing.assert_allclose(C, Cans, rtol=1e-5)
def test_devicearray_replace(self): N = 100 array = np.arange(N, dtype=np.int32) original = array.copy() gpumem = cuda.to_device(array) cuda.to_device(array * 2, to=gpumem) gpumem.copy_to_host(array) np.testing.assert_array_equal(array, original * 2)
def fork_test(q): from numba.cuda.cudadrv.error import CudaDriverError try: cuda.to_device(np.arange(1)) except CudaDriverError as e: q.put(e) else: q.put(None)
def setup(self): self.stream = cuda.stream() self.d_callResult = cuda.to_device(callResultGold, self.stream) self.d_putResult = cuda.to_device(putResultGold, self.stream) self.d_stockPrice = cuda.to_device(stockPrice, self.stream) self.d_optionStrike = cuda.to_device(optionStrike, self.stream) self.d_optionYears = cuda.to_device(optionYears, self.stream) self.stream.synchronize()
def test_devicearray_replace(self): N = 100 array = np.arange(N, dtype=np.int32) original = array.copy() gpumem = cuda.to_device(array) cuda.to_device(array * 2, to=gpumem) gpumem.copy_to_host(array) self.assertTrue((array == original * 2).all())
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [PRNG(PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in range(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def test_devicearray_contiguous_device_strided(self): d = cuda.to_device(np.arange(20)) arr = np.arange(20) with self.assertRaises(ValueError) as e: d.copy_to_device(cuda.to_device(arr)[::2]) self.assertEqual( devicearray.errmsg_contiguous_buffer, str(e.exception))
def test_laplace_small(self): if config.ENABLE_CUDASIM: NN, NM = 4, 4 iter_max = 20 else: NN, NM = 256, 256 iter_max = 1000 A = np.zeros((NN, NM), dtype=np.float64) Anew = np.zeros((NN, NM), dtype=np.float64) n = NN m = NM tol = 1.0e-6 error = 1.0 for j in range(n): A[j, 0] = 1.0 Anew[j, 0] = 1.0 timer = time.time() iter = 0 blockdim = (tpb, tpb) griddim = (NN // blockdim[0], NM // blockdim[1]) error_grid = np.zeros(griddim) stream = cuda.stream() dA = cuda.to_device(A, stream) # to device and don't come back dAnew = cuda.to_device(Anew, stream) # to device and don't come back derror_grid = cuda.to_device(error_grid, stream) while error > tol and iter < iter_max: self.assertTrue(error_grid.dtype == np.float64) jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid) derror_grid.copy_to_host(error_grid, stream=stream) # error_grid is available on host stream.synchronize() error = np.abs(error_grid).max() # swap dA and dAnew tmp = dA dA = dAnew dAnew = tmp iter += 1 runtime = time.time() - timer
def test_contigous_2d(self): ary = np.arange(10) cary = ary.reshape(2, 5) fary = np.asfortranarray(cary) dcary = cuda.to_device(cary) dfary = cuda.to_device(fary) self.assertTrue(dcary.is_c_contigous()) self.assertTrue(not dfary.is_c_contigous()) self.assertTrue(not dcary.is_f_contigous()) self.assertTrue(dfary.is_f_contigous())
def test_invalid_context_error_with_d2d(self): def d2d(dst, src): dst.copy_to_device(src) arr = np.arange(100) common = cuda.to_device(arr) darr = cuda.to_device(np.zeros(common.shape, dtype=common.dtype)) th = threading.Thread(target=d2d, args=[darr, common]) th.start() th.join() np.testing.assert_equal(darr.copy_to_host(), arr)
def lapconv(img, filt, padval): """ Performs FFT-based normalization on filter and image, without normalization :param numpy.core.multiarray.ndarray img: stimulus image to be convolved :param numpy.core.multiarray.ndarray filt: filter to convolve with :param float padval: value with which to pad the img before convolution :return: result of convolution :rtype: numpy.core.multiarray.ndarray """ # get the number of nonzero entries in the filter for later dividing of the results filt_nnz = np.count_nonzero(filt) # pad the images s_filt = filt.shape s_img = img.shape # appropriate padding depends on context pad_img = np.ones((s_img[0] + s_filt[0], s_img[1] + s_filt[1])) * padval pad_img[0: s_img[0], 0: s_img[1]] = img pad_filt = np.zeros((s_img[0] + s_filt[0], s_img[1] + s_filt[1])) pad_filt[0: s_filt[0], 0: s_filt[1]] = filt # initialize the GPU FFTPlan(shape=pad_img.shape, itype=np.complex64, otype=np.complex64) # create temporary arrays for holding FFT values normtemp1 = np.zeros(pad_img.shape, dtype=np.complex64) normtemp2 = np.zeros(pad_img.shape, dtype=np.complex64) d_pad_filt = cuda.to_device(pad_filt.astype(np.complex64)) d_pad_img = cuda.to_device(pad_img.astype(np.complex64)) d_normtemp1 = cuda.to_device(normtemp1) d_normtemp2 = cuda.to_device(normtemp2) fft(d_pad_filt, d_normtemp1) fft(d_pad_img, d_normtemp2) vmult(d_normtemp1, d_normtemp2, out=d_normtemp1) ifft(d_normtemp1, d_normtemp2) # temp_out = (cuda.fft.ifft_inplace(cuda.fft.fft_inplace(pad_img)) * cuda.fft.fft_inplace(pad_filt)).real temp_out = d_normtemp2.copy_to_host().real # extract the appropriate portion of the filtered image filtered = temp_out[(s_filt[0] / 2): (s_filt[0] / 2) + s_img[0], (s_filt[1] / 2): (s_filt[1] / 2) + s_img[1]] # divide each value by the number of nonzero entries in the filter (and image?!?), so we get an average of all the # values filtered /= (filt_nnz * s_img[0] * s_img[1]) return filtered
def __init__(self, positions, weights): self.calculate_forces = cuda.jit( argtypes=(float32[:,:], float32[:], float32[:,:]) )(calculate_forces) self.accelerations = np.zeros_like(positions) self.n_bodies = len(weights) self.stream = cuda.stream() self.d_pos = cuda.to_device(positions, self.stream) self.d_wei = cuda.to_device(weights, self.stream) self.d_acc = cuda.to_device(self.accelerations, self.stream) self.stream.synchronize()
def test_device_array_interface(self): dary = cuda.device_array(shape=100) devicearray.verify_cuda_ndarray_interface(dary) ary = np.empty(100) dary = cuda.to_device(ary) devicearray.verify_cuda_ndarray_interface(dary) ary = np.asarray(1.234) dary = cuda.to_device(ary) self.assertEquals(dary.ndim, 1) devicearray.verify_cuda_ndarray_interface(dary)
def test_max_pending_count(self): # get deallocation manager and flush it deallocs = cuda.current_context().deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) # deallocate to maximum count for i in range(config.CUDA_DEALLOCS_COUNT): cuda.to_device(np.arange(1)) self.assertEqual(len(deallocs), i + 1) # one more to trigger .clear() cuda.to_device(np.arange(1)) self.assertEqual(len(deallocs), 0)
def pooling(self, S, s, w, stride, th, blockdim, griddim): """ Cuda Pooling Kernel call Returns the updated spike times """ d_S = cuda.to_device(np.ascontiguousarray(S).astype(np.uint8)) d_s = cuda.to_device(np.ascontiguousarray(s).astype(np.uint8)) d_w = cuda.to_device(np.ascontiguousarray(w).astype(np.float32)) S_out = np.empty(d_S.shape, dtype=d_S.dtype) pool[griddim, blockdim](d_S, d_s, d_w, stride, th) d_S.copy_to_host(S_out) return S_out
def test_event_elapsed(self): N = 32 dary = cuda.device_array(N, dtype=np.double) evtstart = cuda.event() evtend = cuda.event() evtstart.record() cuda.to_device(np.arange(N), to=dary) evtend.record() evtend.wait() evtend.synchronize() print(evtstart.elapsed_time(evtend))
def is_points_inside_cuda(points, solution): threads_per_block = 128 blocks_per_grid_x = math.ceil(points.shape[0] / threads_per_block) blocks_per_grid_y = math.ceil(solution.shape[0]) blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y) result = np.empty((points.shape[0], solution.shape[0]), dtype=bool) p_points = cuda.to_device(points) p_edges = cuda.to_device(solution) ray_intersect_segment_cuda[blocks_per_grid, threads_per_block](p_points, p_edges, result) return odd(np.sum(result, axis=1))
def extract_1dlbp_gpu(input, neighborhood, d_powers): maxThread = 512 blockDim = maxThread d_input = cuda.to_device(input) hist = np.zeros(2 ** (2 * neighborhood), dtype='int32') gridDim = (len(input) - 2 * neighborhood + blockDim) / blockDim d_hist = cuda.to_device(hist) lbp_kernel[gridDim, blockDim](d_input, neighborhood, d_powers, d_hist) d_hist.to_host() return hist
def convolve(): # Build Filter laplacian_pts = ''' -4 -1 0 -1 -4 -1 2 3 2 -1 0 3 4 3 0 -1 2 3 2 -1 -4 -1 0 -1 -4 '''.split() laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5) image = get_image() print("Image size: %s" % (image.shape,)) response = np.zeros_like(image) response[:5, :5] = laplacian # CPU # Use SciPy to perform the FFT convolution ts = timer() cvimage_cpu = fftconvolve(image, laplacian, mode='same') te = timer() print('CPU: %.2fs' % (te - ts)) # GPU threadperblock = 32, 8 blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock) print('kernel config: %s x %s' % (blockpergrid, threadperblock)) # Initialize the cuFFT system. FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64) # Start GPU timer ts = timer() image_complex = image.astype(np.complex64) response_complex = response.astype(np.complex64) d_image_complex = cuda.to_device(image_complex) d_response_complex = cuda.to_device(response_complex) task1(d_image_complex, d_response_complex) cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape) te = timer() print('GPU: %.2fs' % (te - ts)) return cvimage_cpu, cvimage_gpu
def connected_comps_gpu(dest_in, weight_in, firstEdge_in, outDegree_in, MAX_TPB=512, stream=None): if stream is None: myStream = cuda.stream() else: myStream = stream dest = cuda.to_device(dest_in, stream=myStream) weight = cuda.to_device(weight_in, stream=myStream) firstEdge = cuda.to_device(firstEdge_in, stream=myStream) outDegree = cuda.to_device(outDegree_in, stream=myStream) n_vertices = firstEdge.size n_edges = dest.size n_components = n_vertices # still need edge_id for conflict resolution in find_minedge edge_id = cuda.to_device(np.arange(n_edges, dtype=dest.dtype), stream=myStream) #labels = np.empty(n_vertices, dtype = dest.dtype) first_iter = True # initialize with name top_edge so we can recycle an array between iterations top_edge = cuda.device_array(n_components, dtype=dest.dtype, stream=myStream) labels = cuda.device_array(n_components, dtype=dest.dtype, stream=myStream) converged = cuda.device_array(1, dtype=np.int8, stream=myStream) gridDimLabels = compute_cuda_grid_dim(n_components, MAX_TPB) gridDim = compute_cuda_grid_dim(n_components, MAX_TPB) final_converged = False while (not final_converged): vertex_minedge = top_edge findMinEdge_CUDA[gridDim, MAX_TPB, myStream](weight, firstEdge, outDegree, vertex_minedge, dest) removeMirroredEdges_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge) colors = cuda.device_array(shape=n_components, dtype=np.int32, stream=myStream) initializeColors_CUDA[gridDim, MAX_TPB, myStream](dest, vertex_minedge, colors) # propagate colors until convergence propagateConverged = False while (not propagateConverged): propagateColors_CUDA[gridDim, MAX_TPB, myStream](colors, converged) converged_num = converged.getitem(0, stream=myStream) propagateConverged = True if converged_num == 1 else False # first we build the flags in the new_vertex array new_vertex = vertex_minedge # reuse the vertex_minedge array as the new new_vertex buildFlag_CUDA[gridDim, MAX_TPB, myStream](colors, new_vertex) # new_n_vertices is the number of vertices of the new contracted graph new_n_vertices = ex_prefix_sum_gpu(new_vertex, MAX_TPB=MAX_TPB, stream=myStream).getitem( 0, stream=myStream) new_n_vertices = int(new_n_vertices) if first_iter: # first iteration defines labels as the initial colors and updates labels.copy_to_device(colors, stream=myStream) first_iter = False # other iterations update the labels with the new colors update_labels_single_pass_cuda[gridDimLabels, MAX_TPB, myStream](labels, colors, new_vertex) if new_n_vertices == 1: final_converged = True del new_vertex break newGridDim = compute_cuda_grid_dim(n_components, MAX_TPB) # count number of edges for new supervertices and write in new outDegree newOutDegree = cuda.device_array(shape=new_n_vertices, dtype=np.int32, stream=myStream) memSet[newGridDim, MAX_TPB, myStream](newOutDegree, 0) # zero the newOutDegree array countNewEdges_CUDA[gridDim, MAX_TPB, myStream](colors, firstEdge, outDegree, dest, new_vertex, newOutDegree) # new first edge array for contracted graph newFirstEdge = cuda.device_array_like(newOutDegree, stream=myStream) # copy newOutDegree to newFirstEdge newFirstEdge.copy_to_device(newOutDegree, stream=myStream) new_n_edges = ex_prefix_sum_gpu(newFirstEdge, MAX_TPB=MAX_TPB, stream=myStream) new_n_edges = new_n_edges.getitem(0, stream=myStream) new_n_edges = int(new_n_edges) # if no edges remain, then MST has converged if new_n_edges == 0: final_converged = True del newOutDegree, newFirstEdge, new_vertex break # create arrays for new edges new_dest = cuda.device_array(new_n_edges, dtype=np.int32, stream=myStream) new_edge_id = cuda.device_array(new_n_edges, dtype=np.int32, stream=myStream) new_weight = cuda.device_array(new_n_edges, dtype=weight.dtype, stream=myStream) top_edge = cuda.device_array_like(newFirstEdge, stream=myStream) top_edge.copy_to_device(newFirstEdge, stream=myStream) # assign and insert new edges assignInsert_CUDA[gridDim, MAX_TPB, myStream](edge_id, dest, weight, firstEdge, outDegree, colors, new_vertex, new_dest, new_edge_id, new_weight, top_edge) # delete old graph del new_vertex, edge_id, dest, weight, firstEdge, outDegree, colors # write new graph n_components = newFirstEdge.size edge_id = new_edge_id dest = new_dest weight = new_weight firstEdge = newFirstEdge outDegree = newOutDegree gridDim = newGridDim returnLabels = labels.copy_to_host() del dest, weight, edge_id, firstEdge, outDegree, converged, labels return returnLabels
VAR1 = np.full((nx + 2 * nb, ny + 2 * nb, nz), np.nan, dtype=wp) VAR2 = np.full((nx + 2 * nb, ny + 2 * nb, nz), np.nan, dtype=wp) VAR[ii, jj, :] = np.random.random((nx, ny, nz)) VAR1[ii, jj, :] = np.random.random((nx, ny, nz)) VAR2[ii, jj, :] = np.random.random((nx, ny, nz)) dVARdt[ii, jj, :] = 0. exchange_BC(VAR) exchange_BC(VAR1) exchange_BC(VAR2) stream = cuda.stream() VARd = cuda.to_device(VAR, stream) dVARdtd = cuda.to_device(dVARdt, stream) VAR1d = cuda.to_device(VAR1, stream) VAR2d = cuda.to_device(VAR2, stream) kernel_x = cuda.jit(cuda_kernel_decorator(kernel_x))(kernel_x) kernel_y = cuda.jit(cuda_kernel_decorator(kernel_y))(kernel_y) kernel_z = cuda.jit(cuda_kernel_decorator(kernel_z))(kernel_z) kernel_improve = cuda.jit(cuda_kernel_decorator(kernel_improve))\ (kernel_improve) kernel_shared = cuda.jit(cuda_kernel_decorator(kernel_shared))\ (kernel_shared) kernel_shared_z = cuda.jit(cuda_kernel_decorator(kernel_shared_z))\ (kernel_shared_z) kernel_shared_x = cuda.jit(cuda_kernel_decorator(kernel_shared_x))\ (kernel_shared_x)
def test_array_views(self): """Views created via array interface support: - Strided slices - Strided slices """ h_arr = np.random.random(10) c_arr = cuda.to_device(h_arr) arr = cuda.as_cuda_array(c_arr) # __getitem__ interface accesses expected data # Direct views np.testing.assert_array_equal(arr.copy_to_host(), h_arr) np.testing.assert_array_equal(arr[:].copy_to_host(), h_arr) # Slicing np.testing.assert_array_equal(arr[:5].copy_to_host(), h_arr[:5]) # Strided view np.testing.assert_array_equal(arr[::2].copy_to_host(), h_arr[::2]) # View of strided array arr_strided = cuda.as_cuda_array(c_arr[::2]) np.testing.assert_array_equal(arr_strided.copy_to_host(), h_arr[::2]) # A strided-view-of-array and view-of-strided-array have the same # shape, strides, itemsize, and alloc_size self.assertEqual(arr[::2].shape, arr_strided.shape) self.assertEqual(arr[::2].strides, arr_strided.strides) self.assertEqual(arr[::2].dtype.itemsize, arr_strided.dtype.itemsize) self.assertEqual(arr[::2].alloc_size, arr_strided.alloc_size) self.assertEqual(arr[::2].nbytes, arr_strided.size * arr_strided.dtype.itemsize) # __setitem__ interface propagates into external array # Writes to a slice arr[:5] = np.pi np.testing.assert_array_equal( c_arr.copy_to_host(), np.concatenate((np.full(5, np.pi), h_arr[5:])) ) # Writes to a slice from a view arr[:5] = arr[5:] np.testing.assert_array_equal( c_arr.copy_to_host(), np.concatenate((h_arr[5:], h_arr[5:])) ) # Writes through a view arr[:] = cuda.to_device(h_arr) np.testing.assert_array_equal(c_arr.copy_to_host(), h_arr) # Writes to a strided slice arr[::2] = np.pi np.testing.assert_array_equal( c_arr.copy_to_host()[::2], np.full(5, np.pi), ) np.testing.assert_array_equal( c_arr.copy_to_host()[1::2], h_arr[1::2] )
def test_negative_slicing_2d(self): arr = np.arange(12).reshape(3, 4) darr = cuda.to_device(arr) for x, y, w, s in product(range(-4, 4), repeat=4): np.testing.assert_array_equal(arr[x:y, w:s], darr[x:y, w:s].copy_to_host())
def apply(self, frame): d_frame = cuda.to_device(frame.flatten().astype(np.float32)) ColorGrade.k_tonemap[(self.blocksPGrid),(self.threadsPBlock)](d_frame, self.tonemap_coeffs, np.int32(self.framesize)) ColorGrade.k_colorcorrect[(self.blocksPGrid),(self.threadsPBlock)](d_frame, self.cc_res, self.cc_vals, np.int32(self.framesize)) return d_frame.copy_to_host().reshape(self.h, self.w, 3)
C[y, x] = 0 for i in range(n): C[y, x] += A[y, i] * B[i, x] A = np.array(np.random.random((n, n)), dtype=np.float32) B = np.array(np.random.random((n, n)), dtype=np.float32) C = np.empty_like(A) print("N = %d x %d" % (n, n)) s = time() stream = cuda.stream() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC) dC.to_host(stream) e = time() tcuda = e - s # Host compute Amat = np.matrix(A) Bmat = np.matrix(B) s = time() Cans = Amat * Bmat e = time()
def test_blackscholes(self): OPT_N = 400 iterations = 2 stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0) optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0) optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0) callResultNumpy = np.zeros(OPT_N) putResultNumpy = -np.ones(OPT_N) callResultNumba = np.zeros(OPT_N) putResultNumba = -np.ones(OPT_N) # numpy for i in range(iterations): black_scholes(callResultNumpy, putResultNumpy, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY) @cuda.jit(double(double), device=True, inline=True) def cnd_cuda(d): K = 1.0 / (1.0 + 0.2316419 * math.fabs(d)) ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) if d > 0: ret_val = 1.0 - ret_val return ret_val @cuda.jit( void(double[:], double[:], double[:], double[:], double[:], double, double)) def black_scholes_cuda(callResult, putResult, S, X, T, R, V): i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i >= S.shape[0]: return sqrtT = math.sqrt(T[i]) d1 = ((math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT)) d2 = d1 - V * sqrtT cndd1 = cnd_cuda(d1) cndd2 = cnd_cuda(d2) expRT = math.exp((-1. * R) * T[i]) callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2) putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)) # numba blockdim = 512, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 stream = cuda.stream() d_callResult = cuda.to_device(callResultNumba, stream) d_putResult = cuda.to_device(putResultNumba, stream) d_stockPrice = cuda.to_device(stockPrice, stream) d_optionStrike = cuda.to_device(optionStrike, stream) d_optionYears = cuda.to_device(optionYears, stream) for i in range(iterations): black_scholes_cuda[griddim, blockdim, stream](d_callResult, d_putResult, d_stockPrice, d_optionStrike, d_optionYears, RISKFREE, VOLATILITY) d_callResult.copy_to_host(callResultNumba, stream) d_putResult.copy_to_host(putResultNumba, stream) stream.synchronize() delta = np.abs(callResultNumpy - callResultNumba) L1norm = delta.sum() / np.abs(callResultNumpy).sum() max_abs_err = delta.max() self.assertTrue(L1norm < 1e-13) self.assertTrue(max_abs_err < 1e-13)
array[i] *= 2 array[i] /= 2 data = [] data_gpu = [] gpu_out = [] streams = [] for _ in range(NUM_ARRAYS): streams.append(cuda.stream()) data.append(np.random.randn(ARRAY_LEN).astype('float32')) t_start = perf_counter() for k in range(NUM_ARRAYS): data_gpu.append(cuda.to_device(data[k], stream=streams[k])) for k in range(NUM_ARRAYS): kernel[1, 64, streams[k]](data_gpu[k]) for k in range(NUM_ARRAYS): gpu_out.append(data_gpu[k].copy_to_host(stream=streams[k])) t_end = perf_counter() for k in range(NUM_ARRAYS): assert (np.allclose(gpu_out[k], data[k])) print(f'Total time: {t_end - t_start: .2f} s')
weight_ar[128] = 2.3 weight_ar[129] = 2.3 weight_ar[130] = 2.3 weight_ar[191] = 2.3 weight_ar[192] = 2.3 weight_ar[193] = 2.3 weight_ar[254] = 2.3 weight_ar[255] = 2.3 weight_ar[256] = 2.3 weight_ar[257] = 2.3 if rowlen > 149: weight_ar[22486] = 2.54 # Explicitly copy to device d_weight_ar = cuda.to_device(weight_ar) d_nonzero_ar = cuda.to_device(nonzero_ar) d_scan_ar = cuda.to_device(scan_ar) # scanf_ar is a data structure to hold the final, corrected scan scanf_ar = np.zeros((arrayszplus, ), dtype=np.uint32) d_scanf_ar = cuda.to_device(scanf_ar) # We now have the arrays in d_carrylist, which needs to be scanned, so that it # can be added to each block of d_scan_ar. If d_carry is of large # size, then we need to recursively scan until we do a single scan on # the multiprocessor # # Make up a list of carry vectors and allocate device memory #
def test_index_1d(self): arr = np.arange(10) darr = cuda.to_device(arr) for i in range(arr.size): self.assertEqual(arr[i], darr[i])
def to_device(self, hostary, stream): return cuda.to_device(hostary, stream=stream)
def test_strided_index_1d(self): arr = np.arange(10) darr = cuda.to_device(arr) for i in range(arr.size): np.testing.assert_equal(arr[i::2], darr[i::2].copy_to_host())
def test_strides(self): arr = np.ones(20) darr = cuda.to_device(arr) arr[::2] = 500 darr[::2] = 500 np.testing.assert_array_equal(darr.copy_to_host(), arr)
def test_broadcast(self): arr = np.arange(5 * 7).reshape(5, 7) darr = cuda.to_device(arr) arr[:, 2] = 500 darr[:, 2] = 500 np.testing.assert_array_equal(darr.copy_to_host(), arr)
import numpy as np from numba import cuda @cuda.jit def square_device(a, out): idx = cuda.grid(1) out[idx] = a[idx]**2 n = 4096 a = np.arange(n) d_a = cuda.to_device(a) d_out = cuda.device_array(shape=(n, ), dtype=np.float32) threads = 32 blocks = 128 square_device[blocks, threads](d_a, d_out)
domain = [0.0, 1.0, 0.0, 1.0] D = 1.0 # Diffusion coefficient x_spacing = 1.0 / float(c.shape[0]) y_spacing = 1.0 / float(c.shape[1]) # Store spacing as inverse square to avoid repeated division inv_xspsq = 1.0 / (x_spacing**2) inv_yspsq = 1.0 / (y_spacing**2) # Satisfy stability condition time_step = 0.25 * min(x_spacing, y_spacing)**2 # Copy this array to the device, and create a new device array to hold updated value d_c = cuda.to_device(c) d_c_new = cuda.device_array(c.shape, dtype=np.float) threads_per_block = (32, 32) blocks_per_grid = (dim // 30, dim // 30) t1 = timer() # Start timer # Evolve forward 2000 steps for step in range(2000): # Launch the kernel diffusion_kernel_shared[blocks_per_grid, threads_per_block](D, inv_xspsq, inv_yspsq, time_step, d_c, d_c_new)
""" row, col = cuda.grid(2) if row < C.shape[0] and col < C.shape[1]: tmp = 0. for k in range(A.shape[1]): tmp += A[row, k] * B[k, col] C[row, col] = tmp # Host code # Initialize the data arrays A = numpy.full((2400, 1200), 3, numpy.float) # matrix containing all 3's B = numpy.full((1200, 2200), 4, numpy.float) # matrix containing all 4's # Copy the arrays to the device A_global_mem = cuda.to_device(A) B_global_mem = cuda.to_device(B) # Allocate memory on the device for the result C_global_mem = cuda.device_array((2400, 2200)) # Configure the blocks threadsperblock = (30, 30) blockspergrid_x = int(math.ceil(A.shape[0] / threadsperblock[0])) blockspergrid_y = int(math.ceil(B.shape[1] / threadsperblock[1])) blockspergrid = (blockspergrid_x, blockspergrid_y) # Start the kernel matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem) # Copy the result back to the host
# Read the input image original = cv2.imread("data/original0.png") #original = cv2.resize(original, None, fx=0.25, fy=0.25) original = cv2.split(cv2.cvtColor(original, cv2.COLOR_BGR2YCR_CB))[0] original = original.astype(np.uint64) / 255 #original = np.full_like(original, 1) dim = np.array([original.shape[1], original.shape[0]]) print(original.shape) p = np.array([64]) # pixels to interpolate in row max_levels = np.array([int(math.log2(original.shape[1]))]) # summation levels in CUDA for logn performance template = np.zeros_like(original) # Create images in GPU d_original = cuda.to_device(np.ascontiguousarray(original), stream=stream) d_interpolated = cuda.to_device(np.ascontiguousarray(template), stream=stream) d_dim = cuda.to_device(np.ascontiguousarray(dim), stream=stream) d_p = cuda.to_device(np.ascontiguousarray(p), stream=stream) interpolate_image[512,512](d_original, d_p, d_dim, d_interpolated) """ FOR PSNR # Compute image squared error original-interpolated d_squared_error = cuda.to_device(np.ascontiguousarray(template), stream=stream) image_squared_error[512,512](d_original, d_interpolated, d_dim, d_squared_error) # Sum squared errors d_max_levels = cuda.to_device(np.ascontiguousarray(max_levels), stream=stream) row_sum[512,512](d_squared_error, d_dim, d_max_levels)
pad_size = int(kz_degree*(kz_window-1)/2) times_no_mem = np.zeros(300, dtype=np.float) times_mem = np.zeros(300, dtype=np.float) x = np.zeros(300, dtype=np.int32) for i in range(0, 300): dt = 0.1 timePoints = np.arange(0, i+dt, dt) signal_original = np.sin(2*np.pi*0.05*timePoints) result = np.zeros(signal_original.size) signal = np.append(np.zeros(pad_size),np.append(signal_original,np.zeros(pad_size))) # Padded left and right # time transfer t_transfer_signal = synchronous_kernel_timeit(lambda: cuda.to_device(signal), number=100) t_transfer_coeffs = synchronous_kernel_timeit(lambda: cuda.to_device(kz_coeffs), number=100) t_transfer_result = synchronous_kernel_timeit(lambda: cuda.device_array(result.size), number=100) t_transfer = t_transfer_signal + t_transfer_result + t_transfer_coeffs # time no mem t_no_mem = time_filter(1, 100, signal, kz_coeffs, result) # time mem signal_dev = cuda.to_device(signal) result_dev = cuda.device_array(result.size) coeffs_dev = cuda.to_device(kz_coeffs) t_mem = time_filter(1, 100, signal_dev, coeffs_dev, result_dev) times_mem[i] = t_transfer+t_mem times_no_mem[i] = t_no_mem
c1[0] = int64(temp1) c1[1] = int64(temp2) c1[2] = int64(temp3) c1[3] = int64(temp4) c1[4] = int64(temp5) c1[5] = int64(temp6) a = [temp1, temp2, temp3, temp4, temp5, temp6] for i in range(c1.shape[0]): # for j in range(0,5): c1[i] = a[i] return a # Copy the arrays to the device n1 = cuda.to_device(n1) dev_inp = cuda.to_device(one) # alloc and copy input data test(dev_inp, dev_inp) # invoke the gufunc dev_inp.copy_to_host(one) # Copy the result back to the host C = dev_inp.copy_to_host() #thread and parallel def test(x1, c1): i = 0 e1 = x1[i] e2 = x1[i + 1] x2 = n1[e1]
def cubeOverlap(gtNums, dtNums, gts, dts, hAxis, criterion, gpuId): ''' gtNums: array of #gt with shape (#frame, ), dtype uint8 dtNums: array of #dt with shape (#frame, ), dtype uint8 gts: array of gt with shape (#frame, maxGTnum, itemNum), dtype float32 dts: array of dt with shape (#frame, maxDTnum, itemNum), dtype float32 hAxis: height axis (uint8) criterion: overlap criterion (int8) gpuId: device id item info: x, y, z, dx, dy, dz, heading ''' assert gtNums.shape == dtNums.shape, 'gtNums {} frames while dtNums {} frames'.format( gtNums.shape, dtNums.shape) assert gtNums.dtype == np.uint8, 'gtNums: uint8 expected but {} given'.format( gtNums.dtype) assert dtNums.dtype == np.uint8, 'dtNums: uint8 expected but {} given'.format( dtNums.dtype) assert gts.shape[0] == dts.shape[ 0], 'gts {} frames while dts {} frames'.format(gts.shape[0], dts.shape[0]) assert gts.dtype == np.float32, 'gts: float32 expected but {} given'.format( gts.dtype) assert dts.dtype == np.float32, 'dts: float32 expected but {} given'.format( dts.dtype) assert gtNums.shape[0] == gts.shape[ 0], 'gtNums {} frames while gts {} frames'.format( gtNums.shape[0], gts.shape[0]) assert hAxis in set([0, 1, 2]), 'invalid hAxis {}'.format(hAxis) hAxis = np.uint8(hAxis) assert criterion in set( [-1, 0, 1]), 'invalid overlap criterion {}'.format(criterion) criterion = np.int8(criterion) overlaps = np.zeros((gts.shape[0], gts.shape[1], dts.shape[1]), np.float32) if not np.all(overlaps.shape): return overlaps assert gts.shape[-1] == 7, 'invalid gt shape {}'.format(gts.shape) assert dts.shape[-1] == 7, 'invalid dt shape {}'.format(dts.shape) device = cuda.select_device(gpuId) # if device.id != gpuId: # cuda.close() # device = cuda.select_device(gpuId) assert device is None or device.id == gpuId, 'wake up {}th gpu rather than requested {}th gpu'.format( device.id, gpuId) blocksPerGrid = gts.shape[0] threadsPerBlock = gts.shape[1] gtCubeNum = np.uint8(gts.shape[1]) dtCubeNum = np.uint8(dts.shape[1]) stream = cuda.stream() with stream.auto_synchronize(): gtNumsGPU = cuda.to_device(gtNums, stream) dtNumsGPU = cuda.to_device(dtNums, stream) gtsGPU = cuda.to_device(gts.reshape(-1), stream) dtsGPU = cuda.to_device(dts.reshape(-1), stream) overlapsGPU = cuda.to_device(overlaps.reshape(-1), stream) cubeOverlap_kernel[blocksPerGrid, threadsPerBlock, stream](gtNumsGPU, dtNumsGPU, gtCubeNum, dtCubeNum, gtsGPU, dtsGPU, overlapsGPU, hAxis, criterion) overlapsGPU.copy_to_host(overlaps.reshape(-1), stream) return overlaps
def test_prefix_select(self): arr = np.arange(5 * 7).reshape(5, 7, order='F') darr = cuda.to_device(arr) self.assertTrue(np.all(darr[:1, 1].copy_to_host() == arr[:1, 1]))
def _gpu_stump( T_A_fname, T_B_fname, m, range_stop, excl_zone, M_T_fname, Σ_T_fname, QT_fname, QT_first_fname, μ_Q_fname, σ_Q_fname, k, ignore_trivial=True, range_start=1, device_id=0, ): """ A Numba CUDA version of STOMP for parallel computation of the matrix profile, matrix profile indices, left matrix profile indices, and right matrix profile indices. Parameters ---------- T_A_fname : str The file name for the time series or sequence for which to compute the matrix profile T_B_fname : str The file name for the time series or sequence that will be used to annotate T_A. For every subsequence in T_A, its nearest neighbor in T_B will be recorded. m : int Window size range_stop : int The index value along T_B for which to stop the matrix profile calculation. This parameter is here for consistency with the distributed `stumped` algorithm. excl_zone : int The half width for the exclusion zone relative to the current sliding window M_T_fname : str The file name for the sliding mean of time series, `T` Σ_T_fname : str The file name for the sliding standard deviation of time series, `T` QT_fname : str The file name for the dot product between some query sequence,`Q`, and time series, `T` QT_first_fname : str The file name for the QT for the first window relative to the current sliding window μ_Q_fname : str The file name for the mean of the query sequence, `Q`, relative to the current sliding window σ_Q_fname : str The file name for the standard deviation of the query sequence, `Q`, relative to the current sliding window k : int The total number of sliding windows to iterate over ignore_trivial : bool Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. range_start : int The starting index value along T_B for which to start the matrix profile calculation. Default is 1. device_id : int The (GPU) device number to use. The default value is `0`. Returns ------- profile_fname : str The file name for the matrix profile indices_fname : str The file name for the matrix profile indices. The first column of the array consists of the matrix profile indices, the second column consists of the left matrix profile indices, and the third column consists of the right matrix profile indices. Notes ----- `DOI: 10.1109/ICDM.2016.0085 \ <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__ See Table II, Figure 5, and Figure 6 Timeseries, T_A, will be annotated with the distance location (or index) of all its subsequences in another times series, T_B. Return: For every subsequence, Q, in T_A, you will get a distance and index for the closest subsequence in T_A. Thus, the array returned will have length T_A.shape[0]-m+1. Additionally, the left and right matrix profiles are also returned. Note: Unlike in the Table II where T_A.shape is expected to be equal to T_B.shape, this implementation is generalized so that the shapes of T_A and T_B can be different. In the case where T_A.shape == T_B.shape, then our algorithm reduces down to the same algorithm found in Table II. Additionally, unlike STAMP where the exclusion zone is m/2, the default exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3). For self-joins, set `ignore_trivial = True` in order to avoid the trivial match. Note that left and right matrix profiles are only available for self-joins. """ threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(k / threads_per_block) T_A = np.load(T_A_fname, allow_pickle=False) T_B = np.load(T_B_fname, allow_pickle=False) QT = np.load(QT_fname, allow_pickle=False) QT_first = np.load(QT_first_fname, allow_pickle=False) M_T = np.load(M_T_fname, allow_pickle=False) Σ_T = np.load(Σ_T_fname, allow_pickle=False) μ_Q = np.load(μ_Q_fname, allow_pickle=False) σ_Q = np.load(σ_Q_fname, allow_pickle=False) with cuda.gpus[device_id]: device_T_A = cuda.to_device(T_A) device_QT_odd = cuda.to_device(QT) device_QT_even = cuda.to_device(QT) device_QT_first = cuda.to_device(QT_first) device_μ_Q = cuda.to_device(μ_Q) device_σ_Q = cuda.to_device(σ_Q) if ignore_trivial: device_T_B = device_T_A device_M_T = device_μ_Q device_Σ_T = device_σ_Q else: device_T_B = cuda.to_device(T_B) device_M_T = cuda.to_device(M_T) device_Σ_T = cuda.to_device(Σ_T) profile = np.full((k, 3), np.inf) # float64 indices = np.full((k, 3), -1, dtype=np.int64) # int64 device_profile = cuda.to_device(profile) device_indices = cuda.to_device(indices) _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block]( range_start - 1, device_T_A, device_T_B, m, device_QT_even, device_QT_odd, device_QT_first, device_M_T, device_Σ_T, device_μ_Q, device_σ_Q, k, ignore_trivial, excl_zone, device_profile, device_indices, False, ) for i in range(range_start, range_stop): _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block]( i, device_T_A, device_T_B, m, device_QT_even, device_QT_odd, device_QT_first, device_M_T, device_Σ_T, device_μ_Q, device_σ_Q, k, ignore_trivial, excl_zone, device_profile, device_indices, True, ) profile = device_profile.copy_to_host() indices = device_indices.copy_to_host() profile = np.sqrt(profile) profile_fname = core.array_to_temp_file(profile) indices_fname = core.array_to_temp_file(indices) return profile_fname, indices_fname
def test_ex_matmul(self): """Test of matrix multiplication on various cases.""" # magictoken.ex_import.begin from numba import cuda, float32 import numpy as np import math # magictoken.ex_import.end # magictoken.ex_matmul.begin @cuda.jit def matmul(A, B, C): """Perform square matrix multiplication of C = A * B.""" i, j = cuda.grid(2) if i < C.shape[0] and j < C.shape[1]: tmp = 0. for k in range(A.shape[1]): tmp += A[i, k] * B[k, j] C[i, j] = tmp # magictoken.ex_matmul.end # magictoken.ex_run_matmul.begin x_h = np.arange(16).reshape([4, 4]) y_h = np.ones([4, 4]) z_h = np.zeros([4, 4]) x_d = cuda.to_device(x_h) y_d = cuda.to_device(y_h) z_d = cuda.to_device(z_h) threadsperblock = (16, 16) blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) matmul[blockspergrid, threadsperblock](x_d, y_d, z_d) z_h = z_d.copy_to_host() print(z_h) print(x_h @ y_h) # magictoken.ex_run_matmul.end # magictoken.ex_fast_matmul.begin # Controls threads per block and shared memory usage. # The computation will be done on blocks of TPBxTPB elements. # TPB should not be larger than 32 in this example TPB = 16 @cuda.jit def fast_matmul(A, B, C): """ Perform matrix multiplication of C = A * B using CUDA shared memory. Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella """ # Define an array in the shared memory # The size and type of the arrays must be known at compile time sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32) sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32) x, y = cuda.grid(2) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bpg = cuda.gridDim.x # blocks per grid # Each thread computes one element in the result matrix. # The dot product is chunked into dot products of TPB-long vectors. tmp = float32(0.) for i in range(bpg): # Preload data into shared memory sA[ty, tx] = 0 sB[ty, tx] = 0 if y < A.shape[0] and (tx + i * TPB) < A.shape[1]: sA[ty, tx] = A[y, tx + i * TPB] if x < B.shape[1] and (ty + i * TPB) < B.shape[0]: sB[ty, tx] = B[ty + i * TPB, x] # Wait until all threads finish preloading cuda.syncthreads() # Computes partial product on the shared memory for j in range(TPB): tmp += sA[ty, j] * sB[j, tx] # Wait until all threads finish computing cuda.syncthreads() if y < C.shape[0] and x < C.shape[1]: C[y, x] = tmp # magictoken.ex_fast_matmul.end # magictoken.ex_run_fast_matmul.begin x_h = np.arange(16).reshape([4, 4]) y_h = np.ones([4, 4]) z_h = np.zeros([4, 4]) x_d = cuda.to_device(x_h) y_d = cuda.to_device(y_h) z_d = cuda.to_device(z_h) threadsperblock = (TPB, TPB) blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d) z_h = z_d.copy_to_host() print(z_h) print(x_h @ y_h) # magictoken.ex_run_fast_matmul.end # fast_matmul test(s) msg = "fast_matmul incorrect for shared memory, square case." self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg) # magictoken.ex_run_nonsquare.begin x_h = np.arange(115).reshape([5, 23]) y_h = np.ones([23, 7]) z_h = np.zeros([5, 7]) x_d = cuda.to_device(x_h) y_d = cuda.to_device(y_h) z_d = cuda.to_device(z_h) threadsperblock = (TPB, TPB) grid_y_max = max(x_h.shape[0], y_h.shape[0]) grid_x_max = max(x_h.shape[1], y_h.shape[1]) blockspergrid_x = math.ceil(grid_x_max / threadsperblock[0]) blockspergrid_y = math.ceil(grid_y_max / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d) z_h = z_d.copy_to_host() print(z_h) print(x_h @ y_h) # magictoken.ex_run_nonsquare.end # nonsquare fast_matmul test(s) msg = "fast_matmul incorrect for shared memory, non-square case." self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
both = np.r_[left, right] res[np.sum(p[both >= input[i]])] += 1 return res X = np.arange(3, 7) X = 10**X neighborhood = 4 cpu_times = np.zeros(X.shape[0]) cpu_times_simple = cpu_times.copy() cpu_times_jit = cpu_times.copy() gpu_times = np.zeros(X.shape[0]) p = 1 << np.array(range(0, 2 * neighborhood), dtype='int32') d_powers = cuda.to_device(p) for i, x in enumerate(X): input = np.random.randint(0, 256, size=x).astype(np.uint8) print "Length: {0}".format(x) print "--------------" start = timer() h_cpu = extract_1dlbp_cpu(input, neighborhood, p) cpu_times[i] = timer() - start print "Finished on CPU: time: {0:3.5f}s".format(cpu_times[i]) res = np.zeros(1 << (2 * neighborhood), dtype='int32') start = timer() h_cpu_simple = extract_1dlbp_gpu_debug(input, neighborhood, p, res)
def run_glm_ptr(nFolds, nAlphas, nLambdas, xtrain, ytrain, xtest, ytest, wtrain, write, display, nGPUs=1): """Runs ElasticNetH2O test""" use_gpu = nGPUs > 0 if use_gpu == 1: from numba import cuda #nFolds, nAlphas, nLambdas = arg train_data_mat = cuda.to_device(xtrain) train_result_mat = cuda.to_device(ytrain) test_data_mat = cuda.to_device(xtest) test_result_mat = cuda.to_device(ytest) train_w_mat = cuda.to_device(wtrain) train_data_mat_ptr = train_data_mat.device_ctypes_pointer train_result_mat_ptr = train_result_mat.device_ctypes_pointer test_data_mat_ptr = test_data_mat.device_ctypes_pointer test_result_mat_ptr = test_result_mat.device_ctypes_pointer train_w = train_w_mat.device_ctypes_pointer print(train_data_mat_ptr) print(train_result_mat_ptr) print(test_data_mat_ptr) print(test_result_mat_ptr) import subprocess maxNGPUS = int( subprocess.check_output("nvidia-smi -L | wc -l", shell=True)) print("Maximum Number of GPUS:", maxNGPUS) #nGPUs = maxNGPUS #choose all GPUs #nGPUs = 1 n = train_data_mat.shape[1] mTrain = train_data_mat.shape[0] mValid = test_data_mat.shape[0] else: #nGPUs = 0 n = xtrain.shape[1] mTrain = xtrain.shape[0] mValid = xtest.shape[0] print("No. of Features=%d mTrain=%d mValid=%d" % (n, mTrain, mValid)) #Order of data fortran = 1 print("fortran=%d" % (fortran)) sourceDev = 0 # should be passed from above if user set fit_intercept fit_intercept = True lambda_min_ratio = 1e-9 store_full_path = 1 double_precision = 0 #variables if use_gpu == 1: from ctypes import c_void_p a, b = c_void_p(train_data_mat_ptr.value), c_void_p( train_result_mat_ptr.value) c, d = c_void_p(test_data_mat_ptr.value), c_void_p( test_result_mat_ptr.value) e = c_void_p(train_w.value) print(a, b, c, d, e) else: a, b = xtrain, ytrain c, d = xtest, ytest e = wtrain print("Setting up Solver") sys.stdout.flush() Solver = h2o4gpu.ElasticNetH2O enet = Solver(n_gpus=nGPUs, order='c' if fortran else 'r', fit_intercept=fit_intercept, lambda_min_ratio=lambda_min_ratio, n_lambdas=nLambdas, n_folds=nFolds, n_alphas=nAlphas, verbose=5, store_full_path=store_full_path) print("Solving") sys.stdout.flush() if use_gpu == 1: enet.fit_ptr(mTrain, n, mValid, double_precision, None, a, b, c, d, e, source_dev=sourceDev) else: enet.fit(a, b, c, d, e) #t1 = time() print("Done Solving\n") sys.stdout.flush() error_train = printallerrors(display, enet, "Train", store_full_path) print('Predicting') sys.stdout.flush() if use_gpu == 1: pred_val = enet.predict_ptr(c, d) else: pred_val = enet.predict(c) print('Done Predicting') sys.stdout.flush() print('predicted values:\n', pred_val) error_test = printallerrors(display, enet, "Test", store_full_path) if write == 0: os.system('rm -f error.txt; ' 'rm -f pred*.txt; ' 'rm -f varimp.txt; ' 'rm -f me*.txt; ' 'rm -f stats.txt') from ..solvers.utils import finish finish(enet) return pred_val, error_train, error_test
def similarity_gpu( emx, clusmethod, corrmethod, preout, postout, minexpr, maxexpr, minsamp, minclus, maxclus, criterion, mincorr, maxcorr, gsize, lsize, outfile): # allocate device buffers W = gsize N = emx.shape[1] N_pow2 = next_power_2(N) K = maxclus in_emx = cuda.to_device(emx) in_index_cpu = cuda.pinned_array((W, 2), dtype=np.int32) in_index_gpu = cuda.device_array_like(in_index_cpu) work_x = cuda.device_array((W, N_pow2), dtype=np.float32) work_y = cuda.device_array((W, N_pow2), dtype=np.float32) work_gmm_data = cuda.device_array((W, N, 2), dtype=np.float32) work_gmm_labels = cuda.device_array((W, N), dtype=np.int8) work_gmm_pi = cuda.device_array((W, K), dtype=np.float32) work_gmm_mu = cuda.device_array((W, K, 2), dtype=np.float32) work_gmm_sigma = cuda.device_array((W, K, 2, 2), dtype=np.float32) work_gmm_sigmaInv = cuda.device_array((W, K, 2, 2), dtype=np.float32) work_gmm_normalizer = cuda.device_array((W, K), dtype=np.float32) work_gmm_MP = cuda.device_array((W, K, 2), dtype=np.float32) work_gmm_counts = cuda.device_array((W, K), dtype=np.int32) work_gmm_logpi = cuda.device_array((W, K), dtype=np.float32) work_gmm_xm = cuda.device_array((W, 2), dtype=np.float32) work_gmm_Sxm = cuda.device_array((W, 2), dtype=np.float32) work_gmm_gamma = cuda.device_array((W, N, K), dtype=np.float32) work_gmm_logL = cuda.device_array((W, 1), dtype=np.float32) work_gmm_entropy = cuda.device_array((W, 1), dtype=np.float32) out_K_cpu = cuda.pinned_array((W,), dtype=np.int8) out_K_gpu = cuda.device_array_like(out_K_cpu) out_labels_cpu = cuda.pinned_array((W, N), dtype=np.int8) out_labels_gpu = cuda.device_array_like(out_labels_cpu) out_correlations_cpu = cuda.pinned_array((W, K), dtype=np.float32) out_correlations_gpu = cuda.device_array_like(out_correlations_cpu) # iterate through global work blocks n_genes = emx.shape[0] n_total_pairs = n_genes * (n_genes - 1) // 2 index_x = 1 index_y = 0 for i in range(0, n_total_pairs, gsize): # print("%8d %8d" % (i, n_total_pairs)) # determine number of pairs n_pairs = min(gsize, n_total_pairs - i) # initialize index array index_x_ = index_x index_y_ = index_y for j in range(n_pairs): in_index_cpu[j] = index_x_, index_y_ index_x_, index_y_ = pairwise_increment(index_x_, index_y_) # copy index array to device in_index_gpu.copy_to_device(in_index_cpu) # execute similarity kernel similarity_gpu_helper[gsize // lsize, lsize]( n_pairs, in_emx, in_index_gpu, clusmethod, corrmethod, preout, postout, minexpr, maxexpr, minsamp, minclus, maxclus, criterion, work_x, work_y, work_gmm_data, work_gmm_labels, work_gmm_pi, work_gmm_mu, work_gmm_sigma, work_gmm_sigmaInv, work_gmm_normalizer, work_gmm_MP, work_gmm_counts, work_gmm_logpi, work_gmm_xm, work_gmm_Sxm, work_gmm_gamma, work_gmm_logL, work_gmm_entropy, out_K_gpu, out_labels_gpu, out_correlations_gpu ) cuda.synchronize() # copy results from device out_K_gpu.copy_to_host(out_K_cpu) out_labels_gpu.copy_to_host(out_labels_cpu) out_correlations_gpu.copy_to_host(out_correlations_cpu) # save correlation matrix to output file index_x_ = index_x index_y_ = index_y for j in range(n_pairs): # extract pairwise results K = out_K_cpu[j] labels = out_labels_cpu[j] correlations = out_correlations_cpu[j, 0:K] # save pairwise results write_pair( index_x_, index_y_, K, labels, correlations, mincorr, maxcorr, outfile) # increment pairwise index index_x_, index_y_ = pairwise_increment(index_x_, index_y_) # update local pairwise index index_x = index_x_ index_y = index_y_
Test Cuda sinusoidal_decompression kernel """ t0 = time.time() stream = cuda.stream() h, w, _ = pano.shape dim = np.array([w, h], dtype=np.uint32) img_r = np.zeros_like(pano) tmp = np.zeros_like(pano) out_v = np.zeros_like(pano) dec = np.zeros_like(pano) out = np.zeros((h, w, 4), dtype=np.int32) d_pano = cuda.to_device(np.ascontiguousarray(pano), stream=stream) d_img_r = cuda.to_device(np.ascontiguousarray(img_r), stream=stream) d_tmp = cuda.to_device(np.ascontiguousarray(tmp), stream=stream) d_dim = cuda.to_device(np.ascontiguousarray(dim), stream=stream) d_out_tmp = cuda.to_device(np.ascontiguousarray(out), stream=stream) sin_gpu.sinusoidal_compression_0[512, 512](d_pano, d_tmp, d_dim) sin_gpu.sinusoidal_compression_1[512, 512](d_tmp, d_img_r, d_dim) img_r = d_img_r.copy_to_host() pix_count = int(h * math.acos(0.5) / math.pi) img_r = img_r[:2 * pix_count + 2, :, :] d_img_r = cuda.to_device(np.ascontiguousarray(img_r), stream=stream) d_out_v = cuda.to_device(np.ascontiguousarray(out_v), stream=stream) d_dec = cuda.to_device(np.ascontiguousarray(dec), stream=stream)
def test_negative_slicing_1d(self): arr = np.arange(10) darr = cuda.to_device(arr) for i, j in product(range(-10, 10), repeat=2): np.testing.assert_array_equal(arr[i:j], darr[i:j].copy_to_host())
def main(): cu_discriminant = vectorize(['f4(f4, f4, f4)', 'f8(f8, f8, f8)'], target='cuda')(poly.discriminant) N = 1e+8 // 2 print('Data size', N) A, B, C = poly.generate_input(N, dtype=np.float32) D = np.empty(A.shape, dtype=A.dtype) stream = cuda.stream() print('== One') ts = time() with stream.auto_synchronize(): dA = cuda.to_device(A, stream) dB = cuda.to_device(B, stream) dC = cuda.to_device(C, stream) dD = cuda.to_device(D, stream, copy=False) cu_discriminant(dA, dB, dC, out=dD, stream=stream) dD.to_host(stream) te = time() total_time = (te - ts) print('Execution time %.4f' % total_time) print('Throughput %.2f' % (N / total_time)) print('== Chunked') chunksize = 1e+7 chunkcount = N // chunksize print('Chunk size', chunksize) sA = np.split(A, chunkcount) sB = np.split(B, chunkcount) sC = np.split(C, chunkcount) sD = np.split(D, chunkcount) device_ptrs = [] ts = time() with stream.auto_synchronize(): for a, b, c, d in zip(sA, sB, sC, sD): dA = cuda.to_device(a, stream) dB = cuda.to_device(b, stream) dC = cuda.to_device(c, stream) dD = cuda.to_device(d, stream, copy=False) cu_discriminant(dA, dB, dC, out=dD, stream=stream) dD.to_host(stream) device_ptrs.extend([dA, dB, dC, dD]) te = time() total_time = (te - ts) print('Execution time %.4f' % total_time) print('Throughput %.2f' % (N / total_time)) if '-verify' in sys.argv[1:]: poly.check_answer(D, A, B, C)