def test_gemv_dot_strides(): # Reported in https://github.com/Theano/Theano/issues/6142 xv = rand(5) yv = rand(5, 1) x = gpuarray_shared_constructor(xv) y = gpuarray_shared_constructor(yv, broadcastable=(False, True)) f = aesara.function([], dot(x, y[::-1]), mode=mode_with_gpu) out = f() utt.assert_allclose(out, np.dot(xv, yv[::-1]))
def test_float16(): # gemv (gemm called) float16_data = [ rand(3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gemv(*float16_shared) f = aesara.function([], o, mode=mode_with_gpu) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) topo = f.maker.fgraph.toposort() assert any(isinstance(n.op, GpuGemm) for n in topo) # gemm float16_data = [ rand(3, 3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3, 3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gpugemm_no_inplace(*float16_shared) f = aesara.function([], o) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) # dot22 float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")] float16_shared = [gpuarray_shared_constructor(val) for val in float16_data] o = gpu_dot22(*float16_shared) f = aesara.function([], o) x, y = float16_data out = f() utt.assert_allclose(np.asarray(out), np.dot(x, y))
def test_gpu_cholesky_inplace(self): A = self.rand_symmetric(1000) A_gpu = gpuarray_shared_constructor(A) A_copy = A_gpu.get_value() C = GpuMagmaCholesky()(A_gpu) fn = aesara.function([], C, mode=mode_with_gpu, updates=[(A_gpu, C)]) assert any([ node.op.inplace for node in fn.maker.fgraph.toposort() if isinstance(node.op, GpuMagmaCholesky) ]) fn() L = A_gpu.get_value() utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
def test_gpu_matrix_inverse_inplace(self): N = 1000 test_rng = np.random.default_rng(seed=1) A_val_gpu = gpuarray_shared_constructor( test_rng.random((N, N)).astype("float32") * 2 - 1) A_val_copy = A_val_gpu.get_value() A_val_gpu_inv = GpuMagmaMatrixInverse()(A_val_gpu) fn = aesara.function([], A_val_gpu_inv, mode=mode_with_gpu, updates=[(A_val_gpu, A_val_gpu_inv)]) assert any([ node.op.inplace for node in fn.maker.fgraph.toposort() if isinstance(node.op, GpuMagmaMatrixInverse) ]) fn() utt.assert_allclose(np.eye(N), np.dot(A_val_gpu.get_value(), A_val_copy), atol=5e-3)
def shared(val): try: return gpuarray_shared_constructor(val) except TypeError: return aesara.shared(val)
def test_sync_update(): # This test if sync_update work. This can only be tested when # there is a GPU. To test if we really sync, we compare a case we # can run in parallel GPU and CPU computation. Then we sync to # disable that parallel computation. Then we assert the time is # higher. # this import needs to go first because it generates the # local 'aesara' variable. You get an UnboundLocalError otherwise. import tests.gpuarray.config sizes = [100, 500, 1000, 2000, 5000, 10000, 20000, 40000] size = sizes[0] w = gpuarray_shared_constructor( np.random.rand(size, size).astype("float32"), "w", target=tests.gpuarray.config.test_ctx_name, ) x = gpuarray_shared_constructor( np.random.rand(size, size).astype("float32"), "x", target=tests.gpuarray.config.test_ctx_name, ) updates = [(w, w + np.asarray(0.001, "float32") * dot(x, x))] f = function([], updates=updates, mode=tests.gpuarray.config.mode_with_gpu) assert len(f.maker.fgraph.apply_nodes) == 1 assert any(isinstance(n.op, GpuGemm) for n in f.maker.fgraph.apply_nodes) # Make sure libgpuarray have compile all kernels f() f.sync_shared() # Find a good size that will take about .5s. # This is to make the test more stable across different GPUs. size = sizes[-1] for i in sizes: data = np.random.rand(i, i).astype("float32") w.set_value(data) x.set_value(data) t0 = time.time() f() f.sync_shared() t1 = time.time() if (t1 - t0) < 0.5: continue size = i break # sync to make sure all computation are done f.sync_shared() t_0 = time.time() for i in range(3): f() # Sync after each call to see the slowdown from sync. f.sync_shared() time.sleep(0.5) t_1 = time.time() for i in range(3): f() time.sleep(0.5) f.sync_shared() # Sync to make sure all computation are finished. t_2 = time.time() d1 = t_1 - t_0 d2 = t_2 - t_1 assert d1 > d2, (d1, d2)