def test_gpuarray_shared_scalar(): # By default, we don't put scalar as shared variable on the GPU with pytest.raises(TypeError): gpuarray_shared_constructor(np.asarray(1, dtype="float32")) # But we can force that gpuarray_shared_constructor(np.asarray(1, dtype="float32"), target=test_ctx_name)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1, 1)): inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)] filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)] dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) dCdH_val = np.random.random(dCdH_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) dCdH = gpuarray_shared_constructor(dCdH_val) shape = gpuarray_shared_constructor(np.array(filters_shape[2:])) if subsample == (1, 1, 1): conv_ref = Corr3dMMGradWeights(subsample=subsample)( ref_cast(inputs), ref_cast(dCdH)) conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(inputs, dCdH) else: conv_ref = Corr3dMMGradWeights(subsample=subsample)( ref_cast(inputs), ref_cast(dCdH), shape=shape) conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)( inputs, dCdH, shape=shape) f_ref = aesara.function([], conv_ref, mode=mode_without_gpu) f = aesara.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1, 1)): inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)] filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) filters_val = np.random.random(filters_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) filters = gpuarray_shared_constructor(filters_val) bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2] bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3] bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4] bottom_shape = gpuarray_shared_constructor( np.array([bottom_height, bottom_width, bottom_depth])) if subsample == (1, 1, 1): conv_ref = Corr3dMMGradInputs(subsample=subsample)( kern=ref_cast(filters), topgrad=ref_cast(inputs)) conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=filters, topgrad=inputs) else: conv_ref = Corr3dMMGradInputs(subsample=subsample)( kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape) conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=filters, topgrad=inputs, shape=bottom_shape) f_ref = aesara.function([], conv_ref, mode=mode_without_gpu) f = aesara.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def run_conv_valid( self, inputs_shape, filters_shape, border_mode="valid", filter_dilation=(1, 1), subsample=(1, 1), unshared=False, verify_grad=False, ): inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] if unshared: filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)] else: filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) filters_val = np.random.random(filters_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) filters = gpuarray_shared_constructor(filters_val) conv_ref = CorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, )(ref_cast(inputs), ref_cast(filters)) f_ref = aesara.function([], conv_ref, mode=mode_without_gpu) conv = GpuCorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, )(inputs, filters) f = aesara.function([], conv, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res) if verify_grad: utt.verify_grad( GpuCorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, ), [inputs_val, filters_val], mode=mode_with_gpu, )
def test_overflow_gpu_new_backend(): seed = 12345 n_substreams = 7 curr_rstate = np.array([seed] * 6, dtype="int32") rstate = [curr_rstate.copy()] for j in range(1, n_substreams): rstate.append(rng_mrg.ff_2p72(rstate[-1])) rstate = np.asarray(rstate) rstate = gpuarray_shared_constructor(rstate) fct = functools.partial(GPUA_mrg_uniform.new, rstate, ndim=None, dtype="float32") # should raise error as the size overflows sizes = [ (2**31, ), (2**32, ), ( 2**15, 2**16, ), (2, 2**15, 2**15), ] rng_mrg_overflow(sizes, fct, mode, should_raise_error=True) # should not raise error sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)] rng_mrg_overflow(sizes, fct, mode, should_raise_error=False) # should support int32 sizes sizes = [(np.int32(2**10), ), (np.int32(2), np.int32(2**10), np.int32(2**10))] rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_set_value_non_contiguous(): s = gpuarray_shared_constructor( np.asarray([[1.0, 2.0], [1.0, 2.0], [5, 6]])) s.set_value(s.get_value(borrow=True, return_internal_type=True)[::2], borrow=True) assert not s.get_value(borrow=True, return_internal_type=True).flags["C_CONTIGUOUS"] # In the past, this failed s.set_value([[0, 0], [1, 1]])
def test_incsub_offset(): # Test for https://github.com/Theano/Theano/issues/5670 # Build a GPU variable which value will have an offset (x1) x = gpuarray_shared_constructor(np.zeros(5, dtype=aesara.config.floatX)) x1 = x[1:] # Use inc_subtensor on it y = vector() z = inc_subtensor(x1[2:], y) # Use updates so that inc_subtensor can happen inplace f = aesara.function([y], z, updates={x: z}, mode=mode_with_gpu) utt.assert_allclose(f([1, 2]), np.array([0, 0, 1, 2], dtype=aesara.config.floatX))
def test_consistency_GPUA_parallel(): # Verify that the random numbers generated by GPUA_mrg_uniform, in # parallel, are the same as the reference (Java) implementation by # L'Ecuyer et al. seed = 12345 n_samples = 5 n_streams = 12 n_substreams = 7 # 7 samples will be drawn in parallel samples = [] curr_rstate = np.array([seed] * 6, dtype="int32") for i in range(n_streams): stream_samples = [] rstate = [curr_rstate.copy()] for j in range(1, n_substreams): rstate.append(rng_mrg.ff_2p72(rstate[-1])) rstate = np.asarray(rstate) rstate = gpuarray_shared_constructor(rstate) new_rstate, sample = GPUA_mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(n_substreams, )) rstate.default_update = new_rstate # Not really necessary, just mimicking # rng_mrg.MRG_RandomStream' behavior sample.rstate = rstate sample.update = (rstate, new_rstate) # We need the sample back in the main memory cpu_sample = at.as_tensor_variable(sample) f = aesara.function([], cpu_sample, mode=mode) for k in range(n_samples): s = f() stream_samples.append(s) samples.append(np.array(stream_samples).T.flatten()) # next stream curr_rstate = rng_mrg.ff_2p134(curr_rstate) samples = np.array(samples).flatten() assert np.allclose(samples, java_samples)
def test_consistency_GPUA_serial(): # Verify that the random numbers generated by GPUA_mrg_uniform, serially, # are the same as the reference (Java) implementation by L'Ecuyer et al. seed = 12345 n_samples = 5 n_streams = 12 n_substreams = 7 samples = [] curr_rstate = np.array([seed] * 6, dtype="int32") for i in range(n_streams): stream_rstate = curr_rstate.copy() for j in range(n_substreams): substream_rstate = np.array([stream_rstate.copy()], dtype="int32") # Transfer to device rstate = gpuarray_shared_constructor(substream_rstate) new_rstate, sample = GPUA_mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(1, )) rstate.default_update = new_rstate # Not really necessary, just mimicking # rng_mrg.MRG_RandomStreams' behavior sample.rstate = rstate sample.update = (rstate, new_rstate) # We need the sample back in the main memory cpu_sample = tensor.as_tensor_variable(sample) f = aesara.function([], cpu_sample, mode=mode) for k in range(n_samples): s = f() samples.append(s) # next substream stream_rstate = rng_mrg.ff_2p72(stream_rstate) # next stream curr_rstate = rng_mrg.ff_2p134(curr_rstate) samples = np.array(samples).flatten() assert np.allclose(samples, java_samples)
def test_cpu_target_with_shared_variable(): srng = MRG_RandomStream() s = np.random.rand(2, 3).astype("float32") x = gpuarray_shared_constructor(s, name="x") try: # To have aesara.shared(x) try to move on the GPU aesara.compile.shared_constructor(gpuarray_shared_constructor) y = srng.uniform(x.shape, target="cpu") y.name = "y" z = (x * y).sum() z.name = "z" fz = aesara.function([], z, mode=mode) nodes = fz.maker.fgraph.toposort() assert not any(isinstance(node.op, GPUA_mrg_uniform) for node in nodes) finally: aesara.compile.shared_constructor(gpuarray_shared_constructor, remove=True)
def test_Gpujoin_inplace(): # Test Gpujoin to work inplace. # # This function tests the case when several elements are passed to the # Gpujoin function but all except one of them are empty. In this case # Gpujoin should work inplace and the output should be the view of the # non-empty element. s = tt.lscalar() data = np.array([3, 4, 5], dtype=aesara.config.floatX) x = gpuarray_shared_constructor(data, borrow=True) z = tt.zeros((s,)) join = GpuJoin(view=0) c = join(0, x, z) f = aesara.function([s], aesara.Out(c, borrow=True)) if not isinstance(mode_with_gpu, aesara.compile.DebugMode): assert x.get_value(borrow=True, return_internal_type=True) is f(0) assert np.allclose(f(0), [3, 4, 5])
def test_GPUA_full_fill(): # Make sure the whole sample buffer is filled. Also make sure # large samples are consistent with CPU results. # This needs to be large to trigger the problem on GPU size = (10, 1000) R = MRG_RandomStream(234) uni = R.uniform(size, nstreams=60 * 256) f_cpu = aesara.function([], uni) rstate_gpu = gpuarray_shared_constructor( R.state_updates[-1][0].get_value()) new_rstate, sample = GPUA_mrg_uniform.new(rstate_gpu, ndim=None, dtype="float32", size=size) rstate_gpu.default_update = new_rstate f_gpu = aesara.function([], sample, mode=mode) utt.assert_allclose(f_cpu(), f_gpu())
def test_elemwise_pow(): # Test that GpuElemwise(pow) can compile with any combination of integer # or float input dtype. dtypes = [ "uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", "float16", "float32", "float64", ] for dtype_base in dtypes: for dtype_exp in dtypes: # Compile a gpu function with the specified dtypes base_val = np.random.randint(0, 5, size=10).astype(dtype_base) exp_val = np.random.randint(0, 3, size=10).astype(dtype_exp) base = vector(dtype=dtype_base) exp = gpuarray_shared_constructor(exp_val) assert exp.dtype == dtype_exp output = base**exp f = aesara.function([base], output, mode=mode_with_gpu) # We don't transfer to the GPU when the output dtype is int* n = len([ n for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise) ]) assert n == (output.dtype in float_dtypes) # Call the function to make sure the output is valid out = f(base_val) expected_out = base_val**exp_val assert_allclose(out, expected_out)
def test_blocksparse_grad_merge(self): b = fmatrix() h = ftensor3() iIdx = lmatrix() oIdx = lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = gpuarray_shared_constructor(W_val, context=test_ctx_name) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = aesara.grad(o.sum(), W) lr = np.asarray(0.05, dtype="float32") upd = W - lr * gW f1 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha") mode = mode.excluding("local_merge_blocksparse_output") f2 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def shared(x, **kwargs): return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
def test_validate_input_types_gpuarray_backend(): with config.change_flags(compute_test_value="raise"): rstate = np.zeros((7, 6), dtype="int32") rstate = gpuarray_shared_constructor(rstate) rng_mrg.mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3, ))