def check_cholesky(self, N, lower=True, rtol=None, atol=None): A = self.rand_symmetric(N) L = self.run_gpu_cholesky(A, lower=lower) if not lower: L = L.T utt.assert_allclose(np.dot(L, L.T), A, rtol=rtol, atol=atol)
def test_conv3d(border_mode): if aesara.config.mode == "FAST_COMPILE": mode = aesara.compile.mode.get_mode("FAST_RUN") else: mode = aesara.compile.mode.get_default_mode() Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32 Nf, Tf, C, Hf, Wf = 32, 5, 3, 5, 5 signals = (np.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype("float32")) filters = (np.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype("float32")) # t0 = time.time() pyres = pyconv3d(signals, filters, border_mode) # print(time.time() - t0) s_signals = shared(signals) s_filters = shared(filters) s_output = shared(signals * 0) out = conv3d( s_signals, s_filters, signals_shape=signals.shape, filters_shape=filters.shape, border_mode=border_mode, ) newconv3d = aesara.function([], [], updates={s_output: out}, mode=mode) check_diagonal_subtensor_view_traces(newconv3d) # t0 = time.time() newconv3d() # print(time.time() - t0) utt.assert_allclose(pyres, s_output.get_value(borrow=True)) gsignals, gfilters = aesara.grad(out.sum(), [s_signals, s_filters]) gnewconv3d = aesara.function( [], [], updates=[(s_filters, gfilters), (s_signals, gsignals)], mode=mode, name="grad", ) check_diagonal_subtensor_view_traces(gnewconv3d) # t0 = time.time() gnewconv3d() # print("grad", time.time() - t0) Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5 Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2 rng = np.random.default_rng(280284) signals = rng.random((Ns, Ts, C, Hs, Ws)).astype("float32") filters = rng.random((Nf, Tf, C, Hf, Wf)).astype("float32") utt.verify_grad( lambda s, f: conv3d(s, f, border_mode=border_mode), [signals, filters], eps=1e-1, mode=mode, ) # Additional Test that covers the case of patched implementation for filter with Tf=1 Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32 Nf, Tf, C, Hf, Wf = 32, 1, 3, 5, 5 signals = (np.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype("float32")) filters = (np.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype("float32")) # t0 = time.time() pyres = pyconv3d(signals, filters, border_mode) # print(time.time() - t0) s_signals = shared(signals) s_filters = shared(filters) s_output = shared(signals * 0) out = conv3d( s_signals, s_filters, signals_shape=signals.shape, filters_shape=filters.shape, border_mode=border_mode, ) newconv3d = aesara.function([], [], updates={s_output: out}, mode=mode) # t0 = time.time() newconv3d() # print(time.time() - t0) utt.assert_allclose(pyres, s_output.get_value(borrow=True)) gsignals, gfilters = aesara.grad(out.sum(), [s_signals, s_filters]) gnewconv3d = aesara.function( [], [], updates=[(s_filters, gfilters), (s_signals, gsignals)], mode=mode, name="grad", ) # t0 = time.time() gnewconv3d() # print("grad", time.time() - t0) Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5 Nf, Tf, C, Hf, Wf = 4, 1, 3, 2, 2 signals = rng.random((Ns, Ts, C, Hs, Ws)).astype("float32") filters = rng.random((Nf, Tf, C, Hf, Wf)).astype("float32") utt.verify_grad( lambda s, f: conv3d(s, f, border_mode=border_mode), [signals, filters], eps=1e-1, mode=mode, )
def test_one_sequence_one_output_weights_gpu2(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = fvector("u") x0 = fscalar("x0") W_in = fscalar("win") W = fscalar("w") output, updates = scan( f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=self.mode_with_gpu, ) f2 = aesara.function( [u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=self.mode_with_gpu, ) # get random initial values rng = np.random.default_rng(utt.fetch_seed()) v_u = rng.uniform(size=(4, ), low=-5.0, high=5.0) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() # compute the output in numpy v_out = np.zeros((4, )) v_out[0] = v_u[0] * W_in + v_x0 * W for step in range(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W aesara_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(aesara_values, v_out) topo = f2.maker.fgraph.toposort() assert (sum([ isinstance(node.op, self.gpu_backend.HostFromGpu) for node in topo ]) == 1) assert (sum([ isinstance(node.op, self.gpu_backend.GpuFromHost) for node in topo ]) == 4) scan_node = [node for node in topo if isinstance(node.op, Scan)] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any([ isinstance(node.op, self.gpu_backend.GpuElemwise) for node in scan_node_topo ]) assert not any([ isinstance(node.op, self.gpu_backend.HostFromGpu) for node in scan_node_topo ]) assert not any([ isinstance(node.op, self.gpu_backend.GpuFromHost) for node in scan_node_topo ])
def check_svd(self, A, U, S, VT, rtol=None, atol=None): S_m = np.zeros_like(A) np.fill_diagonal(S_m, S) utt.assert_allclose(np.dot(np.dot(U, S_m), VT), A, rtol=rtol, atol=atol)
def test_one_sequence_one_output_weights_gpu1(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = theano.tensor.fvector("u") x0 = theano.tensor.fscalar("x0") W_in = theano.tensor.fscalar("win") W = theano.tensor.fscalar("w") # The following line is needed to have the first case being used # Otherwise, it is the second that is tested. mode = self.mode_with_gpu.excluding("InputToGpuOptimizer") output, updates = scan( f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode, ) output = self.gpu_backend.gpu_from_host(output) f2 = theano.function( [u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=self.mode_with_gpu, ) # get random initial values rng = np.random.RandomState(utt.fetch_seed()) v_u = rng.uniform(size=(4,), low=-5.0, high=5.0) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() v_u = np.asarray(v_u, dtype="float32") v_x0 = np.asarray(v_x0, dtype="float32") W = np.asarray(W, dtype="float32") W_in = np.asarray(W_in, dtype="float32") # compute the output in numpy v_out = np.zeros((4,)) v_out[0] = v_u[0] * W_in + v_x0 * W for step in range(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W theano_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(theano_values, v_out) # TO DEL topo = f2.maker.fgraph.toposort() scan_node = [node for node in topo if isinstance(node.op, Scan)] assert len(scan_node) == 1 scan_node = scan_node[0] topo = f2.maker.fgraph.toposort() assert ( sum([isinstance(node.op, self.gpu_backend.HostFromGpu) for node in topo]) == 0 ) assert ( sum([isinstance(node.op, self.gpu_backend.GpuFromHost) for node in topo]) == 4 ) scan_node = [node for node in topo if isinstance(node.op, Scan)] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any( [ isinstance(node.op, self.gpu_backend.GpuElemwise) for node in scan_node_topo ] ) assert not any( [ isinstance(node.op, self.gpu_backend.HostFromGpu) for node in scan_node_topo ] ) assert not any( [ isinstance(node.op, self.gpu_backend.GpuFromHost) for node in scan_node_topo ] )
def test_gpu3_mixture_dtype_outputs(self): def f_rnn(u_t, x_tm1, W_in, W): return (u_t * W_in + x_tm1 * W, at.cast(u_t + x_tm1, "int64")) u = fvector("u") x0 = fscalar("x0") W_in = fscalar("win") W = fscalar("w") output, updates = scan( f_rnn, u, [x0, None], [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode_with_gpu, ) f2 = aesara.function( [u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=mode_with_gpu, ) # get random initial values rng = np.random.default_rng(utt.fetch_seed()) v_u = rng.uniform(size=(4, ), low=-5.0, high=5.0) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() # compute the output in numpy v_out1 = np.zeros((4, )) v_out2 = np.zeros((4, ), dtype="int64") v_out1[0] = v_u[0] * W_in + v_x0 * W v_out2[0] = v_u[0] + v_x0 for step in range(1, 4): v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W v_out2[step] = np.int64(v_u[step] + v_out1[step - 1]) aesara_out1, aesara_out2 = f2(v_u, v_x0, W_in, W) utt.assert_allclose(aesara_out1, v_out1) utt.assert_allclose(aesara_out2, v_out2) topo = f2.maker.fgraph.toposort() scan_node = [ node for node in topo if isinstance(node.op, scan.op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] assert scan_node.op.gpua scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert not any( isinstance(node.op, HostFromGpu) for node in scan_node_topo) assert not any( isinstance(node.op, GpuFromHost) for node in scan_node_topo)
def _cmp(self, n, m, f, f_gpu): data = np.arange(n * m, dtype="float32").reshape(n, m) out = f(data) gout = f_gpu(data) utt.assert_allclose(out, gout)
def run_conv_fwd(self, algo, dtype, precision, parameters): ( inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta, ) = parameters inputs_val = np.random.random(inputs_shape).astype(dtype) filters_val = np.random.random(filters_shape).astype(dtype) # Scale down the input values to prevent very large absolute errors # due to float rounding inputs_val /= 10 filters_val /= 10 inputs = aesara.shared(inputs_val) filters = aesara.shared(filters_val) if beta == 0: out = None else: out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype) out /= 10 # Compile an Aesara function for the cuDNN implementation conv = dnn_conv( img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode, subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision, ) f = aesara.function([], conv, mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters flipped according to the width, height and time axis if conv_mode == "conv": if inputs.ndim == 5: flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters[:, :, ::-1, ::-1] else: flipped_filters = filters # Compile an Aesara function for the reference implementation conv_ref = self.cpu_conv_class(border_mode=border_mode, subsample=subsample, filter_dilation=dilation)( ref_cast(inputs), flipped_filters) f_ref = aesara.function([], conv_ref, mode="FAST_RUN") # Compare the results of the two implementations res_ref = f_ref() res = np.asarray(f()) if algo in cudnn.deterministic_fwd_algorithms: utt.assert_allclose(res, np.asarray(f())) atol, rtol = self.get_atol_rtol(algo, dtype, precision) if beta == 0: cpu_res = alpha * res_ref else: cpu_res = alpha * res_ref + beta * out self.scale_numpy_arrays_inplace(cpu_res, res, alpha) utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def test_numpy_method(fct, value): x = dscalar("x") y = fct(x) f = aesara.function([x], y) utt.assert_allclose(np.nan_to_num(f(value)), np.nan_to_num(fct(value)))
def cmp(n, m): data = np.random.uniform(0, 1, (n, m)).astype(dtype=dtypeInput) out = f(data) gout = f_gpu(data) utt.assert_allclose(out, gout)
def test_batch_normalization(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B np.random.seed(1234) X = 1 + np.random.random([10, 20]).astype("float32") B = 1 + np.random.random([20]).astype("float32") G = 1 + np.random.random([20]).astype("float32") M = 1 + np.random.random([20]).astype("float32") V = 1 + np.random.random([20]).astype("float32") x = matrix("x") b = vector("b") g = vector("g") m = vector("m") v = vector("v") bn_ref_op = bn_ref(x, g, b, m, v) f_ref = aesara.function([x, g, b, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ["low_mem", "high_mem"]: bn_op = batchnorm.batch_normalization(x, g, b, m, v, mode=mode) f = aesara.function([x, g, b, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return batchnorm.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad(bn_f, [X, G, B, M, V]) bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True)) f_ref = aesara.function([x, b, g], [bn_ref_op]) res_ref = f_ref(X, G, B) for mode in ["low_mem", "high_mem"]: bn_op = batchnorm.batch_normalization( x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode, ) f = aesara.function([x, b, g], [bn_op]) res = f(X, G, B) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return batchnorm.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad( bn_f, [X, G, B, X.mean(axis=0)[np.newaxis], X.std(axis=0)[np.newaxis]])
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): # This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias # We check that we loop when their is too much threads n_in = 1000 batch_size = 4097 n_out = 1250 if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode): n_in = 4098 n_out = 4099 y = lvector("y") b = fvector("b") # we precompute the dot with big shape before to allow the test of # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error # (the launch timed out and was terminated) on GPU card not # powerful enough. We need the big shape to check for corner # case. dot_result = fmatrix("dot_result") xx = np.asarray(np.random.rand(batch_size, n_in), dtype=np.float32) yy = np.ones((batch_size,), dtype="int32") b_values = np.zeros((n_out,), dtype="float32") W_values = np.asarray(np.random.rand(n_in, n_out), dtype="float32") dot_value = np.asarray(np.dot(xx, W_values), dtype="float32") del W_values p_y_given_x = aesara.tensor.nnet.softmax(dot_result + b) y_pred = argmax(p_y_given_x, axis=-1) loss = -mean(log(p_y_given_x)[at.arange(y.shape[0]), y]) dW = grad(loss, dot_result) classify = aesara.function( inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu ) classify_gpu = aesara.function( inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu ) assert any( [ isinstance( node.op, aesara.tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias ) for node in classify.maker.fgraph.toposort() ] ) assert any( [ isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.fgraph.toposort() ] ) out = classify(yy, b_values, dot_value) gout = classify_gpu(yy, b_values, dot_value) assert len(out) == len(gout) == 3 utt.assert_allclose(out[0], gout[0]) utt.assert_allclose(out[2], gout[2], atol=3e-6) utt.assert_allclose(out[1], gout[1])
def test_h_softmax(): # Tests the output dimensions of the h_softmax when a target is provided or # not. input_size = 4 batch_size = 2 h_softmax_level1_size = 5 h_softmax_level2_size = 3 output_size = h_softmax_level1_size * h_softmax_level2_size # First level of h_softmax W1 = np.asarray(np.random.normal(size=(input_size, h_softmax_level1_size)), dtype=config.floatX) W1 = aesara.shared(W1) b1 = aesara.shared( np.asarray(np.zeros((h_softmax_level1_size, )), dtype=config.floatX)) # Second level of h_softmax W2 = np.asarray( np.random.normal(size=(h_softmax_level1_size, input_size, h_softmax_level2_size)), dtype=config.floatX, ) W2 = aesara.shared(W2) b2 = aesara.shared( np.asarray( np.zeros((h_softmax_level1_size, h_softmax_level2_size)), dtype=config.floatX, )) x = matrix("x") y = ivector("y") # This only computes the output corresponding to the target y_hat_tg = h_softmax( x, batch_size, output_size, h_softmax_level1_size, h_softmax_level2_size, W1, b1, W2, b2, y, ) # This computes all the outputs y_hat_all = h_softmax( x, batch_size, output_size, h_softmax_level1_size, h_softmax_level2_size, W1, b1, W2, b2, ) fun_output_tg = aesara.function([x, y], y_hat_tg) fun_output = aesara.function([x], y_hat_all) x_mat = np.random.normal(size=(batch_size, input_size)).astype(config.floatX) y_mat = np.random.default_rng().integers(0, output_size, batch_size).astype("int32") tg_output = fun_output_tg(x_mat, y_mat) all_outputs = fun_output(x_mat) assert tg_output.shape == (batch_size, ) assert all_outputs.shape == (batch_size, output_size) # Verifies that the outputs computed by fun_output_tg are the same as those # computed by fun_output. utt.assert_allclose(all_outputs[np.arange(0, batch_size), y_mat], tg_output)
def test_DownsampleFactorMaxStride(self): rng = np.random.RandomState(utt.fetch_seed()) # maxpool, stride, ignore_border, input, output sizes examples = ( ((1, 1), (1, 1), True, (4, 10, 16, 16), (4, 10, 16, 16)), ((1, 1), (5, 7), True, (4, 10, 16, 16), (4, 10, 4, 3)), ((1, 1), (1, 1), False, (4, 10, 16, 16), (4, 10, 16, 16)), ((1, 1), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)), ((3, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 14, 14)), ((3, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 5, 5)), ((3, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)), ((3, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 14, 14)), ((3, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 6, 6)), ((3, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)), ((5, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 12, 14)), ((5, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 4, 5)), ((5, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)), ((5, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 12, 14)), ((5, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 5, 6)), ((5, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)), ((16, 16), (1, 1), True, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (5, 7), True, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (1, 1), False, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (5, 7), False, (4, 10, 16, 16), (4, 10, 1, 1)), ((3, ), (5, ), True, (16, ), (3, )), ( (3, ), (5, ), True, ( 2, 16, ), ( 2, 3, ), ), ( (5, ), (3, ), True, ( 2, 3, 16, ), ( 2, 3, 4, ), ), ((5, 1, 3), (3, 3, 3), True, (2, 16, 16, 16), (2, 4, 6, 5)), ((5, 1, 3), (3, 3, 3), True, (4, 2, 16, 16, 16), (4, 2, 4, 6, 5)), ) for example, mode in product( examples, ["max", "sum", "average_inc_pad", "average_exc_pad"]): (maxpoolshp, stride, ignore_border, inputshp, outputshp) = example # generate random images imval = rng.rand(*inputshp) images = theano.shared(imval) # Pool op numpy_output_val = self.numpy_max_pool_nd_stride( imval, maxpoolshp, ignore_border, stride, mode) assert (numpy_output_val.shape == outputshp ), "outshape is {}, calculated shape is {}".format( outputshp, numpy_output_val.shape, ) maxpool_op = Pool(ndim=len(maxpoolshp), ignore_border=ignore_border, mode=mode)(images, maxpoolshp, stride) f = function([], maxpool_op) output_val = f() utt.assert_allclose(output_val, numpy_output_val)
def test_batch_normalization_train(): utt.seed_rng() for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (tensor5, tensor3, vector): x, scale, bias, running_mean, running_var = ( vartype(n) for n in ("x", "scale", "bias", "running_mean", "running_var") ) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass ( out, x_mean, x_invstd, out_running_mean, out_running_var, ) = batchnorm.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var, ) # reference forward pass if axes == "per-activation": axes2 = (0,) elif axes == "spatial": axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes x_mean2 = x.mean(axis=axes2, keepdims=True) x_var2 = x.var(axis=axes2, keepdims=True) x_invstd2 = aet.reciprocal(aet.sqrt(x_var2 + eps)) scale2 = aet.addbroadcast(scale, *axes2) bias2 = aet.addbroadcast(bias, *axes2) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 m = aet.cast( aet.prod(x.shape) / aet.prod(scale.shape), aesara.config.floatX ) out_running_mean2 = ( running_mean * (1 - running_average_factor) + x_mean2 * running_average_factor ) out_running_var2 = ( running_var * (1 - running_average_factor) + (m / (m - 1)) * x_var2 * running_average_factor ) # backward pass dy = vartype("dy") grads = aet.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = aet.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # second-order backward pass dx = vartype("dinputs") dscale = vartype("dscale") dbias = vartype("dbias") grad_grads = aet.grad( None, wrt=[x, dy, scale], known_grads=OrderedDict( {grads[0]: dx, grads[1]: dscale, grads[2]: dbias} ), consider_constant=[ x, dy, scale, bias, x_mean, x_invstd, running_mean, running_var, ], return_disconnected="zero", ) # reference second-order backward pass grad_grads2 = aet.grad( None, wrt=[x, dy, scale], known_grads=OrderedDict( {grads2[0]: dx, grads2[1]: dscale, grads2[2]: dbias} ), consider_constant=[ x, dy, scale, bias, x_mean2, x_var2, running_mean, running_var, ], return_disconnected="zero", ) # compile f = aesara.function( [x, scale, bias, running_mean, running_var, dy, dx, dscale, dbias], [ out, x_mean, x_invstd, out_running_mean, out_running_var, out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2, ] + grads + grads2 + grad_grads + grad_grads2, ) # check if the abstract Ops have been replaced assert not any( [ isinstance( n.op, ( batchnorm.AbstractBatchNormTrain, batchnorm.AbstractBatchNormInference, batchnorm.AbstractBatchNormTrainGrad, ), ) for n in f.maker.fgraph.toposort() ] ) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple( 1 if d in axes2 else s for d, s in enumerate(data_shape) ) X = 4 + 3 * np.random.randn(*data_shape).astype(aesara.config.floatX) Dy = -1 + 2 * np.random.randn(*data_shape).astype(aesara.config.floatX) Scale = np.random.randn(*param_shape).astype(aesara.config.floatX) Bias = np.random.randn(*param_shape).astype(aesara.config.floatX) Running_mean = np.random.randn(*param_shape).astype( aesara.config.floatX ) Running_var = np.random.randn(*param_shape).astype(aesara.config.floatX) Dx = 4 + 3 * np.random.randn(*data_shape).astype(aesara.config.floatX) Dscale = -1 + 2 * np.random.randn(*param_shape).astype( aesara.config.floatX ) Dbias = np.random.randn(*param_shape).astype(aesara.config.floatX) outputs = f( X, Scale, Bias, Running_mean, Running_var, Dy, Dx, Dscale, Dbias ) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 5]) # out utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean utt.assert_allclose( np.nan_to_num(outputs[4]), np.nan_to_num(outputs[4 + 5]) ) # running_var # compare gradients utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx utt.assert_allclose( outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4 ) # dscale utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias # compare second-order gradients utt.assert_allclose(outputs[16], outputs[16 + 3], atol=1e-4) # ddx utt.assert_allclose(outputs[17], outputs[17 + 3]) # ddy utt.assert_allclose( outputs[18], outputs[18 + 3], rtol=3e-4, atol=1e-4 ) # ddscale
def validate( self, image_shape, filter_shape, border_mode="valid", subsample=(1, 1), input=None, filters=None, verify_grad=True, non_contiguous=False, filter_dilation=(1, 1), ): """ :param image_shape: The constant shape info passed to corrMM. :param filter_shape: The constant shape info passed to corrMM. """ if not theano.config.cxx: pytest.skip("Need cxx to test conv2d") N_image_shape = [ tt.get_scalar_constant_value(tt.as_tensor_variable(x)) for x in image_shape ] N_filter_shape = [ tt.get_scalar_constant_value(tt.as_tensor_variable(x)) for x in filter_shape ] if input is None: input = self.input if filters is None: filters = self.filters # THEANO IMPLEMENTATION # we create a symbolic function so that verify_grad can work def sym_CorrMM(input, filters): # define theano graph and function input.name = "input" filters.name = "filters" rval = corr.CorrMM(border_mode, subsample, filter_dilation)(input, filters) rval.name = "corr_output" return rval output = sym_CorrMM(input, filters) output.name = f"CorrMM()({input.name},{filters.name})" theano_corr = theano.function([input, filters], output, mode=self.mode) # initialize input and compute result image_data = np.random.random(N_image_shape).astype(self.dtype) filter_data = np.random.random(N_filter_shape).astype(self.dtype) if non_contiguous: image_data = np.transpose(image_data, axes=(0, 1, 3, 2)) image_data = image_data.copy() image_data = np.transpose(image_data, axes=(0, 1, 3, 2)) filter_data = np.transpose(filter_data, axes=(0, 1, 3, 2)) filter_data = filter_data.copy() filter_data = np.transpose(filter_data, axes=(0, 1, 3, 2)) assert not image_data.flags["CONTIGUOUS"] assert not filter_data.flags["CONTIGUOUS"] theano_output = theano_corr(image_data, filter_data) # REFERENCE IMPLEMENTATION # Testing correlation, not convolution. Reverse filters. filter_data_corr = np.array(filter_data[:, :, ::-1, ::-1], copy=True, order="C") orig_image_data = image_data img_shape2d = np.array(N_image_shape[-2:]) fil_shape2d = np.array(N_filter_shape[-2:]) dil_shape2d = np.array(filter_dilation) dil_fil_shape2d = (fil_shape2d - 1) * dil_shape2d + 1 subsample2d = np.array(subsample) if border_mode == "full": padHW = dil_fil_shape2d - 1 elif border_mode == "valid": padHW = np.array([0, 0]) elif border_mode == "half": padHW = np.floor(dil_fil_shape2d / 2).astype("int32") elif isinstance(border_mode, tuple): padHW = np.array(border_mode) elif isinstance(border_mode, int): padHW = np.array([border_mode, border_mode]) else: raise NotImplementedError(f"Unsupported border_mode {border_mode}") out_shape2d = ( np.floor((img_shape2d + 2 * (padHW) - dil_fil_shape2d) / subsample2d) + 1 ) # avoid numpy deprecation out_shape2d = out_shape2d.astype("int32") out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape2d) ref_output = np.zeros(out_shape) # loop over output feature maps ref_output.fill(0) image_data2 = np.zeros( ( N_image_shape[0], N_image_shape[1], N_image_shape[2] + 2 * padHW[0], N_image_shape[3] + 2 * padHW[1], ) ) image_data2[ :, :, padHW[0] : padHW[0] + N_image_shape[2], padHW[1] : padHW[1] + N_image_shape[3], ] = image_data image_data = image_data2 N_image_shape = image_data.shape for bb in range(N_image_shape[0]): for nn in range(N_filter_shape[0]): for im0 in range(N_image_shape[1]): filter2d = filter_data_corr[nn, im0, :, :] image2d = image_data[bb, im0, :, :] for row in range(ref_output.shape[2]): irow = row * subsample[0] # image row for col in range(ref_output.shape[3]): icol = col * subsample[1] # image col ref_output[bb, nn, row, col] += ( image2d[ irow : irow + dil_fil_shape2d[0] : filter_dilation[0], icol : icol + dil_fil_shape2d[1] : filter_dilation[1], ] * filter2d[::-1, ::-1] ).sum() utt.assert_allclose(ref_output, theano_output) # TEST GRADIENT if verify_grad: utt.verify_grad(sym_CorrMM, [orig_image_data, filter_data], mode=self.mode)
def test_batch_normalization_test(): for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (tensor5, tensor3, vector): x, scale, bias, mean, var = ( vartype(n) for n in ("x", "scale", "bias", "mean", "var") ) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass out = batchnorm.batch_normalization_test( x, scale, bias, mean, var, axes, eps ) # reference forward pass if axes == "per-activation": axes2 = (0,) elif axes == "spatial": axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes scale2, bias2, mean2, var2 = ( aet.addbroadcast(t, *axes2) for t in (scale, bias, mean, var) ) out2 = (x - mean2) * (scale2 / aet.sqrt(var2 + eps)) + bias2 # backward pass dy = vartype("dy") grads = aet.grad( None, wrt=[x, scale, bias, mean, var], known_grads={out: dy} ) # reference backward pass grads2 = aet.grad( None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy} ) # compile f = aesara.function( [x, scale, bias, mean, var, dy], [out, out2] + grads + grads2 ) # check if the abstract Ops have been replaced assert not any( [ isinstance( n.op, ( batchnorm.AbstractBatchNormTrain, batchnorm.AbstractBatchNormInference, batchnorm.AbstractBatchNormTrainGrad, ), ) for n in f.maker.fgraph.toposort() ] ) # run for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple( 1 if d in axes2 else s for d, s in enumerate(data_shape) ) X = 4 + 3 * np.random.randn(*data_shape).astype(aesara.config.floatX) Dy = -1 + 2 * np.random.randn(*data_shape).astype(aesara.config.floatX) Scale = np.random.randn(*param_shape).astype(aesara.config.floatX) Bias = np.random.randn(*param_shape).astype(aesara.config.floatX) Mean = np.random.randn(*param_shape).astype(aesara.config.floatX) Var = np.random.rand(*param_shape).astype(aesara.config.floatX) outputs = f(X, Scale, Bias, Mean, Var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[1]) # out # compare gradients utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose( outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5 ) # dvar
def run_conv_gradweight(self, algo, dtype, precision, parameters): ( inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta, ) = parameters inputs_val = np.random.random(inputs_shape).astype(dtype) if beta == 0: filters_val = None else: filters_val = np.random.random(filters_shape).astype(dtype) filters_val /= 10 topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype) # Scale down the input values to prevent absolute errors in utt.assert_allclose. inputs_val /= 10 topgrad_val /= 10 inputs = aesara.shared(inputs_val) topgrad = aesara.shared(topgrad_val) # Compile an Aesara function for the cuDNN implementation grad_w = dnn_gradweight( inputs, topgrad, filters_shape, alpha=alpha, beta=beta, out=filters_val, border_mode=border_mode, subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision, ) f = aesara.function([], grad_w, mode=mode_with_gpu) # Compile an Aesara function for the reference implementation grad_w_ref = self.cpu_gradweight_class( border_mode=border_mode, subsample=subsample, filter_dilation=dilation)(ref_cast(inputs), ref_cast(topgrad), filters_shape[2:]) if conv_mode == "conv": if inputs.ndim == 5: grad_w_ref = grad_w_ref[:, :, ::-1, ::-1, ::-1] else: grad_w_ref = grad_w_ref[:, :, ::-1, ::-1] f_ref = aesara.function([], grad_w_ref, mode="FAST_RUN") # Compare the results of the two implementations res_ref = f_ref() res = np.asarray(f()) if algo in cudnn.deterministic_bwd_filter_algorithms: utt.assert_allclose(res, np.asarray(f())) atol, rtol = self.get_atol_rtol(algo, dtype, precision) if beta == 0: cpu_res = alpha * res_ref else: cpu_res = alpha * res_ref + beta * filters_val self.scale_numpy_arrays_inplace(cpu_res, res, alpha) utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def run_conv_gradinput(self, algo, dtype, precision, parameters): ( inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta, ) = parameters if beta == 0: inputs_val = None else: inputs_val = np.random.random(inputs_shape).astype(dtype) inputs_val /= 10 filters_val = np.random.random(filters_shape).astype(dtype) topgrad_val = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype) # Scale down the input values to prevent absolute errors in utt.assert_allclose. filters_val /= 10 topgrad_val /= 10 filters = aesara.shared(filters_val) topgrad = aesara.shared(topgrad_val) # Compile a aesara function for the cuDNN implementation grad_i = dnn_gradinput( filters, topgrad, inputs_shape, alpha=alpha, beta=beta, out=inputs_val, border_mode=border_mode, subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision, ) f = aesara.function([], grad_i, mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters flipped according to the width, height and time axis if conv_mode == "conv": if filters.ndim == 5: flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters[:, :, ::-1, ::-1] else: flipped_filters = filters # Compile a aesara function for the reference implementation grad_i_ref = self.cpu_gradinput_class( border_mode=border_mode, subsample=subsample, filter_dilation=dilation)(ref_cast(flipped_filters), ref_cast(topgrad), inputs_shape[2:]) f_ref = aesara.function([], grad_i_ref, mode="FAST_RUN") # Compare the results of the two implementations res_ref = f_ref() res = np.asarray(f()) if algo in cudnn.deterministic_bwd_data_algorithms: utt.assert_allclose(res, np.asarray(f())) atol, rtol = self.get_atol_rtol(algo, dtype, precision) if beta == 0: cpu_res = alpha * res_ref else: cpu_res = alpha * res_ref + beta * inputs_val self.scale_numpy_arrays_inplace(cpu_res, res, alpha) utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)
def test_machine_translation(self): # This test case comes from https://github.com/rizar/scan-grad-speed and # is an example of actual computation done with scan in the context of # machine translation # # 'dim' has been reduced from 1000 to 5 to make the test run faster # Parameters from an actual machine tranlation run batch_size = 80 seq_len = 50 dim = 5 # Weight matrices U = aesara.shared( np.random.normal(size=(dim, dim), scale=0.0001).astype(config.floatX)) U.name = "U" V = aesara.shared(U.get_value()) V.name = "V" W = aesara.shared(U.get_value()) W.name = "W" # Variables and their values x = tensor3("x") x_value = np.random.normal(size=(seq_len, batch_size, dim), scale=0.0001).astype(config.floatX) ri = tensor3("ri") ri_value = x_value zi = tensor3("zi") zi_value = x_value init = aet.alloc(np.cast[config.floatX](0), batch_size, dim) def rnn_step1( # sequences x, ri, zi, # outputs_info h, ): pre_r = ri + h.dot(U) pre_z = zi + h.dot(V) r = sigmoid(pre_r) z = sigmoid(pre_z) after_r = r * h pre_h = x + after_r.dot(W) new_h = tanh(pre_h) res_h = z * new_h + (1 - z) * h return res_h # Compile the function twice, once with the optimization and once # without opt_mode = mode.including("scan") h, _ = aesara.scan( rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name="fpass1", mode=opt_mode, ) cost = h[-1].sum() grad1 = grad(cost, [U, V, W]) f_opt = aesara.function(inputs=[x, ri, zi], outputs=grad1, mode=opt_mode) no_opt_mode = mode.excluding("scanOp_pushout_output") h, _ = aesara.scan( rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name="fpass1", mode=no_opt_mode, ) cost = h[-1].sum() grad1 = grad(cost, [U, V, W]) f_no_opt = aesara.function(inputs=[x, ri, zi], outputs=grad1, mode=no_opt_mode) # Validate that the optimization has been applied scan_node_grad = [ node for node in f_opt.maker.fgraph.toposort() if isinstance(node.op, Scan) ][1] for output in scan_node_grad.op.outputs: assert not (isinstance(output.owner.op, Elemwise) and any( [isinstance(i, Dot) for i in output.owner.inputs])) # Compare the outputs of the two functions on the same input data. f_opt_output = f_opt(x_value, ri_value, zi_value) f_no_opt_output = f_no_opt(x_value, ri_value, zi_value) utt.assert_allclose(f_opt_output, f_no_opt_output)
def test_odd(self): M = N - 1 inputs_val = np.random.random((1, M, M)).astype("float32") inputs = theano.shared(inputs_val) rfft = theano.gpuarray.fft.curfft(inputs) f_rfft = theano.function([], rfft, mode=mode_with_gpu) res_rfft = f_rfft() res_rfft_comp = np.asarray( res_rfft[:, :, :, 0]) + 1j * np.asarray(res_rfft[:, :, :, 1]) rfft_ref = np.fft.rfftn(inputs_val, s=(M, M), axes=(1, 2)) utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4) m = rfft.type() ifft = theano.gpuarray.fft.cuirfft(m, is_odd=True) f_ifft = theano.function([m], ifft, mode=mode_with_gpu) res_ifft = f_ifft(res_rfft) utt.assert_allclose(inputs_val, np.asarray(res_ifft)) inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32") inputs = theano.shared(inputs_val) irfft = theano.gpuarray.fft.cuirfft(inputs, norm="ortho", is_odd=True) f_irfft = theano.function([], irfft, mode=mode_with_gpu) res_irfft = f_irfft() inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1] irfft_ref = np.fft.irfftn(inputs_ref, s=(M, M), axes=(1, 2)) * M utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4) # The numerical gradient of the FFT is sensitive, must set large # enough epsilon to get good accuracy. eps = 1e-1 def f_rfft(inp): return theano.gpuarray.fft.curfft(inp) inputs_val = np.random.random((1, M, M)).astype("float32") utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu) def f_irfft(inp): return theano.gpuarray.fft.cuirfft(inp, is_odd=True) inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32") utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu) def f_rfft(inp): return theano.gpuarray.fft.curfft(inp, norm="ortho") inputs_val = np.random.random((1, M, M)).astype("float32") utt.verify_grad(f_rfft, [inputs_val], eps=eps, mode=mode_with_gpu) def f_irfft(inp): return theano.gpuarray.fft.cuirfft(inp, norm="no_norm", is_odd=True) inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype("float32") utt.verify_grad(f_irfft, [inputs_val], eps=eps, mode=mode_with_gpu)
def test_one_sequence_one_output_weights_gpu1(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = fvector("u") x0 = fscalar("x0") W_in = fscalar("win") W = fscalar("w") mode = mode_with_gpu.excluding("InputToGpuOptimizer") output, updates = scan( f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode, ) output = GpuFromHost(test_ctx_name)(output) f2 = aesara.function( [u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=mode, ) rng = np.random.default_rng(utt.fetch_seed()) v_u = rng.uniform(size=(4, ), low=-5.0, high=5.0) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() v_u = np.asarray(v_u, dtype="float32") v_x0 = np.asarray(v_x0, dtype="float32") W = np.asarray(W, dtype="float32") W_in = np.asarray(W_in, dtype="float32") # compute the output in numpy v_out = np.zeros((4, )) v_out[0] = v_u[0] * W_in + v_x0 * W for step in range(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W aesara_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(aesara_values, v_out) # TO DEL topo = f2.maker.fgraph.toposort() scan_node = [ node for node in topo if isinstance(node.op, scan.op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] topo = f2.maker.fgraph.toposort() assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0 assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4 scan_node = [ node for node in topo if isinstance(node.op, scan.op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any(isinstance(node.op, GpuElemwise) for node in scan_node_topo) assert not any( isinstance(node.op, HostFromGpu) for node in scan_node_topo) assert not any( isinstance(node.op, GpuFromHost) for node in scan_node_topo)
def assert_column_orthonormal(self, Ot): utt.assert_allclose(np.dot(Ot.T, Ot), np.eye(Ot.shape[1]))
def test_DownsampleFactorMax(self): rng = np.random.RandomState(utt.fetch_seed()) # maxpool, input size examples = ( ((2, ), (16, )), ( (2, ), ( 4, 16, ), ), ( (2, ), ( 4, 2, 16, ), ), ((1, 1), (4, 2, 16, 16)), ((2, 2), (4, 2, 16, 16)), ((3, 3), (4, 2, 16, 16)), ((3, 2), (4, 2, 16, 16)), ((3, 2, 2), (3, 2, 16, 16, 16)), ((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)), ) for example, ignore_border, mode in product( examples, [True, False], ["max", "sum", "average_inc_pad", "average_exc_pad"], ): (maxpoolshp, inputsize) = example imval = rng.rand(*inputsize) images = aesara.shared(imval) # Pure Numpy computation numpy_output_val = self.numpy_max_pool_nd(imval, maxpoolshp, ignore_border, mode=mode) # The pool_2d or pool_3d helper methods if len(maxpoolshp) == 2: output = pool_2d(images, maxpoolshp, ignore_border, mode=mode) f = function( [], [ output, ], ) output_val = f() utt.assert_allclose(output_val, numpy_output_val) elif len(maxpoolshp) == 3: output = pool_3d(images, maxpoolshp, ignore_border, mode=mode) f = function( [], [ output, ], ) output_val = f() utt.assert_allclose(output_val, numpy_output_val) # Pool op maxpool_op = Pool(ndim=len(maxpoolshp), ignore_border=ignore_border, mode=mode)(images, maxpoolshp) output_shape = Pool.out_shape( imval.shape, maxpoolshp, ndim=len(maxpoolshp), ignore_border=ignore_border, ) utt.assert_allclose(np.asarray(output_shape), numpy_output_val.shape) f = function([], maxpool_op) output_val = f() utt.assert_allclose(output_val, numpy_output_val)
def test1(self): a = tensor.dmatrix() w = sort(a) f = aesara.function([a], w) utt.assert_allclose(f(self.m_val), np.sort(self.m_val))