def test_grad(self): a = np.asarray(self.rng.standard_normal((5, 5)), dtype=config.floatX) x = matrix("x") expressions_gradients = [ (x * zero_grad(x), x), (x * zero_grad(exp(x)), exp(x)), (zero_grad(x), at.constant(0.0)), (x**2 * zero_grad(x), 2 * x**2), ] for expr, expr_grad in expressions_gradients: g = grad(expr.sum(), x) # gradient according to aesara f = aesara.function([x], g, on_unused_input="ignore") # desired gradient f2 = aesara.function([x], expr_grad, on_unused_input="ignore") assert np.allclose(f(a), f2(a))
def test_local_sigm_times_exp(self): # Test the `local_sigm_times_exp` optimization. # exp(x) * sigm(-x) -> sigm(x) # exp(-x) * sigm(x) -> sigm(-x) def match(func, ops): # print [node.op.scalar_op for node in func.maker.fgraph.toposort()] assert [node.op for node in func.maker.fgraph.toposort()] == ops m = self.get_mode(excluding=["local_elemwise_fusion", "inplace"]) x, y = vectors("x", "y") f = aesara.function([x], sigmoid(-x) * exp(x), mode=m) match(f, [sigmoid]) assert check_stack_trace(f, ops_to_check=sigmoid) f = aesara.function([x], sigmoid(x) * exp(-x), mode=m) match(f, [neg, sigmoid]) assert check_stack_trace(f, ops_to_check=sigmoid) f = aesara.function([x], -(-(-(sigmoid(x)))) * exp(-x), mode=m) match(f, [neg, sigmoid, neg]) # assert check_stack_trace(f, ops_to_check=sigmoid) f = aesara.function( [x, y], (sigmoid(x) * sigmoid(-y) * -exp(-x) * exp(x * y) * exp(y)), mode=m, ) topo = f.maker.fgraph.toposort() for op, nb in [(sigmoid, 2), (mul, 2), (neg, 1), (exp, 1)]: assert sum([n.op == op for n in topo]) == nb
def test_1msigmoid(self): if not register_local_1msigmoid: return m = self.get_mode() x = fmatrix() # tests exp_over_1_plus_exp f = aesara.function([x], 1 - exp(x) / (1 + exp(x)), mode=m) assert check_stack_trace(f, ops_to_check=[neg, sigmoid_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ neg, sigmoid_inplace, ] # tests inv_1_plus_exp f = aesara.function([x], 1 - aet.fill(x, 1.0) / (1 + exp(-x)), mode=m) assert check_stack_trace(f, ops_to_check=[neg, sigmoid_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ neg, sigmoid_inplace, ]
def local_exp_over_1_plus_exp(fgraph, node): """ exp(x)/(1+exp(x)) -> sigm(x) c/(1+exp(x)) -> c*sigm(-x) """ # this optimization should be done for numerical stability # so we don't care to check client counts if node.op == true_div: # find all the exp() terms in the numerator num, denom = node.inputs num_exp_x, num_rest, num_neg = partition_num_or_denom(num, is_exp) denom_1pexp, denom_rest, denom_neg = partition_num_or_denom( denom, is_1pexp) sigmoids = [] for t in denom_1pexp: if t in num_exp_x: # case: exp(x) /(1+exp(x)) sigmoids.append(sigmoid(t)) del num_exp_x[num_exp_x.index(t)] else: # case: 1/(1+exp(x)) sigmoids.append(sigmoid(-t)) copy_stack_trace(node.outputs[0], sigmoids[-1]) if not sigmoids: # we didn't find any. abort return # put the new numerator together new_num = sigmoids + [exp(t) for t in num_exp_x] + num_rest if len(new_num) == 1: new_num = new_num[0] else: new_num = mul(*new_num) if num_neg ^ denom_neg: new_num = -new_num copy_stack_trace(num, new_num) if len(denom_rest) == 0: return [new_num] elif len(denom_rest) == 1: out = new_num / denom_rest[0] else: out = new_num / mul(*denom_rest) copy_stack_trace(node.outputs[0], out) return [out]
def grad(self, inputs, cost_grad): """ In defining the gradient, the Finite Fourier Transform is viewed as a complex-differentiable function of a complex variable """ a = inputs[0] n = inputs[1] axis = inputs[2] grad = cost_grad[0] if not isinstance(axis, TensorConstant): raise NotImplementedError( f"{self.__class__.__name__}: gradient is currently implemented" " only for axis being an Aesara constant") axis = int(axis.data) # notice that the number of actual elements in wrto is independent of # possible padding or truncation: elem = arange(0, shape(a)[axis], 1) # accounts for padding: freq = arange(0, n, 1) outer_res = outer(freq, elem) pow_outer = exp(((-2 * math.pi * 1j) * outer_res) / (1.0 * n)) res = tensordot(grad, pow_outer, (axis, 0)) # This would be simpler but not implemented by aesara: # res = switch(lt(n, shape(a)[axis]), # set_subtensor(res[...,n::], 0, False, False), res) # Instead we resort to that to account for truncation: flip_shape = list(np.arange(0, a.ndim)[::-1]) res = res.dimshuffle(flip_shape) res = switch( lt(n, shape(a)[axis]), set_subtensor( res[n::, ], 0, False, False, ), res, ) res = res.dimshuffle(flip_shape) # insures that gradient shape conforms to input shape: out_shape = (list(np.arange(0, axis)) + [a.ndim - 1] + list(np.arange(axis, a.ndim - 1))) res = res.dimshuffle(*out_shape) return [res, None, None]
def test_log1pexp_to_softplus(self): m = aesara.config.mode if m == "FAST_COMPILE": m = "FAST_RUN" x = vector() out = log(1 + exp(x)) f = aesara.function([x], out, mode=self.m) # Fix ticket #4581 first # assert check_stack_trace(f, ops_to_check='all') topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op.scalar_op, ScalarSoftplus) f(np.random.rand(54).astype(config.floatX))
def test_broadcast_grad(): # rng = numpy.random.RandomState(utt.fetch_seed()) x1 = tensor4("x") # x1_data = rng.randn(1, 1, 300, 300) sigma = scalar("sigma") # sigma_data = 20 window_radius = 3 filter_1d = aet.arange(-window_radius, window_radius + 1) filter_1d = filter_1d.astype(aesara.config.floatX) filter_1d = exp(-0.5 * filter_1d ** 2 / sigma ** 2) filter_1d = filter_1d / filter_1d.sum() filter_W = filter_1d.dimshuffle(["x", "x", 0, "x"]) y = conv2d(x1, filter_W, border_mode="full", filter_shape=[1, 1, None, None]) aesara.grad(y.sum(), sigma)
def test_broadcast_grad(): x1 = tensor4("x") sigma = scalar("sigma") window_radius = 3 filter_1d = at.arange(-window_radius, window_radius + 1) filter_1d = filter_1d.astype(aesara.config.floatX) filter_1d = exp(-0.5 * filter_1d**2 / sigma**2) filter_1d = filter_1d / filter_1d.sum() filter_W = filter_1d.dimshuffle(["x", "x", 0, "x"]) y = conv2d(x1, filter_W, border_mode="full", filter_shape=[1, 1, None, None]) # TODO FIXME: Make this a real test and `assert` something aesara.grad(y.sum(), sigma)
def test_basic(self, axis): c = matrix() if axis is None: p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", "x") elif axis == 0: p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", 0) elif axis == (0, 1): p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", "x") else: p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle(0, "x") # test that function contains softmax and no div. f = aesara.function([c], p_y, mode=self.mode) assert check_stack_trace(f, ops_to_check=Softmax) f_ops = [n.op for n in f.maker.fgraph.toposort()] assert len(f_ops) == 1 assert isinstance(f_ops[0], Softmax) c_val = self.rng.random((3, 4)).astype(config.floatX) assert np.allclose(f(c_val), sp.softmax(c_val, axis=axis))
def test_matrix_perform_and_opt(self): m = config.mode m = aesara.compile.get_mode(m) m.check_isfinite = False x, y = matrices("xy") # regular softmax and crossentropy sm = softmax(x) cm = categorical_crossentropy(sm, y) # numerically stable log-softmax with crossentropy logsm = logsoftmax(x) sm2 = exp(logsm) # just used to show equivalence with sm cm2 = -aet_sum(y * logsm, axis=1) grad_node = grad(cm2.mean(), x) # create some inputs into a softmax that are large and labels a = np.exp(10 * np.random.random((5, 10)).astype(config.floatX)) # create some one-hot coded labels b = np.eye(5, 10).astype(config.floatX) # show equivalence of softmax and exponentiated numerically stable # log-softmax f1 = aesara.function([x], [sm, sm2]) sm_, sm2_ = f1(a) utt.assert_allclose(sm_, sm2_) # now show that the two versions result in the same crossentropy cost # this indicates that the forward function does provide some numerical # stability f2 = aesara.function([x, y], [cm, cm2], mode=m) cm_, cm2_ = f2(a, b) utt.assert_allclose(cm_, cm2_) # now, show that in the standard softmax case the gradients blow up # while in the log-softmax case they don't f3 = aesara.function([x, y], [grad_node]) grad_ = f3(a, b) assert not np.any(np.isnan(grad_))
def softmax_graph(c): return exp(c) / exp(c).sum(axis=-1, keepdims=True)
class TestSoftmaxOpt: # Test that expressions of softmax in terms of exponentiated things # divided by row sums are replaced by softmax expressions. # # Softmax_grad isn't that interesting as an Op, but it has the signature # we look for when trying to insert CrossEntropySoftmax... grad. So, for # now, we add softmax_grad to graphs. In the future, we may modify the # CrossEntropySoftmax...grad to look for the more basic pattern. # def setup_method(self): self.rng = np.random.default_rng(utt.fetch_seed()) self.mode = aesara.compile.mode.get_default_mode() self.mode = self.mode.including("canonicalize") @pytest.mark.parametrize("axis", [None, 0, 1, -1, (0, 1)]) def test_basic(self, axis): c = matrix() if axis is None: p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", "x") elif axis == 0: p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", 0) elif axis == (0, 1): p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle("x", "x") else: p_y = exp(c) / exp(c).sum(axis=axis).dimshuffle(0, "x") # test that function contains softmax and no div. f = aesara.function([c], p_y, mode=self.mode) assert check_stack_trace(f, ops_to_check=Softmax) f_ops = [n.op for n in f.maker.fgraph.toposort()] assert len(f_ops) == 1 assert isinstance(f_ops[0], Softmax) c_val = self.rng.random((3, 4)).astype(config.floatX) assert np.allclose(f(c_val), sp.softmax(c_val, axis=axis)) @pytest.mark.parametrize("axis", [None, 0, 1, 2, -1, -2, -3, (0, 1, 2)]) def test_basic_keepdims(self, axis): c = tensor3() p_y = exp(c) / exp(c).sum(axis=axis, keepdims=True) # test that function contains softmax and no div. f = aesara.function([c], p_y, mode=self.mode) assert check_stack_trace(f, ops_to_check=Softmax) f_ops = [n.op for n in f.maker.fgraph.toposort()] assert len(f_ops) == 1 assert isinstance(f_ops[0], Softmax) c_val = self.rng.random((3, 4, 5)).astype(config.floatX) assert np.allclose(f(c_val), sp.softmax(c_val, axis=axis)) @pytest.mark.skip(reason="Optimization not enabled for the moment") def test_grad(self): c = matrix() p_y = exp(c) / exp(c).sum(axis=1).dimshuffle(0, "x") # test that function contains softmax and softmaxgrad w = matrix() g = aesara.function([c, w], grad((p_y * w).sum(), c), mode=self.mode) g_ops = [n.op for n in g.maker.fgraph.toposort()] assert len(g_ops) == 2, g_ops assert isinstance(g_ops[0], Softmax) assert isinstance(g_ops[1], SoftmaxGrad) g(self.rng.random((3, 4)), self.rng.uniform(0.5, 1, (3, 4))) def test_transpose_basic(self): # this should be a transposed softmax c = matrix() p_y = exp(c) / exp(c).sum(axis=0) # test that function contains softmax and no div. f = aesara.function([c], p_y, mode=self.mode) f_ops = [n.op for n in f.maker.fgraph.toposort()] assert len(f_ops) == 1 assert isinstance(f_ops[0], Softmax) @pytest.mark.skip(reason="Optimization not enabled for the moment") def test_transpose_grad(self): # this should be a transposed softmax c = matrix() p_y = exp(c) / exp(c).sum(axis=0) # test that function contains softmax and no div. g = aesara.function([c], grad(p_y.sum(), c), mode=self.mode) g_ops = [n.op for n in g.maker.fgraph.toposort()] assert len(g_ops) == 2 assert isinstance(g_ops[0], Softmax) assert isinstance(g_ops[1], SoftmaxGrad) def test_1d_basic(self): c = vector() p_y = exp(c) / exp(c).sum() # test that function contains softmax and no div. f = aesara.function([c], p_y, mode=self.mode) f_ops = [n.op for n in f.maker.fgraph.toposort()] assert len(f_ops) == 1 assert isinstance(f_ops[0], Softmax) @pytest.mark.skip(reason="Optimization not enabled for the moment") def test_1D_grad(self): c = vector() p_y = exp(c) / exp(c).sum() # test that function contains softmax and no div. g = aesara.function([c], grad(p_y.sum(), c), mode=self.mode) g_ops = [n.op for n in g.maker.fgraph.toposort()] assert len(g_ops) == 2 assert isinstance(g_ops[0], Softmax) assert isinstance(g_ops[1], SoftmaxGrad) @pytest.mark.parametrize( "f", [ lambda c: exp(c) / exp(c).sum(axis=0).dimshuffle(0, 1, "x"), lambda c: exp(c) / exp(c).sum(axis=0).dimshuffle("x", 0, 1, "x"), lambda c: exp(c) / exp(c).sum(axis=0).dimshuffle("x", 1, 0), lambda c: exp(c) / exp(c).sum(axis=(0, 1), keepdims=True), ], ) def test_invalid_softmax_expressions(self, f): # Test that graphs are not rewritten into a softmax when a dimshuffle # swaps or adds extra dimensions, or when more than one but not all axis # are summed over (which is not allowed by the Softmax Op but otherwise # valid) c = tensor3("c") out = f(c) f = aesara.function([c], out, mode=self.mode) f_ops = [n.op for n in f.maker.fgraph.toposort()] assert len(f_ops) > 1 assert not any(isinstance(op, Softmax) for op in f_ops)
def test_exp_over_1_plus_exp(self): m = self.get_mode(excluding=["local_elemwise_fusion"]) x = vector() data = np.random.rand(54).astype(config.floatX) backup = config.warn__identify_1pexp_bug config.warn__identify_1pexp_bug = False try: # tests exp_over_1_plus_exp f = aesara.function([x], exp(x) / (1 + exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid] f(data) f = aesara.function([x], exp(x) / (2 + exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = aesara.function([x], exp(x) / (1 - exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = aesara.function([x], exp(x + 1) / (1 + exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) # tests inv_1_plus_exp f = aesara.function([x], aet.fill(x, 1.0) / (1 + exp(-x)), mode=m) # todo: solve issue #4589 first # assert check_stack_trace(f, ops_to_check=sigmoid) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid] f(data) f = aesara.function([x], aet.fill(x, 1.0) / (2 + exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = aesara.function([x], aet.fill(x, 1.0) / (1 - exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = aesara.function([x], aet.fill(x, 1.1) / (1 + exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) # tests inv_1_plus_exp with neg f = aesara.function([x], aet.fill(x, -1.0) / (1 + exp(-x)), mode=m) # todo: solve issue #4589 first # assert check_stack_trace( # f, ops_to_check=[sigmoid, neg_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ sigmoid, neg_inplace, ] f(data) f = aesara.function([x], aet.fill(x, -1.0) / (1 - exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, neg_inplace, ] f(data) f = aesara.function([x], aet.fill(x, -1.0) / (2 + exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, neg_inplace, ] f(data) f = aesara.function([x], aet.fill(x, -1.1) / (1 + exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, neg_inplace, ] f(data) # tests double inv_1_plus_exp with neg # (-1)(exp(x)) / (1+exp(x))(1+exp(-x)) # = (-1)/(1+exp(-x)) * exp(x)/(1+exp(x)) # = - (sigm(x) * sigm(x)) f = aesara.function( [x], (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (1 + exp(-x))), mode=m, ) # todo: solve issue #4589 first # assert check_stack_trace(f, ops_to_check=[sigmoid, mul]) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid, mul] f(data) f = aesara.function( [x], (aet.fill(x, -1.1) * exp(x)) / ((1 + exp(x)) * (1 + exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, mul, neg_inplace, ] f(data) f = aesara.function( [x], (aet.fill(x, -1.0) * exp(x)) / ((2 + exp(x)) * (1 + exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, mul, neg_inplace, ] f(data) f = aesara.function( [x], (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (2 + exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, mul, neg_inplace, ] f(data) f = aesara.function( [x], (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (1 + exp(x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, mul, neg_inplace, ] f(data) f = aesara.function( [x], (aet.fill(x, -1.0) * exp(x)) / ((1 + exp(x)) * (2 + exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, mul, neg_inplace, ] f(data) finally: # Restore config option. config.warn__identify_1pexp_bug = backup