コード例 #1
0
ファイル: batchnorm.py プロジェクト: Sayam753/Theano-PyMC
def local_abstract_batch_norm_train_grad(fgraph, node):
    if not isinstance(node.op, AbstractBatchNormTrainGrad):
        return None

    x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
    axes = node.op.axes
    if min(axes) < 0 or max(axes) > x.ndim:
        return None
    if (not isinstance(x.type, TensorType)
            or not isinstance(dy.type, TensorType)
            or not isinstance(scale.type, TensorType)
            or not isinstance(x_mean.type, TensorType)
            or not isinstance(x_invstd.type, TensorType)
            or not isinstance(epsilon.type, TensorType)):
        return None

    x_diff = x - x_mean
    mean_dy_x_diff = mean(dy * x_diff, axis=axes, keepdims=True)
    c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd**3))

    g_wrt_inputs = scale * (c - mean(c, axis=axes, keepdims=True))
    g_wrt_scale = at_sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
    g_wrt_bias = at_sum(dy, axis=axes, keepdims=True)
    results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]

    results = [
        at.patternbroadcast(r, r_orig.broadcastable)
        for (r, r_orig) in zip(results, node.outputs)
    ]

    for var in aesara.graph.basic.vars_between(node.inputs, results):
        if var not in node.inputs:
            copy_stack_trace(node.outputs[0], var)
    return results
コード例 #2
0
ファイル: test_builders.py プロジェクト: lucianopaz/aesara
    def test_lop_override(self, cls_ofg):
        x = vector()
        y = 1.0 / (1.0 + exp(-x))

        def lop_ov(inps, outs, grads):
            (y_, ) = outs
            (dedy_, ) = grads
            return [2.0 * y_ * (1.0 - y_) * dedy_]

        y_, dedy = vector(), vector()
        op_lop_ov = cls_ofg([x, y_, dedy], [2.0 * y_ * (1.0 - y_) * dedy])

        xx = vector()
        yy1 = at_sum(sigmoid(xx))
        gyy1 = 2.0 * grad(yy1, xx)

        for ov in [lop_ov, op_lop_ov]:
            op = cls_ofg([x], [y], lop_overrides=ov)
            yy2 = at_sum(op(xx))
            gyy2 = grad(yy2, xx)
            fn = function([xx], [gyy1, gyy2])

            xval = np.random.random((32, )).astype(config.floatX)
            y1val, y2val = fn(xval)
            np.testing.assert_array_almost_equal(y1val, y2val, 4)
コード例 #3
0
def test_pickle_unpickle_without_reoptimization():
    mode = config.mode
    if mode in ["DEBUG_MODE", "DebugMode"]:
        mode = "FAST_RUN"
    x1 = fmatrix("x1")
    x2 = fmatrix("x2")
    x3 = shared(np.ones((10, 10), dtype=floatX))
    x4 = shared(np.ones((10, 10), dtype=floatX))
    y = at_sum(at_sum(at_sum(x1 ** 2 + x2) + x3) + x4)

    updates = OrderedDict()
    updates[x3] = x3 + 1
    updates[x4] = x4 + 1
    f = function([x1, x2], y, updates=updates, mode=mode)

    # now pickle the compiled aesara fn
    string_pkl = pickle.dumps(f, -1)

    # compute f value
    in1 = np.ones((10, 10), dtype=floatX)
    in2 = np.ones((10, 10), dtype=floatX)

    # test unpickle without optimization
    default = config.reoptimize_unpickled_function
    try:
        # the default is True
        config.reoptimize_unpickled_function = False
        f_ = pickle.loads(string_pkl)
        assert f(in1, in2) == f_(in1, in2)
    finally:
        config.reoptimize_unpickled_function = default
コード例 #4
0
    def test_xent_thing_int32(self):
        x = matrix("x")
        y = lvector("y")
        yi = at.cast(y, "int32")
        expressions = [
            at_sum(-log(softmax(x)[at.arange(yi.shape[0]), yi])),
            -at_sum(log(softmax(x)[at.arange(yi.shape[0]), yi])),
            -at_sum(log(softmax(x))[at.arange(yi.shape[0]), yi]),
            at_sum(-log(softmax(x))[at.arange(yi.shape[0]), yi]),
        ]

        for expr in expressions:
            fgraph = FunctionGraph([x, y], [expr])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 5
            assert crossentropy_softmax_argmax_1hot_with_bias in ops
            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]

            # Also verify the gradient wrt x
            fgraph = FunctionGraph([x, y], [grad(expr, x)])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 3
            assert crossentropy_softmax_1hot_with_bias_dx in ops
            assert softmax_legacy in ops
            assert softmax_grad_legacy not in ops
コード例 #5
0
ファイル: test_jax.py プロジェクト: mgorny/aesara
def test_jax_CAReduce():
    a_at = vector("a")
    a_at.tag.test_value = np.r_[1, 2, 3].astype(config.floatX)

    x = at_sum(a_at, axis=None)
    x_fg = FunctionGraph([a_at], [x])

    compare_jax_and_py(x_fg, [np.r_[1, 2, 3].astype(config.floatX)])

    a_at = matrix("a")
    a_at.tag.test_value = np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)

    x = at_sum(a_at, axis=0)
    x_fg = FunctionGraph([a_at], [x])

    compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])

    x = at_sum(a_at, axis=1)
    x_fg = FunctionGraph([a_at], [x])

    compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])

    a_at = matrix("a")
    a_at.tag.test_value = np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)

    x = prod(a_at, axis=0)
    x_fg = FunctionGraph([a_at], [x])

    compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])

    x = at_all(a_at)
    x_fg = FunctionGraph([a_at], [x])

    compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])
コード例 #6
0
    def test_grad(self):
        x = vector("x")
        a = np.random.random(50).astype(config.floatX)

        aesara.function([x], grad(at_sum(diff(x)), x))
        utt.verify_grad(self.op, [a])

        for k in range(TestDiffOp.nb):
            aesara.function([x], grad(at_sum(diff(x, n=k)), x))
            utt.verify_grad(DiffOp(n=k), [a], eps=7e-3)
コード例 #7
0
ファイル: test_builders.py プロジェクト: lucianopaz/aesara
 def test_grad_grad(self, cls_ofg):
     x, y, z = matrices("xyz")
     e = x + y * z
     op = cls_ofg([x, y, z], [e])
     f = op(x, y, z)
     f = f - grad(at_sum(f), y)
     f = f - grad(at_sum(f), y)
     fn = function([x, y, z], f)
     xv = np.ones((2, 2), dtype=config.floatX)
     yv = np.ones((2, 2), dtype=config.floatX) * 3
     zv = np.ones((2, 2), dtype=config.floatX) * 5
     np.testing.assert_array_almost_equal(6.0, fn(xv, yv, zv), 4)
コード例 #8
0
ファイル: test_opt.py プロジェクト: Sayam753/Theano-PyMC
    def __call__(self, v, cost, parameters, damp):
        # compute Gauss-Newton Matrix right-multiplied by `v`
        Jv = Rop(self._s, parameters, v)
        HJv = grad(at_sum(grad(cost, self._s) * Jv),
                   self._s,
                   consider_constant=[Jv])
        JHJv = grad(at_sum(HJv * self._s),
                    parameters,
                    consider_constant=[HJv, Jv])

        # apply Tikhonov damping
        JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)]
        return JHJv
コード例 #9
0
ファイル: test_types.py プロジェクト: mgorny/aesara
    def test_swap_SharedVariable_with_given(self):
        # A special testcase for logistic_sgd.py in Deep Learning Tutorial
        # This test assert that SharedVariable in different function have same storage

        train_x = shared(value=np.random.random((10,
                                                 10)).astype(config.floatX))
        test_x = shared(value=np.random.random((10, 10)).astype(config.floatX))

        train_y = shared(value=np.random.random((10, 1)).astype(config.floatX))
        test_y = shared(value=np.random.random((10, 1)).astype(config.floatX))

        i = iscalar("index")
        x = vector("x")
        y = vector("y")
        # this formular has no sense but for a test
        out = (at_sum(x) - y)**2
        train = function(
            [i],
            out,
            givens={
                x: train_x[i],
                y: train_y[i]
            },
            updates={train_x: train_x + 0.1},
        )

        test_def = function([i], out, givens={x: test_x[i], y: test_y[i]})
        test_cpy = train.copy(swap={
            train_x: test_x,
            train_y: test_y
        },
                              delete_updates=True)

        for in1, in2 in zip(test_def.maker.inputs, test_cpy.maker.inputs):
            assert in1.value is in2.value
コード例 #10
0
ファイル: test_gradient.py プロジェクト: mgorny/aesara
def test_hessian():
    x = vector()
    y = at_sum(x**2)
    Hx = hessian(y, x)
    f = aesara.function([x], Hx)
    vx = np.arange(10).astype(aesara.config.floatX)
    assert np.allclose(f(vx), np.eye(10) * 2)
コード例 #11
0
    def test_grad(self):
        x = vector()
        np_x = np.random.randn(7).astype(aesara.config.floatX)

        # offset = 0 case:
        mtx_x = GpuAllocDiag()(x)
        sum_mtx_x = at_sum(mtx_x)
        grad_x = aesara.grad(sum_mtx_x, x)
        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)

        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)

        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
        true_grad_x = np.diagonal(computed_grad_mtx_x, 0)
        assert np.allclose(computed_grad_x, true_grad_x)

        # offset > 0 case:
        mtx_x = GpuAllocDiag(2)(x)
        sum_mtx_x = at_sum(mtx_x)
        grad_x = aesara.grad(sum_mtx_x, x)
        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)

        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)

        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
        true_grad_x = np.diagonal(computed_grad_mtx_x, 2)
        assert np.allclose(computed_grad_x, true_grad_x)

        # offset < 0 case:
        mtx_x = GpuAllocDiag(-3)(x)
        sum_mtx_x = at_sum(mtx_x)
        grad_x = aesara.grad(sum_mtx_x, x)
        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)

        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)

        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
        true_grad_x = np.diagonal(computed_grad_mtx_x, -3)
        assert np.allclose(computed_grad_x, true_grad_x)
コード例 #12
0
    def test_shared_grad(self, cls_ofg):
        x, y, z = matrices("xyz")
        s = shared(np.random.rand(2, 2).astype(config.floatX))
        e = x + y * z + s
        op = cls_ofg([x, y, z], [e])
        f = op(x, y, z)
        f = f - grad(at_sum(f), y)
        fn = function([x, y, z], f)
        xv = np.ones((2, 2), dtype=config.floatX)
        yv = np.ones((2, 2), dtype=config.floatX) * 3
        zv = np.ones((2, 2), dtype=config.floatX) * 5
        np.testing.assert_array_almost_equal(11.0 + s.get_value(), fn(xv, yv, zv), 4)

        # grad again the shared variable
        f = op(x, y, z)
        f = f - grad(at_sum(f), s)
        fn = function([x, y, z], f)
        np.testing.assert_array_almost_equal(15.0 + s.get_value(), fn(xv, yv, zv), 4)
コード例 #13
0
ファイル: test_builders.py プロジェクト: lucianopaz/aesara
 def test_grad(self, cls_ofg):
     x, y, z = matrices("xyz")
     e = x + y * z
     op = cls_ofg([x, y, z], [e])
     f = op(x, y, z)
     f = f - grad(at_sum(f), y)
     fn = function([x, y, z], f)
     xv = np.ones((2, 2), dtype=config.floatX)
     yv = np.ones((2, 2), dtype=config.floatX) * 3
     zv = np.ones((2, 2), dtype=config.floatX) * 5
     assert np.all(11.0 == fn(xv, yv, zv))
コード例 #14
0
ファイル: test_opt.py プロジェクト: mgorny/aesara
            def predict_mean_i(i, x_star, s_star, X, beta, h):
                n, D = shape(X)
                # rescale every dimension by the corresponding inverse lengthscale
                iL = at.diag(h[i, :D])
                inp = (X - x_star).dot(iL)

                # compute the mean
                B = iL.dot(s_star).dot(iL)
                t = inp.dot(B)

                lb = (inp * t).sum() + beta.sum()

                Mi = at_sum(lb) * h[i, D]
                return Mi
コード例 #15
0
ファイル: test_rng_mrg.py プロジェクト: Sayam753/Theano-PyMC
def test_gradient_scan():
    # Test for a crash when using MRG inside scan and taking the gradient
    # See https://groups.google.com/d/msg/theano-dev/UbcYyU5m-M8/UO9UgXqnQP0J
    aesara_rng = MRG_RandomStream(10)
    w = shared(np.ones(1, dtype="float32"))

    def one_step(x):
        return x + aesara_rng.uniform((1,), dtype="float32") * w

    x = vector(dtype="float32")
    values, updates = scan(one_step, outputs_info=x, n_steps=10)
    gw = grad(at_sum(values[-1]), w)
    f = function([x], gw)
    f(np.arange(1, dtype="float32"))
コード例 #16
0
def test_gradient_scan(mode):
    aesara_rng = MRG_RandomStream(10)
    w = shared(np.ones(1, dtype="float32"))

    def one_step(x):
        return x + aesara_rng.uniform((1, ), dtype="float32") * w

    x = vector(dtype="float32")
    values, updates = scan(one_step, outputs_info=x, n_steps=10)
    gw = grad(at_sum(values[-1]), w)
    f = function([x], gw, mode=mode)
    assert np.allclose(
        f(np.arange(1, dtype=np.float32)),
        np.array([0.13928187], dtype=np.float32),
        rtol=1e6,
    )
コード例 #17
0
def test_undefined_grad_opt():
    # Make sure that undefined grad get removed in optimized graph.
    random = MRG_RandomStream(
        np.random.default_rng(utt.fetch_seed()).integers(1, 2147462579))
    pvals = shared(np.random.random((10, 20)).astype(config.floatX))
    pvals = pvals / pvals.sum(axis=1)
    pvals = zero_grad(pvals)
    samples = random.multinomial(pvals=pvals, n=1)
    samples = cast(samples, pvals.dtype)
    samples = zero_grad(samples)
    cost = at_sum(samples + pvals)
    grad_out = grad(cost, samples)
    f = function([], grad_out)
    assert not any(
        isinstance(node.op, UndefinedGrad)
        for node in f.maker.fgraph.apply_nodes)
コード例 #18
0
    def __init__(
        self,
        input=None,
        target=None,
        n_input=1,
        n_hidden=1,
        n_output=1,
        lr=1e-3,
        **kw,
    ):
        super().__init__(**kw)

        if input is None:
            input = dvector("input")
        if target is None:
            target = dvector("target")

        self.input = input
        self.target = target
        self.lr = shared(lr, "learning_rate")
        self.w1 = shared(np.zeros((n_hidden, n_input)), "w1")
        self.w2 = shared(np.zeros((n_output, n_hidden)), "w2")
        # print self.lr.type

        self.hidden = sigmoid(dot(self.w1, self.input))
        self.output = dot(self.w2, self.hidden)
        self.cost = at_sum((self.output - self.target)**2)

        self.sgd_updates = {
            self.w1: self.w1 - self.lr * grad(self.cost, self.w1),
            self.w2: self.w2 - self.lr * grad(self.cost, self.w2),
        }

        self.sgd_step = pfunc(
            params=[self.input, self.target],
            outputs=[self.output, self.cost],
            updates=self.sgd_updates,
        )

        self.compute_output = pfunc([self.input], self.output)

        self.output_from_hidden = pfunc([self.hidden], self.output)
コード例 #19
0
    def test_softmax_grad_optimizations(self):
        x = matrix("x")
        one_of_n = lvector("one_of_n")
        op = crossentropy_categorical_1hot
        xe = op(softmax_legacy(x), one_of_n)
        sum_xe = at_sum(xe)
        g_x = grad(sum_xe, x)
        fgraph = FunctionGraph([x, one_of_n], [g_x])
        assert check_stack_trace(
            fgraph,
            ops_to_check=[
                crossentropy_softmax_1hot_with_bias_dx, softmax_legacy
            ],
        )

        optdb.query(OPT_FAST_RUN).optimize(fgraph)

        ops = {node.op for node in fgraph.toposort()}
        assert crossentropy_softmax_argmax_1hot_with_bias not in ops
        assert crossentropy_softmax_1hot_with_bias_dx in ops
        assert softmax_legacy in ops
        assert softmax_grad_legacy not in ops
コード例 #20
0
    def test_matrix_perform_and_opt(self):
        m = config.mode
        m = aesara.compile.get_mode(m)
        m.check_isfinite = False
        x, y = matrices("xy")
        # regular softmax and crossentropy
        sm = softmax(x)
        cm = categorical_crossentropy(sm, y)

        # numerically stable log-softmax with crossentropy
        logsm = logsoftmax(x)
        sm2 = exp(logsm)  # just used to show equivalence with sm
        cm2 = -at_sum(y * logsm, axis=1)
        grad_node = grad(cm2.mean(), x)

        rng = np.random.default_rng(utt.fetch_seed())

        a = np.exp(10 * rng.random((5, 10)).astype(config.floatX))
        b = np.eye(5, 10).astype(config.floatX)

        # show equivalence of softmax and exponentiated numerically stable
        # log-softmax
        f1 = aesara.function([x], [sm, sm2])
        sm_, sm2_ = f1(a)
        utt.assert_allclose(sm_, sm2_)

        # now show that the two versions result in the same crossentropy cost
        # this indicates that the forward function does provide some numerical
        # stability
        f2 = aesara.function([x, y], [cm, cm2], mode=m)
        cm_, cm2_ = f2(a, b)
        utt.assert_allclose(cm_, cm2_)

        # now, show that in the standard softmax case the gradients blow up
        # while in the log-softmax case they don't
        f3 = aesara.function([x, y], [grad_node])
        grad_ = f3(a, b)
        assert not np.any(np.isnan(grad_))
コード例 #21
0
ファイル: test_opt.py プロジェクト: Sayam753/Theano-PyMC
def test_local_csm_grad_c():
    data = vector()
    indices, indptr, shape = (ivector(), ivector(), ivector())
    mode = get_default_mode()

    if aesara.config.mode == "FAST_COMPILE":
        mode = Mode(linker="c|py", optimizer="fast_compile")

    mode = mode.including("specialize", "local_csm_grad_c")
    for CS, cast in [
        (sparse.CSC, sp.sparse.csc_matrix),
        (sparse.CSR, sp.sparse.csr_matrix),
    ]:
        cost = at_sum(sparse.DenseFromSparse()(CS(data, indices, indptr,
                                                  shape)))
        f = aesara.function([data, indices, indptr, shape],
                            aesara.grad(cost, data),
                            mode=mode)
        assert not any(
            isinstance(node.op, sparse.CSMGrad)
            for node in f.maker.fgraph.toposort())
        v = cast(random_lil((10, 40), config.floatX, 3))
        f(v.data, v.indices, v.indptr, v.shape)
コード例 #22
0
def test_broadcastable():
    R = MRG_RandomStream(234)
    x = matrix()
    size1 = (10, 1)
    size2 = (x.shape[0], 1)
    pvals_1 = np.random.uniform(0, 1, size=size1)
    pvals_1 = pvals_1 / sum(pvals_1)
    pvals_2 = R.uniform(size=size2)
    pvals_2 = pvals_2 / at_sum(pvals_2)

    for distribution in [
            R.uniform,
            R.normal,
            R.truncated_normal,
            R.binomial,
            R.multinomial,
            R.multinomial_wo_replacement,
    ]:
        # multinomial or multinomial_wo_replacement does not support "size" argument,
        # the sizes of them are implicitly defined with "pvals" argument.
        if distribution in [R.multinomial, R.multinomial_wo_replacement]:
            # check when all dimensions are constant
            uu = distribution(pvals=pvals_1)
            assert uu.broadcastable == (False, True)

            # check when some dimensions are aesara variables
            uu = distribution(pvals=pvals_2)
            assert uu.broadcastable == (False, True)
        else:
            # check when all dimensions are constant
            uu = distribution(size=size1)
            assert uu.broadcastable == (False, True)

            # check when some dimensions are aesara variables
            uu = distribution(size=size2)
            assert uu.broadcastable == (False, True)
コード例 #23
0
ファイル: test_builders.py プロジェクト: lucianopaz/aesara
    def test_grad_override(self, cls_ofg):
        x, y = vectors("xy")

        def go(inps, gs):
            x, y = inps
            (g, ) = gs
            return [g * y * 2, g * x * 1.5]

        dedz = vector("dedz")
        op_mul_grad = cls_ofg([x, y, dedz], go([x, y], [dedz]))

        op_mul = cls_ofg([x, y], [x * y], grad_overrides=go)
        op_mul2 = cls_ofg([x, y], [x * y], grad_overrides=op_mul_grad)

        # single override case (function or OfG instance)
        xx, yy = vector("xx"), vector("yy")
        for op in [op_mul, op_mul2]:
            zz = at_sum(op(xx, yy))
            dx, dy = grad(zz, [xx, yy])
            fn = function([xx, yy], [dx, dy])
            xv = np.random.random((16, )).astype(config.floatX)
            yv = np.random.random((16, )).astype(config.floatX)
            dxv, dyv = fn(xv, yv)
            np.testing.assert_array_almost_equal(yv * 2, dxv, 4)
            np.testing.assert_array_almost_equal(xv * 1.5, dyv, 4)

        # list override case
        def go1(inps, gs):
            x, w, b = inps
            g = gs[0]
            return g * w * 2

        def go2(inps, gs):
            x, w, b = inps
            g = gs[0]
            return g * x * 1.5

        w, b = vectors("wb")
        # we make the 3rd gradient default (no override)
        op_linear = cls_ofg([x, w, b], [x * w + b],
                            grad_overrides=[go1, go2, "default"])
        xx, ww, bb = vector("xx"), vector("yy"), vector("bb")
        zz = at_sum(op_linear(xx, ww, bb))
        dx, dw, db = grad(zz, [xx, ww, bb])
        fn = function([xx, ww, bb], [dx, dw, db])
        xv = np.random.random((16, )).astype(config.floatX)
        wv = np.random.random((16, )).astype(config.floatX)
        bv = np.random.random((16, )).astype(config.floatX)
        dxv, dwv, dbv = fn(xv, wv, bv)
        np.testing.assert_array_almost_equal(wv * 2, dxv, 4)
        np.testing.assert_array_almost_equal(xv * 1.5, dwv, 4)
        np.testing.assert_array_almost_equal(np.ones(16, dtype=config.floatX),
                                             dbv, 4)

        # NullType and DisconnectedType
        op_linear2 = cls_ofg(
            [x, w, b],
            [x * w + b],
            grad_overrides=[go1, NullType()(),
                            DisconnectedType()()],
        )
        zz2 = at_sum(op_linear2(xx, ww, bb))
        dx2, dw2, db2 = grad(
            zz2,
            [xx, ww, bb],
            return_disconnected="Disconnected",
            disconnected_inputs="ignore",
            null_gradients="return",
        )
        assert isinstance(dx2.type, TensorType)
        assert dx2.ndim == 1
        assert isinstance(dw2.type, NullType)
        assert isinstance(db2.type, DisconnectedType)
コード例 #24
0
    def test_crossentropy_softmax_1hot_with_bias_dxcale_cost(self):
        x = matrix("x")
        y = lvector("y")
        a = scalar("a")

        def validate_grad_graph(func):
            # The graph of the gradient should not have softmaxgrad anymore
            has_cx1hotdx = False
            has_softmax = False
            has_softmaxdx = False
            for node in func.maker.fgraph.toposort():
                if node.op == crossentropy_softmax_1hot_with_bias_dx:
                    has_cx1hotdx = True
                if node.op == softmax_legacy:
                    has_softmax = True
                if node.op == softmax_grad_legacy:
                    has_softmaxdx = True

            assert has_cx1hotdx
            assert has_softmax
            assert not has_softmaxdx

        # Cases to test
        expressions = [
            a * at_sum(-log(softmax(x)[at.arange(y.shape[0]), y])),
            -a * at_sum(log(softmax(x)[at.arange(y.shape[0]), y])),
            a * (-at_sum(log(softmax(x)[at.arange(y.shape[0]), y]))),
            a * at_sum(log(softmax(x)[at.arange(y.shape[0]), y])),
            a * at_sum(-log(softmax(x))[at.arange(y.shape[0]), y]),
            -a * at_sum(log(softmax(x))[at.arange(y.shape[0]), y]),
            a * (-at_sum(log(softmax(x))[at.arange(y.shape[0]), y])),
            a * at_sum(log(softmax(x))[at.arange(y.shape[0]), y]),
            a * mean(-log(softmax(x)[at.arange(y.shape[0]), y])),
            -a * mean(log(softmax(x)[at.arange(y.shape[0]), y])),
            a * (-mean(log(softmax(x)[at.arange(y.shape[0]), y]))),
            a * mean(log(softmax(x)[at.arange(y.shape[0]), y])),
            a * mean(-log(softmax(x))[at.arange(y.shape[0]), y]),
            -a * mean(log(softmax(x))[at.arange(y.shape[0]), y]),
            a * (-mean(log(softmax(x))[at.arange(y.shape[0]), y])),
            a * mean(log(softmax(x))[at.arange(y.shape[0]), y]),
        ]

        for expr in expressions:
            fgraph = FunctionGraph([x, y, a], [expr])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            assert 5 <= len(fgraph.toposort()) <= 10

            ops = {node.op for node in fgraph.toposort()}
            assert crossentropy_softmax_argmax_1hot_with_bias in ops
            assert softmax_legacy not in ops

            # Verify the gradient wrt x
            fgraph = FunctionGraph([x, y, a], [grad(expr, x)])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            assert 3 <= len(fgraph.toposort()) <= 6

            ops = {node.op for node in fgraph.toposort()}
            assert crossentropy_softmax_1hot_with_bias_dx in ops
            assert softmax_legacy in ops
            assert softmax_grad_legacy not in ops

            # Verify the gradient when providing output gradient
            fgraph = FunctionGraph(
                [x, y, a], [grad(expr, x, known_grads={expr: a * x.sum()})])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            assert 6 <= len(fgraph.toposort()) <= 8

            ops = {node.op for node in fgraph.toposort()}
            assert crossentropy_softmax_1hot_with_bias_dx in ops
            assert softmax_legacy in ops
            assert softmax_grad_legacy not in ops
コード例 #25
0
def local_subtensor_rv_lift(fgraph, node):
    """Lift a ``*Subtensor`` through ``RandomVariable`` inputs.

    In a fashion similar to ``local_dimshuffle_rv_lift``, the indexed dimensions
    need to be separated into distinct replication-space and (independent)
    parameter-space ``*Subtensor``s.

    The replication-space ``*Subtensor`` can be used to determine a
    sub/super-set of the replication-space and, thus, a "smaller"/"larger"
    ``size`` tuple.  The parameter-space ``*Subtensor`` is simply lifted and
    applied to the distribution parameters.

    Consider the following example graph:
    ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``.  The
    ``*Subtensor`` ``Op`` requests indices ``idx1``, ``idx2``, and ``idx3``,
    which correspond to all three ``size`` dimensions.  Now, depending on the
    broadcasted dimensions of ``mu`` and ``std``, this ``*Subtensor`` ``Op``
    could be reducing the ``size`` parameter and/or sub-setting the independent
    ``mu`` and ``std`` parameters.  Only once the dimensions are properly
    separated into the two replication/parameter subspaces can we determine how
    the ``*Subtensor`` indices are distributed.
    For instance, ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``
    could become
    ``normal(mu[idx1], std[idx2], size=np.shape(idx1) + np.shape(idx2) + np.shape(idx3))``
    if ``mu.shape == std.shape == ()``

    ``normal`` is a rather simple case, because it's univariate.  Multivariate
    cases require a mapping between the parameter space and the image of the
    random variable.  This may not always be possible, but for many common
    distributions it is.  For example, the dimensions of the multivariate
    normal's image can be mapped directly to each dimension of its parameters.
    We use these mappings to change a graph like ``multivariate_normal(mu, Sigma)[idx1]``
    into ``multivariate_normal(mu[idx1], Sigma[idx1, idx1])``.

    """

    st_op = node.op

    if not isinstance(st_op,
                      (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)):
        return False

    base_rv = node.inputs[0]

    rv_node = base_rv.owner
    if not (rv_node and isinstance(rv_node.op, RandomVariable)):
        return False

    # If no one else is using the underlying `RandomVariable`, then we can
    # do this; otherwise, the graph would be internally inconsistent.
    if is_rv_used_in_graph(base_rv, node, fgraph):
        return False

    rv_op = rv_node.op
    rng, size, dtype, *dist_params = rv_node.inputs

    # TODO: Remove this once the multi-dimensional changes described below are
    # in place.
    if rv_op.ndim_supp > 0:
        return False

    rv_op = base_rv.owner.op
    rng, size, dtype, *dist_params = base_rv.owner.inputs

    idx_list = getattr(st_op, "idx_list", None)
    if idx_list:
        cdata = get_idx_list(node.inputs, idx_list)
    else:
        cdata = node.inputs[1:]

    st_indices, st_is_bool = zip(*tuple(
        (as_index_variable(i), getattr(i, "dtype", None) == "bool")
        for i in cdata))

    # We need to separate dimensions into replications and independents
    num_ind_dims = None
    if len(dist_params) == 1:
        num_ind_dims = dist_params[0].ndim
    else:
        # When there is more than one distribution parameter, assume that all
        # of them will broadcast to the maximum number of dimensions
        num_ind_dims = max(d.ndim for d in dist_params)

    reps_ind_split_idx = base_rv.ndim - (num_ind_dims + rv_op.ndim_supp)

    if len(st_indices) > reps_ind_split_idx:
        # These are the indices that need to be applied to the parameters
        ind_indices = tuple(st_indices[reps_ind_split_idx:])

        # We need to broadcast the parameters before applying the `*Subtensor*`
        # with these indices, because the indices could be referencing broadcast
        # dimensions that don't exist (yet)
        bcast_dist_params = broadcast_params(dist_params, rv_op.ndims_params)

        # TODO: For multidimensional distributions, we need a map that tells us
        # which dimensions of the parameters need to be indexed.
        #
        # For example, `multivariate_normal` would have the following:
        # `RandomVariable.param_to_image_dims = ((0,), (0, 1))`
        #
        # I.e. the first parameter's (i.e. mean's) first dimension maps directly to
        # the dimension of the RV's image, and its second parameter's
        # (i.e. covariance's) first and second dimensions map directly to the
        # dimension of the RV's image.

        args_lifted = tuple(p[ind_indices] for p in bcast_dist_params)
    else:
        # In this case, no indexing is applied to the parameters; only the
        # `size` parameter is affected.
        args_lifted = dist_params

    # TODO: Could use `ShapeFeature` info.  We would need to be sure that
    # `node` isn't in the results, though.
    # if hasattr(fgraph, "shape_feature"):
    #     output_shape = fgraph.shape_feature.shape_of(node.outputs[0])
    # else:
    output_shape = indexed_result_shape(base_rv.shape, st_indices)

    size_lifted = (output_shape if rv_op.ndim_supp == 0 else
                   output_shape[:-rv_op.ndim_supp])

    # Boolean indices can actually change the `size` value (compared to just
    # *which* dimensions of `size` are used).
    if any(st_is_bool):
        size_lifted = tuple(
            at_sum(idx) if is_bool else s
            for s, is_bool, idx in zip(size_lifted, st_is_bool, st_indices[:(
                reps_ind_split_idx + 1)]))

    new_node = rv_op.make_node(rng, size_lifted, dtype, *args_lifted)
    _, new_rv = new_node.outputs

    # Calling `Op.make_node` directly circumvents test value computations, so
    # we need to compute the test values manually
    if config.compute_test_value != "off":
        compute_test_value(new_node)

    return [new_rv]
コード例 #26
0
ファイル: batchnorm.py プロジェクト: Sayam753/Theano-PyMC
    def grad(self, inp, grads):
        x, dy, scale, x_mean, x_invstd, epsilon = inp
        ddinputs, ddscale, ddbias = grads

        x_diff = x - x_mean
        mean_dy_x_diff = mean(dy * x_diff, axis=self.axes, keepdims=True)

        # compute gradients given each of the output gradients
        g_wrt_x = 0
        g_wrt_dy = 0
        g_wrt_scale = 0
        g_wrt_x_mean = 0
        g_wrt_x_invstd = 0

        if not isinstance(ddinputs.type, aesara.gradient.DisconnectedType):
            ccc = scale * (ddinputs -
                           mean(ddinputs, axis=self.axes, keepdims=True))
            ddd = (x_invstd**3) * (
                ccc * mean(dy * x_diff, axis=self.axes, keepdims=True) +
                dy * mean(ccc * x_diff, axis=self.axes, keepdims=True))

            g_wrt_x = g_wrt_x - ddd
            g_wrt_dy = g_wrt_dy + ((ccc * x_invstd) - (
                (x_invstd**3) * x_diff *
                mean(ccc * x_diff, axis=self.axes, keepdims=True)))

            eee = (dy * x_invstd) - ((x_invstd**3) * x_diff * mean_dy_x_diff)
            g_wrt_scale = g_wrt_scale + at_sum(
                ddinputs * (eee - mean(eee, axis=self.axes, keepdims=True)),
                axis=self.axes,
                keepdims=True,
            )

            g_wrt_x_mean = g_wrt_x_mean + at_sum(
                ddd, axis=self.axes, keepdims=True)
            g_wrt_x_invstd = g_wrt_x_invstd + at_sum(
                ccc * (dy - 3 * (x_invstd**2) * x_diff * mean_dy_x_diff),
                axis=self.axes,
                keepdims=True,
            )

        if not isinstance(ddscale.type, aesara.gradient.DisconnectedType):
            g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy)
            g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff)
            g_wrt_x_mean = g_wrt_x_mean - (
                x_invstd * ddscale * at_sum(dy, axis=self.axes, keepdims=True))
            g_wrt_x_invstd = g_wrt_x_invstd + (
                ddscale * at_sum(dy * x_diff, axis=self.axes, keepdims=True))

        if not isinstance(ddbias.type, aesara.gradient.DisconnectedType):
            g_wrt_dy = g_wrt_dy + at.fill(dy, ddbias)

        # depending on which output gradients are given,
        # some inputs should be disconnected
        results = [
            g_wrt_x,
            g_wrt_dy,
            g_wrt_scale,
            g_wrt_x_mean,
            g_wrt_x_invstd,
            aesara.gradient.DisconnectedType()(),
        ]
        return [
            aesara.gradient.DisconnectedType()() if
            (isinstance(r, int) and r == 0) else r for r in results
        ]
コード例 #27
0
    def test_get_rid_of_advanced_indexing_version_of_xent(self):
        x = matrix("x")
        b = vector("b")
        y = lvector("y")

        # Basic case
        expressions = [
            at_sum(-log(softmax(x)[at.arange(y.shape[0]), y])),
            -at_sum(log(softmax(x)[at.arange(y.shape[0]), y])),
            -at_sum(log(softmax(x))[at.arange(y.shape[0]), y]),
            at_sum(-log(softmax(x))[at.arange(y.shape[0]), y]),
        ]
        for expr in expressions:

            fgraph = FunctionGraph([x, y], [expr])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 4
            assert crossentropy_softmax_argmax_1hot_with_bias in ops
            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]

            # Also verify the gradient wrt x
            fgraph = FunctionGraph([x, y], [grad(expr, x)])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 2
            assert crossentropy_softmax_1hot_with_bias_dx in ops
            assert softmax_legacy in ops
            assert softmax_grad_legacy not in ops

        # Test that a biased softmax is optimized correctly
        bias_expressions = [
            at_sum(-log(softmax(x + b)[at.arange(y.shape[0]), y])),
            -at_sum(log(softmax(b + x)[at.arange(y.shape[0]), y])),
            -at_sum(log(softmax(x + b))[at.arange(y.shape[0]), y]),
            at_sum(-log(softmax(b + x))[at.arange(y.shape[0]), y]),
        ]

        for expr in bias_expressions:
            fgraph = FunctionGraph([x, b, y], [expr, x])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 2  # [big_op, sum]
            assert crossentropy_softmax_argmax_1hot_with_bias in ops

            fgraph = FunctionGraph([x, b, y], [grad(expr, x)])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 2
            assert crossentropy_softmax_1hot_with_bias_dx in ops
            assert softmax_with_bias in ops
            assert softmax_grad_legacy not in ops

        # Test that using "mean" instead of sum works, too
        mean_expressions = [
            mean(-log(softmax(x)[at.arange(y.shape[0]), y])),
            -mean(log(softmax(x)[at.arange(y.shape[0]), y])),
            -mean(log(softmax(x))[at.arange(y.shape[0]), y]),
            mean(-log(softmax(x))[at.arange(y.shape[0]), y]),
        ]

        for expr in mean_expressions:

            fgraph = FunctionGraph([x, y], [expr])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 6
            assert crossentropy_softmax_argmax_1hot_with_bias in ops
            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]

            fgraph = FunctionGraph([x, y], [grad(expr, x)])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 5
            # there's an extra dimshuffle in there
            # but I can't think of a good rule to get rid of it
            assert crossentropy_softmax_1hot_with_bias_dx in ops
            assert softmax_legacy in ops
            assert softmax_grad_legacy not in ops

        mean_bias_expressions = [
            mean(-log(softmax(x + b)[at.arange(y.shape[0]), y])),
            -mean(log(softmax(b + x)[at.arange(y.shape[0]), y])),
            -mean(log(softmax(x + b))[at.arange(y.shape[0]), y]),
            mean(-log(softmax(b + x))[at.arange(y.shape[0]), y]),
        ]

        for expr in mean_bias_expressions:

            fgraph = FunctionGraph([x, b, y], [expr])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 4
            assert crossentropy_softmax_argmax_1hot_with_bias in ops
            assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)]

            fgraph = FunctionGraph([x, b, y], [grad(expr, x)])
            optdb.query(OPT_FAST_RUN).optimize(fgraph)

            ops = [node.op for node in fgraph.toposort()]
            assert len(ops) == 5
            assert crossentropy_softmax_1hot_with_bias_dx in ops
            assert softmax_with_bias in ops
            assert softmax_grad_legacy not in ops
コード例 #28
0
def test_batch_normalization_train_broadcast():
    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
        for vartype in (tensor5, tensor4, tensor3, matrix, vector):
            x = vartype("x")
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # convert axes to explicit list
            if axes == "per-activation":
                axes2 = (0, )
            elif axes == "spatial":
                axes2 = (0, ) + tuple(range(2, ndim))
            else:
                axes2 = axes

            # compute axes for parameter tensors
            non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
            params_dimshuffle = ["x"] * ndim
            for i, axis in enumerate(non_bc_axes):
                params_dimshuffle[axis] = i

            # construct non-broadcasted parameter variables
            param_type = TensorType(x.dtype, (False, ) * len(non_bc_axes))
            scale, bias, running_mean, running_var = (param_type(n)
                                                      for n in ("scale",
                                                                "bias",
                                                                "running_mean",
                                                                "running_var"))

            # broadcast parameter variables
            scale_bc = scale.dimshuffle(params_dimshuffle)
            bias_bc = bias.dimshuffle(params_dimshuffle)
            running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
            running_var_bc = running_var.dimshuffle(params_dimshuffle)

            # batch_normalization_train with original, non-broadcasted variables
            train_non_bc = batchnorm.batch_normalization_train(
                x,
                scale,
                bias,
                axes,
                eps,
                running_average_factor,
                running_mean,
                running_var,
            )
            # batch_normalization_train with broadcasted variables
            train_bc = batchnorm.batch_normalization_train(
                x,
                scale_bc,
                bias_bc,
                axes,
                eps,
                running_average_factor,
                running_mean_bc,
                running_var_bc,
            )
            train_bc = tuple([train_bc[0]] +
                             [r.dimshuffle(non_bc_axes)
                              for r in train_bc[1:]]  # out
                             )

            # batch_normalization_test with original, non-broadcasted variables
            test_non_bc = batchnorm.batch_normalization_test(
                x, scale, bias, running_mean, running_var, axes, eps)
            # batch_normalization_test with broadcasted variables
            test_bc = batchnorm.batch_normalization_test(
                x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes,
                eps)

            # subtract the results of the non-broadcasted and broadcasted calls
            results_non_bc = train_non_bc + (test_non_bc, )
            results_bc = train_bc + (test_bc, )
            results = [
                abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)
            ]

            # compile to compute all differences
            f = aesara.function([x, scale, bias, running_mean, running_var],
                                at_sum(sum(results)))

            # the paired ops are exactly the same, so the optimizer should have
            # collapsed the sum of differences to a constant zero
            nodes = f.maker.fgraph.toposort()
            if aesara.config.mode != "FAST_COMPILE":
                assert len(nodes) == 1
                assert isinstance(nodes[0].op, aesara.compile.DeepCopyOp)
            inputs = [
                np.asarray(np.random.random(((4, ) * n)), x.dtype) for n in [
                    x.ndim,
                    scale.ndim,
                    bias.ndim,
                    running_mean.ndim,
                    running_var.ndim,
                ]
            ]
            assert 0.0 == f(*inputs)
コード例 #29
0
def test_undefined_grad():
    srng = MRG_RandomStream(seed=1234)

    # checking uniform distribution
    low = scalar()
    out = srng.uniform((), low=low)
    with pytest.raises(NullTypeGradError):
        grad(out, low)

    high = scalar()
    out = srng.uniform((), low=0, high=high)
    with pytest.raises(NullTypeGradError):
        grad(out, high)

    out = srng.uniform((), low=low, high=high)
    with pytest.raises(NullTypeGradError):
        grad(out, (low, high))

    # checking binomial distribution
    prob = scalar()
    out = srng.binomial((), p=prob)
    with pytest.raises(NullTypeGradError):
        grad(out, prob)

    # checking multinomial distribution
    prob1 = scalar()
    prob2 = scalar()
    p = [as_tensor_variable([prob1, 0.5, 0.25])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    with pytest.raises(NullTypeGradError):
        grad(at_sum(out), prob1)

    p = [as_tensor_variable([prob1, prob2])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    with pytest.raises(NullTypeGradError):
        grad(at_sum(out), (prob1, prob2))

    # checking choice
    p = [as_tensor_variable([prob1, prob2, 0.1, 0.2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(NullTypeGradError):
        grad(out[0], (prob1, prob2))

    p = [as_tensor_variable([prob1, prob2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(NullTypeGradError):
        grad(out[0], (prob1, prob2))

    p = [as_tensor_variable([prob1, 0.2, 0.3])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(NullTypeGradError):
        grad(out[0], prob1)

    # checking normal distribution
    avg = scalar()
    out = srng.normal((), avg=avg)
    with pytest.raises(NullTypeGradError):
        grad(out, avg)

    std = scalar()
    out = srng.normal((), avg=0, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, std)

    out = srng.normal((), avg=avg, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, (avg, std))

    # checking truncated normal distribution
    avg = scalar()
    out = srng.truncated_normal((), avg=avg)
    with pytest.raises(NullTypeGradError):
        grad(out, avg)

    std = scalar()
    out = srng.truncated_normal((), avg=0, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, std)

    out = srng.truncated_normal((), avg=avg, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, (avg, std))
コード例 #30
0
ファイル: test_scan.py プロジェクト: Sayam753/Theano-PyMC
    def test_gpu_memory_usage(self):
        # This test validates that the memory usage of the defined aesara
        # function is reasonable when executed on the GPU. It checks for
        # a bug in which one of scan's optimization was not applied which
        # made the scan node compute large and unnecessary outputs which
        # brought memory usage on the GPU to ~12G.

        # Dimensionality of input and output data (not one-hot coded)
        n_in = 100
        n_out = 100
        # Number of neurons in hidden layer
        n_hid = 4000

        # Number of minibatches
        mb_size = 2
        # Time steps in minibatch
        mb_length = 200

        # Define input variables
        xin = ftensor3(name="xin")
        yout = ftensor3(name="yout")

        # Initialize the network parameters
        U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"),
                          name="W_xin_to_l1")
        V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"),
                          name="W_l1_to_l1")
        W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"),
                          name="W_l1_to_l2")
        nparams = [U, V, W]

        # Build the forward pass
        l1_base = dot(xin, U)

        def scan_l(baseline, last_step):
            return baseline + dot(last_step, V)

        zero_output = at.alloc(np.asarray(0.0, dtype="float32"), mb_size,
                               n_hid)

        l1_out, _ = scan(
            scan_l,
            sequences=[l1_base],
            outputs_info=[zero_output],
            mode=self.mode_with_gpu_nodebug,
        )

        l2_out = dot(l1_out, W)

        # Compute the cost and take the gradient wrt params
        cost = at_sum((l2_out - yout)**2)
        grads = aesara.grad(cost, nparams)
        updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads))))

        # Compile the aesara function
        feval_backprop = aesara.function([xin, yout],
                                         cost,
                                         updates=updates,
                                         mode=self.mode_with_gpu_nodebug)

        # Validate that the PushOutScanOutput optimization has been applied
        # by checking the number of outputs of the grad Scan node in the
        # compiled function.
        nodes = feval_backprop.maker.fgraph.toposort()
        scan_nodes = [n for n in nodes if isinstance(n.op, Scan)]

        # The grad scan is always the 2nd one according to toposort. If the
        # optimization has been applied, it has 2 outputs, otherwise 3.
        grad_scan_node = scan_nodes[1]
        assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs)

        # Call the aesara function to ensure the absence of a memory error
        feval_backprop(
            np.zeros((mb_length, mb_size, n_in), dtype="float32"),
            np.zeros((mb_length, mb_size, n_out), dtype="float32"),
        )