Ejemplo n.º 1
0
    def test_lop_override(self, cls_ofg):
        x = vector()
        y = 1.0 / (1.0 + exp(-x))

        def lop_ov(inps, outs, grads):
            (y_, ) = outs
            (dedy_, ) = grads
            return [2.0 * y_ * (1.0 - y_) * dedy_]

        y_, dedy = vector(), vector()
        op_lop_ov = cls_ofg([x, y_, dedy], [2.0 * y_ * (1.0 - y_) * dedy])

        xx = vector()
        yy1 = tt_sum(sigmoid(xx))
        gyy1 = 2.0 * grad(yy1, xx)

        for ov in [lop_ov, op_lop_ov]:
            op = cls_ofg([x], [y], lop_overrides=ov)
            yy2 = tt_sum(op(xx))
            gyy2 = grad(yy2, xx)
            fn = function([xx], [gyy1, gyy2])

            xval = np.random.rand(32).astype(config.floatX)
            y1val, y2val = fn(xval)
            assert np.allclose(y1val, y2val)
Ejemplo n.º 2
0
    def infer_shape(self, fgraph, node, ins_shapes):
        i0_shapes = ins_shapes[0]
        repeats = node.inputs[1]
        out_shape = list(i0_shapes)

        # uint64 shape are not supported.
        dtype = None
        if repeats.dtype in ["uint8", "uint16", "uint32"]:
            dtype = "int64"
        if self.axis is None:
            if repeats.ndim == 0:
                if len(i0_shapes) == 0:
                    out_shape = [repeats]
                else:
                    res = 1
                    for d in i0_shapes:
                        res = res * d
                    out_shape = (res * repeats, )
            else:
                out_shape = [tt_sum(repeats, dtype=dtype)]
        else:
            if repeats.ndim == 0:
                out_shape[self.axis] = out_shape[self.axis] * repeats
            else:
                out_shape[self.axis] = tt_sum(repeats, dtype=dtype)
        return [out_shape]
Ejemplo n.º 3
0
def test_pickle_unpickle_without_reoptimization():
    mode = config.mode
    if mode in ["DEBUG_MODE", "DebugMode"]:
        mode = "FAST_RUN"
    x1 = fmatrix("x1")
    x2 = fmatrix("x2")
    x3 = aesara.shared(np.ones((10, 10), dtype=floatX))
    x4 = aesara.shared(np.ones((10, 10), dtype=floatX))
    y = tt_sum(tt_sum(tt_sum(x1**2 + x2) + x3) + x4)

    updates = OrderedDict()
    updates[x3] = x3 + 1
    updates[x4] = x4 + 1
    f = aesara.function([x1, x2], y, updates=updates, mode=mode)

    # now pickle the compiled aesara fn
    string_pkl = pickle.dumps(f, -1)

    # compute f value
    in1 = np.ones((10, 10), dtype=floatX)
    in2 = np.ones((10, 10), dtype=floatX)

    # test unpickle without optimization
    default = config.reoptimize_unpickled_function
    try:
        # the default is True
        config.reoptimize_unpickled_function = False
        f_ = pickle.loads(string_pkl)
        assert f(in1, in2) == f_(in1, in2)
    finally:
        config.reoptimize_unpickled_function = default
Ejemplo n.º 4
0
    def test_grad(self):
        x = vector("x")
        a = np.random.random(50).astype(config.floatX)

        aesara.function([x], grad(tt_sum(diff(x)), x))
        utt.verify_grad(self.op, [a])

        for k in range(TestDiffOp.nb):
            aesara.function([x], grad(tt_sum(diff(x, n=k)), x))
            utt.verify_grad(DiffOp(n=k), [a], eps=7e-3)
Ejemplo n.º 5
0
 def test_grad_grad(self, cls_ofg):
     x, y, z = matrices("xyz")
     e = x + y * z
     op = cls_ofg([x, y, z], [e])
     f = op(x, y, z)
     f = f - grad(tt_sum(f), y)
     f = f - grad(tt_sum(f), y)
     fn = function([x, y, z], f)
     xv = np.ones((2, 2), dtype=config.floatX)
     yv = np.ones((2, 2), dtype=config.floatX) * 3
     zv = np.ones((2, 2), dtype=config.floatX) * 5
     assert np.allclose(6.0, fn(xv, yv, zv))
Ejemplo n.º 6
0
    def __call__(self, v, cost, parameters, damp):
        # compute Gauss-Newton Matrix right-multiplied by `v`
        Jv = Rop(self._s, parameters, v)
        HJv = grad(tt_sum(grad(cost, self._s) * Jv),
                   self._s,
                   consider_constant=[Jv])
        JHJv = grad(tt_sum(HJv * self._s),
                    parameters,
                    consider_constant=[HJv, Jv])

        # apply Tikhonov damping
        JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)]
        return JHJv
Ejemplo n.º 7
0
    def test_swap_SharedVariable_with_given(self):
        # A special testcase for logistic_sgd.py in Deep Learning Tutorial
        # This test assert that SharedVariable in different function have same storage

        train_x = aesara.shared(value=np.random.rand(10, 10).astype(config.floatX))
        test_x = aesara.shared(value=np.random.rand(10, 10).astype(config.floatX))

        train_y = aesara.shared(value=np.random.rand(10, 1).astype(config.floatX))
        test_y = aesara.shared(value=np.random.rand(10, 1).astype(config.floatX))

        i = iscalar("index")
        x = vector("x")
        y = vector("y")
        # this formular has no sense but for a test
        out = (tt_sum(x) - y) ** 2
        train = aesara.function(
            [i],
            out,
            givens={x: train_x[i], y: train_y[i]},
            updates={train_x: train_x + 0.1},
        )

        test_def = aesara.function([i], out, givens={x: test_x[i], y: test_y[i]})
        test_cpy = train.copy(
            swap={train_x: test_x, train_y: test_y}, delete_updates=True
        )

        for in1, in2 in zip(test_def.maker.inputs, test_cpy.maker.inputs):
            assert in1.value is in2.value
Ejemplo n.º 8
0
def test_hessian():
    x = vector()
    y = tt_sum(x ** 2)
    Hx = hessian(y, x)
    f = aesara.function([x], Hx)
    vx = np.arange(10).astype(aesara.config.floatX)
    assert np.allclose(f(vx), np.eye(10) * 2)
Ejemplo n.º 9
0
    def test_grad(self):
        x = vector()
        np_x = np.random.randn(7).astype(aesara.config.floatX)

        # offset = 0 case:
        mtx_x = GpuAllocDiag()(x)
        sum_mtx_x = tt_sum(mtx_x)
        grad_x = aesara.grad(sum_mtx_x, x)
        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)

        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)

        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
        true_grad_x = np.diagonal(computed_grad_mtx_x, 0)
        assert np.allclose(computed_grad_x, true_grad_x)

        # offset > 0 case:
        mtx_x = GpuAllocDiag(2)(x)
        sum_mtx_x = tt_sum(mtx_x)
        grad_x = aesara.grad(sum_mtx_x, x)
        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)

        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)

        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
        true_grad_x = np.diagonal(computed_grad_mtx_x, 2)
        assert np.allclose(computed_grad_x, true_grad_x)

        # offset < 0 case:
        mtx_x = GpuAllocDiag(-3)(x)
        sum_mtx_x = tt_sum(mtx_x)
        grad_x = aesara.grad(sum_mtx_x, x)
        grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x)

        fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu)
        fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu)

        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
        true_grad_x = np.diagonal(computed_grad_mtx_x, -3)
        assert np.allclose(computed_grad_x, true_grad_x)
Ejemplo n.º 10
0
 def test_no_shared_as_input(self):
     # Test that shared variables cannot be used as function inputs.
     w_init = np.random.rand(2, 2)
     w = shared(w_init.copy(), "w")
     with pytest.raises(
             TypeError,
             match=r"^Cannot use a shared variable \(w\) as explicit input"
     ):
         pfunc([w], tt_sum(w * w))
Ejemplo n.º 11
0
    def test_shared_grad(self, cls_ofg):
        x, y, z = matrices("xyz")
        s = shared(np.random.rand(2, 2).astype(config.floatX))
        e = x + y * z + s
        op = cls_ofg([x, y, z], [e])
        f = op(x, y, z)
        f = f - grad(tt_sum(f), y)
        fn = function([x, y, z], f)
        xv = np.ones((2, 2), dtype=config.floatX)
        yv = np.ones((2, 2), dtype=config.floatX) * 3
        zv = np.ones((2, 2), dtype=config.floatX) * 5
        assert np.allclose(11.0 + s.get_value(), fn(xv, yv, zv))

        # grad again the shared variable
        f = op(x, y, z)
        f = f - grad(tt_sum(f), s)
        fn = function([x, y, z], f)
        assert np.allclose(15.0 + s.get_value(), fn(xv, yv, zv))
Ejemplo n.º 12
0
def test_FunctionMaker_cache_optimizations():

    opt_db_file = os.path.join(config.compiledir, "optimized_graphs.pkl")
    if os.path.exists(opt_db_file):
        os.remove(opt_db_file)

    floatX = "float32"
    mode = config.mode
    if mode in ["DEBUG_MODE", "DebugMode"]:
        mode = "FAST_RUN"

    graph_db_file = os.path.join(config.compiledir, "optimized_graphs.pkl")
    assert not os.path.exists(graph_db_file)

    with config.change_flags(cache_optimizations=True):
        a = fmatrix("a")
        b = fmatrix("b")
        c = aesara.shared(np.ones((10, 10), dtype=floatX))
        d = aesara.shared(np.ones((10, 10), dtype=floatX))
        e = tt_sum(tt_sum(tt_sum(a ** 2 + b) + c) + d)
        f1 = aesara.function([a, b], e, mode=mode)

        # FIXME: We can do much better about testing this.
        assert os.path.exists(graph_db_file)

        m = fmatrix("x1")
        n = fmatrix("x2")
        p = aesara.shared(np.ones((10, 10), dtype=floatX))
        q = aesara.shared(np.ones((10, 10), dtype=floatX))
        j = tt_sum(tt_sum(tt_sum(m ** 2 + n) + p) + q)
        f2 = aesara.function([m, n], j, mode=mode)

        in1 = np.ones((10, 10), dtype=floatX)
        in2 = np.ones((10, 10), dtype=floatX)
        assert f1(in1, in2) == f2(in1, in2)
Ejemplo n.º 13
0
def test_jax_CAReduce():
    a_tt = vector("a")
    a_tt.tag.test_value = np.r_[1, 2, 3].astype(config.floatX)

    x = tt_sum(a_tt, axis=None)
    x_fg = FunctionGraph([a_tt], [x])

    compare_jax_and_py(x_fg, [np.r_[1, 2, 3].astype(config.floatX)])

    a_tt = matrix("a")
    a_tt.tag.test_value = np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)

    x = tt_sum(a_tt, axis=0)
    x_fg = FunctionGraph([a_tt], [x])

    compare_jax_and_py(x_fg,
                       [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])

    x = tt_sum(a_tt, axis=1)
    x_fg = FunctionGraph([a_tt], [x])

    compare_jax_and_py(x_fg,
                       [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])

    a_tt = matrix("a")
    a_tt.tag.test_value = np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)

    x = prod(a_tt, axis=0)
    x_fg = FunctionGraph([a_tt], [x])

    compare_jax_and_py(x_fg,
                       [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])

    x = tt_all(a_tt)
    x_fg = FunctionGraph([a_tt], [x])

    compare_jax_and_py(x_fg,
                       [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])
Ejemplo n.º 14
0
def test_gradient_scan():
    # Test for a crash when using MRG inside scan and taking the gradient
    # See https://groups.google.com/d/msg/theano-dev/UbcYyU5m-M8/UO9UgXqnQP0J
    aesara_rng = MRG_RandomStream(10)
    w = shared(np.ones(1, dtype="float32"))

    def one_step(x):
        return x + aesara_rng.uniform((1,), dtype="float32") * w

    x = vector(dtype="float32")
    values, updates = scan(one_step, outputs_info=x, n_steps=10)
    gw = grad(tt_sum(values[-1]), w)
    f = function([x], gw)
    f(np.arange(1, dtype="float32"))
Ejemplo n.º 15
0
    def test_default_container(self):
        # Ensure it is possible to (implicitly) use a shared variable in a
        # function, as a 'state' that can be updated at will.

        rng = np.random.RandomState(1827)
        w_init = rng.rand(5)
        w = shared(w_init.copy(), "w")
        reg = tt_sum(w * w)
        f = pfunc([], reg)

        assert f() == np.sum(w_init * w_init)
        # Change the value of w and ensure the output changes accordingly.
        w.set_value(w.get_value(borrow=True) + 1.0, borrow=True)
        assert f() == np.sum((w_init + 1)**2)
Ejemplo n.º 16
0
def local_abstract_batch_norm_train_grad(fgraph, node):
    if not isinstance(node.op, AbstractBatchNormTrainGrad):
        return None

    x, dy, scale, x_mean, x_invstd, epsilon = node.inputs
    axes = node.op.axes
    if min(axes) < 0 or max(axes) > x.ndim:
        return None
    if (
        not isinstance(x.type, TensorType)
        or not isinstance(dy.type, TensorType)
        or not isinstance(scale.type, TensorType)
        or not isinstance(x_mean.type, TensorType)
        or not isinstance(x_invstd.type, TensorType)
        or not isinstance(epsilon.type, TensorType)
    ):
        return None

    x_diff = x - x_mean
    mean_dy_x_diff = mean(dy * x_diff, axis=axes, keepdims=True)
    c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))

    g_wrt_inputs = scale * (c - mean(c, axis=axes, keepdims=True))
    g_wrt_scale = tt_sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
    g_wrt_bias = tt_sum(dy, axis=axes, keepdims=True)
    results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]

    results = [
        aet.patternbroadcast(r, r_orig.broadcastable)
        for (r, r_orig) in zip(results, node.outputs)
    ]

    for var in aesara.graph.basic.vars_between(node.inputs, results):
        if var not in node.inputs:
            copy_stack_trace(node.outputs[0], var)
    return results
Ejemplo n.º 17
0
    def test_DownsampleFactorMax_hessian(self):
        # Example provided by Frans Cronje, see
        # https://groups.google.com/d/msg/theano-users/qpqUy_3glhw/JMwIvlN5wX4J
        x_vec = vector("x")
        z = aet.dot(x_vec.dimshuffle(0, "x"), x_vec.dimshuffle("x", 0))
        y = pool_2d(input=z, ws=(2, 2), ignore_border=True)
        C = aet.exp(tt_sum(y))

        grad_hess = aesara.gradient.hessian(cost=C, wrt=x_vec)
        fn_hess = function(inputs=[x_vec], outputs=grad_hess)

        # The value has been manually computed from the theoretical gradient,
        # and confirmed by the implementation.

        assert np.allclose(fn_hess([1, 2]), [[0.0, 0.0], [0.0, 982.7667]])
Ejemplo n.º 18
0
def test_undefined_grad_opt():
    # Make sure that undefined grad get removed in optimized graph.
    random = MRG_RandomStream(np.random.randint(1, 2147462579))
    pvals = shared(np.random.rand(10, 20).astype(config.floatX))
    pvals = pvals / pvals.sum(axis=1)
    pvals = zero_grad(pvals)
    samples = random.multinomial(pvals=pvals, n=1)
    samples = cast(samples, pvals.dtype)
    samples = zero_grad(samples)
    cost = tt_sum(samples + pvals)
    grad_out = grad(cost, samples)
    f = function([], grad_out)
    assert not any(
        [isinstance(node.op, UndefinedGrad) for node in f.maker.fgraph.apply_nodes]
    )
Ejemplo n.º 19
0
    def __init__(
        self,
        input=None,
        target=None,
        n_input=1,
        n_hidden=1,
        n_output=1,
        lr=1e-3,
        **kw,
    ):
        super().__init__(**kw)

        if input is None:
            input = dvector("input")
        if target is None:
            target = dvector("target")

        self.input = input
        self.target = target
        self.lr = shared(lr, "learning_rate")
        self.w1 = shared(np.zeros((n_hidden, n_input)), "w1")
        self.w2 = shared(np.zeros((n_output, n_hidden)), "w2")
        # print self.lr.type

        self.hidden = sigmoid(dot(self.w1, self.input))
        self.output = dot(self.w2, self.hidden)
        self.cost = tt_sum((self.output - self.target) ** 2)

        self.sgd_updates = {
            self.w1: self.w1 - self.lr * grad(self.cost, self.w1),
            self.w2: self.w2 - self.lr * grad(self.cost, self.w2),
        }

        self.sgd_step = pfunc(
            params=[self.input, self.target],
            outputs=[self.output, self.cost],
            updates=self.sgd_updates,
        )

        self.compute_output = pfunc([self.input], self.output)

        self.output_from_hidden = pfunc([self.hidden], self.output)
Ejemplo n.º 20
0
def test_broadcastable():
    R = MRG_RandomStream(234)
    x = matrix()
    size1 = (10, 1)
    size2 = (x.shape[0], 1)
    pvals_1 = np.random.uniform(0, 1, size=size1)
    pvals_1 = pvals_1 / sum(pvals_1)
    pvals_2 = R.uniform(size=size2)
    pvals_2 = pvals_2 / tt_sum(pvals_2)

    for distribution in [
        R.uniform,
        R.normal,
        R.truncated_normal,
        R.binomial,
        R.multinomial,
        R.multinomial_wo_replacement,
    ]:
        # multinomial or multinomial_wo_replacement does not support "size" argument,
        # the sizes of them are implicitly defined with "pvals" argument.
        if distribution in [R.multinomial, R.multinomial_wo_replacement]:
            # check when all dimensions are constant
            uu = distribution(pvals=pvals_1)
            assert uu.broadcastable == (False, True)

            # check when some dimensions are aesara variables
            uu = distribution(pvals=pvals_2)
            assert uu.broadcastable == (False, True)
        else:
            # check when all dimensions are constant
            uu = distribution(size=size1)
            assert uu.broadcastable == (False, True)

            # check when some dimensions are aesara variables
            uu = distribution(size=size2)
            assert uu.broadcastable == (False, True)
Ejemplo n.º 21
0
    def grad(self, inp, grads):
        x, dy, scale, x_mean, x_invstd, epsilon = inp
        ddinputs, ddscale, ddbias = grads

        x_diff = x - x_mean
        mean_dy_x_diff = mean(dy * x_diff, axis=self.axes, keepdims=True)

        # compute gradients given each of the output gradients
        g_wrt_x = 0
        g_wrt_dy = 0
        g_wrt_scale = 0
        g_wrt_x_mean = 0
        g_wrt_x_invstd = 0

        if not isinstance(ddinputs.type, aesara.gradient.DisconnectedType):
            ccc = scale * (ddinputs - mean(ddinputs, axis=self.axes, keepdims=True))
            ddd = (x_invstd ** 3) * (
                ccc * mean(dy * x_diff, axis=self.axes, keepdims=True)
                + dy * mean(ccc * x_diff, axis=self.axes, keepdims=True)
            )

            g_wrt_x = g_wrt_x - ddd
            g_wrt_dy = g_wrt_dy + (
                (ccc * x_invstd)
                - (
                    (x_invstd ** 3)
                    * x_diff
                    * mean(ccc * x_diff, axis=self.axes, keepdims=True)
                )
            )

            eee = (dy * x_invstd) - ((x_invstd ** 3) * x_diff * mean_dy_x_diff)
            g_wrt_scale = g_wrt_scale + tt_sum(
                ddinputs * (eee - mean(eee, axis=self.axes, keepdims=True)),
                axis=self.axes,
                keepdims=True,
            )

            g_wrt_x_mean = g_wrt_x_mean + tt_sum(ddd, axis=self.axes, keepdims=True)
            g_wrt_x_invstd = g_wrt_x_invstd + tt_sum(
                ccc * (dy - 3 * (x_invstd ** 2) * x_diff * mean_dy_x_diff),
                axis=self.axes,
                keepdims=True,
            )

        if not isinstance(ddscale.type, aesara.gradient.DisconnectedType):
            g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy)
            g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff)
            g_wrt_x_mean = g_wrt_x_mean - (
                x_invstd * ddscale * tt_sum(dy, axis=self.axes, keepdims=True)
            )
            g_wrt_x_invstd = g_wrt_x_invstd + (
                ddscale * tt_sum(dy * x_diff, axis=self.axes, keepdims=True)
            )

        if not isinstance(ddbias.type, aesara.gradient.DisconnectedType):
            g_wrt_dy = g_wrt_dy + aet.fill(dy, ddbias)

        # depending on which output gradients are given,
        # some inputs should be disconnected
        results = [
            g_wrt_x,
            g_wrt_dy,
            g_wrt_scale,
            g_wrt_x_mean,
            g_wrt_x_invstd,
            aesara.gradient.DisconnectedType()(),
        ]
        return [
            aesara.gradient.DisconnectedType()() if (type(r) == int and r == 0) else r
            for r in results
        ]
Ejemplo n.º 22
0
    def test_gpu_memory_usage(self):
        # This test validates that the memory usage of the defined aesara
        # function is reasonnable when executed on the GPU. It checks for
        # a bug in which one of scan's optimization was not applied which
        # made the scan node compute large and unnecessary outputs which
        # brought memory usage on the GPU to ~12G.

        # Dimensionality of input and output data (not one-hot coded)
        n_in = 100
        n_out = 100
        # Number of neurons in hidden layer
        n_hid = 4000

        # Number of minibatches
        mb_size = 2
        # Time steps in minibatch
        mb_length = 200

        # Define input variables
        xin = ftensor3(name="xin")
        yout = ftensor3(name="yout")

        # Initialize the network parameters
        U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"),
                          name="W_xin_to_l1")
        V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"),
                          name="W_l1_to_l1")
        W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"),
                          name="W_l1_to_l2")
        nparams = [U, V, W]

        # Build the forward pass
        l1_base = dot(xin, U)

        def scan_l(baseline, last_step):
            return baseline + dot(last_step, V)

        zero_output = aet.alloc(np.asarray(0.0, dtype="float32"), mb_size,
                                n_hid)

        l1_out, _ = scan(
            scan_l,
            sequences=[l1_base],
            outputs_info=[zero_output],
            mode=self.mode_with_gpu_nodebug,
        )

        l2_out = dot(l1_out, W)

        # Compute the cost and take the gradient wrt params
        cost = tt_sum((l2_out - yout)**2)
        grads = aesara.grad(cost, nparams)
        updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads))))

        # Compile the aesara function
        feval_backprop = aesara.function([xin, yout],
                                         cost,
                                         updates=updates,
                                         mode=self.mode_with_gpu_nodebug)

        # Validate that the PushOutScanOutput optimization has been applied
        # by checking the number of outputs of the grad Scan node in the
        # compiled function.
        nodes = feval_backprop.maker.fgraph.toposort()
        scan_nodes = [n for n in nodes if isinstance(n.op, Scan)]

        # The grad scan is always the 2nd one according to toposort. If the
        # optimization has been applied, it has 2 outputs, otherwise 3.
        grad_scan_node = scan_nodes[1]
        assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs)

        # Call the aesara function to ensure the absence of a memory error
        feval_backprop(
            np.zeros((mb_length, mb_size, n_in), dtype="float32"),
            np.zeros((mb_length, mb_size, n_out), dtype="float32"),
        )
Ejemplo n.º 23
0
def test_batch_normalization_train_broadcast():
    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
        for vartype in (tensor5, tensor4, tensor3, matrix, vector):
            x = vartype("x")
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # convert axes to explicit list
            if axes == "per-activation":
                axes2 = (0, )
            elif axes == "spatial":
                axes2 = (0, ) + tuple(range(2, ndim))
            else:
                axes2 = axes

            # compute axes for parameter tensors
            non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
            params_dimshuffle = ["x"] * ndim
            for i, axis in enumerate(non_bc_axes):
                params_dimshuffle[axis] = i

            # construct non-broadcasted parameter variables
            param_type = TensorType(x.dtype, (False, ) * len(non_bc_axes))
            scale, bias, running_mean, running_var = (param_type(n)
                                                      for n in ("scale",
                                                                "bias",
                                                                "running_mean",
                                                                "running_var"))

            # broadcast parameter variables
            scale_bc = scale.dimshuffle(params_dimshuffle)
            bias_bc = bias.dimshuffle(params_dimshuffle)
            running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
            running_var_bc = running_var.dimshuffle(params_dimshuffle)

            # batch_normalization_train with original, non-broadcasted variables
            train_non_bc = batchnorm.batch_normalization_train(
                x,
                scale,
                bias,
                axes,
                eps,
                running_average_factor,
                running_mean,
                running_var,
            )
            # batch_normalization_train with broadcasted variables
            train_bc = batchnorm.batch_normalization_train(
                x,
                scale_bc,
                bias_bc,
                axes,
                eps,
                running_average_factor,
                running_mean_bc,
                running_var_bc,
            )
            train_bc = tuple([train_bc[0]] +
                             [r.dimshuffle(non_bc_axes)
                              for r in train_bc[1:]]  # out
                             )

            # batch_normalization_test with original, non-broadcasted variables
            test_non_bc = batchnorm.batch_normalization_test(
                x, scale, bias, running_mean, running_var, axes, eps)
            # batch_normalization_test with broadcasted variables
            test_bc = batchnorm.batch_normalization_test(
                x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes,
                eps)

            # subtract the results of the non-broadcasted and broadcasted calls
            results_non_bc = train_non_bc + (test_non_bc, )
            results_bc = train_bc + (test_bc, )
            results = [
                abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)
            ]

            # compile to compute all differences
            f = aesara.function([x, scale, bias, running_mean, running_var],
                                tt_sum(sum(results)))

            # the paired ops are exactly the same, so the optimizer should have
            # collapsed the sum of differences to a constant zero
            nodes = f.maker.fgraph.toposort()
            if aesara.config.mode != "FAST_COMPILE":
                assert len(nodes) == 1
                assert isinstance(nodes[0].op, aesara.compile.DeepCopyOp)
            inputs = [
                np.asarray(np.random.rand(*((4, ) * n)), x.dtype) for n in [
                    x.ndim,
                    scale.ndim,
                    bias.ndim,
                    running_mean.ndim,
                    running_var.ndim,
                ]
            ]
            assert 0.0 == f(*inputs)
Ejemplo n.º 24
0
    def test_grad_override(self, cls_ofg):
        x, y = vectors("xy")

        def go(inps, gs):
            x, y = inps
            (g, ) = gs
            return [g * y * 2, g * x * 1.5]

        dedz = vector("dedz")
        op_mul_grad = cls_ofg([x, y, dedz], go([x, y], [dedz]))

        op_mul = cls_ofg([x, y], [x * y], grad_overrides=go)
        op_mul2 = cls_ofg([x, y], [x * y], grad_overrides=op_mul_grad)

        # single override case (function or OfG instance)
        xx, yy = vector("xx"), vector("yy")
        for op in [op_mul, op_mul2]:
            zz = tt_sum(op(xx, yy))
            dx, dy = grad(zz, [xx, yy])
            fn = function([xx, yy], [dx, dy])
            xv = np.random.rand(16).astype(config.floatX)
            yv = np.random.rand(16).astype(config.floatX)
            dxv, dyv = fn(xv, yv)
            assert np.allclose(yv * 2, dxv)
            assert np.allclose(xv * 1.5, dyv)

        # list override case
        def go1(inps, gs):
            x, w, b = inps
            g = gs[0]
            return g * w * 2

        def go2(inps, gs):
            x, w, b = inps
            g = gs[0]
            return g * x * 1.5

        w, b = vectors("wb")
        # we make the 3rd gradient default (no override)
        op_linear = cls_ofg([x, w, b], [x * w + b],
                            grad_overrides=[go1, go2, "default"])
        xx, ww, bb = vector("xx"), vector("yy"), vector("bb")
        zz = tt_sum(op_linear(xx, ww, bb))
        dx, dw, db = grad(zz, [xx, ww, bb])
        fn = function([xx, ww, bb], [dx, dw, db])
        xv = np.random.rand(16).astype(config.floatX)
        wv = np.random.rand(16).astype(config.floatX)
        bv = np.random.rand(16).astype(config.floatX)
        dxv, dwv, dbv = fn(xv, wv, bv)
        assert np.allclose(wv * 2, dxv)
        assert np.allclose(xv * 1.5, dwv)
        assert np.allclose(np.ones(16, dtype=config.floatX), dbv)

        # NullType and DisconnectedType
        op_linear2 = cls_ofg(
            [x, w, b],
            [x * w + b],
            grad_overrides=[go1, NullType()(),
                            DisconnectedType()()],
        )
        zz2 = tt_sum(op_linear2(xx, ww, bb))
        dx2, dw2, db2 = grad(
            zz2,
            [xx, ww, bb],
            return_disconnected="Disconnected",
            disconnected_inputs="ignore",
            null_gradients="return",
        )
        assert isinstance(dx2.type, TensorType)
        assert dx2.ndim == 1
        assert isinstance(dw2.type, NullType)
        assert isinstance(db2.type, DisconnectedType)
Ejemplo n.º 25
0
def local_subtensor_rv_lift(fgraph, node):
    """Lift ``*Subtensor`` `Op`s up to a `RandomVariable`'s parameters.

    In a fashion similar to `local_dimshuffle_rv_lift`, the indexed dimensions
    need to be separated into distinct replication-space and (independent)
    parameter-space ``*Subtensor``s.

    The replication-space ``*Subtensor`` can be used to determine a
    sub/super-set of the replication-space and, thus, a "smaller"/"larger"
    ``size`` tuple.  The parameter-space ``*Subtensor`` is simply lifted and
    applied to the `RandomVariable`'s distribution parameters.

    Consider the following example graph:
    ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``.  The
    ``*Subtensor`` `Op` requests indices ``idx1``, ``idx2``, and ``idx3``,
    which correspond to all three ``size`` dimensions.  Now, depending on the
    broadcasted dimensions of ``mu`` and ``std``, this ``*Subtensor`` `Op`
    could be reducing the ``size`` parameter and/or subsetting the independent
    ``mu`` and ``std`` parameters.  Only once the dimensions are properly
    separated into the two replication/parameter subspaces can we determine how
    the ``*Subtensor`` indices are distributed.
    For instance, ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``
    could become ``normal(mu[idx1], std[idx2], size=np.shape(idx1) + np.shape(idx2) + np.shape(idx3))``
    if ``mu.shape == std.shape == ()``

    ``normal`` is a rather simple case, because it's univariate.  Multivariate
    cases require a mapping between the parameter space and the image of the
    random variable.  This may not always be possible, but for many common
    distributions it is.  For example, the dimensions of the multivariate
    normal's image can be mapped directly to each dimension of its parameters.
    We use these mappings to change a graph like ``multivariate_normal(mu, Sigma)[idx1]``
    into ``multivariate_normal(mu[idx1], Sigma[idx1, idx1])``.  Notice how

    Also, there's the important matter of "advanced" indexing, which may not
    only subset an array, but also broadcast it to a larger size.

    """

    st_op = node.op

    if not isinstance(st_op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)):
        return False

    base_rv = node.inputs[0]

    rv_node = base_rv.owner
    if not (rv_node and isinstance(rv_node.op, RandomVariable)):
        return False

    # If no one else is using the underlying `RandomVariable`, then we can
    # do this; otherwise, the graph would be internally inconsistent.
    if not all(
        (n == node or isinstance(n.op, Shape)) for n, i in fgraph.clients[base_rv]
    ):
        return False

    rv_op = rv_node.op
    rng, size, dtype, *dist_params = rv_node.inputs

    # TODO: Remove this once the multi-dimensional changes described below are
    # in place.
    if rv_op.ndim_supp > 0:
        return False

    rv_op = base_rv.owner.op
    rng, size, dtype, *dist_params = base_rv.owner.inputs

    idx_list = getattr(st_op, "idx_list", None)
    if idx_list:
        cdata = get_idx_list(node.inputs, idx_list)
    else:
        cdata = node.inputs[1:]

    st_indices, st_is_bool = zip(
        *tuple(
            (as_index_variable(i), getattr(i, "dtype", None) == "bool") for i in cdata
        )
    )

    # We need to separate dimensions into replications and independents
    num_ind_dims = None
    if len(dist_params) == 1:
        num_ind_dims = dist_params[0].ndim
    else:
        # When there is more than one distribution parameter, assume that all
        # of them will broadcast to the maximum number of dimensions
        num_ind_dims = max(d.ndim for d in dist_params)

    reps_ind_split_idx = base_rv.ndim - (num_ind_dims + rv_op.ndim_supp)

    if len(st_indices) > reps_ind_split_idx:
        # These are the indices that need to be applied to the parameters
        ind_indices = tuple(st_indices[reps_ind_split_idx:])

        # We need to broadcast the parameters before applying the `*Subtensor*`
        # with these indices, because the indices could be referencing broadcast
        # dimensions that don't exist (yet)
        bcast_dist_params = broadcast_params(dist_params, rv_op.ndims_params)

        # TODO: For multidimensional distributions, we need a map that tells us
        # which dimensions of the parameters need to be indexed.
        #
        # For example, `multivariate_normal` would have the following:
        # `RandomVariable.param_to_image_dims = ((0,), (0, 1))`
        #
        # I.e. the first parameter's (i.e. mean's) first dimension maps directly to
        # the dimension of the RV's image, and its second parameter's
        # (i.e. covariance's) first and second dimensions map directly to the
        # dimension of the RV's image.

        args_lifted = tuple(p[ind_indices] for p in bcast_dist_params)
    else:
        # In this case, no indexing is applied to the parameters; only the
        # `size` parameter is affected.
        args_lifted = dist_params

    # TODO: Could use `ShapeFeature` info.  We would need to be sure that
    # `node` isn't in the results, though.
    # if hasattr(fgraph, "shape_feature"):
    #     output_shape = fgraph.shape_feature.shape_of(node.outputs[0])
    # else:
    output_shape = indexed_result_shape(base_rv.shape, st_indices)

    size_lifted = (
        output_shape if rv_op.ndim_supp == 0 else output_shape[: -rv_op.ndim_supp]
    )

    # Boolean indices can actually change the `size` value (compared to just
    # *which* dimensions of `size` are used).
    if any(st_is_bool):
        size_lifted = tuple(
            tt_sum(idx) if is_bool else s
            for s, is_bool, idx in zip(
                size_lifted, st_is_bool, st_indices[: (reps_ind_split_idx + 1)]
            )
        )

    new_node = rv_op.make_node(rng, size_lifted, dtype, *args_lifted)
    _, new_rv = new_node.outputs

    # Calling `Op.make_node` directly circumvents test value computations, so
    # we need to compute the test values manually
    if config.compute_test_value != "off":
        compute_test_value(new_node)

    return [new_rv]
Ejemplo n.º 26
0
def test_undefined_grad():
    srng = MRG_RandomStream(seed=1234)

    # checking uniform distribution
    low = scalar()
    out = srng.uniform((), low=low)
    with pytest.raises(NullTypeGradError):
        grad(out, low)

    high = scalar()
    out = srng.uniform((), low=0, high=high)
    with pytest.raises(NullTypeGradError):
        grad(out, high)

    out = srng.uniform((), low=low, high=high)
    with pytest.raises(NullTypeGradError):
        grad(out, (low, high))

    # checking binomial distribution
    prob = scalar()
    out = srng.binomial((), p=prob)
    with pytest.raises(NullTypeGradError):
        grad(out, prob)

    # checking multinomial distribution
    prob1 = scalar()
    prob2 = scalar()
    p = [as_tensor_variable([prob1, 0.5, 0.25])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    with pytest.raises(NullTypeGradError):
        grad(tt_sum(out), prob1)

    p = [as_tensor_variable([prob1, prob2])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    with pytest.raises(NullTypeGradError):
        grad(tt_sum(out), (prob1, prob2))

    # checking choice
    p = [as_tensor_variable([prob1, prob2, 0.1, 0.2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(NullTypeGradError):
        grad(out[0], (prob1, prob2))

    p = [as_tensor_variable([prob1, prob2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(NullTypeGradError):
        grad(out[0], (prob1, prob2))

    p = [as_tensor_variable([prob1, 0.2, 0.3])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    with pytest.raises(NullTypeGradError):
        grad(out[0], prob1)

    # checking normal distribution
    avg = scalar()
    out = srng.normal((), avg=avg)
    with pytest.raises(NullTypeGradError):
        grad(out, avg)

    std = scalar()
    out = srng.normal((), avg=0, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, std)

    out = srng.normal((), avg=avg, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, (avg, std))

    # checking truncated normal distribution
    avg = scalar()
    out = srng.truncated_normal((), avg=avg)
    with pytest.raises(NullTypeGradError):
        grad(out, avg)

    std = scalar()
    out = srng.truncated_normal((), avg=0, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, std)

    out = srng.truncated_normal((), avg=avg, std=std)
    with pytest.raises(NullTypeGradError):
        grad(out, (avg, std))