def test_undefined_grad_opt(): # Make sure that undefined grad get removed in optimized graph. random = MRG_RandomStream(np.random.randint(1, 2147462579)) pvals = aesara.shared(np.random.rand(10, 20).astype(config.floatX)) pvals = pvals / pvals.sum(axis=1) pvals = zero_grad(pvals) samples = random.multinomial(pvals=pvals, n=1) samples = aet.cast(samples, pvals.dtype) samples = zero_grad(samples) cost = aet_sum(samples + pvals) grad_res = grad(cost, samples) f = aesara.function([], grad_res) assert not any([ isinstance(node.op, UndefinedGrad) for node in f.maker.fgraph.apply_nodes ])
def test_softmax_grad_optimizations(self): x = matrix("x") one_of_n = lvector("one_of_n") op = crossentropy_categorical_1hot xe = op(softmax_legacy(x), one_of_n) sum_xe = aet_sum(xe) g_x = grad(sum_xe, x) fgraph = FunctionGraph([x, one_of_n], [g_x]) assert check_stack_trace( fgraph, ops_to_check=[ crossentropy_softmax_1hot_with_bias_dx, softmax_legacy ], ) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = {node.op for node in fgraph.toposort()} assert crossentropy_softmax_argmax_1hot_with_bias not in ops assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops
def test_local_csm_grad_c(): data = vector() indices, indptr, shape = (ivector(), ivector(), ivector()) mode = get_default_mode() if aesara.config.mode == "FAST_COMPILE": mode = Mode(linker="c|py", optimizer="fast_compile") mode = mode.including("specialize", "local_csm_grad_c") for CS, cast in [ (sparse.CSC, sp.sparse.csc_matrix), (sparse.CSR, sp.sparse.csr_matrix), ]: cost = aet_sum(sparse.DenseFromSparse()(CS(data, indices, indptr, shape))) f = aesara.function( [data, indices, indptr, shape], aesara.grad(cost, data), mode=mode ) assert not any( isinstance(node.op, sparse.CSMGrad) for node in f.maker.fgraph.toposort() ) v = cast(random_lil((10, 40), config.floatX, 3)) f(v.data, v.indices, v.indptr, v.shape)
def test_matrix_perform_and_opt(self): m = config.mode m = aesara.compile.get_mode(m) m.check_isfinite = False x, y = matrices("xy") # regular softmax and crossentropy sm = softmax(x) cm = categorical_crossentropy(sm, y) # numerically stable log-softmax with crossentropy logsm = logsoftmax(x) sm2 = exp(logsm) # just used to show equivalence with sm cm2 = -aet_sum(y * logsm, axis=1) grad_node = grad(cm2.mean(), x) # create some inputs into a softmax that are large and labels a = np.exp(10 * np.random.random((5, 10)).astype(config.floatX)) # create some one-hot coded labels b = np.eye(5, 10).astype(config.floatX) # show equivalence of softmax and exponentiated numerically stable # log-softmax f1 = aesara.function([x], [sm, sm2]) sm_, sm2_ = f1(a) utt.assert_allclose(sm_, sm2_) # now show that the two versions result in the same crossentropy cost # this indicates that the forward function does provide some numerical # stability f2 = aesara.function([x, y], [cm, cm2], mode=m) cm_, cm2_ = f2(a, b) utt.assert_allclose(cm_, cm2_) # now, show that in the standard softmax case the gradients blow up # while in the log-softmax case they don't f3 = aesara.function([x, y], [grad_node]) grad_ = f3(a, b) assert not np.any(np.isnan(grad_))
def test_broadcastable(): R = MRG_RandomStream(234) x = matrix() size1 = (10, 1) size2 = (x.shape[0], 1) pvals_1 = np.random.uniform(0, 1, size=size1) pvals_1 = pvals_1 / sum(pvals_1) pvals_2 = R.uniform(size=size2) pvals_2 = pvals_2 / aet_sum(pvals_2) for distribution in [ R.uniform, R.normal, R.truncated_normal, R.binomial, R.multinomial, R.multinomial_wo_replacement, ]: # multinomial or multinomial_wo_replacement does not support "size" argument, # the sizes of them are implicitly defined with "pvals" argument. if distribution in [R.multinomial, R.multinomial_wo_replacement]: # check when all dimensions are constant uu = distribution(pvals=pvals_1) assert uu.broadcastable == (False, True) # check when some dimensions are aesara variables uu = distribution(pvals=pvals_2) assert uu.broadcastable == (False, True) else: # check when all dimensions are constant uu = distribution(size=size1) assert uu.broadcastable == (False, True) # check when some dimensions are aesara variables uu = distribution(size=size2) assert uu.broadcastable == (False, True)
def test_crossentropy_softmax_1hot_with_bias_dxcale_cost(self): x = matrix("x") y = lvector("y") a = scalar("a") def validate_grad_graph(func): # The graph of the gradient should not have softmaxgrad anymore has_cx1hotdx = False has_softmax = False has_softmaxdx = False for node in func.maker.fgraph.toposort(): if node.op == crossentropy_softmax_1hot_with_bias_dx: has_cx1hotdx = True if node.op == softmax_legacy: has_softmax = True if node.op == softmax_grad_legacy: has_softmaxdx = True assert has_cx1hotdx assert has_softmax assert not has_softmaxdx # Cases to test expressions = [ a * aet_sum(-log(softmax(x)[aet.arange(y.shape[0]), y])), -a * aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y])), a * (-aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y]))), a * aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y])), a * aet_sum(-log(softmax(x))[aet.arange(y.shape[0]), y]), -a * aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y]), a * (-aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y])), a * aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y]), a * mean(-log(softmax(x)[aet.arange(y.shape[0]), y])), -a * mean(log(softmax(x)[aet.arange(y.shape[0]), y])), a * (-mean(log(softmax(x)[aet.arange(y.shape[0]), y]))), a * mean(log(softmax(x)[aet.arange(y.shape[0]), y])), a * mean(-log(softmax(x))[aet.arange(y.shape[0]), y]), -a * mean(log(softmax(x))[aet.arange(y.shape[0]), y]), a * (-mean(log(softmax(x))[aet.arange(y.shape[0]), y])), a * mean(log(softmax(x))[aet.arange(y.shape[0]), y]), ] for expr in expressions: fgraph = FunctionGraph([x, y, a], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) assert 5 <= len(fgraph.toposort()) <= 10 ops = {node.op for node in fgraph.toposort()} assert crossentropy_softmax_argmax_1hot_with_bias in ops assert softmax_legacy not in ops # Verify the gradient wrt x fgraph = FunctionGraph([x, y, a], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) assert 3 <= len(fgraph.toposort()) <= 6 ops = {node.op for node in fgraph.toposort()} assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops # Verify the gradient when providing output gradient fgraph = FunctionGraph( [x, y, a], [grad(expr, x, known_grads={expr: a * x.sum()})]) optdb.query(OPT_FAST_RUN).optimize(fgraph) assert 6 <= len(fgraph.toposort()) <= 8 ops = {node.op for node in fgraph.toposort()} assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops
def test_get_rid_of_advanced_indexing_version_of_xent(self): x = matrix("x") b = vector("b") y = lvector("y") # Basic case expressions = [ aet_sum(-log(softmax(x)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y]), aet_sum(-log(softmax(x))[aet.arange(y.shape[0]), y]), ] for expr in expressions: fgraph = FunctionGraph([x, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 4 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] # Also verify the gradient wrt x fgraph = FunctionGraph([x, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 2 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops # Test that a biased softmax is optimized correctly bias_expressions = [ aet_sum(-log(softmax(x + b)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(b + x)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(x + b))[aet.arange(y.shape[0]), y]), aet_sum(-log(softmax(b + x))[aet.arange(y.shape[0]), y]), ] for expr in bias_expressions: fgraph = FunctionGraph([x, b, y], [expr, x]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 2 # [big_op, sum] assert crossentropy_softmax_argmax_1hot_with_bias in ops fgraph = FunctionGraph([x, b, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 2 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_with_bias in ops assert softmax_grad_legacy not in ops # Test that using "mean" instead of sum works, too mean_expressions = [ mean(-log(softmax(x)[aet.arange(y.shape[0]), y])), -mean(log(softmax(x)[aet.arange(y.shape[0]), y])), -mean(log(softmax(x))[aet.arange(y.shape[0]), y]), mean(-log(softmax(x))[aet.arange(y.shape[0]), y]), ] for expr in mean_expressions: fgraph = FunctionGraph([x, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 6 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] fgraph = FunctionGraph([x, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 5 # there's an extra dimshuffle in there # but I can't think of a good rule to get rid of it assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops mean_bias_expressions = [ mean(-log(softmax(x + b)[aet.arange(y.shape[0]), y])), -mean(log(softmax(b + x)[aet.arange(y.shape[0]), y])), -mean(log(softmax(x + b))[aet.arange(y.shape[0]), y]), mean(-log(softmax(b + x))[aet.arange(y.shape[0]), y]), ] for expr in mean_bias_expressions: fgraph = FunctionGraph([x, b, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 4 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] fgraph = FunctionGraph([x, b, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 5 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_with_bias in ops assert softmax_grad_legacy not in ops
def test_batch_normalization_train_broadcast(): for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (tensor5, tensor4, tensor3, matrix, vector): x = vartype("x") ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # convert axes to explicit list if axes == "per-activation": axes2 = (0, ) elif axes == "spatial": axes2 = (0, ) + tuple(range(2, ndim)) else: axes2 = axes # compute axes for parameter tensors non_bc_axes = tuple(i for i in range(ndim) if i not in axes2) params_dimshuffle = ["x"] * ndim for i, axis in enumerate(non_bc_axes): params_dimshuffle[axis] = i # construct non-broadcasted parameter variables param_type = TensorType(x.dtype, (False, ) * len(non_bc_axes)) scale, bias, running_mean, running_var = (param_type(n) for n in ("scale", "bias", "running_mean", "running_var")) # broadcast parameter variables scale_bc = scale.dimshuffle(params_dimshuffle) bias_bc = bias.dimshuffle(params_dimshuffle) running_mean_bc = running_mean.dimshuffle(params_dimshuffle) running_var_bc = running_var.dimshuffle(params_dimshuffle) # batch_normalization_train with original, non-broadcasted variables train_non_bc = batchnorm.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var, ) # batch_normalization_train with broadcasted variables train_bc = batchnorm.batch_normalization_train( x, scale_bc, bias_bc, axes, eps, running_average_factor, running_mean_bc, running_var_bc, ) train_bc = tuple([train_bc[0]] + [r.dimshuffle(non_bc_axes) for r in train_bc[1:]] # out ) # batch_normalization_test with original, non-broadcasted variables test_non_bc = batchnorm.batch_normalization_test( x, scale, bias, running_mean, running_var, axes, eps) # batch_normalization_test with broadcasted variables test_bc = batchnorm.batch_normalization_test( x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps) # subtract the results of the non-broadcasted and broadcasted calls results_non_bc = train_non_bc + (test_non_bc, ) results_bc = train_bc + (test_bc, ) results = [ abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc) ] # compile to compute all differences f = aesara.function([x, scale, bias, running_mean, running_var], aet_sum(sum(results))) # the paired ops are exactly the same, so the optimizer should have # collapsed the sum of differences to a constant zero nodes = f.maker.fgraph.toposort() if aesara.config.mode != "FAST_COMPILE": assert len(nodes) == 1 assert isinstance(nodes[0].op, aesara.compile.DeepCopyOp) inputs = [ np.asarray(np.random.random(((4, ) * n)), x.dtype) for n in [ x.ndim, scale.ndim, bias.ndim, running_mean.ndim, running_var.ndim, ] ] assert 0.0 == f(*inputs)
def test_gpu_memory_usage(self): # This test validates that the memory usage of the defined aesara # function is reasonable when executed on the GPU. It checks for # a bug in which one of scan's optimization was not applied which # made the scan node compute large and unnecessary outputs which # brought memory usage on the GPU to ~12G. # Dimensionality of input and output data (not one-hot coded) n_in = 100 n_out = 100 # Number of neurons in hidden layer n_hid = 4000 # Number of minibatches mb_size = 2 # Time steps in minibatch mb_length = 200 # Define input variables xin = ftensor3(name="xin") yout = ftensor3(name="yout") # Initialize the network parameters U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"), name="W_xin_to_l1") V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"), name="W_l1_to_l1") W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"), name="W_l1_to_l2") nparams = [U, V, W] # Build the forward pass l1_base = dot(xin, U) def scan_l(baseline, last_step): return baseline + dot(last_step, V) zero_output = aet.alloc(np.asarray(0.0, dtype="float32"), mb_size, n_hid) l1_out, _ = scan( scan_l, sequences=[l1_base], outputs_info=[zero_output], mode=self.mode_with_gpu_nodebug, ) l2_out = dot(l1_out, W) # Compute the cost and take the gradient wrt params cost = aet_sum((l2_out - yout)**2) grads = aesara.grad(cost, nparams) updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads)))) # Compile the aesara function feval_backprop = aesara.function([xin, yout], cost, updates=updates, mode=self.mode_with_gpu_nodebug) # Validate that the PushOutScanOutput optimization has been applied # by checking the number of outputs of the grad Scan node in the # compiled function. nodes = feval_backprop.maker.fgraph.toposort() scan_nodes = [n for n in nodes if isinstance(n.op, Scan)] # The grad scan is always the 2nd one according to toposort. If the # optimization has been applied, it has 2 outputs, otherwise 3. grad_scan_node = scan_nodes[1] assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs) # Call the aesara function to ensure the absence of a memory error feval_backprop( np.zeros((mb_length, mb_size, n_in), dtype="float32"), np.zeros((mb_length, mb_size, n_out), dtype="float32"), )
def test_grad_override(self, cls_ofg): x, y = vectors("xy") def go(inps, gs): x, y = inps (g, ) = gs return [g * y * 2, g * x * 1.5] dedz = vector("dedz") op_mul_grad = cls_ofg([x, y, dedz], go([x, y], [dedz])) op_mul = cls_ofg([x, y], [x * y], grad_overrides=go) op_mul2 = cls_ofg([x, y], [x * y], grad_overrides=op_mul_grad) # single override case (function or OfG instance) xx, yy = vector("xx"), vector("yy") for op in [op_mul, op_mul2]: zz = aet_sum(op(xx, yy)) dx, dy = grad(zz, [xx, yy]) fn = function([xx, yy], [dx, dy]) xv = np.random.rand(16).astype(config.floatX) yv = np.random.rand(16).astype(config.floatX) dxv, dyv = fn(xv, yv) assert np.allclose(yv * 2, dxv) assert np.allclose(xv * 1.5, dyv) # list override case def go1(inps, gs): x, w, b = inps g = gs[0] return g * w * 2 def go2(inps, gs): x, w, b = inps g = gs[0] return g * x * 1.5 w, b = vectors("wb") # we make the 3rd gradient default (no override) op_linear = cls_ofg([x, w, b], [x * w + b], grad_overrides=[go1, go2, "default"]) xx, ww, bb = vector("xx"), vector("yy"), vector("bb") zz = aet_sum(op_linear(xx, ww, bb)) dx, dw, db = grad(zz, [xx, ww, bb]) fn = function([xx, ww, bb], [dx, dw, db]) xv = np.random.rand(16).astype(config.floatX) wv = np.random.rand(16).astype(config.floatX) bv = np.random.rand(16).astype(config.floatX) dxv, dwv, dbv = fn(xv, wv, bv) assert np.allclose(wv * 2, dxv) assert np.allclose(xv * 1.5, dwv) assert np.allclose(np.ones(16, dtype=config.floatX), dbv) # NullType and DisconnectedType op_linear2 = cls_ofg( [x, w, b], [x * w + b], grad_overrides=[go1, NullType()(), DisconnectedType()()], ) zz2 = aet_sum(op_linear2(xx, ww, bb)) dx2, dw2, db2 = grad( zz2, [xx, ww, bb], return_disconnected="Disconnected", disconnected_inputs="ignore", null_gradients="return", ) assert isinstance(dx2.type, TensorType) assert dx2.ndim == 1 assert isinstance(dw2.type, NullType) assert isinstance(db2.type, DisconnectedType)
def grad(self, inp, grads): x, dy, scale, x_mean, x_invstd, epsilon = inp ddinputs, ddscale, ddbias = grads x_diff = x - x_mean mean_dy_x_diff = mean(dy * x_diff, axis=self.axes, keepdims=True) # compute gradients given each of the output gradients g_wrt_x = 0 g_wrt_dy = 0 g_wrt_scale = 0 g_wrt_x_mean = 0 g_wrt_x_invstd = 0 if not isinstance(ddinputs.type, aesara.gradient.DisconnectedType): ccc = scale * (ddinputs - mean(ddinputs, axis=self.axes, keepdims=True)) ddd = (x_invstd**3) * ( ccc * mean(dy * x_diff, axis=self.axes, keepdims=True) + dy * mean(ccc * x_diff, axis=self.axes, keepdims=True)) g_wrt_x = g_wrt_x - ddd g_wrt_dy = g_wrt_dy + ((ccc * x_invstd) - ( (x_invstd**3) * x_diff * mean(ccc * x_diff, axis=self.axes, keepdims=True))) eee = (dy * x_invstd) - ((x_invstd**3) * x_diff * mean_dy_x_diff) g_wrt_scale = g_wrt_scale + aet_sum( ddinputs * (eee - mean(eee, axis=self.axes, keepdims=True)), axis=self.axes, keepdims=True, ) g_wrt_x_mean = g_wrt_x_mean + aet_sum( ddd, axis=self.axes, keepdims=True) g_wrt_x_invstd = g_wrt_x_invstd + aet_sum( ccc * (dy - 3 * (x_invstd**2) * x_diff * mean_dy_x_diff), axis=self.axes, keepdims=True, ) if not isinstance(ddscale.type, aesara.gradient.DisconnectedType): g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy) g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff) g_wrt_x_mean = g_wrt_x_mean - (x_invstd * ddscale * aet_sum( dy, axis=self.axes, keepdims=True)) g_wrt_x_invstd = g_wrt_x_invstd + ( ddscale * aet_sum(dy * x_diff, axis=self.axes, keepdims=True)) if not isinstance(ddbias.type, aesara.gradient.DisconnectedType): g_wrt_dy = g_wrt_dy + aet.fill(dy, ddbias) # depending on which output gradients are given, # some inputs should be disconnected results = [ g_wrt_x, g_wrt_dy, g_wrt_scale, g_wrt_x_mean, g_wrt_x_invstd, aesara.gradient.DisconnectedType()(), ] return [ aesara.gradient.DisconnectedType()() if (type(r) == int and r == 0) else r for r in results ]
def test_undefined_grad(): srng = MRG_RandomStream(seed=1234) # checking uniform distribution low = scalar() out = srng.uniform((), low=low) with pytest.raises(NullTypeGradError): grad(out, low) high = scalar() out = srng.uniform((), low=0, high=high) with pytest.raises(NullTypeGradError): grad(out, high) out = srng.uniform((), low=low, high=high) with pytest.raises(NullTypeGradError): grad(out, (low, high)) # checking binomial distribution prob = scalar() out = srng.binomial((), p=prob) with pytest.raises(NullTypeGradError): grad(out, prob) # checking multinomial distribution prob1 = scalar() prob2 = scalar() p = [as_tensor_variable([prob1, 0.5, 0.25])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(NullTypeGradError): grad(aet_sum(out), prob1) p = [as_tensor_variable([prob1, prob2])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(NullTypeGradError): grad(aet_sum(out), (prob1, prob2)) # checking choice p = [as_tensor_variable([prob1, prob2, 0.1, 0.2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(NullTypeGradError): grad(out[0], (prob1, prob2)) p = [as_tensor_variable([prob1, prob2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(NullTypeGradError): grad(out[0], (prob1, prob2)) p = [as_tensor_variable([prob1, 0.2, 0.3])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(NullTypeGradError): grad(out[0], prob1) # checking normal distribution avg = scalar() out = srng.normal((), avg=avg) with pytest.raises(NullTypeGradError): grad(out, avg) std = scalar() out = srng.normal((), avg=0, std=std) with pytest.raises(NullTypeGradError): grad(out, std) out = srng.normal((), avg=avg, std=std) with pytest.raises(NullTypeGradError): grad(out, (avg, std)) # checking truncated normal distribution avg = scalar() out = srng.truncated_normal((), avg=avg) with pytest.raises(NullTypeGradError): grad(out, avg) std = scalar() out = srng.truncated_normal((), avg=0, std=std) with pytest.raises(NullTypeGradError): grad(out, std) out = srng.truncated_normal((), avg=avg, std=std) with pytest.raises(NullTypeGradError): grad(out, (avg, std))
def local_subtensor_rv_lift(fgraph, node): """Lift a ``*Subtensor`` through ``RandomVariable`` inputs. In a fashion similar to ``local_dimshuffle_rv_lift``, the indexed dimensions need to be separated into distinct replication-space and (independent) parameter-space ``*Subtensor``s. The replication-space ``*Subtensor`` can be used to determine a sub/super-set of the replication-space and, thus, a "smaller"/"larger" ``size`` tuple. The parameter-space ``*Subtensor`` is simply lifted and applied to the distribution parameters. Consider the following example graph: ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``. The ``*Subtensor`` ``Op`` requests indices ``idx1``, ``idx2``, and ``idx3``, which correspond to all three ``size`` dimensions. Now, depending on the broadcasted dimensions of ``mu`` and ``std``, this ``*Subtensor`` ``Op`` could be reducing the ``size`` parameter and/or sub-setting the independent ``mu`` and ``std`` parameters. Only once the dimensions are properly separated into the two replication/parameter subspaces can we determine how the ``*Subtensor`` indices are distributed. For instance, ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]`` could become ``normal(mu[idx1], std[idx2], size=np.shape(idx1) + np.shape(idx2) + np.shape(idx3))`` if ``mu.shape == std.shape == ()`` ``normal`` is a rather simple case, because it's univariate. Multivariate cases require a mapping between the parameter space and the image of the random variable. This may not always be possible, but for many common distributions it is. For example, the dimensions of the multivariate normal's image can be mapped directly to each dimension of its parameters. We use these mappings to change a graph like ``multivariate_normal(mu, Sigma)[idx1]`` into ``multivariate_normal(mu[idx1], Sigma[idx1, idx1])``. """ st_op = node.op if not isinstance(st_op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)): return False base_rv = node.inputs[0] rv_node = base_rv.owner if not (rv_node and isinstance(rv_node.op, RandomVariable)): return False # If no one else is using the underlying `RandomVariable`, then we can # do this; otherwise, the graph would be internally inconsistent. if not all((n == node or isinstance(n.op, Shape)) for n, i in fgraph.clients.get(base_rv, ())): return False rv_op = rv_node.op rng, size, dtype, *dist_params = rv_node.inputs # TODO: Remove this once the multi-dimensional changes described below are # in place. if rv_op.ndim_supp > 0: return False rv_op = base_rv.owner.op rng, size, dtype, *dist_params = base_rv.owner.inputs idx_list = getattr(st_op, "idx_list", None) if idx_list: cdata = get_idx_list(node.inputs, idx_list) else: cdata = node.inputs[1:] st_indices, st_is_bool = zip(*tuple( (as_index_variable(i), getattr(i, "dtype", None) == "bool") for i in cdata)) # We need to separate dimensions into replications and independents num_ind_dims = None if len(dist_params) == 1: num_ind_dims = dist_params[0].ndim else: # When there is more than one distribution parameter, assume that all # of them will broadcast to the maximum number of dimensions num_ind_dims = max(d.ndim for d in dist_params) reps_ind_split_idx = base_rv.ndim - (num_ind_dims + rv_op.ndim_supp) if len(st_indices) > reps_ind_split_idx: # These are the indices that need to be applied to the parameters ind_indices = tuple(st_indices[reps_ind_split_idx:]) # We need to broadcast the parameters before applying the `*Subtensor*` # with these indices, because the indices could be referencing broadcast # dimensions that don't exist (yet) bcast_dist_params = broadcast_params(dist_params, rv_op.ndims_params) # TODO: For multidimensional distributions, we need a map that tells us # which dimensions of the parameters need to be indexed. # # For example, `multivariate_normal` would have the following: # `RandomVariable.param_to_image_dims = ((0,), (0, 1))` # # I.e. the first parameter's (i.e. mean's) first dimension maps directly to # the dimension of the RV's image, and its second parameter's # (i.e. covariance's) first and second dimensions map directly to the # dimension of the RV's image. args_lifted = tuple(p[ind_indices] for p in bcast_dist_params) else: # In this case, no indexing is applied to the parameters; only the # `size` parameter is affected. args_lifted = dist_params # TODO: Could use `ShapeFeature` info. We would need to be sure that # `node` isn't in the results, though. # if hasattr(fgraph, "shape_feature"): # output_shape = fgraph.shape_feature.shape_of(node.outputs[0]) # else: output_shape = indexed_result_shape(base_rv.shape, st_indices) size_lifted = (output_shape if rv_op.ndim_supp == 0 else output_shape[:-rv_op.ndim_supp]) # Boolean indices can actually change the `size` value (compared to just # *which* dimensions of `size` are used). if any(st_is_bool): size_lifted = tuple( aet_sum(idx) if is_bool else s for s, is_bool, idx in zip(size_lifted, st_is_bool, st_indices[:( reps_ind_split_idx + 1)])) new_node = rv_op.make_node(rng, size_lifted, dtype, *args_lifted) _, new_rv = new_node.outputs # Calling `Op.make_node` directly circumvents test value computations, so # we need to compute the test values manually if config.compute_test_value != "off": compute_test_value(new_node) return [new_rv]