def test_lop_override(self, cls_ofg): x = vector() y = 1.0 / (1.0 + exp(-x)) def lop_ov(inps, outs, grads): (y_, ) = outs (dedy_, ) = grads return [2.0 * y_ * (1.0 - y_) * dedy_] y_, dedy = vector(), vector() op_lop_ov = cls_ofg([x, y_, dedy], [2.0 * y_ * (1.0 - y_) * dedy]) xx = vector() yy1 = tt_sum(sigmoid(xx)) gyy1 = 2.0 * grad(yy1, xx) for ov in [lop_ov, op_lop_ov]: op = cls_ofg([x], [y], lop_overrides=ov) yy2 = tt_sum(op(xx)) gyy2 = grad(yy2, xx) fn = function([xx], [gyy1, gyy2]) xval = np.random.rand(32).astype(config.floatX) y1val, y2val = fn(xval) assert np.allclose(y1val, y2val)
def infer_shape(self, fgraph, node, ins_shapes): i0_shapes = ins_shapes[0] repeats = node.inputs[1] out_shape = list(i0_shapes) # uint64 shape are not supported. dtype = None if repeats.dtype in ["uint8", "uint16", "uint32"]: dtype = "int64" if self.axis is None: if repeats.ndim == 0: if len(i0_shapes) == 0: out_shape = [repeats] else: res = 1 for d in i0_shapes: res = res * d out_shape = (res * repeats, ) else: out_shape = [tt_sum(repeats, dtype=dtype)] else: if repeats.ndim == 0: out_shape[self.axis] = out_shape[self.axis] * repeats else: out_shape[self.axis] = tt_sum(repeats, dtype=dtype) return [out_shape]
def test_pickle_unpickle_without_reoptimization(): mode = config.mode if mode in ["DEBUG_MODE", "DebugMode"]: mode = "FAST_RUN" x1 = fmatrix("x1") x2 = fmatrix("x2") x3 = aesara.shared(np.ones((10, 10), dtype=floatX)) x4 = aesara.shared(np.ones((10, 10), dtype=floatX)) y = tt_sum(tt_sum(tt_sum(x1**2 + x2) + x3) + x4) updates = OrderedDict() updates[x3] = x3 + 1 updates[x4] = x4 + 1 f = aesara.function([x1, x2], y, updates=updates, mode=mode) # now pickle the compiled aesara fn string_pkl = pickle.dumps(f, -1) # compute f value in1 = np.ones((10, 10), dtype=floatX) in2 = np.ones((10, 10), dtype=floatX) # test unpickle without optimization default = config.reoptimize_unpickled_function try: # the default is True config.reoptimize_unpickled_function = False f_ = pickle.loads(string_pkl) assert f(in1, in2) == f_(in1, in2) finally: config.reoptimize_unpickled_function = default
def test_grad(self): x = vector("x") a = np.random.random(50).astype(config.floatX) aesara.function([x], grad(tt_sum(diff(x)), x)) utt.verify_grad(self.op, [a]) for k in range(TestDiffOp.nb): aesara.function([x], grad(tt_sum(diff(x, n=k)), x)) utt.verify_grad(DiffOp(n=k), [a], eps=7e-3)
def test_grad_grad(self, cls_ofg): x, y, z = matrices("xyz") e = x + y * z op = cls_ofg([x, y, z], [e]) f = op(x, y, z) f = f - grad(tt_sum(f), y) f = f - grad(tt_sum(f), y) fn = function([x, y, z], f) xv = np.ones((2, 2), dtype=config.floatX) yv = np.ones((2, 2), dtype=config.floatX) * 3 zv = np.ones((2, 2), dtype=config.floatX) * 5 assert np.allclose(6.0, fn(xv, yv, zv))
def __call__(self, v, cost, parameters, damp): # compute Gauss-Newton Matrix right-multiplied by `v` Jv = Rop(self._s, parameters, v) HJv = grad(tt_sum(grad(cost, self._s) * Jv), self._s, consider_constant=[Jv]) JHJv = grad(tt_sum(HJv * self._s), parameters, consider_constant=[HJv, Jv]) # apply Tikhonov damping JHJv = [JHJvi + damp * vi for JHJvi, vi in zip(JHJv, v)] return JHJv
def test_swap_SharedVariable_with_given(self): # A special testcase for logistic_sgd.py in Deep Learning Tutorial # This test assert that SharedVariable in different function have same storage train_x = aesara.shared(value=np.random.rand(10, 10).astype(config.floatX)) test_x = aesara.shared(value=np.random.rand(10, 10).astype(config.floatX)) train_y = aesara.shared(value=np.random.rand(10, 1).astype(config.floatX)) test_y = aesara.shared(value=np.random.rand(10, 1).astype(config.floatX)) i = iscalar("index") x = vector("x") y = vector("y") # this formular has no sense but for a test out = (tt_sum(x) - y) ** 2 train = aesara.function( [i], out, givens={x: train_x[i], y: train_y[i]}, updates={train_x: train_x + 0.1}, ) test_def = aesara.function([i], out, givens={x: test_x[i], y: test_y[i]}) test_cpy = train.copy( swap={train_x: test_x, train_y: test_y}, delete_updates=True ) for in1, in2 in zip(test_def.maker.inputs, test_cpy.maker.inputs): assert in1.value is in2.value
def test_hessian(): x = vector() y = tt_sum(x ** 2) Hx = hessian(y, x) f = aesara.function([x], Hx) vx = np.arange(10).astype(aesara.config.floatX) assert np.allclose(f(vx), np.eye(10) * 2)
def test_grad(self): x = vector() np_x = np.random.randn(7).astype(aesara.config.floatX) # offset = 0 case: mtx_x = GpuAllocDiag()(x) sum_mtx_x = tt_sum(mtx_x) grad_x = aesara.grad(sum_mtx_x, x) grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x) fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu) fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu) computed_grad_x = fn_grad_x(np_x) computed_grad_mtx_x = fn_grad_mtx_x(np_x) true_grad_x = np.diagonal(computed_grad_mtx_x, 0) assert np.allclose(computed_grad_x, true_grad_x) # offset > 0 case: mtx_x = GpuAllocDiag(2)(x) sum_mtx_x = tt_sum(mtx_x) grad_x = aesara.grad(sum_mtx_x, x) grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x) fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu) fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu) computed_grad_x = fn_grad_x(np_x) computed_grad_mtx_x = fn_grad_mtx_x(np_x) true_grad_x = np.diagonal(computed_grad_mtx_x, 2) assert np.allclose(computed_grad_x, true_grad_x) # offset < 0 case: mtx_x = GpuAllocDiag(-3)(x) sum_mtx_x = tt_sum(mtx_x) grad_x = aesara.grad(sum_mtx_x, x) grad_mtx_x = aesara.grad(sum_mtx_x, mtx_x) fn_grad_x = aesara.function([x], grad_x, mode=mode_with_gpu) fn_grad_mtx_x = aesara.function([x], grad_mtx_x, mode=mode_with_gpu) computed_grad_x = fn_grad_x(np_x) computed_grad_mtx_x = fn_grad_mtx_x(np_x) true_grad_x = np.diagonal(computed_grad_mtx_x, -3) assert np.allclose(computed_grad_x, true_grad_x)
def test_no_shared_as_input(self): # Test that shared variables cannot be used as function inputs. w_init = np.random.rand(2, 2) w = shared(w_init.copy(), "w") with pytest.raises( TypeError, match=r"^Cannot use a shared variable \(w\) as explicit input" ): pfunc([w], tt_sum(w * w))
def test_shared_grad(self, cls_ofg): x, y, z = matrices("xyz") s = shared(np.random.rand(2, 2).astype(config.floatX)) e = x + y * z + s op = cls_ofg([x, y, z], [e]) f = op(x, y, z) f = f - grad(tt_sum(f), y) fn = function([x, y, z], f) xv = np.ones((2, 2), dtype=config.floatX) yv = np.ones((2, 2), dtype=config.floatX) * 3 zv = np.ones((2, 2), dtype=config.floatX) * 5 assert np.allclose(11.0 + s.get_value(), fn(xv, yv, zv)) # grad again the shared variable f = op(x, y, z) f = f - grad(tt_sum(f), s) fn = function([x, y, z], f) assert np.allclose(15.0 + s.get_value(), fn(xv, yv, zv))
def test_FunctionMaker_cache_optimizations(): opt_db_file = os.path.join(config.compiledir, "optimized_graphs.pkl") if os.path.exists(opt_db_file): os.remove(opt_db_file) floatX = "float32" mode = config.mode if mode in ["DEBUG_MODE", "DebugMode"]: mode = "FAST_RUN" graph_db_file = os.path.join(config.compiledir, "optimized_graphs.pkl") assert not os.path.exists(graph_db_file) with config.change_flags(cache_optimizations=True): a = fmatrix("a") b = fmatrix("b") c = aesara.shared(np.ones((10, 10), dtype=floatX)) d = aesara.shared(np.ones((10, 10), dtype=floatX)) e = tt_sum(tt_sum(tt_sum(a ** 2 + b) + c) + d) f1 = aesara.function([a, b], e, mode=mode) # FIXME: We can do much better about testing this. assert os.path.exists(graph_db_file) m = fmatrix("x1") n = fmatrix("x2") p = aesara.shared(np.ones((10, 10), dtype=floatX)) q = aesara.shared(np.ones((10, 10), dtype=floatX)) j = tt_sum(tt_sum(tt_sum(m ** 2 + n) + p) + q) f2 = aesara.function([m, n], j, mode=mode) in1 = np.ones((10, 10), dtype=floatX) in2 = np.ones((10, 10), dtype=floatX) assert f1(in1, in2) == f2(in1, in2)
def test_jax_CAReduce(): a_tt = vector("a") a_tt.tag.test_value = np.r_[1, 2, 3].astype(config.floatX) x = tt_sum(a_tt, axis=None) x_fg = FunctionGraph([a_tt], [x]) compare_jax_and_py(x_fg, [np.r_[1, 2, 3].astype(config.floatX)]) a_tt = matrix("a") a_tt.tag.test_value = np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX) x = tt_sum(a_tt, axis=0) x_fg = FunctionGraph([a_tt], [x]) compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)]) x = tt_sum(a_tt, axis=1) x_fg = FunctionGraph([a_tt], [x]) compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)]) a_tt = matrix("a") a_tt.tag.test_value = np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX) x = prod(a_tt, axis=0) x_fg = FunctionGraph([a_tt], [x]) compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)]) x = tt_all(a_tt) x_fg = FunctionGraph([a_tt], [x]) compare_jax_and_py(x_fg, [np.c_[[1, 2, 3], [1, 2, 3]].astype(config.floatX)])
def test_gradient_scan(): # Test for a crash when using MRG inside scan and taking the gradient # See https://groups.google.com/d/msg/theano-dev/UbcYyU5m-M8/UO9UgXqnQP0J aesara_rng = MRG_RandomStream(10) w = shared(np.ones(1, dtype="float32")) def one_step(x): return x + aesara_rng.uniform((1,), dtype="float32") * w x = vector(dtype="float32") values, updates = scan(one_step, outputs_info=x, n_steps=10) gw = grad(tt_sum(values[-1]), w) f = function([x], gw) f(np.arange(1, dtype="float32"))
def test_default_container(self): # Ensure it is possible to (implicitly) use a shared variable in a # function, as a 'state' that can be updated at will. rng = np.random.RandomState(1827) w_init = rng.rand(5) w = shared(w_init.copy(), "w") reg = tt_sum(w * w) f = pfunc([], reg) assert f() == np.sum(w_init * w_init) # Change the value of w and ensure the output changes accordingly. w.set_value(w.get_value(borrow=True) + 1.0, borrow=True) assert f() == np.sum((w_init + 1)**2)
def local_abstract_batch_norm_train_grad(fgraph, node): if not isinstance(node.op, AbstractBatchNormTrainGrad): return None x, dy, scale, x_mean, x_invstd, epsilon = node.inputs axes = node.op.axes if min(axes) < 0 or max(axes) > x.ndim: return None if ( not isinstance(x.type, TensorType) or not isinstance(dy.type, TensorType) or not isinstance(scale.type, TensorType) or not isinstance(x_mean.type, TensorType) or not isinstance(x_invstd.type, TensorType) or not isinstance(epsilon.type, TensorType) ): return None x_diff = x - x_mean mean_dy_x_diff = mean(dy * x_diff, axis=axes, keepdims=True) c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3)) g_wrt_inputs = scale * (c - mean(c, axis=axes, keepdims=True)) g_wrt_scale = tt_sum(dy * x_invstd * x_diff, axis=axes, keepdims=True) g_wrt_bias = tt_sum(dy, axis=axes, keepdims=True) results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias] results = [ aet.patternbroadcast(r, r_orig.broadcastable) for (r, r_orig) in zip(results, node.outputs) ] for var in aesara.graph.basic.vars_between(node.inputs, results): if var not in node.inputs: copy_stack_trace(node.outputs[0], var) return results
def test_DownsampleFactorMax_hessian(self): # Example provided by Frans Cronje, see # https://groups.google.com/d/msg/theano-users/qpqUy_3glhw/JMwIvlN5wX4J x_vec = vector("x") z = aet.dot(x_vec.dimshuffle(0, "x"), x_vec.dimshuffle("x", 0)) y = pool_2d(input=z, ws=(2, 2), ignore_border=True) C = aet.exp(tt_sum(y)) grad_hess = aesara.gradient.hessian(cost=C, wrt=x_vec) fn_hess = function(inputs=[x_vec], outputs=grad_hess) # The value has been manually computed from the theoretical gradient, # and confirmed by the implementation. assert np.allclose(fn_hess([1, 2]), [[0.0, 0.0], [0.0, 982.7667]])
def test_undefined_grad_opt(): # Make sure that undefined grad get removed in optimized graph. random = MRG_RandomStream(np.random.randint(1, 2147462579)) pvals = shared(np.random.rand(10, 20).astype(config.floatX)) pvals = pvals / pvals.sum(axis=1) pvals = zero_grad(pvals) samples = random.multinomial(pvals=pvals, n=1) samples = cast(samples, pvals.dtype) samples = zero_grad(samples) cost = tt_sum(samples + pvals) grad_out = grad(cost, samples) f = function([], grad_out) assert not any( [isinstance(node.op, UndefinedGrad) for node in f.maker.fgraph.apply_nodes] )
def __init__( self, input=None, target=None, n_input=1, n_hidden=1, n_output=1, lr=1e-3, **kw, ): super().__init__(**kw) if input is None: input = dvector("input") if target is None: target = dvector("target") self.input = input self.target = target self.lr = shared(lr, "learning_rate") self.w1 = shared(np.zeros((n_hidden, n_input)), "w1") self.w2 = shared(np.zeros((n_output, n_hidden)), "w2") # print self.lr.type self.hidden = sigmoid(dot(self.w1, self.input)) self.output = dot(self.w2, self.hidden) self.cost = tt_sum((self.output - self.target) ** 2) self.sgd_updates = { self.w1: self.w1 - self.lr * grad(self.cost, self.w1), self.w2: self.w2 - self.lr * grad(self.cost, self.w2), } self.sgd_step = pfunc( params=[self.input, self.target], outputs=[self.output, self.cost], updates=self.sgd_updates, ) self.compute_output = pfunc([self.input], self.output) self.output_from_hidden = pfunc([self.hidden], self.output)
def test_broadcastable(): R = MRG_RandomStream(234) x = matrix() size1 = (10, 1) size2 = (x.shape[0], 1) pvals_1 = np.random.uniform(0, 1, size=size1) pvals_1 = pvals_1 / sum(pvals_1) pvals_2 = R.uniform(size=size2) pvals_2 = pvals_2 / tt_sum(pvals_2) for distribution in [ R.uniform, R.normal, R.truncated_normal, R.binomial, R.multinomial, R.multinomial_wo_replacement, ]: # multinomial or multinomial_wo_replacement does not support "size" argument, # the sizes of them are implicitly defined with "pvals" argument. if distribution in [R.multinomial, R.multinomial_wo_replacement]: # check when all dimensions are constant uu = distribution(pvals=pvals_1) assert uu.broadcastable == (False, True) # check when some dimensions are aesara variables uu = distribution(pvals=pvals_2) assert uu.broadcastable == (False, True) else: # check when all dimensions are constant uu = distribution(size=size1) assert uu.broadcastable == (False, True) # check when some dimensions are aesara variables uu = distribution(size=size2) assert uu.broadcastable == (False, True)
def grad(self, inp, grads): x, dy, scale, x_mean, x_invstd, epsilon = inp ddinputs, ddscale, ddbias = grads x_diff = x - x_mean mean_dy_x_diff = mean(dy * x_diff, axis=self.axes, keepdims=True) # compute gradients given each of the output gradients g_wrt_x = 0 g_wrt_dy = 0 g_wrt_scale = 0 g_wrt_x_mean = 0 g_wrt_x_invstd = 0 if not isinstance(ddinputs.type, aesara.gradient.DisconnectedType): ccc = scale * (ddinputs - mean(ddinputs, axis=self.axes, keepdims=True)) ddd = (x_invstd ** 3) * ( ccc * mean(dy * x_diff, axis=self.axes, keepdims=True) + dy * mean(ccc * x_diff, axis=self.axes, keepdims=True) ) g_wrt_x = g_wrt_x - ddd g_wrt_dy = g_wrt_dy + ( (ccc * x_invstd) - ( (x_invstd ** 3) * x_diff * mean(ccc * x_diff, axis=self.axes, keepdims=True) ) ) eee = (dy * x_invstd) - ((x_invstd ** 3) * x_diff * mean_dy_x_diff) g_wrt_scale = g_wrt_scale + tt_sum( ddinputs * (eee - mean(eee, axis=self.axes, keepdims=True)), axis=self.axes, keepdims=True, ) g_wrt_x_mean = g_wrt_x_mean + tt_sum(ddd, axis=self.axes, keepdims=True) g_wrt_x_invstd = g_wrt_x_invstd + tt_sum( ccc * (dy - 3 * (x_invstd ** 2) * x_diff * mean_dy_x_diff), axis=self.axes, keepdims=True, ) if not isinstance(ddscale.type, aesara.gradient.DisconnectedType): g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy) g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff) g_wrt_x_mean = g_wrt_x_mean - ( x_invstd * ddscale * tt_sum(dy, axis=self.axes, keepdims=True) ) g_wrt_x_invstd = g_wrt_x_invstd + ( ddscale * tt_sum(dy * x_diff, axis=self.axes, keepdims=True) ) if not isinstance(ddbias.type, aesara.gradient.DisconnectedType): g_wrt_dy = g_wrt_dy + aet.fill(dy, ddbias) # depending on which output gradients are given, # some inputs should be disconnected results = [ g_wrt_x, g_wrt_dy, g_wrt_scale, g_wrt_x_mean, g_wrt_x_invstd, aesara.gradient.DisconnectedType()(), ] return [ aesara.gradient.DisconnectedType()() if (type(r) == int and r == 0) else r for r in results ]
def test_gpu_memory_usage(self): # This test validates that the memory usage of the defined aesara # function is reasonnable when executed on the GPU. It checks for # a bug in which one of scan's optimization was not applied which # made the scan node compute large and unnecessary outputs which # brought memory usage on the GPU to ~12G. # Dimensionality of input and output data (not one-hot coded) n_in = 100 n_out = 100 # Number of neurons in hidden layer n_hid = 4000 # Number of minibatches mb_size = 2 # Time steps in minibatch mb_length = 200 # Define input variables xin = ftensor3(name="xin") yout = ftensor3(name="yout") # Initialize the network parameters U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"), name="W_xin_to_l1") V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"), name="W_l1_to_l1") W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"), name="W_l1_to_l2") nparams = [U, V, W] # Build the forward pass l1_base = dot(xin, U) def scan_l(baseline, last_step): return baseline + dot(last_step, V) zero_output = aet.alloc(np.asarray(0.0, dtype="float32"), mb_size, n_hid) l1_out, _ = scan( scan_l, sequences=[l1_base], outputs_info=[zero_output], mode=self.mode_with_gpu_nodebug, ) l2_out = dot(l1_out, W) # Compute the cost and take the gradient wrt params cost = tt_sum((l2_out - yout)**2) grads = aesara.grad(cost, nparams) updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads)))) # Compile the aesara function feval_backprop = aesara.function([xin, yout], cost, updates=updates, mode=self.mode_with_gpu_nodebug) # Validate that the PushOutScanOutput optimization has been applied # by checking the number of outputs of the grad Scan node in the # compiled function. nodes = feval_backprop.maker.fgraph.toposort() scan_nodes = [n for n in nodes if isinstance(n.op, Scan)] # The grad scan is always the 2nd one according to toposort. If the # optimization has been applied, it has 2 outputs, otherwise 3. grad_scan_node = scan_nodes[1] assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs) # Call the aesara function to ensure the absence of a memory error feval_backprop( np.zeros((mb_length, mb_size, n_in), dtype="float32"), np.zeros((mb_length, mb_size, n_out), dtype="float32"), )
def test_batch_normalization_train_broadcast(): for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (tensor5, tensor4, tensor3, matrix, vector): x = vartype("x") ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # convert axes to explicit list if axes == "per-activation": axes2 = (0, ) elif axes == "spatial": axes2 = (0, ) + tuple(range(2, ndim)) else: axes2 = axes # compute axes for parameter tensors non_bc_axes = tuple(i for i in range(ndim) if i not in axes2) params_dimshuffle = ["x"] * ndim for i, axis in enumerate(non_bc_axes): params_dimshuffle[axis] = i # construct non-broadcasted parameter variables param_type = TensorType(x.dtype, (False, ) * len(non_bc_axes)) scale, bias, running_mean, running_var = (param_type(n) for n in ("scale", "bias", "running_mean", "running_var")) # broadcast parameter variables scale_bc = scale.dimshuffle(params_dimshuffle) bias_bc = bias.dimshuffle(params_dimshuffle) running_mean_bc = running_mean.dimshuffle(params_dimshuffle) running_var_bc = running_var.dimshuffle(params_dimshuffle) # batch_normalization_train with original, non-broadcasted variables train_non_bc = batchnorm.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var, ) # batch_normalization_train with broadcasted variables train_bc = batchnorm.batch_normalization_train( x, scale_bc, bias_bc, axes, eps, running_average_factor, running_mean_bc, running_var_bc, ) train_bc = tuple([train_bc[0]] + [r.dimshuffle(non_bc_axes) for r in train_bc[1:]] # out ) # batch_normalization_test with original, non-broadcasted variables test_non_bc = batchnorm.batch_normalization_test( x, scale, bias, running_mean, running_var, axes, eps) # batch_normalization_test with broadcasted variables test_bc = batchnorm.batch_normalization_test( x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps) # subtract the results of the non-broadcasted and broadcasted calls results_non_bc = train_non_bc + (test_non_bc, ) results_bc = train_bc + (test_bc, ) results = [ abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc) ] # compile to compute all differences f = aesara.function([x, scale, bias, running_mean, running_var], tt_sum(sum(results))) # the paired ops are exactly the same, so the optimizer should have # collapsed the sum of differences to a constant zero nodes = f.maker.fgraph.toposort() if aesara.config.mode != "FAST_COMPILE": assert len(nodes) == 1 assert isinstance(nodes[0].op, aesara.compile.DeepCopyOp) inputs = [ np.asarray(np.random.rand(*((4, ) * n)), x.dtype) for n in [ x.ndim, scale.ndim, bias.ndim, running_mean.ndim, running_var.ndim, ] ] assert 0.0 == f(*inputs)
def test_grad_override(self, cls_ofg): x, y = vectors("xy") def go(inps, gs): x, y = inps (g, ) = gs return [g * y * 2, g * x * 1.5] dedz = vector("dedz") op_mul_grad = cls_ofg([x, y, dedz], go([x, y], [dedz])) op_mul = cls_ofg([x, y], [x * y], grad_overrides=go) op_mul2 = cls_ofg([x, y], [x * y], grad_overrides=op_mul_grad) # single override case (function or OfG instance) xx, yy = vector("xx"), vector("yy") for op in [op_mul, op_mul2]: zz = tt_sum(op(xx, yy)) dx, dy = grad(zz, [xx, yy]) fn = function([xx, yy], [dx, dy]) xv = np.random.rand(16).astype(config.floatX) yv = np.random.rand(16).astype(config.floatX) dxv, dyv = fn(xv, yv) assert np.allclose(yv * 2, dxv) assert np.allclose(xv * 1.5, dyv) # list override case def go1(inps, gs): x, w, b = inps g = gs[0] return g * w * 2 def go2(inps, gs): x, w, b = inps g = gs[0] return g * x * 1.5 w, b = vectors("wb") # we make the 3rd gradient default (no override) op_linear = cls_ofg([x, w, b], [x * w + b], grad_overrides=[go1, go2, "default"]) xx, ww, bb = vector("xx"), vector("yy"), vector("bb") zz = tt_sum(op_linear(xx, ww, bb)) dx, dw, db = grad(zz, [xx, ww, bb]) fn = function([xx, ww, bb], [dx, dw, db]) xv = np.random.rand(16).astype(config.floatX) wv = np.random.rand(16).astype(config.floatX) bv = np.random.rand(16).astype(config.floatX) dxv, dwv, dbv = fn(xv, wv, bv) assert np.allclose(wv * 2, dxv) assert np.allclose(xv * 1.5, dwv) assert np.allclose(np.ones(16, dtype=config.floatX), dbv) # NullType and DisconnectedType op_linear2 = cls_ofg( [x, w, b], [x * w + b], grad_overrides=[go1, NullType()(), DisconnectedType()()], ) zz2 = tt_sum(op_linear2(xx, ww, bb)) dx2, dw2, db2 = grad( zz2, [xx, ww, bb], return_disconnected="Disconnected", disconnected_inputs="ignore", null_gradients="return", ) assert isinstance(dx2.type, TensorType) assert dx2.ndim == 1 assert isinstance(dw2.type, NullType) assert isinstance(db2.type, DisconnectedType)
def local_subtensor_rv_lift(fgraph, node): """Lift ``*Subtensor`` `Op`s up to a `RandomVariable`'s parameters. In a fashion similar to `local_dimshuffle_rv_lift`, the indexed dimensions need to be separated into distinct replication-space and (independent) parameter-space ``*Subtensor``s. The replication-space ``*Subtensor`` can be used to determine a sub/super-set of the replication-space and, thus, a "smaller"/"larger" ``size`` tuple. The parameter-space ``*Subtensor`` is simply lifted and applied to the `RandomVariable`'s distribution parameters. Consider the following example graph: ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]``. The ``*Subtensor`` `Op` requests indices ``idx1``, ``idx2``, and ``idx3``, which correspond to all three ``size`` dimensions. Now, depending on the broadcasted dimensions of ``mu`` and ``std``, this ``*Subtensor`` `Op` could be reducing the ``size`` parameter and/or subsetting the independent ``mu`` and ``std`` parameters. Only once the dimensions are properly separated into the two replication/parameter subspaces can we determine how the ``*Subtensor`` indices are distributed. For instance, ``normal(mu, std, size=(d1, d2, d3))[idx1, idx2, idx3]`` could become ``normal(mu[idx1], std[idx2], size=np.shape(idx1) + np.shape(idx2) + np.shape(idx3))`` if ``mu.shape == std.shape == ()`` ``normal`` is a rather simple case, because it's univariate. Multivariate cases require a mapping between the parameter space and the image of the random variable. This may not always be possible, but for many common distributions it is. For example, the dimensions of the multivariate normal's image can be mapped directly to each dimension of its parameters. We use these mappings to change a graph like ``multivariate_normal(mu, Sigma)[idx1]`` into ``multivariate_normal(mu[idx1], Sigma[idx1, idx1])``. Notice how Also, there's the important matter of "advanced" indexing, which may not only subset an array, but also broadcast it to a larger size. """ st_op = node.op if not isinstance(st_op, (AdvancedSubtensor, AdvancedSubtensor1, Subtensor)): return False base_rv = node.inputs[0] rv_node = base_rv.owner if not (rv_node and isinstance(rv_node.op, RandomVariable)): return False # If no one else is using the underlying `RandomVariable`, then we can # do this; otherwise, the graph would be internally inconsistent. if not all( (n == node or isinstance(n.op, Shape)) for n, i in fgraph.clients[base_rv] ): return False rv_op = rv_node.op rng, size, dtype, *dist_params = rv_node.inputs # TODO: Remove this once the multi-dimensional changes described below are # in place. if rv_op.ndim_supp > 0: return False rv_op = base_rv.owner.op rng, size, dtype, *dist_params = base_rv.owner.inputs idx_list = getattr(st_op, "idx_list", None) if idx_list: cdata = get_idx_list(node.inputs, idx_list) else: cdata = node.inputs[1:] st_indices, st_is_bool = zip( *tuple( (as_index_variable(i), getattr(i, "dtype", None) == "bool") for i in cdata ) ) # We need to separate dimensions into replications and independents num_ind_dims = None if len(dist_params) == 1: num_ind_dims = dist_params[0].ndim else: # When there is more than one distribution parameter, assume that all # of them will broadcast to the maximum number of dimensions num_ind_dims = max(d.ndim for d in dist_params) reps_ind_split_idx = base_rv.ndim - (num_ind_dims + rv_op.ndim_supp) if len(st_indices) > reps_ind_split_idx: # These are the indices that need to be applied to the parameters ind_indices = tuple(st_indices[reps_ind_split_idx:]) # We need to broadcast the parameters before applying the `*Subtensor*` # with these indices, because the indices could be referencing broadcast # dimensions that don't exist (yet) bcast_dist_params = broadcast_params(dist_params, rv_op.ndims_params) # TODO: For multidimensional distributions, we need a map that tells us # which dimensions of the parameters need to be indexed. # # For example, `multivariate_normal` would have the following: # `RandomVariable.param_to_image_dims = ((0,), (0, 1))` # # I.e. the first parameter's (i.e. mean's) first dimension maps directly to # the dimension of the RV's image, and its second parameter's # (i.e. covariance's) first and second dimensions map directly to the # dimension of the RV's image. args_lifted = tuple(p[ind_indices] for p in bcast_dist_params) else: # In this case, no indexing is applied to the parameters; only the # `size` parameter is affected. args_lifted = dist_params # TODO: Could use `ShapeFeature` info. We would need to be sure that # `node` isn't in the results, though. # if hasattr(fgraph, "shape_feature"): # output_shape = fgraph.shape_feature.shape_of(node.outputs[0]) # else: output_shape = indexed_result_shape(base_rv.shape, st_indices) size_lifted = ( output_shape if rv_op.ndim_supp == 0 else output_shape[: -rv_op.ndim_supp] ) # Boolean indices can actually change the `size` value (compared to just # *which* dimensions of `size` are used). if any(st_is_bool): size_lifted = tuple( tt_sum(idx) if is_bool else s for s, is_bool, idx in zip( size_lifted, st_is_bool, st_indices[: (reps_ind_split_idx + 1)] ) ) new_node = rv_op.make_node(rng, size_lifted, dtype, *args_lifted) _, new_rv = new_node.outputs # Calling `Op.make_node` directly circumvents test value computations, so # we need to compute the test values manually if config.compute_test_value != "off": compute_test_value(new_node) return [new_rv]
def test_undefined_grad(): srng = MRG_RandomStream(seed=1234) # checking uniform distribution low = scalar() out = srng.uniform((), low=low) with pytest.raises(NullTypeGradError): grad(out, low) high = scalar() out = srng.uniform((), low=0, high=high) with pytest.raises(NullTypeGradError): grad(out, high) out = srng.uniform((), low=low, high=high) with pytest.raises(NullTypeGradError): grad(out, (low, high)) # checking binomial distribution prob = scalar() out = srng.binomial((), p=prob) with pytest.raises(NullTypeGradError): grad(out, prob) # checking multinomial distribution prob1 = scalar() prob2 = scalar() p = [as_tensor_variable([prob1, 0.5, 0.25])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(NullTypeGradError): grad(tt_sum(out), prob1) p = [as_tensor_variable([prob1, prob2])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(NullTypeGradError): grad(tt_sum(out), (prob1, prob2)) # checking choice p = [as_tensor_variable([prob1, prob2, 0.1, 0.2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(NullTypeGradError): grad(out[0], (prob1, prob2)) p = [as_tensor_variable([prob1, prob2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(NullTypeGradError): grad(out[0], (prob1, prob2)) p = [as_tensor_variable([prob1, 0.2, 0.3])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(NullTypeGradError): grad(out[0], prob1) # checking normal distribution avg = scalar() out = srng.normal((), avg=avg) with pytest.raises(NullTypeGradError): grad(out, avg) std = scalar() out = srng.normal((), avg=0, std=std) with pytest.raises(NullTypeGradError): grad(out, std) out = srng.normal((), avg=avg, std=std) with pytest.raises(NullTypeGradError): grad(out, (avg, std)) # checking truncated normal distribution avg = scalar() out = srng.truncated_normal((), avg=avg) with pytest.raises(NullTypeGradError): grad(out, avg) std = scalar() out = srng.truncated_normal((), avg=0, std=std) with pytest.raises(NullTypeGradError): grad(out, std) out = srng.truncated_normal((), avg=avg, std=std) with pytest.raises(NullTypeGradError): grad(out, (avg, std))