def _build_marginal_likelihood_logp(self, y, X, Xu, sigma): sigma2 = at.square(sigma) Kuu = self.cov_func(Xu) Kuf = self.cov_func(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = at.sum(A * A, 0) if self.approx == "FITC": Kffd = self.cov_func(X, diag=True) Lamd = at.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 trace = 0.0 elif self.approx == "VFE": Lamd = at.ones_like(Qffd) * sigma2 trace = (1.0 / (2.0 * sigma2)) * (at.sum(self.cov_func(X, diag=True)) - at.sum(at.sum(A * A, 0))) else: # DTC Lamd = at.ones_like(Qffd) * sigma2 trace = 0.0 A_l = A / Lamd L_B = cholesky(at.eye(Xu.shape[0]) + at.dot(A_l, at.transpose(A))) r = y - self.mean_func(X) r_l = r / Lamd c = solve_lower(L_B, at.dot(A, r_l)) constant = 0.5 * X.shape[0] * at.log(2.0 * np.pi) logdet = 0.5 * at.sum(at.log(Lamd)) + at.sum(at.log(at.diag(L_B))) quadratic = 0.5 * (at.dot(r, r_l) - at.dot(c, c)) return -1.0 * (constant + logdet + quadratic + trace)
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = at.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = at.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = at.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = at.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(at.eye(Xu.shape[0]) + at.dot(A_l, at.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, at.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + at.dot(at.transpose(As), solve_upper(at.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - at.sum(at.square(As), 0) + at.sum(at.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = self.cov_func(Xnew) - at.dot(at.transpose(As), As) + at.dot( at.transpose(C), C) if pred_noise: cov += sigma2 * at.identity_like(cov) return mu, cov if pred_noise else stabilize(cov)
def rv_op(cls, dist, lower=None, upper=None, size=None, rngs=None): lower = at.constant( -np.inf) if lower is None else at.as_tensor_variable(lower) upper = at.constant( np.inf) if upper is None else at.as_tensor_variable(upper) # When size is not specified, dist may have to be broadcasted according to lower/upper dist_shape = size if size is not None else at.broadcast_shape( dist, lower, upper) dist = change_rv_size(dist, dist_shape) # Censoring is achieved by clipping the base distribution between lower and upper rv_out = at.clip(dist, lower, upper) # Reference nodes to facilitate identification in other classmethods, without # worring about possible dimshuffles rv_out.tag.dist = dist rv_out.tag.lower = lower rv_out.tag.upper = upper if rngs is not None: rv_out = cls._change_rngs(rv_out, rngs) return rv_out
def square_dist(self, X, Xs=None): X2 = aet.sum(aet.square(X), 1) if Xs is None: sqd = -2.0 * aet.dot(X, aet.transpose(X)) + ( aet.reshape(X2, (-1, 1)) + aet.reshape(X2, (1, -1))) else: Xs2 = aet.sum(aet.square(Xs), 1) sqd = -2.0 * aet.dot(X, aet.transpose(Xs)) + ( aet.reshape(X2, (-1, 1)) + aet.reshape(Xs2, (1, -1))) return aet.clip(sqd, 0.0, np.inf)
def square_dist(self, X, Xs): X = at.mul(X, 1.0 / self.ls) X2 = at.sum(at.square(X), 1) if Xs is None: sqd = -2.0 * at.dot(X, at.transpose(X)) + (at.reshape(X2, (-1, 1)) + at.reshape(X2, (1, -1))) else: Xs = at.mul(Xs, 1.0 / self.ls) Xs2 = at.sum(at.square(Xs), 1) sqd = -2.0 * at.dot(X, at.transpose(Xs)) + ( at.reshape(X2, (-1, 1)) + at.reshape(Xs2, (1, -1))) return at.clip(sqd, 0.0, np.inf)
def rv_op(cls, dist, lower=None, upper=None, size=None, rngs=None): if lower is None: lower = at.constant(-np.inf) if upper is None: upper = at.constant(np.inf) # Censoring is achieved by clipping the base distribution between lower and upper rv_out = at.clip(dist, lower, upper) # Reference nodes to facilitate identification in other classmethods, without # worring about possible dimshuffles rv_out.tag.dist = dist rv_out.tag.lower = lower rv_out.tag.upper = upper if size is not None: rv_out = cls.change_size(rv_out, size) if rngs is not None: rv_out = cls.change_rngs(rv_out, rngs) return rv_out
def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7): """Max weight norm constraints and gradient clipping This takes a TensorVariable and rescales it so that incoming weight norms are below a specified constraint value. Vectors violating the constraint are rescaled so that they are within the allowed range. Parameters ---------- tensor_var: TensorVariable Aesara expression for update, gradient, or other quantity. max_norm: scalar This value sets the maximum allowed value of any norm in `tensor_var`. norm_axes: sequence (list or tuple) The axes over which to compute the norm. This overrides the default norm axes defined for the number of dimensions in `tensor_var`. When this is not specified and `tensor_var` is a matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or 5D tensor, it is set to a tuple listing all axes but axis 0. The former default is useful for working with dense layers, the latter is useful for 1D, 2D and 3D convolutional layers. (Optional) epsilon: scalar, optional Value used to prevent numerical instability when dividing by very small or zero norms. Returns ------- TensorVariable Input `tensor_var` with rescaling applied to weight vectors that violate the specified constraints. Examples -------- >>> param = aesara.shared( ... np.random.randn(100, 200).astype(aesara.config.floatX)) >>> update = param + 100 >>> update = norm_constraint(update, 10) >>> func = aesara.function([], [], updates=[(param, update)]) >>> # Apply constrained update >>> _ = func() >>> from lasagne.utils import compute_norms >>> norms = compute_norms(param.get_value()) >>> np.isclose(np.max(norms), 10) True Notes ----- When `norm_axes` is not specified, the axes over which the norm is computed depend on the dimensionality of the input variable. If it is 2D, it is assumed to come from a dense layer, and the norm is computed over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a convolutional layer and the norm is computed over all trailing axes beyond axis 0. For other uses, you should explicitly specify the axes over which to compute the norm using `norm_axes`. """ ndim = tensor_var.ndim if norm_axes is not None: sum_over = tuple(norm_axes) elif ndim == 2: # DenseLayer sum_over = (0, ) elif ndim in [3, 4, 5]: # Conv{1,2,3}DLayer sum_over = tuple(range(1, ndim)) else: raise ValueError("Unsupported tensor dimensionality {}." "Must specify `norm_axes`".format(ndim)) dtype = np.dtype(aesara.config.floatX).type norms = aet.sqrt(aet.sum(aet.sqr(tensor_var), axis=sum_over, keepdims=True)) target_norms = aet.clip(norms, 0, dtype(max_norm)) constrained_output = tensor_var * (target_norms / (dtype(epsilon) + norms)) return constrained_output
def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False): """Rescales a list of tensors based on their combined norm If the combined norm of the input tensors exceeds the threshold then all tensors are rescaled such that the combined norm is equal to the threshold. Scaling the norms of the gradients is often used when training recurrent neural networks [1]_. Parameters ---------- tensor_vars: List of TensorVariables. Tensors to be rescaled. max_norm: float Threshold value for total norm. epsilon: scalar, optional Value used to prevent numerical instability when dividing by very small or zero norms. return_norm: bool If true the total norm is also returned. Returns ------- tensor_vars_scaled: list of TensorVariables The scaled tensor variables. norm: Aesara scalar The combined norms of the input variables prior to rescaling, only returned if ``return_norms=True``. Examples -------- >>> from lasagne.layers import InputLayer, DenseLayer >>> import lasagne >>> from lasagne.updates import sgd, total_norm_constraint >>> x = aet.matrix() >>> y = aet.ivector() >>> l_in = InputLayer((5, 10)) >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=aet.nnet.softmax) >>> output = lasagne.layers.get_output(l1, x) >>> cost = aet.mean(aet.nnet.categorical_crossentropy(output, y)) >>> all_params = lasagne.layers.get_all_params(l1) >>> all_grads = aet.grad(cost, all_params) >>> scaled_grads = total_norm_constraint(all_grads, 5) >>> updates = sgd(scaled_grads, all_params, learning_rate=0.1) Notes ----- The total norm can be used to monitor training. References ---------- .. [1] Sutskever, I., Vinyals, O., & Le, Q. V. (2014): Sequence to sequence learning with neural networks. In Advances in Neural Information Processing Systems (pp. 3104-3112). """ norm = aet.sqrt(sum(aet.sum(tensor**2) for tensor in tensor_vars)) dtype = np.dtype(aesara.config.floatX).type target_norm = aet.clip(norm, 0, dtype(max_norm)) multiplier = target_norm / (dtype(epsilon) + norm) tensor_vars_scaled = [step * multiplier for step in tensor_vars] if return_norm: return tensor_vars_scaled, norm else: return tensor_vars_scaled
def weinland(self, t): return (1 + self.tau * t / self.c) * at.clip(1 - t / self.c, 0, np.inf)**self.tau