Esempio n. 1
0
 def _build_marginal_likelihood_logp(self, y, X, Xu, sigma):
     sigma2 = at.square(sigma)
     Kuu = self.cov_func(Xu)
     Kuf = self.cov_func(Xu, X)
     Luu = cholesky(stabilize(Kuu))
     A = solve_lower(Luu, Kuf)
     Qffd = at.sum(A * A, 0)
     if self.approx == "FITC":
         Kffd = self.cov_func(X, diag=True)
         Lamd = at.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
         trace = 0.0
     elif self.approx == "VFE":
         Lamd = at.ones_like(Qffd) * sigma2
         trace = (1.0 /
                  (2.0 * sigma2)) * (at.sum(self.cov_func(X, diag=True)) -
                                     at.sum(at.sum(A * A, 0)))
     else:  # DTC
         Lamd = at.ones_like(Qffd) * sigma2
         trace = 0.0
     A_l = A / Lamd
     L_B = cholesky(at.eye(Xu.shape[0]) + at.dot(A_l, at.transpose(A)))
     r = y - self.mean_func(X)
     r_l = r / Lamd
     c = solve_lower(L_B, at.dot(A, r_l))
     constant = 0.5 * X.shape[0] * at.log(2.0 * np.pi)
     logdet = 0.5 * at.sum(at.log(Lamd)) + at.sum(at.log(at.diag(L_B)))
     quadratic = 0.5 * (at.dot(r, r_l) - at.dot(c, c))
     return -1.0 * (constant + logdet + quadratic + trace)
Esempio n. 2
0
 def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma,
                        cov_total, mean_total):
     sigma2 = at.square(sigma)
     Kuu = cov_total(Xu)
     Kuf = cov_total(Xu, X)
     Luu = cholesky(stabilize(Kuu))
     A = solve_lower(Luu, Kuf)
     Qffd = at.sum(A * A, 0)
     if self.approx == "FITC":
         Kffd = cov_total(X, diag=True)
         Lamd = at.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
     else:  # VFE or DTC
         Lamd = at.ones_like(Qffd) * sigma2
     A_l = A / Lamd
     L_B = cholesky(at.eye(Xu.shape[0]) + at.dot(A_l, at.transpose(A)))
     r = y - mean_total(X)
     r_l = r / Lamd
     c = solve_lower(L_B, at.dot(A, r_l))
     Kus = self.cov_func(Xu, Xnew)
     As = solve_lower(Luu, Kus)
     mu = self.mean_func(Xnew) + at.dot(at.transpose(As),
                                        solve_upper(at.transpose(L_B), c))
     C = solve_lower(L_B, As)
     if diag:
         Kss = self.cov_func(Xnew, diag=True)
         var = Kss - at.sum(at.square(As), 0) + at.sum(at.square(C), 0)
         if pred_noise:
             var += sigma2
         return mu, var
     else:
         cov = self.cov_func(Xnew) - at.dot(at.transpose(As), As) + at.dot(
             at.transpose(C), C)
         if pred_noise:
             cov += sigma2 * at.identity_like(cov)
         return mu, cov if pred_noise else stabilize(cov)
Esempio n. 3
0
    def rv_op(cls, dist, lower=None, upper=None, size=None, rngs=None):

        lower = at.constant(
            -np.inf) if lower is None else at.as_tensor_variable(lower)
        upper = at.constant(
            np.inf) if upper is None else at.as_tensor_variable(upper)

        # When size is not specified, dist may have to be broadcasted according to lower/upper
        dist_shape = size if size is not None else at.broadcast_shape(
            dist, lower, upper)
        dist = change_rv_size(dist, dist_shape)

        # Censoring is achieved by clipping the base distribution between lower and upper
        rv_out = at.clip(dist, lower, upper)

        # Reference nodes to facilitate identification in other classmethods, without
        # worring about possible dimshuffles
        rv_out.tag.dist = dist
        rv_out.tag.lower = lower
        rv_out.tag.upper = upper

        if rngs is not None:
            rv_out = cls._change_rngs(rv_out, rngs)

        return rv_out
Esempio n. 4
0
 def square_dist(self, X, Xs=None):
     X2 = aet.sum(aet.square(X), 1)
     if Xs is None:
         sqd = -2.0 * aet.dot(X, aet.transpose(X)) + (
             aet.reshape(X2, (-1, 1)) + aet.reshape(X2, (1, -1)))
     else:
         Xs2 = aet.sum(aet.square(Xs), 1)
         sqd = -2.0 * aet.dot(X, aet.transpose(Xs)) + (
             aet.reshape(X2, (-1, 1)) + aet.reshape(Xs2, (1, -1)))
     return aet.clip(sqd, 0.0, np.inf)
Esempio n. 5
0
 def square_dist(self, X, Xs):
     X = at.mul(X, 1.0 / self.ls)
     X2 = at.sum(at.square(X), 1)
     if Xs is None:
         sqd = -2.0 * at.dot(X, at.transpose(X)) + (at.reshape(X2,
                                                               (-1, 1)) +
                                                    at.reshape(X2, (1, -1)))
     else:
         Xs = at.mul(Xs, 1.0 / self.ls)
         Xs2 = at.sum(at.square(Xs), 1)
         sqd = -2.0 * at.dot(X, at.transpose(Xs)) + (
             at.reshape(X2, (-1, 1)) + at.reshape(Xs2, (1, -1)))
     return at.clip(sqd, 0.0, np.inf)
Esempio n. 6
0
    def rv_op(cls, dist, lower=None, upper=None, size=None, rngs=None):
        if lower is None:
            lower = at.constant(-np.inf)
        if upper is None:
            upper = at.constant(np.inf)

        # Censoring is achieved by clipping the base distribution between lower and upper
        rv_out = at.clip(dist, lower, upper)

        # Reference nodes to facilitate identification in other classmethods, without
        # worring about possible dimshuffles
        rv_out.tag.dist = dist
        rv_out.tag.lower = lower
        rv_out.tag.upper = upper

        if size is not None:
            rv_out = cls.change_size(rv_out, size)
        if rngs is not None:
            rv_out = cls.change_rngs(rv_out, rngs)

        return rv_out
Esempio n. 7
0
def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
    """Max weight norm constraints and gradient clipping

    This takes a TensorVariable and rescales it so that incoming weight
    norms are below a specified constraint value. Vectors violating the
    constraint are rescaled so that they are within the allowed range.

    Parameters
    ----------
    tensor_var: TensorVariable
        Aesara expression for update, gradient, or other quantity.
    max_norm: scalar
        This value sets the maximum allowed value of any norm in
        `tensor_var`.
    norm_axes: sequence (list or tuple)
        The axes over which to compute the norm.  This overrides the
        default norm axes defined for the number of dimensions
        in `tensor_var`. When this is not specified and `tensor_var` is a
        matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or
        5D tensor, it is set to a tuple listing all axes but axis 0. The
        former default is useful for working with dense layers, the latter
        is useful for 1D, 2D and 3D convolutional layers.
        (Optional)
    epsilon: scalar, optional
        Value used to prevent numerical instability when dividing by
        very small or zero norms.

    Returns
    -------
    TensorVariable
        Input `tensor_var` with rescaling applied to weight vectors
        that violate the specified constraints.

    Examples
    --------
    >>> param = aesara.shared(
    ...     np.random.randn(100, 200).astype(aesara.config.floatX))
    >>> update = param + 100
    >>> update = norm_constraint(update, 10)
    >>> func = aesara.function([], [], updates=[(param, update)])
    >>> # Apply constrained update
    >>> _ = func()
    >>> from lasagne.utils import compute_norms
    >>> norms = compute_norms(param.get_value())
    >>> np.isclose(np.max(norms), 10)
    True

    Notes
    -----
    When `norm_axes` is not specified, the axes over which the norm is
    computed depend on the dimensionality of the input variable. If it is
    2D, it is assumed to come from a dense layer, and the norm is computed
    over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a
    convolutional layer and the norm is computed over all trailing axes
    beyond axis 0. For other uses, you should explicitly specify the axes
    over which to compute the norm using `norm_axes`.
    """
    ndim = tensor_var.ndim

    if norm_axes is not None:
        sum_over = tuple(norm_axes)
    elif ndim == 2:  # DenseLayer
        sum_over = (0, )
    elif ndim in [3, 4, 5]:  # Conv{1,2,3}DLayer
        sum_over = tuple(range(1, ndim))
    else:
        raise ValueError("Unsupported tensor dimensionality {}."
                         "Must specify `norm_axes`".format(ndim))

    dtype = np.dtype(aesara.config.floatX).type
    norms = aet.sqrt(aet.sum(aet.sqr(tensor_var), axis=sum_over,
                             keepdims=True))
    target_norms = aet.clip(norms, 0, dtype(max_norm))
    constrained_output = tensor_var * (target_norms / (dtype(epsilon) + norms))

    return constrained_output
Esempio n. 8
0
def total_norm_constraint(tensor_vars,
                          max_norm,
                          epsilon=1e-7,
                          return_norm=False):
    """Rescales a list of tensors based on their combined norm

    If the combined norm of the input tensors exceeds the threshold then all
    tensors are rescaled such that the combined norm is equal to the threshold.

    Scaling the norms of the gradients is often used when training recurrent
    neural networks [1]_.

    Parameters
    ----------
    tensor_vars: List of TensorVariables.
        Tensors to be rescaled.
    max_norm: float
        Threshold value for total norm.
    epsilon: scalar, optional
        Value used to prevent numerical instability when dividing by
        very small or zero norms.
    return_norm: bool
        If true the total norm is also returned.

    Returns
    -------
    tensor_vars_scaled: list of TensorVariables
        The scaled tensor variables.
    norm: Aesara scalar
        The combined norms of the input variables prior to rescaling,
        only returned if ``return_norms=True``.

    Examples
    --------
    >>> from lasagne.layers import InputLayer, DenseLayer
    >>> import lasagne
    >>> from lasagne.updates import sgd, total_norm_constraint
    >>> x = aet.matrix()
    >>> y = aet.ivector()
    >>> l_in = InputLayer((5, 10))
    >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=aet.nnet.softmax)
    >>> output = lasagne.layers.get_output(l1, x)
    >>> cost = aet.mean(aet.nnet.categorical_crossentropy(output, y))
    >>> all_params = lasagne.layers.get_all_params(l1)
    >>> all_grads = aet.grad(cost, all_params)
    >>> scaled_grads = total_norm_constraint(all_grads, 5)
    >>> updates = sgd(scaled_grads, all_params, learning_rate=0.1)

    Notes
    -----
    The total norm can be used to monitor training.

    References
    ----------
    .. [1] Sutskever, I., Vinyals, O., & Le, Q. V. (2014): Sequence to sequence
       learning with neural networks. In Advances in Neural Information
       Processing Systems (pp. 3104-3112).
    """
    norm = aet.sqrt(sum(aet.sum(tensor**2) for tensor in tensor_vars))
    dtype = np.dtype(aesara.config.floatX).type
    target_norm = aet.clip(norm, 0, dtype(max_norm))
    multiplier = target_norm / (dtype(epsilon) + norm)
    tensor_vars_scaled = [step * multiplier for step in tensor_vars]

    if return_norm:
        return tensor_vars_scaled, norm
    else:
        return tensor_vars_scaled
Esempio n. 9
0
 def weinland(self, t):
     return (1 + self.tau * t / self.c) * at.clip(1 - t / self.c, 0,
                                                  np.inf)**self.tau