Example #1
0
def bernoulli_nll(x, y):
    """Computes the negative log-likelihood of a Bernoulli distribution.

    This function calculates the negative log-likelihood of a Bernoulli
    distribution.

    .. math::

        -B(x; p) = -\\sum_i {x_i \\log(p_i) + (1 - x_i)\\log(1 - p_i)},

    where :math:`p = \\sigma(y)`, and :math:`\\sigma(\\cdot)` is a sigmoid
    function.

    .. note::

       As this function uses a sigmoid function, you can pass a result of
       fully-connected layer (that means :class:`Linear`) to this function
       directly.

    Args:
        x (~chainer.Variable): Input variable.
        y (~chainer.Variable): A variable representing the parameter of
            Bernoulli distribution.

    Returns:
        ~chainer.Variable: A variable representing negative log-likelihood.

    """
    assert isinstance(x, variable.Variable)
    assert isinstance(y, variable.Variable)

    return sum.sum(softplus.softplus(y)) - sum.sum(x * y)
Example #2
0
def gaussian_kl_divergence(mean, ln_var):
    """Computes the KL-divergence of Gaussian variables from the standard one.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function returns a variable
    representing the KL-divergence between the given multi-dimensional Gaussian
    :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)`

    .. math::

       D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)),

    where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2`
    and :math:`I` is an identity matrix.

    Args:
        mean (~chainer.Variable): A variable representing mean of given
            gaussian distribution, :math:`\\mu`.
        ln_var (~chainer.Variable): A variable representing logarithm of
            variance of given gaussian distribution, :math:`\\log(\\sigma^2)`.

    Returns:
        ~chainer.Variable: A variable representing KL-divergence between
            given gaussian distribution and the standard gaussian.

    """
    assert isinstance(mean, variable.Variable)
    assert isinstance(ln_var, variable.Variable)

    J = mean.size
    var = exponential.exp(ln_var)
    return (sum.sum(mean * mean) + sum.sum(var) - sum.sum(ln_var) - J) * 0.5
Example #3
0
def gaussian_kl_divergence(mean, ln_var):
    """Computes the KL-divergence of Gaussian variables from the standard one.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function returns a variable
    representing the KL-divergence between the given multi-dimensional Gaussian
    :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)`

    .. math::

       D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)),

    where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2`
    and :math:`I` is an identity matrix.

    Args:
        mean (~chainer.Variable): A variable representing mean of given
            gaussian distribution, :math:`\\mu`.
        ln_var (~chainer.Variable): A variable representing logarithm of
            variance of given gaussian distribution, :math:`\\log(\\sigma^2)`.

    Returns:
        ~chainer.Variable: A variable representing KL-divergence between
            given gaussian distribution and the standard gaussian.

    """
    assert isinstance(mean, variable.Variable)
    assert isinstance(ln_var, variable.Variable)

    J = mean.size
    var = exponential.exp(ln_var)
    return (sum.sum(mean * mean) + sum.sum(var) - sum.sum(ln_var) - J) * 0.5
Example #4
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int or tuple of int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if axis is None:
        pass
    elif isinstance(axis, tuple):
        axis = [a + x.ndim if a < 0 else a for a in axis]
        axis.sort()
        for a, b in six.moves.zip(axis, axis[1:]):
            if a == b:
                raise ValueError('duplicate value in \'axis\'')
        axis = tuple(axis)
    else:
        if axis < 0:
            axis += x.ndim
        axis = (axis, )

    if weights is not None:
        if axis is not None and len(axis) > 1:
            raise ValueError(
                'tuple axis is not supported when weights is given')
        divider = sum_mod.sum(weights)
        if axis is not None:
            w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape),
                                             x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = 1
            for a in axis:
                divider *= x.shape[a]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast whene weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Example #5
0
def bernoulli_nll(x, y):
    """Computes the negative log-likelihood of a Bernoulli distribution.

    This function calculates the negative log-likelihood of a Bernoulli
    distribution.

    .. math::

        -\\log B(x; p) = -\\sum_i \{x_i \\log(p_i) + (1 - x_i)\\log(1 - p_i)\},

    where :math:`p = \\sigma(y)`, :math:`\\sigma(\\cdot)` is a sigmoid
    function, and :math:`B(x; p)` is a Bernoulli distribution.

    .. note::

       As this function uses a sigmoid function, you can pass a result of
       fully-connected layer (that means :class:`Linear`) to this function
       directly.

    Args:
        x (~chainer.Variable): Input variable.
        y (~chainer.Variable): A variable representing the parameter of
            Bernoulli distribution.

    Returns:
        ~chainer.Variable: A variable representing negative log-likelihood.

    """
    assert isinstance(x, variable.Variable)
    assert isinstance(y, variable.Variable)

    return sum.sum(softplus.softplus(y)) - sum.sum(x * y)
Example #6
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int or tuple of int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if axis is None:
        pass
    elif isinstance(axis, tuple):
        axis = [a + x.ndim if a < 0 else a for a in axis]
        axis.sort()
        for a, b in six.moves.zip(axis, axis[1:]):
            if a == b:
                raise ValueError('duplicate value in \'axis\'')
        axis = tuple(axis)
    else:
        if axis < 0:
            axis += x.ndim
        axis = (axis,)

    if weights is not None:
        if axis is not None and len(axis) > 1:
            raise ValueError(
                'tuple axis is not supported when weights is given')
        divider = sum_mod.sum(weights)
        if axis is not None:
            w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(
                reshape.reshape(weights, w_shape), x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = 1
            for a in axis:
                divider *= x.shape[a]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast when weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Example #7
0
 def _normalize(self, x):
     size = x.shape[1]
     mean = broadcast.broadcast_to((sum.sum(x, axis=1) / size)[:, None],
                                   x.shape)
     std = broadcast.broadcast_to(
         sqrt.sqrt(sum.sum(square.square(x - mean), axis=1) /
                   size)[:, None], x.shape) + self.eps
     return (x - mean) / std
 def _normalize(self, x):
     size = x.shape[1]
     mean = broadcast.broadcast_to(
         (sum.sum(x, axis=1) / size)[:, None],
         x.shape)
     std = broadcast.broadcast_to(sqrt.sqrt(
         sum.sum(square.square(x - mean), axis=1) / size)[:, None],
         x.shape) + self.eps
     return (x - mean) / std
Example #9
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    scale_tril_inv2 = _batch_triangular_inv(
        dist2.scale_tril.reshape(-1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d))**2,
                        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah**2, axis=-2).reshape(dist1.batch_shape)
    return dist2._logdet_scale - dist1._logdet_scale \
        + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Example #10
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return dist2._logdet_scale - dist1._logdet_scale \
        + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Example #11
0
def black_out(x, t, W, samples):
    """BlackOut loss function.

    BlackOut loss function is defined as

    .. math::

      -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)),

    where :math:`t` is the correct label, :math:`S` is a set of negative
    examples and :math:`p(\cdot)` is likelihood of a given label.
    And, :math:`p` is defined as

    .. math::

       p(y) = \\frac{\\exp(W_y^\\top x)}{
       \\sum_{s \\in samples} \\exp(W_s^\\top x)}.

    Args:
        x (~chainer.Variable): Batch of input vectors.
        t (~chainer.Variable): Vector of ground truth labels.
        W (~chainer.Variable): Weight matrix.
        samples (~chainer.Variable): Negative samples.

    Returns:
        ~chainer.Variable: Loss value.

    See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \
         Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_

    .. seealso:: :class:`~chainer.links.BlackOut`.

    """

    batch_size = x.shape[0]

    neg_emb = embed_id.embed_id(samples, W)
    neg_y = matmul.batch_matmul(neg_emb, x)
    neg_y = reshape.reshape(neg_y, neg_y.shape[:-1])

    pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1)
    pos_y = matmul.batch_matmul(pos_emb, x)
    pos_y = reshape.reshape(pos_y, pos_y.shape[:-1])

    logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1)
    blogz, bneg_y = broadcast.broadcast(reshape.reshape(logz, (batch_size, 1)),
                                        neg_y)
    ny = exponential.log(1 - exponential.exp(bneg_y - blogz))
    py = reshape.reshape(pos_y, (batch_size, ))
    loss = py - logz + _sum.sum(ny, axis=1)
    return -_sum.sum(loss) / batch_size
Example #12
0
def black_out(x, t, W, samples):
    """BlackOut loss function.

    BlackOut loss function is defined as

    .. math::

      -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)),

    where :math:`t` is the correct label, :math:`S` is a set of negative
    examples and :math:`p(\cdot)` is likelihood of a given label.
    And, :math:`p` is defined as

    .. math::

       p(y) = \\frac{\\exp(W_y^\\top x)}{
       \\sum_{s \\in samples} \\exp(W_s^\\top x)}.

    Args:
        x (~chainer.Variable): Batch of input vectors.
        t (~chainer.Variable): Vector of ground truth labels.
        W (~chainer.Variable): Weight matrix.
        samples (~chainer.Variable): Negative samples.

    Returns:
        ~chainer.Variable: Loss value.

    See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \
         Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_

    .. seealso:: :class:`~chainer.links.BlackOut`.

    """

    batch_size = x.shape[0]

    neg_emb = embed_id.embed_id(samples, W)
    neg_y = matmul.batch_matmul(neg_emb, x)
    neg_y = reshape.reshape(neg_y, neg_y.shape[:-1])

    pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1)
    pos_y = matmul.batch_matmul(pos_emb, x)
    pos_y = reshape.reshape(pos_y, pos_y.shape[:-1])

    logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1)
    blogz, bneg_y = broadcast.broadcast(
        reshape.reshape(logz, (batch_size, 1)), neg_y)
    ny = exponential.log(1 - exponential.exp(bneg_y - blogz))
    py = reshape.reshape(pos_y, (batch_size,))
    loss = py - logz + _sum.sum(ny, axis=1)
    return -_sum.sum(loss) / batch_size
Example #13
0
    def predict(self, images, oversample=True):
        """Computes all the probabilities of given images.

        Args:
            images (iterable of PIL.Image or numpy.ndarray): Input images.
            oversample (bool): If ``True``, it averages results across
                center, corners, and mirrors. Otherwise, it uses only the
                center.

        Returns:
            ~chainer.Variable: Output that contains the class probabilities
            of given images.

        """

        x = concat_examples([prepare(img, size=(256, 256)) for img in images])
        if oversample:
            x = imgproc.oversample(x, crop_dims=(224, 224))
        else:
            x = x[:, :, 16:240, 16:240]
        # Set volatile option to ON to reduce memory consumption
        x = Variable(self.xp.asarray(x), volatile=flag.ON)
        y = self(x, layers=['prob'])['prob']
        if oversample:
            n = y.data.shape[0] // 10
            y_shape = y.data.shape[1:]
            y = reshape(y, (n, 10) + y_shape)
            y = sum(y, axis=1) / 10
        return y
Example #14
0
    def predict(self, images, oversample=True):
        """Computes all the probabilities of given images.

        Args:
            images (iterable of PIL.Image or numpy.ndarray): Input images.
            oversample (bool): If ``True``, it averages results across
                center, corners, and mirrors. Otherwise, it uses only the
                center.

        Returns:
            ~chainer.Variable: Output that contains the class probabilities
            of given images.

        """

        x = concat_examples([prepare(img, size=(256, 256)) for img in images])
        if oversample:
            x = imgproc.oversample(x, crop_dims=(224, 224))
        else:
            x = x[:, :, 16:240, 16:240]
        # Use no_backprop_mode to reduce memory consumption
        with function.no_backprop_mode():
            x = Variable(self.xp.asarray(x))
            y = self(x, layers=['prob'])['prob']
            if oversample:
                n = y.data.shape[0] // 10
                y_shape = y.data.shape[1:]
                y = reshape(y, (n, 10) + y_shape)
                y = sum(y, axis=1) / 10
        return y
Example #15
0
File: vgg.py Project: km-t/dcpython
    def predict(self, images, oversample=True):
        """Computes all the probabilities of given images.

        Args:
            images (iterable of PIL.Image or numpy.ndarray): Input images.
                When you specify a color image as a :class:`numpy.ndarray`,
                make sure that color order is RGB.
            oversample (bool): If ``True``, it averages results across
                center, corners, and mirrors. Otherwise, it uses only the
                center.

        Returns:
            ~chainer.Variable: Output that contains the class probabilities
            of given images.

        """

        x = concat_examples([prepare(img, size=(256, 256)) for img in images])
        if oversample:
            x = imgproc.oversample(x, crop_dims=(224, 224))
        else:
            x = x[:, :, 16:240, 16:240]
        # Use no_backprop_mode to reduce memory consumption
        with function.no_backprop_mode(), chainer.using_config('train', False):
            x = Variable(self.xp.asarray(x))
            y = self(x, layers=['prob'])['prob']
            if oversample:
                n = len(y) // 10
                y_shape = y.shape[1:]
                y = reshape(y, (n, 10) + y_shape)
                y = sum(y, axis=1) / 10
        return y
Example #16
0
def gaussian_nll(x, mean, ln_var, reduce='sum'):
    """Computes the negative log-likelihood of a Gaussian distribution.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function computes in
    elementwise manner the negative log-likelihood of :math:`x` on a
    Gaussian distribution :math:`N(\\mu, S)`,

    .. math::

        -\\log N(x; \\mu, \\sigma^2) =
        \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) +
        \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu),

    where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal
    matrix where :math:`S_{ii} = \\sigma_i^2`.

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the elementwise
    loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up
    or averaged respectively.

    Args:
        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`): Input variable.
        mean (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`): A variable representing mean of a Gaussian
            distribution, :math:`\\mu`.
        ln_var (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`): A variable representing logarithm of
            variance of a Gaussian distribution, :math:`\\log(\\sigma^2)`.
        reduce (str): Reduction option. Its value must be either
            ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError`
            is raised.

    Returns:
        ~chainer.Variable:
            A variable representing the negative log-likelihood.
            If ``reduce`` is ``'no'``, the output variable holds array
            whose shape is same as one of (hence both of) input variables.
            If it is ``'sum'`` or ``'mean'``, the output variable holds a
            scalar value.

    """
    if reduce not in ('sum', 'mean', 'no'):
        raise ValueError(
            "only 'sum', 'mean' and 'no' are valid for 'reduce', but '%s'"
            ' is given' % reduce)

    x_prec = exponential.exp(-ln_var)
    x_diff = x - mean
    x_power = (x_diff * x_diff) * x_prec * -0.5
    loss = (ln_var + math.log(2 * math.pi)) / 2 - x_power
    if reduce == 'sum':
        return sum.sum(loss)
    elif reduce == 'mean':
        return average.average(loss)
    else:
        return loss
Example #17
0
def crf1d(cost, xs, ys):

    """Calculates negative log-likelihood of linear-chain CRF.

    It takes a transition cost matrix, a sequence of costs, and a sequence of
    labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to
    a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at
    position :math:`i`, and :math:`y_i` be an expected label at position
    :math:`i`. The negative log-likelihood of linear-chain CRF is defined as

    .. math::
        L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\
             \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) ,

    where :math:`l` is the length of the input sequence and :math:`Z` is the
    normalizing constant called partition function.

    Args:
        cost (Variable): A :math:`K \\times K` matrix which holds transition
            cost between two labels, where :math:`K` is the number of labels.
        xs (list of Variable): Input feature vector for each label. Each
            :class:`~chainer.Variable` holds a :math:`B \\times K`
            matrix, where :math:`B` is mini-batch size, :math:`K` is the number
            of labels.
        ys (list of Variable): Expected output labels. Each
            :class:`~chainer.Variable` holds a :math:`B` integer vector.

    Returns:
        ~chainer.Variable: A variable holding the average negative
            log-likelihood of the input sequences.

    .. note::

        See detail in the original paper: `Conditional Random Fields:
        Probabilistic Models for Segmenting and Labeling Sequence Data
        <http://repository.upenn.edu/cis_papers/159/>`_.

    """
    assert xs[0].data.shape[1] == cost.data.shape[0]

    n_label = cost.data.shape[0]
    n_batch = xs[0].data.shape[0]

    alpha = xs[0]
    for x in xs[1:]:
        b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost)
        alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x

    logz = logsumexp.logsumexp(alpha, axis=1)

    score = 0
    cost = reshape.reshape(cost, (cost.data.size, 1))
    for y1, y2 in zip(ys[:-1], ys[1:]):
        score += reshape.reshape(
            embed_id.embed_id(y1 * n_label + y2, cost), (n_batch,))
    for x, y in zip(xs, ys):
        score += select_item.select_item(x, y)

    return _sum.sum(logz - score) / n_batch
Example #18
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    diag = diagonal.diagonal(dist1.scale_tril, axis1=-2, axis2=-1)
    logdet1 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    diag = diagonal.diagonal(dist2.scale_tril, axis1=-2, axis2=-1)
    logdet2 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
 def backward(self, indexes, grad_outputs):
     x, gy0 = self.get_retained_inputs()
     gy0 = gy0.reshape(-1, *((1, ) * (x.ndim - 1)))
     gy0 = chainer.functions.broadcast_to(gy0, x.shape)
     ggx2 = 2 * grad_outputs[0]
     gx = ggx2 * gy0
     ggy0 = ggx2 * x
     return gx, _sum.sum(ggy0, axis=tuple(six.moves.range(1, ggy0.ndim)))
Example #20
0
 def backward(self, indexes, grad_outputs):
     x, gy0 = self.get_retained_inputs()
     gy0 = gy0.reshape(-1, *((1,) * (x.ndim - 1)))
     gy0 = chainer.functions.broadcast_to(gy0, x.shape)
     ggx2 = 2 * grad_outputs[0]
     gx = ggx2 * gy0
     ggy0 = ggx2 * x
     return gx, _sum.sum(ggy0, axis=tuple(six.moves.range(1, ggy0.ndim)))
Example #21
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    diag = diagonal.diagonal(dist1.scale_tril, axis1=-2, axis2=-1)
    logdet1 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    diag = diagonal.diagonal(dist2.scale_tril, axis1=-2, axis2=-1)
    logdet2 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Example #22
0
 def gaussian_kl_divergence(self, mu1, ln_var1, mu2, ln_var2):
     # D_KL [ N(z ; mu1, var1) || N(z; mu2, var2) ]
     var1 = exponential.exp(ln_var1)
     inv_var2 = exponential.exp(-ln_var2)
     mu_diff = mu2 - mu1
     term1 = (var1 + mu_diff * mu_diff) * inv_var2
     loss = (term1 - ln_var1 + ln_var2 - 1.) * 0.5
     return sum.sum(loss)
 def entropy(self):
     return (
         _lbeta(self.alpha)
         + ((self.alpha0 - self.event_shape[0])
            * digamma.digamma(self.alpha0))
         - sum_mod.sum(
             (self.alpha - 1) * digamma.digamma(self.alpha),
             axis=-1))
Example #24
0
    def encode_decode_train(self,
                            in_word_list,
                            out_word_list,
                            train=True,
                            sample=False):
        xp = cuda.cupy if self.gpuid >= 0 else np
        self.reset_state()
        # Add GO_ID, EOS_ID to decoder input
        decoder_word_list = [GO_ID] + out_word_list + [EOS_ID]
        # encode list of words/tokens
        enc_states = self.encode_list(in_word_list, train=train)
        # initialize decoder LSTM to final encoder state
        self.set_decoder_state()
        # decode and compute loss
        # convert list of tokens into chainer variable list
        var_dec = (Variable(xp.asarray(decoder_word_list,
                                       dtype=np.int32).reshape((-1, 1)),
                            volatile=not train))
        # Initialise first decoded word to GOID
        pred_word = Variable(xp.asarray([GO_ID], dtype=np.int32),
                             volatile=not train)

        # compute loss
        self.loss = 0
        # decode tokens
        for next_word_var in var_dec[1:]:
            self.decode(pred_word, train=train)
            if self.attn == NO_ATTN:
                predicted_out = self.out(self[self.lstm_dec[-1]].h)
            else:
                ''' __QUESTION Add attention '''
                prevh = self[self.lstm_dec[-1]].h
                alpha = F.softmax(matmul(prevh, enc_states, transb=True))
                ctxt = F.reshape(
                    M.sum(F.scale(enc_states, F.transpose(alpha), axis=0),
                          axis=0), (1, 200))
                predicted_out = self.out(self.attn_out(F.concat(
                    (ctxt, prevh))))

            # compute loss
            prob = F.softmax(predicted_out)

            pred_word = self.select_word(prob, train=train, sample=False)
            # pred_word = Variable(xp.asarray([pred_word.data], dtype=np.int32), volatile=not train)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            Explain what loss is computed with an example. What does this value mean?

            The cross-entropy is a soft measure of how close the network got to the
            correct answer. Here it is used to find how close the predicted word
            (predicted_out) was to the expected word (next_word_var).
            '''
            self.loss += F.softmax_cross_entropy(predicted_out, next_word_var)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        report({"loss": self.loss}, self)

        return self.loss
Example #25
0
def gaussian_nll(x, mean, ln_var, reduce='sum'):
    """Computes the negative log-likelihood of a Gaussian distribution.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function computes in
    elementwise manner the negative log-likelihood of :math:`x` on a
    Gaussian distribution :math:`N(\\mu, S)`,

    .. math::

        -\\log N(x; \\mu, \\sigma^2) =
        \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) +
        \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu),

    where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal
    matrix where :math:`S_{ii} = \\sigma_i^2`.

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the elementwise
    loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up
    or averaged respectively.

    Args:
        x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable.
        mean (:class:`~chainer.Variable` or :ref:`ndarray`): A variable
            representing mean of a Gaussian distribution, :math:`\\mu`.
        ln_var (:class:`~chainer.Variable` or :ref:`ndarray`): A variable
            representing logarithm of variance of a Gaussian distribution,
            :math:`\\log(\\sigma^2)`.
        reduce (str): Reduction option. Its value must be either
            ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError`
            is raised.

    Returns:
        ~chainer.Variable:
            A variable representing the negative log-likelihood.
            If ``reduce`` is ``'no'``, the output variable holds array
            whose shape is same as one of (hence both of) input variables.
            If it is ``'sum'`` or ``'mean'``, the output variable holds a
            scalar value.

    """
    if reduce not in ('sum', 'mean', 'no'):
        raise ValueError(
            'only \'sum\', \'mean\' and \'no\' are valid for \'reduce\', but '
            '\'%s\' is given' % reduce)

    x_prec = exponential.exp(-ln_var)
    x_diff = x - mean
    x_power = (x_diff * x_diff) * x_prec * -0.5
    loss = (ln_var + math.log(2 * math.pi)) / 2 - x_power
    if reduce == 'sum':
        return sum.sum(loss)
    elif reduce == 'mean':
        return average.average(loss)
    else:
        return loss
Example #26
0
def bernoulli_nll(x, y, reduce='sum'):
    """Computes the negative log-likelihood of a Bernoulli distribution.

    This function calculates the negative log-likelihood of a Bernoulli
    distribution.

    .. math::

        -\\log B(x; p) = -\\sum_i \\{x_i \\log(p_i) + \
        (1 - x_i)\\log(1 - p_i)\\},

    where :math:`p = \\sigma(y)`, :math:`\\sigma(\\cdot)` is a sigmoid
    function, and :math:`B(x; p)` is a Bernoulli distribution.


    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the elementwise
    loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up
    or averaged respectively.

    .. note::

       As this function uses a sigmoid function, you can pass a result of
       fully-connected layer (that means :class:`Linear`) to this function
       directly.

    Args:
        x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`): Input variable.
        y (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \
        :class:`cupy.ndarray`): A variable representing the parameter of
            Bernoulli distribution.
        reduce (str): Reduction option. Its value must be either
            ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError`
            is raised.

    Returns:
        ~chainer.Variable:
            A variable representing the negative log-likelihood.
            If ``reduce`` is ``'no'``, the output variable holds array
            whose shape is same as one of (hence both of) input variables.
            If it is ``'sum'`` or ``'mean'``, the output variable holds a
            scalar value.

    """
    if reduce not in ('sum', 'mean', 'no'):
        raise ValueError(
            "only 'sum', 'mean' and 'no' are valid for 'reduce', but '%s'"
            ' is given' % reduce)

    loss = softplus.softplus(y) - x * y
    if reduce == 'sum':
        return sum.sum(loss)
    elif reduce == 'mean':
        return average.average(loss)
    else:
        return loss
Example #27
0
def soft_dtw_grad(D_bar, G, verbose=1):
    xp = cuda.get_array_module(G)
    if verbose > 0:
        print('Computing final gradient')
    d1, d2, d3, m, n = G.shape
    assert D_bar.shape == (m, n)
    final_G = Variable(xp.zeros((d1, d2, d3, m), dtype=np.float64))
    for i in range(m):
        final_G.data[:, :, :, i] = sum(D_bar[i] * G[:, :, :, i, :], axis=-1)
    return final_G
Example #28
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if weights is not None:
        divider = sum_mod.sum(weights)
        if axis is not None:
            if axis < 0:
                axis += x.ndim
            w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(
                reshape.reshape(weights, w_shape), x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = x.shape[axis]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast whene weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Example #29
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    st = moveaxis.moveaxis(dist1.scale_tril, (-2, -1), (0, 1))
    diag = st[list(range(dist1.d)), list(range(dist1.d))]
    logdet1 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0)

    st = moveaxis.moveaxis(dist2.scale_tril, (-2, -1), (0, 1))
    diag = st[list(range(dist2.d)), list(range(dist2.d))]
    logdet2 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0)

    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Example #30
0
def average(x, axis=None, weights=None, keepdims=False):
    """Calculate weighted average of array elements over a given axis.

    Args:
        x (~chainer.Variable): Elements to sum.
        axis (None or int): Axis which the method is performed.
            With the default (axis = None) it performs a mean over all the
            dimensions of the input array.
        weights (None or chainer.Variable): An array holding weights to
            calculate weighted average. If it is ``None``, all weights are
            assumed to be one.
            When ``axis`` is ``None``, ``weights`` must have the same shape
            of ``x``. And when ``axis`` is ``int``, it must be 1-D array
            satisfing ``weights.shape == (x.shape[axis],)``.
        keepdims (bool): If ``True``, the specified axes are remained as axes
            of length one.

    Returns:
        ~chainer.Variable: Output variable.

    """
    if weights is not None:
        divider = sum_mod.sum(weights)
        if axis is not None:
            if axis < 0:
                axis += x.ndim
            w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)]
            weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape),
                                             x.shape)

        x = x * weights
    else:
        if axis is None:
            divider = x.size
        else:
            divider = x.shape[axis]

    x_sum = sum_mod.sum(x, axis, keepdims)
    if weights is not None:
        # We do not need to call broadcast whene weights is None because
        # divider here is not a Variable but a scalar
        divider = broadcast.broadcast_to(divider, x_sum.shape)
    return x_sum / divider
Example #31
0
def gaussian_kl_divergence(mean, ln_var, reduce='sum'):
    """Computes the KL-divergence of Gaussian variables from the standard one.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function calculates
    the KL-divergence in elementwise manner between the given multi-dimensional
    Gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)`

    .. math::

       D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)),

    where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2`
    and :math:`I` is an identity matrix.

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the elementwise
    loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up
    or averaged respectively.

    Args:
        mean (:class:`~chainer.Variable` or :ref:`ndarray`):
            A variable representing mean of given
            gaussian distribution, :math:`\\mu`.
        ln_var (:class:`~chainer.Variable` or :ref:`ndarray`):
            A variable representing logarithm of
            variance of given gaussian distribution, :math:`\\log(\\sigma^2)`.
        reduce (str): Reduction option. Its value must be either
            ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError`
            is raised.

    Returns:
        ~chainer.Variable:
            A variable representing KL-divergence between
            given gaussian distribution and the standard gaussian.
            If ``reduce`` is ``'no'``, the output variable holds array
            whose shape is same as one of (hence both of) input variables.
            If it is ``'sum'`` or ``'mean'``, the output variable holds a
            scalar value.

    """
    if reduce not in ('sum', 'mean', 'no'):
        raise ValueError(
            "only 'sum', 'mean' and 'no' are valid for 'reduce', but '%s'"
            ' is given' % reduce)

    var = exponential.exp(ln_var)
    mean_square = mean * mean
    loss = (mean_square + var - ln_var - 1) * 0.5
    if reduce == 'sum':
        return sum.sum(loss)
    elif reduce == 'mean':
        return average.average(loss)
    else:
        return loss
Example #32
0
def gaussian_kl_divergence(mean, ln_var, reduce='sum'):
    """Computes the KL-divergence of Gaussian variables from the standard one.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function calculates
    the KL-divergence in elementwise manner between the given multi-dimensional
    Gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)`

    .. math::

       D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)),

    where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2`
    and :math:`I` is an identity matrix.

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the elementwise
    loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up
    or averaged respectively.

    Args:
        mean (:class:`~chainer.Variable` or :ref:`ndarray`):
            A variable representing mean of given
            gaussian distribution, :math:`\\mu`.
        ln_var (:class:`~chainer.Variable` or :ref:`ndarray`):
            A variable representing logarithm of
            variance of given gaussian distribution, :math:`\\log(\\sigma^2)`.
        reduce (str): Reduction option. Its value must be either
            ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError`
            is raised.

    Returns:
        ~chainer.Variable:
            A variable representing KL-divergence between
            given gaussian distribution and the standard gaussian.
            If ``reduce`` is ``'no'``, the output variable holds array
            whose shape is same as one of (hence both of) input variables.
            If it is ``'sum'`` or ``'mean'``, the output variable holds a
            scalar value.

    """
    if reduce not in ('sum', 'mean', 'no'):
        raise ValueError(
            'only \'sum\', \'mean\' and \'no\' are valid for \'reduce\', but '
            '\'%s\' is given' % reduce)

    var = exponential.exp(ln_var)
    mean_square = mean * mean
    loss = (mean_square + var - ln_var - 1) * 0.5
    if reduce == 'sum':
        return sum.sum(loss)
    elif reduce == 'mean':
        return average.average(loss)
    else:
        return loss
Example #33
0
def bernoulli_nll(x, y, reduce='sum'):
    """Computes the negative log-likelihood of a Bernoulli distribution.

    This function calculates the negative log-likelihood of a Bernoulli
    distribution.

    .. math::

        -\\log B(x; p) = -\\sum_i \\{x_i \\log(p_i) + \
        (1 - x_i)\\log(1 - p_i)\\},

    where :math:`p = \\sigma(y)`, :math:`\\sigma(\\cdot)` is a sigmoid
    function, and :math:`B(x; p)` is a Bernoulli distribution.


    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the elementwise
    loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up
    or averaged respectively.

    .. note::

       As this function uses a sigmoid function, you can pass a result of
       fully-connected layer (that means :class:`Linear`) to this function
       directly.

    Args:
        x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable.
        y (:class:`~chainer.Variable` or :ref:`ndarray`): A variable
            representing the parameter of Bernoulli distribution.
        reduce (str): Reduction option. Its value must be either
            ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError`
            is raised.

    Returns:
        ~chainer.Variable:
            A variable representing the negative log-likelihood.
            If ``reduce`` is ``'no'``, the output variable holds array
            whose shape is same as one of (hence both of) input variables.
            If it is ``'sum'`` or ``'mean'``, the output variable holds a
            scalar value.

    """
    if reduce not in ('sum', 'mean', 'no'):
        raise ValueError(
            'only \'sum\', \'mean\' and \'no\' are valid for \'reduce\', but '
            '\'%s\' is given' % reduce)

    loss = softplus.softplus(y) - x * y
    if reduce == 'sum':
        return sum.sum(loss)
    elif reduce == 'mean':
        return average.average(loss)
    else:
        return loss
def _kl_dirichlet_dirichlet(dist1, dist2):
    return (
        - _lbeta(dist1.alpha)
        + _lbeta(dist2.alpha)
        + sum_mod.sum(
            (dist1.alpha - dist2.alpha)
            * (digamma.digamma(dist1.alpha)
               - expand_dims.expand_dims(
                   digamma.digamma(dist1.alpha0),
                   axis=-1)),
            axis=-1))
Example #35
0
def _sum_rightmost(value, dim):
    """Sum out `dim` many rightmost dimensions of a given tensor.

    Args:
        value (Tensor): A tensor of ``.dim()`` at least ``dim``.
        dim (int): The number of rightmost dims to sum out.
    """
    if dim == 0:
        return value
    required_shape = value.shape[:-dim] + (-1, )
    return sum_mod.sum(reshape.reshape(value, required_shape), axis=-1)
Example #36
0
def max_singular_value(W, u=None, Ip=1):
    """
    Apply power iteration for the weight parameter
    """
    xp = cuda.get_array_module(W.data)
    if u is None:
        u = xp.random.normal(size=(1, W.shape[0])).astype(xp.float32)
    _u = u
    for _ in range(Ip):
        _v = _l2normalize(xp.dot(_u, W.data), eps=1e-12)
        _u = _l2normalize(xp.dot(_v, W.data.transpose()), eps=1e-12)
    sigma = sum.sum(linear.linear(_u, transpose.transpose(W)) * _v)
    return sigma, _u, _v
Example #37
0
 def _log_det_jacobian(self, x, y):
     shape = x.shape
     scale = self.scale
     if isinstance(scale, numbers.Number):
         xp = cuda.get_array_module(x, y)
         result = exponential.log(basic_math.absolute(scale)) \
             * xp.ones(shape, dtype=x.dtype)
     else:
         result = exponential.log(basic_math.absolute(scale))
     if self.event_dim:
         result_size = result.shape[:-self.event_dim] + (-1, )
         result = sum_mod.sum(result.view(result_size), axis=-1)
         shape = shape[:-self.event_dim]
     return broadcast.broadcast_to(result, shape)
Example #38
0
def gaussian_nll(x, mean, ln_var):
    """Computes the negative log-likelihood of a Gaussian distribution.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function returns the negative
    log-likelihood of :math:`x` on a Gaussian distribution :math:`N(\\mu, S)`,

    .. math::

        -\\log N(x; \\mu, \\sigma^2) =
        \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) +
        \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu),

    where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal
    matrix where :math:`S_{ii} = \\sigma_i^2`.

    Args:
        x (~chainer.Variable): Input variable.
        mean (~chainer.Variable): A variable representing mean of a Gaussian
            distribution, :math:`\\mu`.
        ln_var (~chainer.Variable): A variable representing logarithm of
            variance of a Gaussian distribution, :math:`\\log(\\sigma^2)`.

    Returns:
        ~chainer.Variable: A variable representing the negative log-likelihood.

    """
    assert isinstance(x, variable.Variable)
    assert isinstance(mean, variable.Variable)
    assert isinstance(ln_var, variable.Variable)

    D = x.size
    x_prec = exponential.exp(-ln_var)
    x_diff = x - mean
    x_power = (x_diff * x_diff) * x_prec * -0.5
    return (sum.sum(ln_var) + D * math.log(2 * math.pi)) / 2 - sum.sum(x_power)
Example #39
0
def gaussian_nll(x, mean, ln_var):
    """Computes the negative log-likelihood of a Gaussian distribution.

    Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
    representing :math:`\\log(\\sigma^2)`, this function returns the negative
    log-likelihood of :math:`x` on a Gaussian distribution :math:`N(\\mu, S)`,

    .. math::

        -\\log N(x; \\mu, \\sigma^2) =
        \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) +
        \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu),

    where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal
    matrix where :math:`S_{ii} = \\sigma_i^2`.

    Args:
        x (~chainer.Variable): Input variable.
        mean (~chainer.Variable): A variable representing mean of a Gaussian
            distribution, :math:`\\mu`.
        ln_var (~chainer.Variable): A variable representing logarithm of
            variance of a Gaussian distribution, :math:`\\log(\\sigma^2)`.

    Returns:
        ~chainer.Variable: A variable representing the negative log-likelihood.

    """
    assert isinstance(x, variable.Variable)
    assert isinstance(mean, variable.Variable)
    assert isinstance(ln_var, variable.Variable)

    D = x.size
    x_prec = exponential.exp(-ln_var)
    x_diff = x - mean
    x_power = (x_diff * x_diff) * x_prec * -0.5
    return (sum.sum(ln_var) + D * math.log(2 * math.pi)) / 2 - sum.sum(x_power)
Example #40
0
    def decoder_predict(self,
                        start_word,
                        enc_states,
                        max_predict_len=MAX_PREDICT_LEN,
                        sample=False):
        xp = cuda.cupy if self.gpuid >= 0 else np

        # __QUESTION -- Following code is to assist with ATTENTION
        # alpha_arr should store the alphas for every predicted word
        alpha_arr = xp.empty((0, enc_states.shape[0]), dtype=xp.float32)

        # return list of predicted words
        predicted_sent = []
        # load start symbol
        pred_word = Variable(xp.asarray([start_word], dtype=np.int32),
                             volatile=True)
        pred_count = 0

        # start prediction loop
        while pred_count < max_predict_len and (int(pred_word.data) !=
                                                (EOS_ID)):
            self.decode(pred_word, train=False)

            if self.attn == NO_ATTN:
                predicted_out = self.out(self[self.lstm_dec[-1]].h)
            else:
                ''' __QUESTION Add attention '''
                prevh = self[self.lstm_dec[-1]].h
                alpha = F.softmax(matmul(prevh, enc_states, transb=True))
                ctxt = F.reshape(
                    M.sum(F.scale(enc_states, F.transpose(alpha), axis=0),
                          axis=0), (1, 200))
                alpha_arr = xp.concatenate((alpha_arr, alpha.data))
                predicted_out = self.out(self.attn_out(F.concat(
                    (ctxt, prevh))))

            prob = F.softmax(predicted_out)

            pred_word = self.select_word(prob, train=False, sample=sample)
            # add integer id of predicted word to output list
            predicted_sent.append(int(pred_word.data))
            pred_count += 1
        # __QUESTION Add attention
        # When implementing attention, make sure to use alpha_arr to store
        # your attention vectors.
        # The visualisation function in nmt_translate.py assumes such an array as input.
        return predicted_sent, alpha_arr
Example #41
0
def _kl_independent_independent(dist1, dist2):
    '''Batched KL divergence :math:`\\mathrm{KL}(\\mathrm{dist1} ||
    \\mathrm{dist2})` for Independent distributions.

    We can leverage the fact that
    .. math::
        \\mathrm{KL}(
                \\mathrm{Independent}(\\mathrm{dist1}) ||
                \\mathrm{Independent}(\\mathrm{dist2}))
        = \\mathrm{sum}(\\mathrm{KL}(\\mathrm{dist1} || \\mathrm{dist2}))
    where the sum is over the ``reinterpreted_batch_ndims``.

    Args:
        dist1 (:class:`~chainer.distribution.Independent`): Instance of
            `Independent`.
        dist2 (:class:`~chainer.distribution.Independent`): Instance of
            `Independent`.

    Returns:
        Batchwise ``KL(dist1 || dist2)``.

    Raises:
        :class:`ValueError`: If the event space for ``dist1`` and ``dist2``,
            or their underlying distributions don't match.
    '''

    p = dist1.distribution
    q = dist2.distribution

    # The KL between any two (non)-batched distributions is a scalar.
    # Given that the KL between two factored distributions is the sum, i.e.
    # KL(p1(x)p2(y) || q1(x)q2(y)) = KL(p1 || q1) + KL(q1 || q2), we compute
    # KL(p || q) and do a `reduce_sum` on the reinterpreted batch dimensions.
    if dist1.event_shape == dist2.event_shape:
        if p.event_shape == q.event_shape:
            num_reduce_dims = len(dist1.event_shape) - len(p.event_shape)
            reduce_dims = tuple([-i - 1 for i in range(0, num_reduce_dims)])

            return sum_mod.sum(
                distribution.kl_divergence(p, q), axis=reduce_dims)
        else:
            raise NotImplementedError(
                'KL between Independents with different '
                'event shapes not supported.')
    else:
        raise ValueError('Event shapes do not match.')
Example #42
0
def _kl_independent_independent(dist1, dist2):
    '''Batched KL divergence :math:`\\mathrm{KL}(\\mathrm{dist1} ||
    \\mathrm{dist2})` for Independent distributions.

    We can leverage the fact that
    .. math::
        \\mathrm{KL}(
                \\mathrm{Independent}(\\mathrm{dist1}) ||
                \\mathrm{Independent}(\\mathrm{dist2}))
        = \\mathrm{sum}(\\mathrm{KL}(\\mathrm{dist1} || \\mathrm{dist2}))
    where the sum is over the ``reinterpreted_batch_ndims``.

    Args:
        dist1 (:class:`~chainer.distribution.Independent`): Instance of
            `Independent`.
        dist2 (:class:`~chainer.distribution.Independent`): Instance of
            `Independent`.

    Returns:
        Batchwise ``KL(dist1 || dist2)``.

    Raises:
        :class:`ValueError`: If the event space for ``dist1`` and ``dist2``,
            or their underlying distributions don't match.
    '''

    p = dist1.distribution
    q = dist2.distribution

    # The KL between any two (non)-batched distributions is a scalar.
    # Given that the KL between two factored distributions is the sum, i.e.
    # KL(p1(x)p2(y) || q1(x)q2(y)) = KL(p1 || q1) + KL(q1 || q2), we compute
    # KL(p || q) and do a `reduce_sum` on the reinterpreted batch dimensions.
    if dist1.event_shape == dist2.event_shape:
        if p.event_shape == q.event_shape:
            num_reduce_dims = len(dist1.event_shape) - len(p.event_shape)
            reduce_dims = tuple([-i - 1 for i in range(0, num_reduce_dims)])

            return sum_mod.sum(distribution.kl_divergence(p, q),
                               axis=reduce_dims)
        else:
            raise NotImplementedError('KL between Independents with different '
                                      'event shapes not supported.')
    else:
        raise ValueError('Event shapes do not match.')
Example #43
0
 def entropy(self):
     return -sum_mod.sum(
         chainer.distributions.utils._modified_xlogx(self.p), axis=-1)
Example #44
0
 def _logdet(self, x):
     diag = diagonal.diagonal(x, axis1=-2, axis2=-1)
     logdet = sum_mod.sum(
         exponential.log(abs(diag)), axis=-1)
     return logdet
Example #45
0
 def entropy(self):
     return - sum_mod.sum(
         chainer.distributions.utils._modified_xlogx(self.p), axis=-1)
Example #46
0
def _kl_dirichlet_dirichlet(dist1, dist2):
    return - _lbeta(dist1.alpha) + _lbeta(dist2.alpha) \
        + sum_mod.sum((dist1.alpha - dist2.alpha) * (
            digamma.digamma(dist1.alpha)
            - expand_dims.expand_dims(digamma.digamma(
                dist1.alpha0), axis=-1)), axis=-1)
Example #47
0
 def _logdet(self, x):
     diag = diagonal.diagonal(x, axis1=-2, axis2=-1)
     logdet = sum_mod.sum(
         exponential.log(abs(diag)), axis=-1)
     return logdet
Example #48
0
 def entropy(self):
     return _lbeta(self.alpha) \
         + (self.alpha0 - self.event_shape[0]) \
         * digamma.digamma(self.alpha0) \
         - sum_mod.sum((self.alpha - 1)
                       * digamma.digamma(self.alpha), axis=-1)
Example #49
0
 def log_prob(self, x):
     return - _lbeta(self.alpha) \
         + sum_mod.sum((self.alpha - 1) * exponential.log(x), axis=-1)
Example #50
0
def _lbeta(x):
    return sum_mod.sum(lgamma.lgamma(x), axis=-1) \
        - lgamma.lgamma(sum_mod.sum(x, axis=-1))
Example #51
0
 def alpha0(self):
     return sum_mod.sum(self.alpha, axis=-1)
Example #52
0
def black_out(x, t, W, samples, reduce='mean'):
    """BlackOut loss function.

    BlackOut loss function is defined as

    .. math::

      -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)),

    where :math:`t` is the correct label, :math:`S` is a set of negative
    examples and :math:`p(\\cdot)` is likelihood of a given label.
    And, :math:`p` is defined as

    .. math::

       p(y) = \\frac{\\exp(W_y^\\top x)}{
       \\sum_{s \\in samples} \\exp(W_s^\\top x)}.

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the
    no loss values. If it is ``'mean'``, this function takes
    a mean of loss values.

    Args:
        x (~chainer.Variable): Batch of input vectors.
            Its shape should be :math:`(N, D)`.
        t (~chainer.Variable): Vector of ground truth labels.
            Its shape should be :math:`(N,)`. Each elements :math:`v`
            should satisfy :math:`0 \\geq v \\geq V` or :math:`-1`
            where :math:`V` is the number of label types.
        W (~chainer.Variable): Weight matrix.
            Its shape should be :math:`(V, D)`
        samples (~chainer.Variable): Negative samples.
            Its shape should be :math:`(N, S)` where :math:`S` is
            the number of negative samples.
        reduce (str): Reduction option. Its value must be either
            ``'no'`` or ``'mean'``. Otherwise,
            :class:`ValueError` is raised.

    Returns:
        ~chainer.Variable:
            A variable object holding loss value(s).
            If ``reduce`` is ``'no'``, the output variable holds an
            array whose shape is :math:`(N,)` .
            If it is ``'mean'``, it holds a scalar.

    See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \
         Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_

    .. seealso:: :class:`~chainer.links.BlackOut`.

    """

    batch_size = x.shape[0]

    neg_emb = embed_id.embed_id(samples, W)
    neg_y = matmul.matmul(neg_emb, x[:, :, None])
    neg_y = reshape.reshape(neg_y, neg_y.shape[:-1])

    pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1)
    pos_y = matmul.matmul(pos_emb, x[:, :, None])
    pos_y = reshape.reshape(pos_y, pos_y.shape[:-1])

    logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1)
    blogz, bneg_y = broadcast.broadcast(
        reshape.reshape(logz, (batch_size, 1)), neg_y)
    ny = exponential.log(1 - exponential.exp(bneg_y - blogz))
    py = reshape.reshape(pos_y, (batch_size,))
    loss = -(py - logz + _sum.sum(ny, axis=1))
    if reduce == 'mean':
        loss = average.average(loss)
    return loss
Example #53
0
def _kl_categorical_categorical(dist1, dist2):
    return sum_mod.sum(dist1.p * (dist1.log_p - dist2.log_p), axis=-1)
Example #54
0
def _kl_categorical_categorical(dist1, dist2):
    return sum_mod.sum(dist1.p * (dist1.log_p - dist2.log_p), axis=-1)
Example #55
0
def crf1d(cost, xs, ys, reduce='mean'):
    """Calculates negative log-likelihood of linear-chain CRF.

    It takes a transition cost matrix, a sequence of costs, and a sequence of
    labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to
    a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at
    position :math:`i`, and :math:`y_i` be an expected label at position
    :math:`i`. The negative log-likelihood of linear-chain CRF is defined as

    .. math::
        L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\
             \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) ,

    where :math:`l` is the length of the input sequence and :math:`Z` is the
    normalizing constant called partition function.

    .. note::

       When you want to calculate the negative log-likelihood of sequences
       which have different lengths, sort the sequences in descending order of
       lengths and transpose the sequences.
       For example, you have three input sequences:

       >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype(np.float32)
       >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype(np.float32)
       >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype(np.float32)

       >>> a = [a1, a2, a3, a4]
       >>> b = [b1, b2, b3]
       >>> c = [c1, c2]

       where ``a1`` and all other variables are arrays with ``(K,)`` shape.
       Make a transpose of the sequences:

       >>> x1 = np.stack([a1, b1, c1])
       >>> x2 = np.stack([a2, b2, c2])
       >>> x3 = np.stack([a3, b3])
       >>> x4 = np.stack([a4])

       and make a list of the arrays:

       >>> xs = [x1, x2, x3, x4]

       You need to make label sequences in the same fashion.
       And then, call the function:

       >>> cost = chainer.Variable(
       ...     np.random.uniform(-1, 1, (3, 3)).astype(np.float32))
       >>> ys = [np.zeros(x.shape[0:1], dtype=np.int32) for x in xs]
       >>> loss = F.crf1d(cost, xs, ys)

       It calculates mean of the negative log-likelihood of the three
       sequences.

       The output is a variable whose value depends on the value of
       the option ``reduce``. If it is ``'no'``, it holds the elementwise
       loss values. If it is ``'mean'``, it holds mean of the loss values.


    Args:
        cost (Variable): A :math:`K \\times K` matrix which holds transition
            cost between two labels, where :math:`K` is the number of labels.
        xs (list of Variable): Input vector for each label.
            ``len(xs)`` denotes the length of the sequence,
            and each :class:`~chainer.Variable` holds a :math:`B \\times K`
            matrix, where :math:`B` is mini-batch size, :math:`K` is the number
            of labels.
            Note that :math:`B`\\ s in all the variables are not necessary
            the same, i.e., it accepts the input sequences with different
            lengths.
        ys (list of Variable): Expected output labels. It needs to have the
            same length as ``xs``. Each :class:`~chainer.Variable` holds a
            :math:`B` integer vector.
            When ``x`` in ``xs`` has the different :math:`B`, correspoding
            ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy
            ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``.
        reduce (str): Reduction option. Its value must be either
            ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised.

    Returns:
        ~chainer.Variable: A variable holding the average negative
        log-likelihood of the input sequences.

    .. note::

        See detail in the original paper: `Conditional Random Fields:
        Probabilistic Models for Segmenting and Labeling Sequence Data
        <https://repository.upenn.edu/cis_papers/159/>`_.

    """
    if reduce not in ('mean', 'no'):
        raise ValueError(
            "only 'mean' and 'no' are valid for 'reduce', but '%s' is "
            'given' % reduce)

    assert xs[0].shape[1] == cost.shape[0]

    n_label = cost.shape[0]
    n_batch = xs[0].shape[0]

    alpha = xs[0]
    alphas = []
    for x in xs[1:]:
        batch = x.shape[0]
        if alpha.shape[0] > batch:
            alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0)
            alphas.append(alpha_rest)
        b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost)
        alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x

    if len(alphas) > 0:
        alphas.append(alpha)
        alpha = concat.concat(alphas[::-1], axis=0)

    logz = logsumexp.logsumexp(alpha, axis=1)

    cost = reshape.reshape(cost, (cost.size, 1))
    score = select_item.select_item(xs[0], ys[0])
    scores = []
    for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]):
        batch = x.shape[0]
        if score.shape[0] > batch:
            y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0)
            score, score_rest = split_axis.split_axis(score, [batch], axis=0)
            scores.append(score_rest)
        score += (select_item.select_item(x, y) + reshape.reshape(
            embed_id.embed_id(y_prev * n_label + y, cost), (batch,)))

    if len(scores) > 0:
        scores.append(score)
        score = concat.concat(scores[::-1], axis=0)

    loss = logz - score
    if reduce == 'mean':
        return _sum.sum(loss) / n_batch
    else:
        return loss
Example #56
0
 def log_prob(self, x):
     return sum_mod.sum(exponential.log(self.p) * x, axis=-1)
Example #57
0
 def log_prob(self, x):
     return sum_mod.sum(self.log_p * x, axis=-1)
Example #58
0
def _triangular_logdet(x):
    diag = diagonal.diagonal(x, axis1=-2, axis2=-1)
    return sum_mod.sum(exponential.log(abs(diag)), axis=-1)