Esempio n. 1
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return dist2._logdet_scale - dist1._logdet_scale \
        + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Esempio n. 2
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    scale_tril_inv2 = _batch_triangular_inv(
        dist2.scale_tril.reshape(-1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d))**2,
                        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah**2, axis=-2).reshape(dist1.batch_shape)
    return dist2._logdet_scale - dist1._logdet_scale \
        + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Esempio n. 3
0
    def log_prob(self, x):
        scale_tril_inv = \
            _batch_triangular_inv(self.scale_tril.reshape(-1, self.d, self.d))
        scale_tril_inv = scale_tril_inv.reshape(self.batch_shape +
                                                (self.d, self.d))

        bsti = broadcast.broadcast_to(scale_tril_inv, x.shape + (self.d, ))
        bl = broadcast.broadcast_to(self.loc, x.shape)
        m = matmul.matmul(bsti, expand_dims.expand_dims(x - bl, axis=-1))
        m = matmul.matmul(swapaxes.swapaxes(m, -1, -2), m)
        m = squeeze.squeeze(m, axis=-1)
        m = squeeze.squeeze(m, axis=-1)
        logz = LOGPROBC * self.d - self._logdet(self.scale_tril)
        return broadcast.broadcast_to(logz, m.shape) - 0.5 * m
Esempio n. 4
0
    def log_prob(self, x):
        scale_tril_inv = \
            _batch_triangular_inv(self.scale_tril.reshape(-1, self.d, self.d))
        scale_tril_inv = scale_tril_inv.reshape(
            self.batch_shape+(self.d, self.d))

        bsti = broadcast.broadcast_to(scale_tril_inv, x.shape + (self.d,))
        bl = broadcast.broadcast_to(self.loc, x.shape)
        m = matmul.matmul(
            bsti,
            expand_dims.expand_dims(x - bl, axis=-1))
        m = matmul.matmul(swapaxes.swapaxes(m, -1, -2), m)
        m = squeeze.squeeze(m, axis=-1)
        m = squeeze.squeeze(m, axis=-1)
        logz = LOGPROBC * self.d - self._logdet(self.scale_tril)
        return broadcast.broadcast_to(logz, m.shape) - 0.5 * m
Esempio n. 5
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    diag = diagonal.diagonal(dist1.scale_tril, axis1=-2, axis2=-1)
    logdet1 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    diag = diagonal.diagonal(dist2.scale_tril, axis1=-2, axis2=-1)
    logdet2 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Esempio n. 6
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    diag = diagonal.diagonal(dist1.scale_tril, axis1=-2, axis2=-1)
    logdet1 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    diag = diagonal.diagonal(dist2.scale_tril, axis1=-2, axis2=-1)
    logdet2 = sum_mod.sum(exponential.log(abs(diag)), axis=-1)

    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Esempio n. 7
0
    def encode_decode_train(self,
                            in_word_list,
                            out_word_list,
                            train=True,
                            sample=False):
        xp = cuda.cupy if self.gpuid >= 0 else np
        self.reset_state()
        # Add GO_ID, EOS_ID to decoder input
        decoder_word_list = [GO_ID] + out_word_list + [EOS_ID]
        # encode list of words/tokens
        enc_states = self.encode_list(in_word_list, train=train)
        # initialize decoder LSTM to final encoder state
        self.set_decoder_state()
        # decode and compute loss
        # convert list of tokens into chainer variable list
        var_dec = (Variable(xp.asarray(decoder_word_list,
                                       dtype=np.int32).reshape((-1, 1)),
                            volatile=not train))
        # Initialise first decoded word to GOID
        pred_word = Variable(xp.asarray([GO_ID], dtype=np.int32),
                             volatile=not train)

        # compute loss
        self.loss = 0
        # decode tokens
        for next_word_var in var_dec[1:]:
            self.decode(pred_word, train=train)
            if self.attn == NO_ATTN:
                predicted_out = self.out(self[self.lstm_dec[-1]].h)
            else:
                ''' __QUESTION Add attention '''
                prevh = self[self.lstm_dec[-1]].h
                alpha = F.softmax(matmul(prevh, enc_states, transb=True))
                ctxt = F.reshape(
                    M.sum(F.scale(enc_states, F.transpose(alpha), axis=0),
                          axis=0), (1, 200))
                predicted_out = self.out(self.attn_out(F.concat(
                    (ctxt, prevh))))

            # compute loss
            prob = F.softmax(predicted_out)

            pred_word = self.select_word(prob, train=train, sample=False)
            # pred_word = Variable(xp.asarray([pred_word.data], dtype=np.int32), volatile=not train)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            Explain what loss is computed with an example. What does this value mean?

            The cross-entropy is a soft measure of how close the network got to the
            correct answer. Here it is used to find how close the predicted word
            (predicted_out) was to the expected word (next_word_var).
            '''
            self.loss += F.softmax_cross_entropy(predicted_out, next_word_var)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        report({"loss": self.loss}, self)

        return self.loss
Esempio n. 8
0
def _kl_multivariatenormal_multivariatenormal(dist1, dist2):
    st = moveaxis.moveaxis(dist1.scale_tril, (-2, -1), (0, 1))
    diag = st[list(range(dist1.d)), list(range(dist1.d))]
    logdet1 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0)

    st = moveaxis.moveaxis(dist2.scale_tril, (-2, -1), (0, 1))
    diag = st[list(range(dist2.d)), list(range(dist2.d))]
    logdet2 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0)

    scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape(
        -1, dist2.d, dist2.d))
    trace = sum_mod.sum(matmul.matmul(
        scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2,
        axis=(-1, -2)).reshape(dist1.batch_shape)

    mu = dist1.loc - dist2.loc
    mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1))
    mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape)
    return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
Esempio n. 9
0
    def sample_n(self, n):
        if self._is_gpu:
            eps = cuda.cupy.random.standard_normal(
                (n,)+self.loc.shape+(1,), dtype=self.loc.dtype)
        else:
            eps = numpy.random.standard_normal(
                (n,)+self.loc.shape+(1,)).astype(numpy.float32)

        return self.loc + squeeze.squeeze(
            matmul.matmul(self.scale_tril, eps), axis=-1)
Esempio n. 10
0
    def sample_n(self, n):
        if self._is_gpu:
            eps = cuda.cupy.random.standard_normal(
                (n,)+self.loc.shape+(1,), dtype=self.loc.dtype)
        else:
            eps = numpy.random.standard_normal(
                (n,)+self.loc.shape+(1,)).astype(numpy.float32)

        return self.loc + squeeze.squeeze(
            matmul.matmul(self.scale_tril, eps), axis=-1)
Esempio n. 11
0
    def sample_n(self, n):
        if self._is_gpu:
            eps = cuda.cupy.random.standard_normal(
                (n,)+self.loc.shape+(1,), dtype=self.loc.dtype)
        else:
            eps = numpy.random.standard_normal(
                (n,)+self.loc.shape+(1,)).astype(numpy.float32)

        noise = matmul.matmul(repeat.repeat(
            expand_dims.expand_dims(self.scale_tril, axis=0), n, axis=0), eps)
        noise = squeeze.squeeze(noise, axis=-1)
        noise += repeat.repeat(expand_dims.expand_dims(
            self.loc, axis=0), n, axis=0)

        return noise
Esempio n. 12
0
    def sample_n(self, n):
        if self._is_gpu:
            eps = cuda.cupy.random.standard_normal(
                (n,)+self.loc.shape+(1,), dtype=self.loc.dtype)
        else:
            eps = numpy.random.standard_normal(
                (n,)+self.loc.shape+(1,)).astype(numpy.float32)

        noise = matmul.matmul(repeat.repeat(
            expand_dims.expand_dims(self.scale_tril, axis=0), n, axis=0), eps)
        noise = squeeze.squeeze(noise, axis=-1)
        noise += repeat.repeat(expand_dims.expand_dims(
            self.loc, axis=0), n, axis=0)

        return noise
Esempio n. 13
0
    def decoder_predict(self,
                        start_word,
                        enc_states,
                        max_predict_len=MAX_PREDICT_LEN,
                        sample=False):
        xp = cuda.cupy if self.gpuid >= 0 else np

        # __QUESTION -- Following code is to assist with ATTENTION
        # alpha_arr should store the alphas for every predicted word
        alpha_arr = xp.empty((0, enc_states.shape[0]), dtype=xp.float32)

        # return list of predicted words
        predicted_sent = []
        # load start symbol
        pred_word = Variable(xp.asarray([start_word], dtype=np.int32),
                             volatile=True)
        pred_count = 0

        # start prediction loop
        while pred_count < max_predict_len and (int(pred_word.data) !=
                                                (EOS_ID)):
            self.decode(pred_word, train=False)

            if self.attn == NO_ATTN:
                predicted_out = self.out(self[self.lstm_dec[-1]].h)
            else:
                ''' __QUESTION Add attention '''
                prevh = self[self.lstm_dec[-1]].h
                alpha = F.softmax(matmul(prevh, enc_states, transb=True))
                ctxt = F.reshape(
                    M.sum(F.scale(enc_states, F.transpose(alpha), axis=0),
                          axis=0), (1, 200))
                alpha_arr = xp.concatenate((alpha_arr, alpha.data))
                predicted_out = self.out(self.attn_out(F.concat(
                    (ctxt, prevh))))

            prob = F.softmax(predicted_out)

            pred_word = self.select_word(prob, train=False, sample=sample)
            # add integer id of predicted word to output list
            predicted_sent.append(int(pred_word.data))
            pred_count += 1
        # __QUESTION Add attention
        # When implementing attention, make sure to use alpha_arr to store
        # your attention vectors.
        # The visualisation function in nmt_translate.py assumes such an array as input.
        return predicted_sent, alpha_arr
def deformable_convolution_2d_sampler(x, offset, W, b=None, stride=1, pad=0):
    """Two-dimensional deformable convolution function using computed offset.

    This is an implementation of two-dimensional deformable convolution from
    `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.

    It takes four variables: the input image ``x``, the offset image
    ``offset``, the filter weight ``W``, and the bias vector ``b``.

    Notation: here is the notation for the dimensionalities.

    - :math:`n` is the batch size.
    - :math:`c_I` and :math:`c_O` are the number of the input and output,
      respectively.
    - :math:`h` and :math:`w` are the height and width of the input image,
      respectively.
    - :math:`k_H` and :math:`k_W` are the height and width of the filters,
      respectively.
    - :math:`s_Y` and :math:`s_X` are the strides of the filter.
    - :math:`p_H` and :math:`p_W` are the spatial padding sizes.

    The output size :math:`(h_O, w_O)` is determined by the following
    equations:

    .. math::

       h_O &= (h + 2p_H - k_H) / s_Y + 1,\\\\
       w_O &= (w + 2p_W - k_W) / s_X + 1.

    Args:
        x (~chainer.Variable): Input variable of shape :math:`(n, c_I, h, w)`.
        offset (~chainer.Variable): Offset variable of shape
            :math:`(n, 2 \\cdot k_H \\cdot k_W, h_O, w_O)`. The first
            :math:`k_H \\cdot k_W` index of the second axis corresponds to
            the offsets in the horizontal direction. The last
            :math:`k_H \\cdot k_W` index of the second axis corresponds to
            the offsets in the vertical direction.
        W (~chainer.Variable): Weight variable of shape
            :math:`(c_O, c_I, k_H, k_W)`.
        b (~chainer.Variable): Bias variable of length :math:`c_O` (optional).
        stride (int or pair of ints): Stride of filter applications.
            ``stride=s`` and ``stride=(s, s)`` are equivalent.
        pad (int or pair of ints): Spatial padding width for input arrays.
            ``pad=p`` and ``pad=(p, p)`` are equivalent.

    Returns:
        ~chainer.Variable: Output variable.

    Deformable convolution adds 2D offsets to the regular grid sampling
    locations in the standard convolution. It enables free form deformation of
    the sampling grid.

    See `Jifeng Dai, Haozhi Qi, Yuwen Xiong, Yi Li, Guodong Zhang, Han Hu, \
        Yichen Wei. Deformable Convolutional Networks\
        <https://arxiv.org/abs/1703.06211>`_

    If the bias vector is given, then it is added to all spatial locations of
    the output of convolution.

    .. seealso:: :class:`~chainer.links.DeformableConvolution2D`

    .. admonition:: Example

        >>> x = np.random.uniform(0, 1, (2, 3, 4, 7)).astype(np.float32)
        >>> offset = np.random.uniform(
        ...     0, 1, (2, 2 * 3 * 3, 2, 5)).astype(np.float32)
        >>> W = np.random.uniform(0, 1, (4, 3, 3, 3)).astype(np.float32)
        >>> b = np.random.uniform(0, 1, (4,)).astype(np.float32)
        >>> y = F.deformable_convolution_2d_sampler(x, offset, W, b)
        >>> y.shape
        (2, 4, 2, 5)

    """
    sy, sx = _pair(stride)
    ph, pw = _pair(pad)
    out_c, _, kh, kw = W.shape
    n, c, h, w = x.shape
    _, khkw2, out_h, out_w = offset.shape

    if khkw2 != 2 * kh * kw:
        raise ValueError(
            'The shape of the offset does not match the kernel size')

    grid = _offset2grid(offset, kh, kw, sy, sx, ph, pw, h, w)
    grid = grid.reshape(n, 2, kh * kw, out_h * out_w)
    x_pad = pad_module.pad(x, ((0, 0), (0, 0), (ph, ph), (pw, pw)), 'constant')
    x_st = spatial_transformer_sampler.spatial_transformer_sampler(
        x_pad, grid)

    x_st = x_st.transpose(0, 3, 1, 2).reshape(n * out_h * out_w, c * kh * kw)
    W = W.transpose(1, 2, 3, 0).reshape(c * kh * kw, out_c)
    y = matmul.matmul(x_st, W)
    y = y.reshape(n, out_h, out_w, out_c).transpose(0, 3, 1, 2)

    if b is not None:
        b = broadcast.broadcast_to(b[None, :, None, None], y.shape)
        y += b
    return y
Esempio n. 15
0
def deformable_convolution_2d_sampler(x, offset, W, b=None, stride=1, pad=0):
    """Two-dimensional deformable convolution function using computed offset.

    This is an implementation of two-dimensional deformable convolution from
    `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.

    It takes four variables: the input image ``x``, the offset image
    ``offset``, the filter weight ``W``, and the bias vector ``b``.

    Notation: here is the notation for the dimensionalities.

    - :math:`n` is the batch size.
    - :math:`c_I` and :math:`c_O` are the number of the input and output,
      respectively.
    - :math:`h` and :math:`w` are the height and width of the input image,
      respectively.
    - :math:`k_H` and :math:`k_W` are the height and width of the filters,
      respectively.
    - :math:`s_Y` and :math:`s_X` are the strides of the filter.
    - :math:`p_H` and :math:`p_W` are the spatial padding sizes.

    The output size :math:`(h_O, w_O)` is determined by the following
    equations:

    .. math::

       h_O &= (h + 2p_H - k_H) / s_Y + 1,\\\\
       w_O &= (w + 2p_W - k_W) / s_X + 1.

    Args:
        x (~chainer.Variable): Input variable of shape :math:`(n, c_I, h, w)`.
        offset (~chainer.Variable): Offset variable of shape
            :math:`(n, 2 \\cdot k_H \\cdot k_W, h_O, w_O)`. The first
            :math:`k_H \\cdot k_W` index of the second axis corresponds to
            the offsets in the horizontal direction. The last
            :math:`k_H \\cdot k_W` index of the second axis corresponds to
            the offsets in the vertical direction.
        W (~chainer.Variable): Weight variable of shape
            :math:`(c_O, c_I, k_H, k_W)`.
        b (~chainer.Variable): Bias variable of length :math:`c_O` (optional).
        stride (int or pair of ints): Stride of filter applications.
            ``stride=s`` and ``stride=(s, s)`` are equivalent.
        pad (int or pair of ints): Spatial padding width for input arrays.
            ``pad=p`` and ``pad=(p, p)`` are equivalent.

    Returns:
        ~chainer.Variable: Output variable.

    Deformable convolution adds 2D offsets to the regular grid sampling
    locations in the standard convolution. It enables free form deformation of
    the sampling grid.

    See `Jifeng Dai, Haozhi Qi, Yuwen Xiong, Yi Li, Guodong Zhang, Han Hu, \
        Yichen Wei. Deformable Convolutional Networks\
        <https://arxiv.org/abs/1703.06211>`_

    If the bias vector is given, then it is added to all spatial locations of
    the output of convolution.

    .. seealso:: :class:`~chainer.links.DeformableConvolution2D`

    .. admonition:: Example

        >>> x = np.random.uniform(0, 1, (2, 3, 4, 7)).astype(np.float32)
        >>> offset = np.random.uniform(
        ...     0, 1, (2, 2 * 3 * 3, 2, 5)).astype(np.float32)
        >>> W = np.random.uniform(0, 1, (4, 3, 3, 3)).astype(np.float32)
        >>> b = np.random.uniform(0, 1, (4,)).astype(np.float32)
        >>> y = F.deformable_convolution_2d_sampler(x, offset, W, b)
        >>> y.shape
        (2, 4, 2, 5)

    """
    sy, sx = _pair(stride)
    ph, pw = _pair(pad)
    out_c, _, kh, kw = W.shape
    n, c, h, w = x.shape
    _, khkw2, out_h, out_w = offset.shape

    if khkw2 != 2 * kh * kw:
        raise ValueError(
            'The shape of the offset does not match the kernel size')

    grid = _offset2grid(offset, kh, kw, sy, sx, ph, pw, h, w)
    grid = grid.reshape(n, 2, kh * kw, out_h * out_w)
    x_pad = pad_module.pad(x, ((0, 0), (0, 0), (ph, ph), (pw, pw)), 'constant')
    x_st = spatial_transformer_sampler.spatial_transformer_sampler(x_pad, grid)

    x_st = x_st.transpose(0, 3, 1, 2).reshape(n * out_h * out_w, c * kh * kw)
    W = W.transpose(1, 2, 3, 0).reshape(c * kh * kw, out_c)
    y = matmul.matmul(x_st, W)
    y = y.reshape(n, out_h, out_w, out_c).transpose(0, 3, 1, 2)

    if b is not None:
        b = broadcast.broadcast_to(b[None, :, None, None], y.shape)
        y += b
    return y
Esempio n. 16
0
def f_chainer(x, h):
    return reshape(
        matmul(h, reshape(x, (h.shape[1], int(x.size / h.shape[1])))), x.shape)
Esempio n. 17
0
def black_out(x, t, W, samples, reduce='mean'):
    """BlackOut loss function.

    BlackOut loss function is defined as

    .. math::

      -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)),

    where :math:`t` is the correct label, :math:`S` is a set of negative
    examples and :math:`p(\\cdot)` is likelihood of a given label.
    And, :math:`p` is defined as

    .. math::

       p(y) = \\frac{\\exp(W_y^\\top x)}{
       \\sum_{s \\in samples} \\exp(W_s^\\top x)}.

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the
    no loss values. If it is ``'mean'``, this function takes
    a mean of loss values.

    Args:
        x (~chainer.Variable): Batch of input vectors.
            Its shape should be :math:`(N, D)`.
        t (~chainer.Variable): Vector of ground truth labels.
            Its shape should be :math:`(N,)`. Each elements :math:`v`
            should satisfy :math:`0 \\geq v \\geq V` or :math:`-1`
            where :math:`V` is the number of label types.
        W (~chainer.Variable): Weight matrix.
            Its shape should be :math:`(V, D)`
        samples (~chainer.Variable): Negative samples.
            Its shape should be :math:`(N, S)` where :math:`S` is
            the number of negative samples.
        reduce (str): Reduction option. Its value must be either
            ``'no'`` or ``'mean'``. Otherwise,
            :class:`ValueError` is raised.

    Returns:
        ~chainer.Variable:
            A variable object holding loss value(s).
            If ``reduce`` is ``'no'``, the output variable holds an
            array whose shape is :math:`(N,)` .
            If it is ``'mean'``, it holds a scalar.

    See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \
         Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_

    .. seealso:: :class:`~chainer.links.BlackOut`.

    """

    batch_size = x.shape[0]

    neg_emb = embed_id.embed_id(samples, W)
    neg_y = matmul.matmul(neg_emb, x[:, :, None])
    neg_y = reshape.reshape(neg_y, neg_y.shape[:-1])

    pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1)
    pos_y = matmul.matmul(pos_emb, x[:, :, None])
    pos_y = reshape.reshape(pos_y, pos_y.shape[:-1])

    logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1)
    blogz, bneg_y = broadcast.broadcast(
        reshape.reshape(logz, (batch_size, 1)), neg_y)
    ny = exponential.log(1 - exponential.exp(bneg_y - blogz))
    py = reshape.reshape(pos_y, (batch_size,))
    loss = -(py - logz + _sum.sum(ny, axis=1))
    if reduce == 'mean':
        loss = average.average(loss)
    return loss
Esempio n. 18
0
 def covariance(self):
     return matmul.matmul(
         self.scale_tril,
         transpose.transpose(self.scale_tril,
                             tuple(range(len(self.batch_shape))) +
                             (-1, -2)))
Esempio n. 19
0
 def covariance(self):
     return matmul.matmul(
         self.scale_tril, transpose.transpose(
             self.scale_tril,
             tuple(range(len(self.batch_shape))) + (-1, -2)))