def bernoulli_nll(x, y): """Computes the negative log-likelihood of a Bernoulli distribution. This function calculates the negative log-likelihood of a Bernoulli distribution. .. math:: -B(x; p) = -\\sum_i {x_i \\log(p_i) + (1 - x_i)\\log(1 - p_i)}, where :math:`p = \\sigma(y)`, and :math:`\\sigma(\\cdot)` is a sigmoid function. .. note:: As this function uses a sigmoid function, you can pass a result of fully-connected layer (that means :class:`Linear`) to this function directly. Args: x (~chainer.Variable): Input variable. y (~chainer.Variable): A variable representing the parameter of Bernoulli distribution. Returns: ~chainer.Variable: A variable representing negative log-likelihood. """ assert isinstance(x, variable.Variable) assert isinstance(y, variable.Variable) return sum.sum(softplus.softplus(y)) - sum.sum(x * y)
def gaussian_kl_divergence(mean, ln_var): """Computes the KL-divergence of Gaussian variables from the standard one. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function returns a variable representing the KL-divergence between the given multi-dimensional Gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)` .. math:: D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)), where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2` and :math:`I` is an identity matrix. Args: mean (~chainer.Variable): A variable representing mean of given gaussian distribution, :math:`\\mu`. ln_var (~chainer.Variable): A variable representing logarithm of variance of given gaussian distribution, :math:`\\log(\\sigma^2)`. Returns: ~chainer.Variable: A variable representing KL-divergence between given gaussian distribution and the standard gaussian. """ assert isinstance(mean, variable.Variable) assert isinstance(ln_var, variable.Variable) J = mean.size var = exponential.exp(ln_var) return (sum.sum(mean * mean) + sum.sum(var) - sum.sum(ln_var) - J) * 0.5
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int or tuple of int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if axis is None: pass elif isinstance(axis, tuple): axis = [a + x.ndim if a < 0 else a for a in axis] axis.sort() for a, b in six.moves.zip(axis, axis[1:]): if a == b: raise ValueError('duplicate value in \'axis\'') axis = tuple(axis) else: if axis < 0: axis += x.ndim axis = (axis, ) if weights is not None: if axis is not None and len(axis) > 1: raise ValueError( 'tuple axis is not supported when weights is given') divider = sum_mod.sum(weights) if axis is not None: w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = 1 for a in axis: divider *= x.shape[a] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast whene weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def bernoulli_nll(x, y): """Computes the negative log-likelihood of a Bernoulli distribution. This function calculates the negative log-likelihood of a Bernoulli distribution. .. math:: -\\log B(x; p) = -\\sum_i \{x_i \\log(p_i) + (1 - x_i)\\log(1 - p_i)\}, where :math:`p = \\sigma(y)`, :math:`\\sigma(\\cdot)` is a sigmoid function, and :math:`B(x; p)` is a Bernoulli distribution. .. note:: As this function uses a sigmoid function, you can pass a result of fully-connected layer (that means :class:`Linear`) to this function directly. Args: x (~chainer.Variable): Input variable. y (~chainer.Variable): A variable representing the parameter of Bernoulli distribution. Returns: ~chainer.Variable: A variable representing negative log-likelihood. """ assert isinstance(x, variable.Variable) assert isinstance(y, variable.Variable) return sum.sum(softplus.softplus(y)) - sum.sum(x * y)
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int or tuple of int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if axis is None: pass elif isinstance(axis, tuple): axis = [a + x.ndim if a < 0 else a for a in axis] axis.sort() for a, b in six.moves.zip(axis, axis[1:]): if a == b: raise ValueError('duplicate value in \'axis\'') axis = tuple(axis) else: if axis < 0: axis += x.ndim axis = (axis,) if weights is not None: if axis is not None and len(axis) > 1: raise ValueError( 'tuple axis is not supported when weights is given') divider = sum_mod.sum(weights) if axis is not None: w_shape = [d if i in axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to( reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = 1 for a in axis: divider *= x.shape[a] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast when weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def _normalize(self, x): size = x.shape[1] mean = broadcast.broadcast_to((sum.sum(x, axis=1) / size)[:, None], x.shape) std = broadcast.broadcast_to( sqrt.sqrt(sum.sum(square.square(x - mean), axis=1) / size)[:, None], x.shape) + self.eps return (x - mean) / std
def _normalize(self, x): size = x.shape[1] mean = broadcast.broadcast_to( (sum.sum(x, axis=1) / size)[:, None], x.shape) std = broadcast.broadcast_to(sqrt.sqrt( sum.sum(square.square(x - mean), axis=1) / size)[:, None], x.shape) + self.eps return (x - mean) / std
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): scale_tril_inv2 = _batch_triangular_inv( dist2.scale_tril.reshape(-1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d))**2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah**2, axis=-2).reshape(dist1.batch_shape) return dist2._logdet_scale - dist1._logdet_scale \ + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape( -1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape) return dist2._logdet_scale - dist1._logdet_scale \ + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def black_out(x, t, W, samples): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. samples (~chainer.Variable): Negative samples. Returns: ~chainer.Variable: Loss value. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.batch_matmul(neg_emb, x) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.batch_matmul(pos_emb, x) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast(reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size, )) loss = py - logz + _sum.sum(ny, axis=1) return -_sum.sum(loss) / batch_size
def black_out(x, t, W, samples): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. samples (~chainer.Variable): Negative samples. Returns: ~chainer.Variable: Loss value. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.batch_matmul(neg_emb, x) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.batch_matmul(pos_emb, x) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = py - logz + _sum.sum(ny, axis=1) return -_sum.sum(loss) / batch_size
def predict(self, images, oversample=True): """Computes all the probabilities of given images. Args: images (iterable of PIL.Image or numpy.ndarray): Input images. oversample (bool): If ``True``, it averages results across center, corners, and mirrors. Otherwise, it uses only the center. Returns: ~chainer.Variable: Output that contains the class probabilities of given images. """ x = concat_examples([prepare(img, size=(256, 256)) for img in images]) if oversample: x = imgproc.oversample(x, crop_dims=(224, 224)) else: x = x[:, :, 16:240, 16:240] # Set volatile option to ON to reduce memory consumption x = Variable(self.xp.asarray(x), volatile=flag.ON) y = self(x, layers=['prob'])['prob'] if oversample: n = y.data.shape[0] // 10 y_shape = y.data.shape[1:] y = reshape(y, (n, 10) + y_shape) y = sum(y, axis=1) / 10 return y
def predict(self, images, oversample=True): """Computes all the probabilities of given images. Args: images (iterable of PIL.Image or numpy.ndarray): Input images. oversample (bool): If ``True``, it averages results across center, corners, and mirrors. Otherwise, it uses only the center. Returns: ~chainer.Variable: Output that contains the class probabilities of given images. """ x = concat_examples([prepare(img, size=(256, 256)) for img in images]) if oversample: x = imgproc.oversample(x, crop_dims=(224, 224)) else: x = x[:, :, 16:240, 16:240] # Use no_backprop_mode to reduce memory consumption with function.no_backprop_mode(): x = Variable(self.xp.asarray(x)) y = self(x, layers=['prob'])['prob'] if oversample: n = y.data.shape[0] // 10 y_shape = y.data.shape[1:] y = reshape(y, (n, 10) + y_shape) y = sum(y, axis=1) / 10 return y
def predict(self, images, oversample=True): """Computes all the probabilities of given images. Args: images (iterable of PIL.Image or numpy.ndarray): Input images. When you specify a color image as a :class:`numpy.ndarray`, make sure that color order is RGB. oversample (bool): If ``True``, it averages results across center, corners, and mirrors. Otherwise, it uses only the center. Returns: ~chainer.Variable: Output that contains the class probabilities of given images. """ x = concat_examples([prepare(img, size=(256, 256)) for img in images]) if oversample: x = imgproc.oversample(x, crop_dims=(224, 224)) else: x = x[:, :, 16:240, 16:240] # Use no_backprop_mode to reduce memory consumption with function.no_backprop_mode(), chainer.using_config('train', False): x = Variable(self.xp.asarray(x)) y = self(x, layers=['prob'])['prob'] if oversample: n = len(y) // 10 y_shape = y.shape[1:] y = reshape(y, (n, 10) + y_shape) y = sum(y, axis=1) / 10 return y
def gaussian_nll(x, mean, ln_var, reduce='sum'): """Computes the negative log-likelihood of a Gaussian distribution. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function computes in elementwise manner the negative log-likelihood of :math:`x` on a Gaussian distribution :math:`N(\\mu, S)`, .. math:: -\\log N(x; \\mu, \\sigma^2) = \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) + \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu), where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal matrix where :math:`S_{ii} = \\sigma_i^2`. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up or averaged respectively. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable. mean (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): A variable representing mean of a Gaussian distribution, :math:`\\mu`. ln_var (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): A variable representing logarithm of variance of a Gaussian distribution, :math:`\\log(\\sigma^2)`. reduce (str): Reduction option. Its value must be either ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable representing the negative log-likelihood. If ``reduce`` is ``'no'``, the output variable holds array whose shape is same as one of (hence both of) input variables. If it is ``'sum'`` or ``'mean'``, the output variable holds a scalar value. """ if reduce not in ('sum', 'mean', 'no'): raise ValueError( "only 'sum', 'mean' and 'no' are valid for 'reduce', but '%s'" ' is given' % reduce) x_prec = exponential.exp(-ln_var) x_diff = x - mean x_power = (x_diff * x_diff) * x_prec * -0.5 loss = (ln_var + math.log(2 * math.pi)) / 2 - x_power if reduce == 'sum': return sum.sum(loss) elif reduce == 'mean': return average.average(loss) else: return loss
def crf1d(cost, xs, ys): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input feature vector for each label. Each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. ys (list of Variable): Expected output labels. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <http://repository.upenn.edu/cis_papers/159/>`_. """ assert xs[0].data.shape[1] == cost.data.shape[0] n_label = cost.data.shape[0] n_batch = xs[0].data.shape[0] alpha = xs[0] for x in xs[1:]: b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x logz = logsumexp.logsumexp(alpha, axis=1) score = 0 cost = reshape.reshape(cost, (cost.data.size, 1)) for y1, y2 in zip(ys[:-1], ys[1:]): score += reshape.reshape( embed_id.embed_id(y1 * n_label + y2, cost), (n_batch,)) for x, y in zip(xs, ys): score += select_item.select_item(x, y) return _sum.sum(logz - score) / n_batch
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): diag = diagonal.diagonal(dist1.scale_tril, axis1=-2, axis2=-1) logdet1 = sum_mod.sum(exponential.log(abs(diag)), axis=-1) diag = diagonal.diagonal(dist2.scale_tril, axis1=-2, axis2=-1) logdet2 = sum_mod.sum(exponential.log(abs(diag)), axis=-1) scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape( -1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape) return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def backward(self, indexes, grad_outputs): x, gy0 = self.get_retained_inputs() gy0 = gy0.reshape(-1, *((1, ) * (x.ndim - 1))) gy0 = chainer.functions.broadcast_to(gy0, x.shape) ggx2 = 2 * grad_outputs[0] gx = ggx2 * gy0 ggy0 = ggx2 * x return gx, _sum.sum(ggy0, axis=tuple(six.moves.range(1, ggy0.ndim)))
def backward(self, indexes, grad_outputs): x, gy0 = self.get_retained_inputs() gy0 = gy0.reshape(-1, *((1,) * (x.ndim - 1))) gy0 = chainer.functions.broadcast_to(gy0, x.shape) ggx2 = 2 * grad_outputs[0] gx = ggx2 * gy0 ggy0 = ggx2 * x return gx, _sum.sum(ggy0, axis=tuple(six.moves.range(1, ggy0.ndim)))
def gaussian_kl_divergence(self, mu1, ln_var1, mu2, ln_var2): # D_KL [ N(z ; mu1, var1) || N(z; mu2, var2) ] var1 = exponential.exp(ln_var1) inv_var2 = exponential.exp(-ln_var2) mu_diff = mu2 - mu1 term1 = (var1 + mu_diff * mu_diff) * inv_var2 loss = (term1 - ln_var1 + ln_var2 - 1.) * 0.5 return sum.sum(loss)
def entropy(self): return ( _lbeta(self.alpha) + ((self.alpha0 - self.event_shape[0]) * digamma.digamma(self.alpha0)) - sum_mod.sum( (self.alpha - 1) * digamma.digamma(self.alpha), axis=-1))
def encode_decode_train(self, in_word_list, out_word_list, train=True, sample=False): xp = cuda.cupy if self.gpuid >= 0 else np self.reset_state() # Add GO_ID, EOS_ID to decoder input decoder_word_list = [GO_ID] + out_word_list + [EOS_ID] # encode list of words/tokens enc_states = self.encode_list(in_word_list, train=train) # initialize decoder LSTM to final encoder state self.set_decoder_state() # decode and compute loss # convert list of tokens into chainer variable list var_dec = (Variable(xp.asarray(decoder_word_list, dtype=np.int32).reshape((-1, 1)), volatile=not train)) # Initialise first decoded word to GOID pred_word = Variable(xp.asarray([GO_ID], dtype=np.int32), volatile=not train) # compute loss self.loss = 0 # decode tokens for next_word_var in var_dec[1:]: self.decode(pred_word, train=train) if self.attn == NO_ATTN: predicted_out = self.out(self[self.lstm_dec[-1]].h) else: ''' __QUESTION Add attention ''' prevh = self[self.lstm_dec[-1]].h alpha = F.softmax(matmul(prevh, enc_states, transb=True)) ctxt = F.reshape( M.sum(F.scale(enc_states, F.transpose(alpha), axis=0), axis=0), (1, 200)) predicted_out = self.out(self.attn_out(F.concat( (ctxt, prevh)))) # compute loss prob = F.softmax(predicted_out) pred_word = self.select_word(prob, train=train, sample=False) # pred_word = Variable(xp.asarray([pred_word.data], dtype=np.int32), volatile=not train) ''' ___QUESTION-1-DESCRIBE-E-START___ Explain what loss is computed with an example. What does this value mean? The cross-entropy is a soft measure of how close the network got to the correct answer. Here it is used to find how close the predicted word (predicted_out) was to the expected word (next_word_var). ''' self.loss += F.softmax_cross_entropy(predicted_out, next_word_var) '''___QUESTION-1-DESCRIBE-E-END___''' report({"loss": self.loss}, self) return self.loss
def gaussian_nll(x, mean, ln_var, reduce='sum'): """Computes the negative log-likelihood of a Gaussian distribution. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function computes in elementwise manner the negative log-likelihood of :math:`x` on a Gaussian distribution :math:`N(\\mu, S)`, .. math:: -\\log N(x; \\mu, \\sigma^2) = \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) + \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu), where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal matrix where :math:`S_{ii} = \\sigma_i^2`. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up or averaged respectively. Args: x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable. mean (:class:`~chainer.Variable` or :ref:`ndarray`): A variable representing mean of a Gaussian distribution, :math:`\\mu`. ln_var (:class:`~chainer.Variable` or :ref:`ndarray`): A variable representing logarithm of variance of a Gaussian distribution, :math:`\\log(\\sigma^2)`. reduce (str): Reduction option. Its value must be either ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable representing the negative log-likelihood. If ``reduce`` is ``'no'``, the output variable holds array whose shape is same as one of (hence both of) input variables. If it is ``'sum'`` or ``'mean'``, the output variable holds a scalar value. """ if reduce not in ('sum', 'mean', 'no'): raise ValueError( 'only \'sum\', \'mean\' and \'no\' are valid for \'reduce\', but ' '\'%s\' is given' % reduce) x_prec = exponential.exp(-ln_var) x_diff = x - mean x_power = (x_diff * x_diff) * x_prec * -0.5 loss = (ln_var + math.log(2 * math.pi)) / 2 - x_power if reduce == 'sum': return sum.sum(loss) elif reduce == 'mean': return average.average(loss) else: return loss
def bernoulli_nll(x, y, reduce='sum'): """Computes the negative log-likelihood of a Bernoulli distribution. This function calculates the negative log-likelihood of a Bernoulli distribution. .. math:: -\\log B(x; p) = -\\sum_i \\{x_i \\log(p_i) + \ (1 - x_i)\\log(1 - p_i)\\}, where :math:`p = \\sigma(y)`, :math:`\\sigma(\\cdot)` is a sigmoid function, and :math:`B(x; p)` is a Bernoulli distribution. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up or averaged respectively. .. note:: As this function uses a sigmoid function, you can pass a result of fully-connected layer (that means :class:`Linear`) to this function directly. Args: x (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): Input variable. y (:class:`~chainer.Variable` or :class:`numpy.ndarray` or \ :class:`cupy.ndarray`): A variable representing the parameter of Bernoulli distribution. reduce (str): Reduction option. Its value must be either ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable representing the negative log-likelihood. If ``reduce`` is ``'no'``, the output variable holds array whose shape is same as one of (hence both of) input variables. If it is ``'sum'`` or ``'mean'``, the output variable holds a scalar value. """ if reduce not in ('sum', 'mean', 'no'): raise ValueError( "only 'sum', 'mean' and 'no' are valid for 'reduce', but '%s'" ' is given' % reduce) loss = softplus.softplus(y) - x * y if reduce == 'sum': return sum.sum(loss) elif reduce == 'mean': return average.average(loss) else: return loss
def soft_dtw_grad(D_bar, G, verbose=1): xp = cuda.get_array_module(G) if verbose > 0: print('Computing final gradient') d1, d2, d3, m, n = G.shape assert D_bar.shape == (m, n) final_G = Variable(xp.zeros((d1, d2, d3, m), dtype=np.float64)) for i in range(m): final_G.data[:, :, :, i] = sum(D_bar[i] * G[:, :, :, i, :], axis=-1) return final_G
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if weights is not None: divider = sum_mod.sum(weights) if axis is not None: if axis < 0: axis += x.ndim w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to( reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = x.shape[axis] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast whene weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): st = moveaxis.moveaxis(dist1.scale_tril, (-2, -1), (0, 1)) diag = st[list(range(dist1.d)), list(range(dist1.d))] logdet1 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0) st = moveaxis.moveaxis(dist2.scale_tril, (-2, -1), (0, 1)) diag = st[list(range(dist2.d)), list(range(dist2.d))] logdet2 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0) scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape( -1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape) return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def average(x, axis=None, weights=None, keepdims=False): """Calculate weighted average of array elements over a given axis. Args: x (~chainer.Variable): Elements to sum. axis (None or int): Axis which the method is performed. With the default (axis = None) it performs a mean over all the dimensions of the input array. weights (None or chainer.Variable): An array holding weights to calculate weighted average. If it is ``None``, all weights are assumed to be one. When ``axis`` is ``None``, ``weights`` must have the same shape of ``x``. And when ``axis`` is ``int``, it must be 1-D array satisfing ``weights.shape == (x.shape[axis],)``. keepdims (bool): If ``True``, the specified axes are remained as axes of length one. Returns: ~chainer.Variable: Output variable. """ if weights is not None: divider = sum_mod.sum(weights) if axis is not None: if axis < 0: axis += x.ndim w_shape = [d if i == axis else 1 for i, d in enumerate(x.shape)] weights = broadcast.broadcast_to(reshape.reshape(weights, w_shape), x.shape) x = x * weights else: if axis is None: divider = x.size else: divider = x.shape[axis] x_sum = sum_mod.sum(x, axis, keepdims) if weights is not None: # We do not need to call broadcast whene weights is None because # divider here is not a Variable but a scalar divider = broadcast.broadcast_to(divider, x_sum.shape) return x_sum / divider
def gaussian_kl_divergence(mean, ln_var, reduce='sum'): """Computes the KL-divergence of Gaussian variables from the standard one. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function calculates the KL-divergence in elementwise manner between the given multi-dimensional Gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)` .. math:: D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)), where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2` and :math:`I` is an identity matrix. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up or averaged respectively. Args: mean (:class:`~chainer.Variable` or :ref:`ndarray`): A variable representing mean of given gaussian distribution, :math:`\\mu`. ln_var (:class:`~chainer.Variable` or :ref:`ndarray`): A variable representing logarithm of variance of given gaussian distribution, :math:`\\log(\\sigma^2)`. reduce (str): Reduction option. Its value must be either ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable representing KL-divergence between given gaussian distribution and the standard gaussian. If ``reduce`` is ``'no'``, the output variable holds array whose shape is same as one of (hence both of) input variables. If it is ``'sum'`` or ``'mean'``, the output variable holds a scalar value. """ if reduce not in ('sum', 'mean', 'no'): raise ValueError( "only 'sum', 'mean' and 'no' are valid for 'reduce', but '%s'" ' is given' % reduce) var = exponential.exp(ln_var) mean_square = mean * mean loss = (mean_square + var - ln_var - 1) * 0.5 if reduce == 'sum': return sum.sum(loss) elif reduce == 'mean': return average.average(loss) else: return loss
def gaussian_kl_divergence(mean, ln_var, reduce='sum'): """Computes the KL-divergence of Gaussian variables from the standard one. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function calculates the KL-divergence in elementwise manner between the given multi-dimensional Gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)` .. math:: D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)), where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2` and :math:`I` is an identity matrix. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up or averaged respectively. Args: mean (:class:`~chainer.Variable` or :ref:`ndarray`): A variable representing mean of given gaussian distribution, :math:`\\mu`. ln_var (:class:`~chainer.Variable` or :ref:`ndarray`): A variable representing logarithm of variance of given gaussian distribution, :math:`\\log(\\sigma^2)`. reduce (str): Reduction option. Its value must be either ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable representing KL-divergence between given gaussian distribution and the standard gaussian. If ``reduce`` is ``'no'``, the output variable holds array whose shape is same as one of (hence both of) input variables. If it is ``'sum'`` or ``'mean'``, the output variable holds a scalar value. """ if reduce not in ('sum', 'mean', 'no'): raise ValueError( 'only \'sum\', \'mean\' and \'no\' are valid for \'reduce\', but ' '\'%s\' is given' % reduce) var = exponential.exp(ln_var) mean_square = mean * mean loss = (mean_square + var - ln_var - 1) * 0.5 if reduce == 'sum': return sum.sum(loss) elif reduce == 'mean': return average.average(loss) else: return loss
def bernoulli_nll(x, y, reduce='sum'): """Computes the negative log-likelihood of a Bernoulli distribution. This function calculates the negative log-likelihood of a Bernoulli distribution. .. math:: -\\log B(x; p) = -\\sum_i \\{x_i \\log(p_i) + \ (1 - x_i)\\log(1 - p_i)\\}, where :math:`p = \\sigma(y)`, :math:`\\sigma(\\cdot)` is a sigmoid function, and :math:`B(x; p)` is a Bernoulli distribution. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'sum'`` or ``'mean'``, loss values are summed up or averaged respectively. .. note:: As this function uses a sigmoid function, you can pass a result of fully-connected layer (that means :class:`Linear`) to this function directly. Args: x (:class:`~chainer.Variable` or :ref:`ndarray`): Input variable. y (:class:`~chainer.Variable` or :ref:`ndarray`): A variable representing the parameter of Bernoulli distribution. reduce (str): Reduction option. Its value must be either ``'sum'``, ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable representing the negative log-likelihood. If ``reduce`` is ``'no'``, the output variable holds array whose shape is same as one of (hence both of) input variables. If it is ``'sum'`` or ``'mean'``, the output variable holds a scalar value. """ if reduce not in ('sum', 'mean', 'no'): raise ValueError( 'only \'sum\', \'mean\' and \'no\' are valid for \'reduce\', but ' '\'%s\' is given' % reduce) loss = softplus.softplus(y) - x * y if reduce == 'sum': return sum.sum(loss) elif reduce == 'mean': return average.average(loss) else: return loss
def _kl_dirichlet_dirichlet(dist1, dist2): return ( - _lbeta(dist1.alpha) + _lbeta(dist2.alpha) + sum_mod.sum( (dist1.alpha - dist2.alpha) * (digamma.digamma(dist1.alpha) - expand_dims.expand_dims( digamma.digamma(dist1.alpha0), axis=-1)), axis=-1))
def _sum_rightmost(value, dim): """Sum out `dim` many rightmost dimensions of a given tensor. Args: value (Tensor): A tensor of ``.dim()`` at least ``dim``. dim (int): The number of rightmost dims to sum out. """ if dim == 0: return value required_shape = value.shape[:-dim] + (-1, ) return sum_mod.sum(reshape.reshape(value, required_shape), axis=-1)
def max_singular_value(W, u=None, Ip=1): """ Apply power iteration for the weight parameter """ xp = cuda.get_array_module(W.data) if u is None: u = xp.random.normal(size=(1, W.shape[0])).astype(xp.float32) _u = u for _ in range(Ip): _v = _l2normalize(xp.dot(_u, W.data), eps=1e-12) _u = _l2normalize(xp.dot(_v, W.data.transpose()), eps=1e-12) sigma = sum.sum(linear.linear(_u, transpose.transpose(W)) * _v) return sigma, _u, _v
def _log_det_jacobian(self, x, y): shape = x.shape scale = self.scale if isinstance(scale, numbers.Number): xp = cuda.get_array_module(x, y) result = exponential.log(basic_math.absolute(scale)) \ * xp.ones(shape, dtype=x.dtype) else: result = exponential.log(basic_math.absolute(scale)) if self.event_dim: result_size = result.shape[:-self.event_dim] + (-1, ) result = sum_mod.sum(result.view(result_size), axis=-1) shape = shape[:-self.event_dim] return broadcast.broadcast_to(result, shape)
def gaussian_nll(x, mean, ln_var): """Computes the negative log-likelihood of a Gaussian distribution. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function returns the negative log-likelihood of :math:`x` on a Gaussian distribution :math:`N(\\mu, S)`, .. math:: -\\log N(x; \\mu, \\sigma^2) = \\log\\left(\\sqrt{(2\\pi)^D |S|}\\right) + \\frac{1}{2}(x - \\mu)^\\top S^{-1}(x - \\mu), where :math:`D` is a dimension of :math:`x` and :math:`S` is a diagonal matrix where :math:`S_{ii} = \\sigma_i^2`. Args: x (~chainer.Variable): Input variable. mean (~chainer.Variable): A variable representing mean of a Gaussian distribution, :math:`\\mu`. ln_var (~chainer.Variable): A variable representing logarithm of variance of a Gaussian distribution, :math:`\\log(\\sigma^2)`. Returns: ~chainer.Variable: A variable representing the negative log-likelihood. """ assert isinstance(x, variable.Variable) assert isinstance(mean, variable.Variable) assert isinstance(ln_var, variable.Variable) D = x.size x_prec = exponential.exp(-ln_var) x_diff = x - mean x_power = (x_diff * x_diff) * x_prec * -0.5 return (sum.sum(ln_var) + D * math.log(2 * math.pi)) / 2 - sum.sum(x_power)
def decoder_predict(self, start_word, enc_states, max_predict_len=MAX_PREDICT_LEN, sample=False): xp = cuda.cupy if self.gpuid >= 0 else np # __QUESTION -- Following code is to assist with ATTENTION # alpha_arr should store the alphas for every predicted word alpha_arr = xp.empty((0, enc_states.shape[0]), dtype=xp.float32) # return list of predicted words predicted_sent = [] # load start symbol pred_word = Variable(xp.asarray([start_word], dtype=np.int32), volatile=True) pred_count = 0 # start prediction loop while pred_count < max_predict_len and (int(pred_word.data) != (EOS_ID)): self.decode(pred_word, train=False) if self.attn == NO_ATTN: predicted_out = self.out(self[self.lstm_dec[-1]].h) else: ''' __QUESTION Add attention ''' prevh = self[self.lstm_dec[-1]].h alpha = F.softmax(matmul(prevh, enc_states, transb=True)) ctxt = F.reshape( M.sum(F.scale(enc_states, F.transpose(alpha), axis=0), axis=0), (1, 200)) alpha_arr = xp.concatenate((alpha_arr, alpha.data)) predicted_out = self.out(self.attn_out(F.concat( (ctxt, prevh)))) prob = F.softmax(predicted_out) pred_word = self.select_word(prob, train=False, sample=sample) # add integer id of predicted word to output list predicted_sent.append(int(pred_word.data)) pred_count += 1 # __QUESTION Add attention # When implementing attention, make sure to use alpha_arr to store # your attention vectors. # The visualisation function in nmt_translate.py assumes such an array as input. return predicted_sent, alpha_arr
def _kl_independent_independent(dist1, dist2): '''Batched KL divergence :math:`\\mathrm{KL}(\\mathrm{dist1} || \\mathrm{dist2})` for Independent distributions. We can leverage the fact that .. math:: \\mathrm{KL}( \\mathrm{Independent}(\\mathrm{dist1}) || \\mathrm{Independent}(\\mathrm{dist2})) = \\mathrm{sum}(\\mathrm{KL}(\\mathrm{dist1} || \\mathrm{dist2})) where the sum is over the ``reinterpreted_batch_ndims``. Args: dist1 (:class:`~chainer.distribution.Independent`): Instance of `Independent`. dist2 (:class:`~chainer.distribution.Independent`): Instance of `Independent`. Returns: Batchwise ``KL(dist1 || dist2)``. Raises: :class:`ValueError`: If the event space for ``dist1`` and ``dist2``, or their underlying distributions don't match. ''' p = dist1.distribution q = dist2.distribution # The KL between any two (non)-batched distributions is a scalar. # Given that the KL between two factored distributions is the sum, i.e. # KL(p1(x)p2(y) || q1(x)q2(y)) = KL(p1 || q1) + KL(q1 || q2), we compute # KL(p || q) and do a `reduce_sum` on the reinterpreted batch dimensions. if dist1.event_shape == dist2.event_shape: if p.event_shape == q.event_shape: num_reduce_dims = len(dist1.event_shape) - len(p.event_shape) reduce_dims = tuple([-i - 1 for i in range(0, num_reduce_dims)]) return sum_mod.sum( distribution.kl_divergence(p, q), axis=reduce_dims) else: raise NotImplementedError( 'KL between Independents with different ' 'event shapes not supported.') else: raise ValueError('Event shapes do not match.')
def _kl_independent_independent(dist1, dist2): '''Batched KL divergence :math:`\\mathrm{KL}(\\mathrm{dist1} || \\mathrm{dist2})` for Independent distributions. We can leverage the fact that .. math:: \\mathrm{KL}( \\mathrm{Independent}(\\mathrm{dist1}) || \\mathrm{Independent}(\\mathrm{dist2})) = \\mathrm{sum}(\\mathrm{KL}(\\mathrm{dist1} || \\mathrm{dist2})) where the sum is over the ``reinterpreted_batch_ndims``. Args: dist1 (:class:`~chainer.distribution.Independent`): Instance of `Independent`. dist2 (:class:`~chainer.distribution.Independent`): Instance of `Independent`. Returns: Batchwise ``KL(dist1 || dist2)``. Raises: :class:`ValueError`: If the event space for ``dist1`` and ``dist2``, or their underlying distributions don't match. ''' p = dist1.distribution q = dist2.distribution # The KL between any two (non)-batched distributions is a scalar. # Given that the KL between two factored distributions is the sum, i.e. # KL(p1(x)p2(y) || q1(x)q2(y)) = KL(p1 || q1) + KL(q1 || q2), we compute # KL(p || q) and do a `reduce_sum` on the reinterpreted batch dimensions. if dist1.event_shape == dist2.event_shape: if p.event_shape == q.event_shape: num_reduce_dims = len(dist1.event_shape) - len(p.event_shape) reduce_dims = tuple([-i - 1 for i in range(0, num_reduce_dims)]) return sum_mod.sum(distribution.kl_divergence(p, q), axis=reduce_dims) else: raise NotImplementedError('KL between Independents with different ' 'event shapes not supported.') else: raise ValueError('Event shapes do not match.')
def entropy(self): return -sum_mod.sum( chainer.distributions.utils._modified_xlogx(self.p), axis=-1)
def _logdet(self, x): diag = diagonal.diagonal(x, axis1=-2, axis2=-1) logdet = sum_mod.sum( exponential.log(abs(diag)), axis=-1) return logdet
def entropy(self): return - sum_mod.sum( chainer.distributions.utils._modified_xlogx(self.p), axis=-1)
def _kl_dirichlet_dirichlet(dist1, dist2): return - _lbeta(dist1.alpha) + _lbeta(dist2.alpha) \ + sum_mod.sum((dist1.alpha - dist2.alpha) * ( digamma.digamma(dist1.alpha) - expand_dims.expand_dims(digamma.digamma( dist1.alpha0), axis=-1)), axis=-1)
def entropy(self): return _lbeta(self.alpha) \ + (self.alpha0 - self.event_shape[0]) \ * digamma.digamma(self.alpha0) \ - sum_mod.sum((self.alpha - 1) * digamma.digamma(self.alpha), axis=-1)
def log_prob(self, x): return - _lbeta(self.alpha) \ + sum_mod.sum((self.alpha - 1) * exponential.log(x), axis=-1)
def _lbeta(x): return sum_mod.sum(lgamma.lgamma(x), axis=-1) \ - lgamma.lgamma(sum_mod.sum(x, axis=-1))
def alpha0(self): return sum_mod.sum(self.alpha, axis=-1)
def black_out(x, t, W, samples, reduce='mean'): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the no loss values. If it is ``'mean'``, this function takes a mean of loss values. Args: x (~chainer.Variable): Batch of input vectors. Its shape should be :math:`(N, D)`. t (~chainer.Variable): Vector of ground truth labels. Its shape should be :math:`(N,)`. Each elements :math:`v` should satisfy :math:`0 \\geq v \\geq V` or :math:`-1` where :math:`V` is the number of label types. W (~chainer.Variable): Weight matrix. Its shape should be :math:`(V, D)` samples (~chainer.Variable): Negative samples. Its shape should be :math:`(N, S)` where :math:`S` is the number of negative samples. reduce (str): Reduction option. Its value must be either ``'no'`` or ``'mean'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable object holding loss value(s). If ``reduce`` is ``'no'``, the output variable holds an array whose shape is :math:`(N,)` . If it is ``'mean'``, it holds a scalar. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.matmul(neg_emb, x[:, :, None]) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.matmul(pos_emb, x[:, :, None]) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = -(py - logz + _sum.sum(ny, axis=1)) if reduce == 'mean': loss = average.average(loss) return loss
def _kl_categorical_categorical(dist1, dist2): return sum_mod.sum(dist1.p * (dist1.log_p - dist2.log_p), axis=-1)
def crf1d(cost, xs, ys, reduce='mean'): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. .. note:: When you want to calculate the negative log-likelihood of sequences which have different lengths, sort the sequences in descending order of lengths and transpose the sequences. For example, you have three input sequences: >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> a = [a1, a2, a3, a4] >>> b = [b1, b2, b3] >>> c = [c1, c2] where ``a1`` and all other variables are arrays with ``(K,)`` shape. Make a transpose of the sequences: >>> x1 = np.stack([a1, b1, c1]) >>> x2 = np.stack([a2, b2, c2]) >>> x3 = np.stack([a3, b3]) >>> x4 = np.stack([a4]) and make a list of the arrays: >>> xs = [x1, x2, x3, x4] You need to make label sequences in the same fashion. And then, call the function: >>> cost = chainer.Variable( ... np.random.uniform(-1, 1, (3, 3)).astype(np.float32)) >>> ys = [np.zeros(x.shape[0:1], dtype=np.int32) for x in xs] >>> loss = F.crf1d(cost, xs, ys) It calculates mean of the negative log-likelihood of the three sequences. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'mean'``, it holds mean of the loss values. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B`\\ s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. ys (list of Variable): Expected output labels. It needs to have the same length as ``xs``. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. When ``x`` in ``xs`` has the different :math:`B`, correspoding ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <https://repository.upenn.edu/cis_papers/159/>`_. """ if reduce not in ('mean', 'no'): raise ValueError( "only 'mean' and 'no' are valid for 'reduce', but '%s' is " 'given' % reduce) assert xs[0].shape[1] == cost.shape[0] n_label = cost.shape[0] n_batch = xs[0].shape[0] alpha = xs[0] alphas = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x if len(alphas) > 0: alphas.append(alpha) alpha = concat.concat(alphas[::-1], axis=0) logz = logsumexp.logsumexp(alpha, axis=1) cost = reshape.reshape(cost, (cost.size, 1)) score = select_item.select_item(xs[0], ys[0]) scores = [] for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]): batch = x.shape[0] if score.shape[0] > batch: y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0) score, score_rest = split_axis.split_axis(score, [batch], axis=0) scores.append(score_rest) score += (select_item.select_item(x, y) + reshape.reshape( embed_id.embed_id(y_prev * n_label + y, cost), (batch,))) if len(scores) > 0: scores.append(score) score = concat.concat(scores[::-1], axis=0) loss = logz - score if reduce == 'mean': return _sum.sum(loss) / n_batch else: return loss
def log_prob(self, x): return sum_mod.sum(exponential.log(self.p) * x, axis=-1)
def log_prob(self, x): return sum_mod.sum(self.log_p * x, axis=-1)
def _triangular_logdet(x): diag = diagonal.diagonal(x, axis1=-2, axis2=-1) return sum_mod.sum(exponential.log(abs(diag)), axis=-1)