def crf1d(cost, xs, ys): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input feature vector for each label. Each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. ys (list of Variable): Expected output labels. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <http://repository.upenn.edu/cis_papers/159/>`_. """ assert xs[0].data.shape[1] == cost.data.shape[0] n_label = cost.data.shape[0] n_batch = xs[0].data.shape[0] alpha = xs[0] for x in xs[1:]: b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x logz = logsumexp.logsumexp(alpha, axis=1) score = 0 cost = reshape.reshape(cost, (cost.data.size, 1)) for y1, y2 in zip(ys[:-1], ys[1:]): score += reshape.reshape( embed_id.embed_id(y1 * n_label + y2, cost), (n_batch,)) for x, y in zip(xs, ys): score += select_item.select_item(x, y) return _sum.sum(logz - score) / n_batch
def black_out(x, t, W, samples): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. samples (~chainer.Variable): Negative samples. Returns: ~chainer.Variable: Loss value. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.batch_matmul(neg_emb, x) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.batch_matmul(pos_emb, x) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast(reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size, )) loss = py - logz + _sum.sum(ny, axis=1) return -_sum.sum(loss) / batch_size
def black_out(x, t, W, samples): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. Args: x (~chainer.Variable): Batch of input vectors. t (~chainer.Variable): Vector of ground truth labels. W (~chainer.Variable): Weight matrix. samples (~chainer.Variable): Negative samples. Returns: ~chainer.Variable: Loss value. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.batch_matmul(neg_emb, x) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.batch_matmul(pos_emb, x) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = py - logz + _sum.sum(ny, axis=1) return -_sum.sum(loss) / batch_size
def __init__(self, p=None, **kwargs): logit = None if kwargs: logit, = argument.parse_kwargs(kwargs, ('logit', logit)) if not (p is None) ^ (logit is None): raise ValueError( "Either `p` or `logit` (not both) must have a value.") with chainer.using_config('enable_backprop', True): if p is None: logit = chainer.as_variable(logit) self.__log_p = logit - expand_dims.expand_dims( logsumexp.logsumexp(logit, axis=-1), axis=-1) self.__p = exponential.exp(self.__log_p) else: self.__p = chainer.as_variable(p) self.__log_p = exponential.log(self.__p)
def __init__(self, p=None, **kwargs): logit = None if kwargs: logit, = argument.parse_kwargs( kwargs, ('logit', logit)) if not (p is None) ^ (logit is None): raise ValueError( "Either `p` or `logit` (not both) must have a value.") with chainer.using_config('enable_backprop', True): if p is None: logit = chainer.as_variable(logit) self.__log_p = logit - expand_dims.expand_dims( logsumexp.logsumexp(logit, axis=-1), axis=-1) self.__p = exponential.exp(self.__log_p) else: self.__p = chainer.as_variable(p) self.__log_p = exponential.log(self.__p)
def crf1d(cost, xs, ys, reduce='mean'): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. .. note:: When you want to calculate the negative log-likelihood of sequences which have different lengths, sort the sequences in descending order of lengths and transpose the sequences. For example, you have three input sequences: >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype(np.float32) >>> a = [a1, a2, a3, a4] >>> b = [b1, b2, b3] >>> c = [c1, c2] where ``a1`` and all other variables are arrays with ``(K,)`` shape. Make a transpose of the sequences: >>> x1 = np.stack([a1, b1, c1]) >>> x2 = np.stack([a2, b2, c2]) >>> x3 = np.stack([a3, b3]) >>> x4 = np.stack([a4]) and make a list of the arrays: >>> xs = [x1, x2, x3, x4] You need to make label sequences in the same fashion. And then, call the function: >>> cost = chainer.Variable( ... np.random.uniform(-1, 1, (3, 3)).astype(np.float32)) >>> ys = [np.zeros(x.shape[0:1], dtype=np.int32) for x in xs] >>> loss = F.crf1d(cost, xs, ys) It calculates mean of the negative log-likelihood of the three sequences. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'mean'``, it holds mean of the loss values. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B`\\ s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. ys (list of Variable): Expected output labels. It needs to have the same length as ``xs``. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. When ``x`` in ``xs`` has the different :math:`B`, correspoding ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <https://repository.upenn.edu/cis_papers/159/>`_. """ if reduce not in ('mean', 'no'): raise ValueError( "only 'mean' and 'no' are valid for 'reduce', but '%s' is " 'given' % reduce) assert xs[0].shape[1] == cost.shape[0] n_label = cost.shape[0] n_batch = xs[0].shape[0] alpha = xs[0] alphas = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x if len(alphas) > 0: alphas.append(alpha) alpha = concat.concat(alphas[::-1], axis=0) logz = logsumexp.logsumexp(alpha, axis=1) cost = reshape.reshape(cost, (cost.size, 1)) score = select_item.select_item(xs[0], ys[0]) scores = [] for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]): batch = x.shape[0] if score.shape[0] > batch: y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0) score, score_rest = split_axis.split_axis(score, [batch], axis=0) scores.append(score_rest) score += (select_item.select_item(x, y) + reshape.reshape( embed_id.embed_id(y_prev * n_label + y, cost), (batch,))) if len(scores) > 0: scores.append(score) score = concat.concat(scores[::-1], axis=0) loss = logz - score if reduce == 'mean': return _sum.sum(loss) / n_batch else: return loss
def crf1d(cost, xs, ys, reduce='mean'): """Calculates negative log-likelihood of linear-chain CRF. It takes a transition cost matrix, a sequence of costs, and a sequence of labels. Let :math:`c_{st}` be a transition cost from a label :math:`s` to a label :math:`t`, :math:`x_{it}` be a cost of a label :math:`t` at position :math:`i`, and :math:`y_i` be an expected label at position :math:`i`. The negative log-likelihood of linear-chain CRF is defined as .. math:: L = -\\left( \\sum_{i=1}^l x_{iy_i} + \\ \\sum_{i=1}^{l-1} c_{y_i y_{i+1}} - {\\log(Z)} \\right) , where :math:`l` is the length of the input sequence and :math:`Z` is the normalizing constant called partition function. .. note:: When you want to calculate the negative log-likelihood of sequences which have different lengths, sort the sequences in descending order of lengths and transpose the sequences. For example, you have three input sequences: >>> a1 = a2 = a3 = a4 = np.random.uniform(-1, 1, 3).astype('f') >>> b1 = b2 = b3 = np.random.uniform(-1, 1, 3).astype('f') >>> c1 = c2 = np.random.uniform(-1, 1, 3).astype('f') >>> a = [a1, a2, a3, a4] >>> b = [b1, b2, b3] >>> c = [c1, c2] where ``a1`` and all other variables are arrays with ``(K,)`` shape. Make a transpose of the sequences: >>> x1 = np.stack([a1, b1, c1]) >>> x2 = np.stack([a2, b2, c2]) >>> x3 = np.stack([a3, b3]) >>> x4 = np.stack([a4]) and make a list of the arrays: >>> xs = [x1, x2, x3, x4] You need to make label sequences in the same fashion. And then, call the function: >>> cost = chainer.Variable( ... np.random.uniform(-1, 1, (3, 3)).astype('f')) >>> ys = [np.zeros(x.shape[0:1], dtype='i') for x in xs] >>> loss = F.crf1d(cost, xs, ys) It calculates mean of the negative log-likelihood of the three sequences. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the elementwise loss values. If it is ``'mean'``, it holds mean of the loss values. Args: cost (Variable): A :math:`K \\times K` matrix which holds transition cost between two labels, where :math:`K` is the number of labels. xs (list of Variable): Input vector for each label. ``len(xs)`` denotes the length of the sequence, and each :class:`~chainer.Variable` holds a :math:`B \\times K` matrix, where :math:`B` is mini-batch size, :math:`K` is the number of labels. Note that :math:`B` s in all the variables are not necessary the same, i.e., it accepts the input sequences with different lengths. ys (list of Variable): Expected output labels. It needs to have the same length as ``xs``. Each :class:`~chainer.Variable` holds a :math:`B` integer vector. When ``x`` in ``xs`` has the different :math:`B`, correspoding ``y`` has the same :math:`B`. In other words, ``ys`` must satisfy ``ys[i].shape == xs[i].shape[0:1]`` for all ``i``. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding the average negative log-likelihood of the input sequences. .. note:: See detail in the original paper: `Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data <http://repository.upenn.edu/cis_papers/159/>`_. """ if reduce not in ('mean', 'no'): raise ValueError( "only 'mean' and 'no' are valid for 'reduce', but '%s' is " 'given' % reduce) assert xs[0].shape[1] == cost.shape[0] n_label = cost.shape[0] n_batch = xs[0].shape[0] alpha = xs[0] alphas = [] for x in xs[1:]: batch = x.shape[0] if alpha.shape[0] > batch: alpha, alpha_rest = split_axis.split_axis(alpha, [batch], axis=0) alphas.append(alpha_rest) b_alpha, b_cost = broadcast.broadcast(alpha[..., None], cost) alpha = logsumexp.logsumexp(b_alpha + b_cost, axis=1) + x if len(alphas) > 0: alphas.append(alpha) alpha = concat.concat(alphas[::-1], axis=0) logz = logsumexp.logsumexp(alpha, axis=1) cost = reshape.reshape(cost, (cost.size, 1)) score = select_item.select_item(xs[0], ys[0]) scores = [] for x, y, y_prev in zip(xs[1:], ys[1:], ys[:-1]): batch = x.shape[0] if score.shape[0] > batch: y_prev, _ = split_axis.split_axis(y_prev, [batch], axis=0) score, score_rest = split_axis.split_axis(score, [batch], axis=0) scores.append(score_rest) score += ( select_item.select_item(x, y) + reshape.reshape(embed_id.embed_id(y_prev * n_label + y, cost), (batch, ))) if len(scores) > 0: scores.append(score) score = concat.concat(scores[::-1], axis=0) loss = logz - score if reduce == 'mean': return _sum.sum(loss) / n_batch else: return loss
def black_out(x, t, W, samples, reduce='mean'): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the no loss values. If it is ``'mean'``, this function takes a mean of loss values. Args: x (~chainer.Variable): Batch of input vectors. Its shape should be :math:`(N, D)`. t (~chainer.Variable): Vector of ground truth labels. Its shape should be :math:`(N,)`. Each elements :math:`v` should satisfy :math:`0 \\geq v \\geq V` or :math:`-1` where :math:`V` is the number of label types. W (~chainer.Variable): Weight matrix. Its shape should be :math:`(V, D)` samples (~chainer.Variable): Negative samples. Its shape should be :math:`(N, S)` where :math:`S` is the number of negative samples. reduce (str): Reduction option. Its value must be either ``'no'`` or ``'mean'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable object holding loss value(s). If ``reduce`` is ``'no'``, the output variable holds an array whose shape is :math:`(N,)` . If it is ``'mean'``, it holds a scalar. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.matmul(neg_emb, x[:, :, None]) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.matmul(pos_emb, x[:, :, None]) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = -(py - logz + _sum.sum(ny, axis=1)) if reduce == 'mean': loss = average.average(loss) return loss