def _kl_multivariatenormal_multivariatenormal(dist1, dist2): scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape( -1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape) return dist2._logdet_scale - dist1._logdet_scale \ + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): scale_tril_inv2 = _batch_triangular_inv( dist2.scale_tril.reshape(-1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d))**2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah**2, axis=-2).reshape(dist1.batch_shape) return dist2._logdet_scale - dist1._logdet_scale \ + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def log_prob(self, x): scale_tril_inv = \ _batch_triangular_inv(self.scale_tril.reshape(-1, self.d, self.d)) scale_tril_inv = scale_tril_inv.reshape(self.batch_shape + (self.d, self.d)) bsti = broadcast.broadcast_to(scale_tril_inv, x.shape + (self.d, )) bl = broadcast.broadcast_to(self.loc, x.shape) m = matmul.matmul(bsti, expand_dims.expand_dims(x - bl, axis=-1)) m = matmul.matmul(swapaxes.swapaxes(m, -1, -2), m) m = squeeze.squeeze(m, axis=-1) m = squeeze.squeeze(m, axis=-1) logz = LOGPROBC * self.d - self._logdet(self.scale_tril) return broadcast.broadcast_to(logz, m.shape) - 0.5 * m
def log_prob(self, x): scale_tril_inv = \ _batch_triangular_inv(self.scale_tril.reshape(-1, self.d, self.d)) scale_tril_inv = scale_tril_inv.reshape( self.batch_shape+(self.d, self.d)) bsti = broadcast.broadcast_to(scale_tril_inv, x.shape + (self.d,)) bl = broadcast.broadcast_to(self.loc, x.shape) m = matmul.matmul( bsti, expand_dims.expand_dims(x - bl, axis=-1)) m = matmul.matmul(swapaxes.swapaxes(m, -1, -2), m) m = squeeze.squeeze(m, axis=-1) m = squeeze.squeeze(m, axis=-1) logz = LOGPROBC * self.d - self._logdet(self.scale_tril) return broadcast.broadcast_to(logz, m.shape) - 0.5 * m
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): diag = diagonal.diagonal(dist1.scale_tril, axis1=-2, axis2=-1) logdet1 = sum_mod.sum(exponential.log(abs(diag)), axis=-1) diag = diagonal.diagonal(dist2.scale_tril, axis1=-2, axis2=-1) logdet2 = sum_mod.sum(exponential.log(abs(diag)), axis=-1) scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape( -1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape) return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def encode_decode_train(self, in_word_list, out_word_list, train=True, sample=False): xp = cuda.cupy if self.gpuid >= 0 else np self.reset_state() # Add GO_ID, EOS_ID to decoder input decoder_word_list = [GO_ID] + out_word_list + [EOS_ID] # encode list of words/tokens enc_states = self.encode_list(in_word_list, train=train) # initialize decoder LSTM to final encoder state self.set_decoder_state() # decode and compute loss # convert list of tokens into chainer variable list var_dec = (Variable(xp.asarray(decoder_word_list, dtype=np.int32).reshape((-1, 1)), volatile=not train)) # Initialise first decoded word to GOID pred_word = Variable(xp.asarray([GO_ID], dtype=np.int32), volatile=not train) # compute loss self.loss = 0 # decode tokens for next_word_var in var_dec[1:]: self.decode(pred_word, train=train) if self.attn == NO_ATTN: predicted_out = self.out(self[self.lstm_dec[-1]].h) else: ''' __QUESTION Add attention ''' prevh = self[self.lstm_dec[-1]].h alpha = F.softmax(matmul(prevh, enc_states, transb=True)) ctxt = F.reshape( M.sum(F.scale(enc_states, F.transpose(alpha), axis=0), axis=0), (1, 200)) predicted_out = self.out(self.attn_out(F.concat( (ctxt, prevh)))) # compute loss prob = F.softmax(predicted_out) pred_word = self.select_word(prob, train=train, sample=False) # pred_word = Variable(xp.asarray([pred_word.data], dtype=np.int32), volatile=not train) ''' ___QUESTION-1-DESCRIBE-E-START___ Explain what loss is computed with an example. What does this value mean? The cross-entropy is a soft measure of how close the network got to the correct answer. Here it is used to find how close the predicted word (predicted_out) was to the expected word (next_word_var). ''' self.loss += F.softmax_cross_entropy(predicted_out, next_word_var) '''___QUESTION-1-DESCRIBE-E-END___''' report({"loss": self.loss}, self) return self.loss
def _kl_multivariatenormal_multivariatenormal(dist1, dist2): st = moveaxis.moveaxis(dist1.scale_tril, (-2, -1), (0, 1)) diag = st[list(range(dist1.d)), list(range(dist1.d))] logdet1 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0) st = moveaxis.moveaxis(dist2.scale_tril, (-2, -1), (0, 1)) diag = st[list(range(dist2.d)), list(range(dist2.d))] logdet2 = sum_mod.sum(exponential.log(basic_math.absolute(diag)), axis=0) scale_tril_inv2 = _batch_triangular_inv(dist2.scale_tril.reshape( -1, dist2.d, dist2.d)) trace = sum_mod.sum(matmul.matmul( scale_tril_inv2, dist1.scale_tril.reshape(-1, dist2.d, dist2.d)) ** 2, axis=(-1, -2)).reshape(dist1.batch_shape) mu = dist1.loc - dist2.loc mah = matmul.matmul(scale_tril_inv2, mu.reshape(-1, dist1.d, 1)) mah = sum_mod.sum(mah ** 2, axis=-2).reshape(dist1.batch_shape) return logdet2 - logdet1 + 0.5 * trace + 0.5 * mah - 0.5 * dist1.d
def sample_n(self, n): if self._is_gpu: eps = cuda.cupy.random.standard_normal( (n,)+self.loc.shape+(1,), dtype=self.loc.dtype) else: eps = numpy.random.standard_normal( (n,)+self.loc.shape+(1,)).astype(numpy.float32) return self.loc + squeeze.squeeze( matmul.matmul(self.scale_tril, eps), axis=-1)
def sample_n(self, n): if self._is_gpu: eps = cuda.cupy.random.standard_normal( (n,)+self.loc.shape+(1,), dtype=self.loc.dtype) else: eps = numpy.random.standard_normal( (n,)+self.loc.shape+(1,)).astype(numpy.float32) noise = matmul.matmul(repeat.repeat( expand_dims.expand_dims(self.scale_tril, axis=0), n, axis=0), eps) noise = squeeze.squeeze(noise, axis=-1) noise += repeat.repeat(expand_dims.expand_dims( self.loc, axis=0), n, axis=0) return noise
def decoder_predict(self, start_word, enc_states, max_predict_len=MAX_PREDICT_LEN, sample=False): xp = cuda.cupy if self.gpuid >= 0 else np # __QUESTION -- Following code is to assist with ATTENTION # alpha_arr should store the alphas for every predicted word alpha_arr = xp.empty((0, enc_states.shape[0]), dtype=xp.float32) # return list of predicted words predicted_sent = [] # load start symbol pred_word = Variable(xp.asarray([start_word], dtype=np.int32), volatile=True) pred_count = 0 # start prediction loop while pred_count < max_predict_len and (int(pred_word.data) != (EOS_ID)): self.decode(pred_word, train=False) if self.attn == NO_ATTN: predicted_out = self.out(self[self.lstm_dec[-1]].h) else: ''' __QUESTION Add attention ''' prevh = self[self.lstm_dec[-1]].h alpha = F.softmax(matmul(prevh, enc_states, transb=True)) ctxt = F.reshape( M.sum(F.scale(enc_states, F.transpose(alpha), axis=0), axis=0), (1, 200)) alpha_arr = xp.concatenate((alpha_arr, alpha.data)) predicted_out = self.out(self.attn_out(F.concat( (ctxt, prevh)))) prob = F.softmax(predicted_out) pred_word = self.select_word(prob, train=False, sample=sample) # add integer id of predicted word to output list predicted_sent.append(int(pred_word.data)) pred_count += 1 # __QUESTION Add attention # When implementing attention, make sure to use alpha_arr to store # your attention vectors. # The visualisation function in nmt_translate.py assumes such an array as input. return predicted_sent, alpha_arr
def deformable_convolution_2d_sampler(x, offset, W, b=None, stride=1, pad=0): """Two-dimensional deformable convolution function using computed offset. This is an implementation of two-dimensional deformable convolution from `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_. It takes four variables: the input image ``x``, the offset image ``offset``, the filter weight ``W``, and the bias vector ``b``. Notation: here is the notation for the dimensionalities. - :math:`n` is the batch size. - :math:`c_I` and :math:`c_O` are the number of the input and output, respectively. - :math:`h` and :math:`w` are the height and width of the input image, respectively. - :math:`k_H` and :math:`k_W` are the height and width of the filters, respectively. - :math:`s_Y` and :math:`s_X` are the strides of the filter. - :math:`p_H` and :math:`p_W` are the spatial padding sizes. The output size :math:`(h_O, w_O)` is determined by the following equations: .. math:: h_O &= (h + 2p_H - k_H) / s_Y + 1,\\\\ w_O &= (w + 2p_W - k_W) / s_X + 1. Args: x (~chainer.Variable): Input variable of shape :math:`(n, c_I, h, w)`. offset (~chainer.Variable): Offset variable of shape :math:`(n, 2 \\cdot k_H \\cdot k_W, h_O, w_O)`. The first :math:`k_H \\cdot k_W` index of the second axis corresponds to the offsets in the horizontal direction. The last :math:`k_H \\cdot k_W` index of the second axis corresponds to the offsets in the vertical direction. W (~chainer.Variable): Weight variable of shape :math:`(c_O, c_I, k_H, k_W)`. b (~chainer.Variable): Bias variable of length :math:`c_O` (optional). stride (int or pair of ints): Stride of filter applications. ``stride=s`` and ``stride=(s, s)`` are equivalent. pad (int or pair of ints): Spatial padding width for input arrays. ``pad=p`` and ``pad=(p, p)`` are equivalent. Returns: ~chainer.Variable: Output variable. Deformable convolution adds 2D offsets to the regular grid sampling locations in the standard convolution. It enables free form deformation of the sampling grid. See `Jifeng Dai, Haozhi Qi, Yuwen Xiong, Yi Li, Guodong Zhang, Han Hu, \ Yichen Wei. Deformable Convolutional Networks\ <https://arxiv.org/abs/1703.06211>`_ If the bias vector is given, then it is added to all spatial locations of the output of convolution. .. seealso:: :class:`~chainer.links.DeformableConvolution2D` .. admonition:: Example >>> x = np.random.uniform(0, 1, (2, 3, 4, 7)).astype(np.float32) >>> offset = np.random.uniform( ... 0, 1, (2, 2 * 3 * 3, 2, 5)).astype(np.float32) >>> W = np.random.uniform(0, 1, (4, 3, 3, 3)).astype(np.float32) >>> b = np.random.uniform(0, 1, (4,)).astype(np.float32) >>> y = F.deformable_convolution_2d_sampler(x, offset, W, b) >>> y.shape (2, 4, 2, 5) """ sy, sx = _pair(stride) ph, pw = _pair(pad) out_c, _, kh, kw = W.shape n, c, h, w = x.shape _, khkw2, out_h, out_w = offset.shape if khkw2 != 2 * kh * kw: raise ValueError( 'The shape of the offset does not match the kernel size') grid = _offset2grid(offset, kh, kw, sy, sx, ph, pw, h, w) grid = grid.reshape(n, 2, kh * kw, out_h * out_w) x_pad = pad_module.pad(x, ((0, 0), (0, 0), (ph, ph), (pw, pw)), 'constant') x_st = spatial_transformer_sampler.spatial_transformer_sampler( x_pad, grid) x_st = x_st.transpose(0, 3, 1, 2).reshape(n * out_h * out_w, c * kh * kw) W = W.transpose(1, 2, 3, 0).reshape(c * kh * kw, out_c) y = matmul.matmul(x_st, W) y = y.reshape(n, out_h, out_w, out_c).transpose(0, 3, 1, 2) if b is not None: b = broadcast.broadcast_to(b[None, :, None, None], y.shape) y += b return y
def deformable_convolution_2d_sampler(x, offset, W, b=None, stride=1, pad=0): """Two-dimensional deformable convolution function using computed offset. This is an implementation of two-dimensional deformable convolution from `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_. It takes four variables: the input image ``x``, the offset image ``offset``, the filter weight ``W``, and the bias vector ``b``. Notation: here is the notation for the dimensionalities. - :math:`n` is the batch size. - :math:`c_I` and :math:`c_O` are the number of the input and output, respectively. - :math:`h` and :math:`w` are the height and width of the input image, respectively. - :math:`k_H` and :math:`k_W` are the height and width of the filters, respectively. - :math:`s_Y` and :math:`s_X` are the strides of the filter. - :math:`p_H` and :math:`p_W` are the spatial padding sizes. The output size :math:`(h_O, w_O)` is determined by the following equations: .. math:: h_O &= (h + 2p_H - k_H) / s_Y + 1,\\\\ w_O &= (w + 2p_W - k_W) / s_X + 1. Args: x (~chainer.Variable): Input variable of shape :math:`(n, c_I, h, w)`. offset (~chainer.Variable): Offset variable of shape :math:`(n, 2 \\cdot k_H \\cdot k_W, h_O, w_O)`. The first :math:`k_H \\cdot k_W` index of the second axis corresponds to the offsets in the horizontal direction. The last :math:`k_H \\cdot k_W` index of the second axis corresponds to the offsets in the vertical direction. W (~chainer.Variable): Weight variable of shape :math:`(c_O, c_I, k_H, k_W)`. b (~chainer.Variable): Bias variable of length :math:`c_O` (optional). stride (int or pair of ints): Stride of filter applications. ``stride=s`` and ``stride=(s, s)`` are equivalent. pad (int or pair of ints): Spatial padding width for input arrays. ``pad=p`` and ``pad=(p, p)`` are equivalent. Returns: ~chainer.Variable: Output variable. Deformable convolution adds 2D offsets to the regular grid sampling locations in the standard convolution. It enables free form deformation of the sampling grid. See `Jifeng Dai, Haozhi Qi, Yuwen Xiong, Yi Li, Guodong Zhang, Han Hu, \ Yichen Wei. Deformable Convolutional Networks\ <https://arxiv.org/abs/1703.06211>`_ If the bias vector is given, then it is added to all spatial locations of the output of convolution. .. seealso:: :class:`~chainer.links.DeformableConvolution2D` .. admonition:: Example >>> x = np.random.uniform(0, 1, (2, 3, 4, 7)).astype(np.float32) >>> offset = np.random.uniform( ... 0, 1, (2, 2 * 3 * 3, 2, 5)).astype(np.float32) >>> W = np.random.uniform(0, 1, (4, 3, 3, 3)).astype(np.float32) >>> b = np.random.uniform(0, 1, (4,)).astype(np.float32) >>> y = F.deformable_convolution_2d_sampler(x, offset, W, b) >>> y.shape (2, 4, 2, 5) """ sy, sx = _pair(stride) ph, pw = _pair(pad) out_c, _, kh, kw = W.shape n, c, h, w = x.shape _, khkw2, out_h, out_w = offset.shape if khkw2 != 2 * kh * kw: raise ValueError( 'The shape of the offset does not match the kernel size') grid = _offset2grid(offset, kh, kw, sy, sx, ph, pw, h, w) grid = grid.reshape(n, 2, kh * kw, out_h * out_w) x_pad = pad_module.pad(x, ((0, 0), (0, 0), (ph, ph), (pw, pw)), 'constant') x_st = spatial_transformer_sampler.spatial_transformer_sampler(x_pad, grid) x_st = x_st.transpose(0, 3, 1, 2).reshape(n * out_h * out_w, c * kh * kw) W = W.transpose(1, 2, 3, 0).reshape(c * kh * kw, out_c) y = matmul.matmul(x_st, W) y = y.reshape(n, out_h, out_w, out_c).transpose(0, 3, 1, 2) if b is not None: b = broadcast.broadcast_to(b[None, :, None, None], y.shape) y += b return y
def f_chainer(x, h): return reshape( matmul(h, reshape(x, (h.shape[1], int(x.size / h.shape[1])))), x.shape)
def black_out(x, t, W, samples, reduce='mean'): """BlackOut loss function. BlackOut loss function is defined as .. math:: -\\log(p(t)) - \\sum_{s \\in S} \\log(1 - p(s)), where :math:`t` is the correct label, :math:`S` is a set of negative examples and :math:`p(\\cdot)` is likelihood of a given label. And, :math:`p` is defined as .. math:: p(y) = \\frac{\\exp(W_y^\\top x)}{ \\sum_{s \\in samples} \\exp(W_s^\\top x)}. The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the no loss values. If it is ``'mean'``, this function takes a mean of loss values. Args: x (~chainer.Variable): Batch of input vectors. Its shape should be :math:`(N, D)`. t (~chainer.Variable): Vector of ground truth labels. Its shape should be :math:`(N,)`. Each elements :math:`v` should satisfy :math:`0 \\geq v \\geq V` or :math:`-1` where :math:`V` is the number of label types. W (~chainer.Variable): Weight matrix. Its shape should be :math:`(V, D)` samples (~chainer.Variable): Negative samples. Its shape should be :math:`(N, S)` where :math:`S` is the number of negative samples. reduce (str): Reduction option. Its value must be either ``'no'`` or ``'mean'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable object holding loss value(s). If ``reduce`` is ``'no'``, the output variable holds an array whose shape is :math:`(N,)` . If it is ``'mean'``, it holds a scalar. See: `BlackOut: Speeding up Recurrent Neural Network Language Models With \ Very Large Vocabularies <https://arxiv.org/abs/1511.06909>`_ .. seealso:: :class:`~chainer.links.BlackOut`. """ batch_size = x.shape[0] neg_emb = embed_id.embed_id(samples, W) neg_y = matmul.matmul(neg_emb, x[:, :, None]) neg_y = reshape.reshape(neg_y, neg_y.shape[:-1]) pos_emb = expand_dims.expand_dims(embed_id.embed_id(t, W), 1) pos_y = matmul.matmul(pos_emb, x[:, :, None]) pos_y = reshape.reshape(pos_y, pos_y.shape[:-1]) logz = logsumexp.logsumexp(concat.concat([pos_y, neg_y]), axis=1) blogz, bneg_y = broadcast.broadcast( reshape.reshape(logz, (batch_size, 1)), neg_y) ny = exponential.log(1 - exponential.exp(bneg_y - blogz)) py = reshape.reshape(pos_y, (batch_size,)) loss = -(py - logz + _sum.sum(ny, axis=1)) if reduce == 'mean': loss = average.average(loss) return loss
def covariance(self): return matmul.matmul( self.scale_tril, transpose.transpose(self.scale_tril, tuple(range(len(self.batch_shape))) + (-1, -2)))
def covariance(self): return matmul.matmul( self.scale_tril, transpose.transpose( self.scale_tril, tuple(range(len(self.batch_shape))) + (-1, -2)))