def __call__(self, hs, ys): """CTC forward. Args: hs (list of chainer.Variable | N-dimension array): Input variable from encoder. ys (list of chainer.Variable | N-dimension array): Input variable of decoder. Returns: chainer.Variable: A variable holding a scalar value of the CTC loss. """ self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.separate(y_hat, axis=1) # ilen list of batch x hdim # zero padding for ys y_true = F.pad_sequence(ys, padding=-1) # batch x olen # get length info input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32)) label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32)) logging.info(self.__class__.__name__ + ' input lengths: ' + str(input_length.data)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data)) # get ctc loss self.loss = F.connectionist_temporal_classification( y_hat, y_true, 0, input_length, label_length) logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def __call__(self, hs, ys): """Core function of the Warp-CTC layer. Args: hs (iterable of chainer.Variable | N-dimention array): Input variable from encoder. ys (iterable of chainer.Variable | N-dimension array): Input variable of decoder. Returns: chainer.Variable: A variable holding a scalar value of the CTC loss. """ self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.transpose(y_hat, (1, 0, 2)) # batch x frames x hdim # get length info logging.info(self.__class__.__name__ + ' input lengths: ' + str(ilens)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(olens)) # get ctc loss from chainer_ctc.warpctc import ctc as warp_ctc self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(l.data) for l in ys])[0] logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def __call__(self, hs, ys): """CTC forward :param hs: :param ys: :return: """ self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.transpose(y_hat, (1, 0, 2)) # batch x frames x hdim # get length info logging.info(self.__class__.__name__ + ' input lengths: ' + str(ilens)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(olens)) # get ctc loss from chainer_ctc.warpctc import ctc as warp_ctc self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(l.data) for l in ys])[0] logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def log_softmax(self, hs): """log_softmax of frame activations :param hs: :return: """ y_hat = linear_tensor(self.ctc_lo, F.pad_sequence(hs)) return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
def log_softmax(self, hs): """Log_softmax of frame activations. Args: hs (list of chainer.Variable | N-dimension array): Input variable from encoder. Returns: chainer.Variable: A n-dimension float array. """ y_hat = linear_tensor(self.ctc_lo, F.pad_sequence(hs)) return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0): """Compute AttDot forward layer. Args: enc_hs (chainer.Variable | N-dimensional array): Input variable from encoder. dec_z (chainer.Variable | N-dimensional array): Input variable of decoder. scaling (float): Scaling weight to make attention sharp. Returns: chainer.Variable: Weighted sum over flames. chainer.Variable: Attention weight. """ batch = len(enc_hs) # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # utt x frame x att_dim self.pre_compute_enc_h = F.tanh( linear_tensor(self.mlp_enc, self.enc_h)) if dec_z is None: dec_z = chainer.Variable( self.xp.zeros((batch, self.dunits), dtype=np.float32)) else: dec_z = F.reshape(dec_z, (batch, self.dunits)) # <phi (h_t), psi (s)> for all t u = F.broadcast_to(F.expand_dims(F.tanh(self.mlp_dec(dec_z)), 1), self.pre_compute_enc_h.shape) e = F.sum(self.pre_compute_enc_h * u, axis=2) # utt x frame # Applying a minus-large-number filter to make a probability value zero for a padded area # simply degrades the performance, and I gave up this implementation # Apply a scaling to make an attention sharp w = F.softmax(scaling * e) # weighted sum over flames # utt x hdim c = F.sum(self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1) return c, w
def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0): """Compute AttLoc forward layer. Args: enc_hs (chainer.Variable | N-dimensional array): Input variable from encoders. dec_z (chainer.Variable | N-dimensional array): Input variable of decoder. att_prev (chainer.Variable | None): Attention weight. scaling (float): Scaling weight to make attention sharp. Returns: chainer.Variable: Weighted sum over flames. chainer.Variable: Attention weight. """ batch = len(enc_hs) # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # utt x frame x att_dim self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h) if dec_z is None: dec_z = chainer.Variable( self.xp.zeros((batch, self.dunits), dtype=np.float32)) else: dec_z = F.reshape(dec_z, (batch, self.dunits)) # initialize attention weight with uniform dist. if att_prev is None: att_prev = [ self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs ] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) # TODO(watanabe) use <chainer variable>.reshpae(), instead of F.reshape() # att_prev: utt x frame -> utt x 1 x 1 x frame -> utt x att_conv_chans x 1 x frame att_conv = self.loc_conv( F.reshape(att_prev, (batch, 1, 1, self.h_length))) # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2) # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim att_conv = linear_tensor(self.mlp_att, att_conv) # dec_z_tiled: utt x frame x att_dim dec_z_tiled = F.broadcast_to(F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape) # dot with gvec # utt x frame x att_dim -> utt x frame # TODO(watanabe) use batch_matmul e = F.squeeze(linear_tensor( self.gvec, F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)), axis=2) # Applying a minus-large-number filter to make a probability value zero for a padded area # simply degrades the performance, and I gave up this implementation # Apply a scaling to make an attention sharp w = F.softmax(scaling * e) # weighted sum over flames # utt x hdim c = F.sum(self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1) return c, w