def __call__(self, trainer): with chainer.no_backprop_mode(): references = [] hypotheses = [] use_data = random.sample(self.test_data, self.batch) if self.device >= 0: sources = [cuda.cupy.asarray(x[1]) for x, _ in use_data] targets = [cuda.cupy.asarray(y[1]) for _, y in use_data] #sourcePersona = [cuda.cupy.asarray(x[0]) for x, _ in use_data]#今は使わん将来使うかも? targetPersona = [cuda.cupy.asarray(y[0]) for _, y in use_data] else: sources = [x[1] for x, _ in use_data] targets = [y[1] for _, y in use_data] sourcePersona = [x[0] for x, _ in use_data] targetPersona = [y[0] for _, y in use_data] sources = F.pad_sequence(sources, loadData.LoadData.maxlen, -1) targets = F.pad_sequence(targets, loadData.LoadData.maxlen, -1) references.extend([[t.tolist()] for t in targets.data]) ys = [y.tolist() for y in self.model.predict(sources.data, targetPersona)]#batch_size hypotheses.extend(ys) bleu = bleu_score.corpus_bleu( references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1) chainer.report({self.key:bleu})
def __call__(self, enc_hs, dec_z, att_prev): """Compute NoAtt forward layer. Args: enc_hs (chainer.Variable | N-dimensional array): Input variable from encoders. dec_z: Dummy. att_prev (chainer.Variable | None): Attention weight. Returns: chainer.Variable: Sum over flames. chainer.Variable: Attention weight. """ # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # initialize attention weight with uniform dist. if att_prev is None: att_prev = [ self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs ] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) self.c = F.sum( self.enc_h * F.broadcast_to(F.expand_dims(att_prev, 2), self.enc_h.shape), axis=1, ) return self.c, att_prev
def vectorise_stories(encoded_stories): """Given a list of encoded stories, vectorise them with padding.""" # Vectorise stories vctx = np.zeros( (len(encoded_stories), max([ len(s['context']) for s in encoded_stories ]), max([len(rule) for s in encoded_stories for rule in s['context']]), max([ len(pred) for s in encoded_stories for rule in s['context'] for pred in rule ])), dtype=np.int32) # (B, R, P, C) vq = F.pad_sequence([ np.array(s['query'], dtype=np.int32) for s in encoded_stories ]).array # (B, Q) vas = np.array([s['answers'] for s in encoded_stories], dtype=np.int32) # (B,) supps = F.pad_sequence( [np.array(s['supps'], dtype=np.int32) for s in encoded_stories], padding=-1).array # (B, I) for i, s in enumerate(encoded_stories): for j, rule in enumerate(s['context']): for k, pred in enumerate(rule): vctx[i, j, k, :len(pred)] = np.array(pred, dtype=np.int32) if DEEPLOGIC: perm = np.random.permutation(len(s['context'])) vctx[i, :len(s['context'])] = vctx[i, perm] for j, supp in enumerate(supps[i]): if supp != -1: supps[i, j] = np.argmax(perm == supp) return vctx, vq, vas, supps
def __call__(self, hs, ys): """CTC forward. Args: hs (list of chainer.Variable | N-dimension array): Input variable from encoder. ys (list of chainer.Variable | N-dimension array): Input variable of decoder. Returns: chainer.Variable: A variable holding a scalar value of the CTC loss. """ self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.separate(y_hat, axis=1) # ilen list of batch x hdim # zero padding for ys y_true = F.pad_sequence(ys, padding=-1) # batch x olen # get length info input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32)) label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32)) logging.info(self.__class__.__name__ + ' input lengths: ' + str(input_length.data)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data)) # get ctc loss self.loss = F.connectionist_temporal_classification( y_hat, y_true, 0, input_length, label_length) logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def test_ctc_loss(): pytest.importorskip("torch") pytest.importorskip("warpctc_pytorch") import torch import warpctc_pytorch from espnet.nets.e2e_asr_th import pad_list n_out = 7 input_length = numpy.array([11, 17, 15], dtype=numpy.int32) label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = [ numpy.random.rand(il, n_out).astype(numpy.float32) for il in input_length ] np_target = [ numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32) for ol in label_length ] # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr.py ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2) ch_target = F.pad_sequence(np_target, padding=-1) ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0, input_length, label_length).data th_pred = pad_list([torch.from_numpy(x) for x in np_pred], 0.0).transpose(0, 1) th_target = torch.from_numpy(numpy.concatenate(np_target)) th_ilen = torch.from_numpy(input_length) th_olen = torch.from_numpy(label_length) th_loss = warpctc_pytorch.CTCLoss(size_average=True)( th_pred, th_target, th_ilen, th_olen).data.numpy()[0] numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
def __call__(self, hx, cx, xs, enc_hs): xs_embed = [self.embed(x) for x in xs] hy, cy, ys = self.Nlstm(hx, cx, xs_embed) ys_pad = F.pad_sequence(ys, length=None, padding=0.0) enc_hs = F.pad_sequence(enc_hs, length=None, padding=0.0) mask = self.xp.all(enc_hs.data == 0, axis=2, keepdims=True) mask_num = self.xp.full(mask.shape, -1024.0, dtype=self.xp.float32) alignment = [] decode = [] ys_pad = F.transpose(ys_pad, (1, 0, 2)) for y in ys_pad: y = F.reshape(y, (*y.shape, 1)) score = F.matmul(enc_hs, y) score = F.where(mask, mask_num, score) align = F.softmax(score, axis=1) context_vector = F.matmul(enc_hs, align, True, False) t = self.W_c( F.dropout(F.concat((y, context_vector), axis=1), self.dropout)) ys_proj = self.proj(F.dropout(t, self.dropout)) alignment.append(F.reshape(align, (len(xs), -1))) decode.append(ys_proj) decode = F.stack(decode, axis=1) alignment = F.stack(alignment, axis=1) return hy, cy, decode, alignment.data
def test_ctc_loss(): pytest.importorskip("torch") pytest.importorskip("warpctc_pytorch") import torch from warpctc_pytorch import CTCLoss from e2e_asr_attctc_th import pad_list n_out = 7 n_batch = 3 input_length = numpy.array([11, 17, 15], dtype=numpy.int32) label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = [numpy.random.rand(il, n_out).astype( numpy.float32) for il in input_length] np_target = [numpy.random.randint( 0, n_out, size=ol, dtype=numpy.int32) for ol in label_length] # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr_attctc.py ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2) ch_target = F.pad_sequence(np_target, padding=-1) ch_loss = F.connectionist_temporal_classification( ch_pred, ch_target, 0, input_length, label_length).data th_pred = pad_list([torch.autograd.Variable(torch.from_numpy(x)) for x in np_pred]).transpose(0, 1) th_target = torch.autograd.Variable( torch.from_numpy(numpy.concatenate(np_target))) th_ilen = torch.autograd.Variable(torch.from_numpy(input_length)) th_olen = torch.autograd.Variable(torch.from_numpy(label_length)) # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does th_loss = (CTCLoss()(th_pred, th_target, th_ilen, th_olen) / n_batch).data.numpy()[0] numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
def forward(self, xs1, xs2): # padding inputs x1 = F.pad_sequence(xs1, padding=-1) x2 = F.pad_sequence(xs2, padding=-1) # word idx -> word vector ex1 = F.dropout(self.embed(x1), self.dropout) ex2 = F.dropout(self.embed(x2), self.dropout) # this mini batch parameters definition batch_size = len(xs1) row, column = ex1.shape[1], ex2.shape[1] utils = Utils(self.out_units, batch_size, column, row, self.xp, self.minute_num) m1, m2 = utils.masking(ex1), utils.masking(ex2) mean = utils.kernel_shaping(self.means) variance = utils.kernel_shaping(self.variances) # cross match and kernel pooling h, mask = utils.cross_match(ex1, ex2, m1, m2) h = utils.kernel_pooling(h, mask, mean, variance) # calculate ranking score h = self.liner(h) h = F.leaky_relu(h) return h
def __call__(self, enc_hs, dec_z, att_prev): '''NoAtt forward :param enc_hs: :param dec_z: dummy :param att_prev: :return: ''' # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # initialize attention weight with uniform dist. if att_prev is None: att_prev = [ self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs ] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) self.c = F.sum( self.enc_h * F.broadcast_to(F.expand_dims(att_prev, 2), self.enc_h.shape), axis=1) return self.c, att_prev
def __call__(self, hs, ys): '''CTC forward :param hs: :param ys: :return: ''' self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.separate(y_hat, axis=1) # ilen list of batch x hdim # zero padding for ys y_true = F.pad_sequence(ys, padding=-1) # batch x olen # get length info input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32)) label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32)) logging.info(self.__class__.__name__ + ' input lengths: ' + str(input_length.data)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data)) # get ctc loss self.loss = F.connectionist_temporal_classification( y_hat, y_true, 0, input_length, label_length) logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def forward(self, enc_hs, dec_z, att_prev): '''AttLoc forward :param enc_hs: :param dec_z: :param att_prev: :param scaling: :return: ''' # EDIT(hamaji): scaling is now a local variable. scaling = 2.0 batch = len(enc_hs) # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # utt x frame x att_dim self.pre_compute_enc_h = linear_tensor_3d(self.mlp_enc, self.enc_h) if dec_z is None: dec_z = chainer.Variable(self.xp.zeros( (batch, self.dunits), dtype=np.float32)) else: dec_z = F.reshape(dec_z, (batch, self.dunits)) # initialize attention weight with uniform dist. if att_prev is None: att_prev = [self.xp.full( hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) # TODO(watanabe) use <chainer variable>.reshpae(), instead of F.reshape() # att_prev: utt x frame -> utt x 1 x 1 x frame -> utt x att_conv_chans x 1 x frame att_conv = self.loc_conv( F.reshape(att_prev, (batch, 1, 1, self.h_length))) # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2) # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim att_conv = linear_tensor_3d(self.mlp_att, att_conv) # dec_z_tiled: utt x frame x att_dim dec_z_tiled = F.broadcast_to( F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape) # dot with gvec # utt x frame x att_dim -> utt x frame # TODO(watanabe) use batch_matmul e = F.squeeze(linear_tensor_3d(self.gvec, F.tanh( att_conv + self.pre_compute_enc_h + dec_z_tiled)), axis=2) # Applying a minus-large-number filter to make a probability value zero for a padded area # simply degrades the performance, and I gave up this implementation # Apply a scaling to make an attention sharp w = F.softmax(scaling * e) # weighted sum over flames # utt x hdim c = F.sum(self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1) return c, w
def __call__(self, hs, ht, x_list): ''' Attentionの計算 :param hs : Encoderの中間ベクトルが記録されたリスト :param ht : Decoderの中間ベクトルが記録されたリスト :return : 中間ベクトルht ''' batch_size = len(x_list) ht = F.reshape(ht, (batch_size, 1, self.hidden_size)) h = [] for i in range(batch_size): h.append(Variable((hs[i].data * ht[i].data))) concat_h = F.concat(h, axis=0) attn = self.eh(concat_h) sections = np.cumsum([len(x) for x in x_list]) split_attention = F.split_axis(attn, sections[:-1], axis=0) split_attention_pad = F.pad_sequence(split_attention, padding=-1024.) attn_softmax = F.softmax(split_attention_pad, axis=1) hs_pad = F.pad_sequence(hs) hs_pad_reshape = F.reshape(hs_pad, (-1, hs_pad.shape[-1])) r = F.reshape(attn_softmax, (-1, attn_softmax.shape[-1])) attn_softmax_reshape = F.broadcast_to( F.reshape(attn_softmax, (-1, attn_softmax.shape[-1])), hs_pad_reshape.shape) attention_hidden = hs_pad_reshape * attn_softmax_reshape attention_hidden_reshape = F.reshape( attention_hidden, (batch_size, -1, attention_hidden.shape[-1])) result = F.sum(attention_hidden_reshape, axis=1) ''' # hの次元を落とす h = Variable(h.data[0]) batch_size = h.data.shape[0] # 重みを記録するリスト ws = [] # 重みの合計値を初期化 sum_w = Variable(self.ARR.zeros((batch_size, 1), dtype='float32')) # EncoderとDecoderの中間ベクトルを使って重みの計算 for e in es: print(e.shape) print(h.shape) # 重みの計算 w = F.tanh(self.eh(e) + self.dh(h)) # 正規化 w = F.exp(self.hw(w)) # 記録 ws.append(w) sum_w += w # 加重平均ベクトルの初期化 att = Variable(self.ARR.zeros( (1, batch_size, self.hidden_size), dtype='float32')) for e, w in zip(es, ws): # 重みの正規化 w /= sum_w # 重み * Encoderの中間ベクトルを、出力するベクトルに加算 att += F.reshape(F.batch_matmul(e, w), (1, batch_size, self.hidden_size)) ''' return F.reshape(result, (1, result.shape[0], result.shape[1]))
def batch_pit_loss_faster(ys, ts, label_delay=0): """ PIT loss over mini-batch. Args: ys: B-length list of predictions ts: B-length list of labels Returns: loss: (1,)-shape mean cross entropy over mini-batch labels: B-length list of permuted labels """ n_speakers = ts[0].shape[1] xp = chainer.backend.get_array_module(ys[0]) # (B, T, C) ys = F.pad_sequence(ys, padding=-1) losses = [] for shift in range(n_speakers): # rolled along with speaker-axis ts_roll = [xp.roll(t, -shift, axis=1) for t in ts] ts_roll = F.pad_sequence(ts_roll, padding=-1) # loss: (B, T, C) loss = F.sigmoid_cross_entropy(ys, ts_roll, reduce='no') # sum over time: (B, C) loss = F.sum(loss, axis=1) losses.append(loss) # losses: (B, C, C) losses = F.stack(losses, axis=2) # losses[b, i, j] is a loss between # `i`-th speaker in y and `(i+j)%C`-th speaker in t perms = xp.array( list(permutations(range(n_speakers))), dtype='i', ) # y_inds: [0,1,2,3] y_ind = xp.arange(n_speakers, dtype='i') # perms -> relation to t_inds -> t_inds # 0,1,2,3 -> 0+j=0,1+j=1,2+j=2,3+j=3 -> 0,0,0,0 # 0,1,3,2 -> 0+j=0,1+j=1,2+j=3,3+j=2 -> 0,0,1,3 t_inds = xp.mod(perms - y_ind, n_speakers) losses_perm = [] for t_ind in t_inds: losses_perm.append(F.mean(losses[:, y_ind, t_ind], axis=1)) # losses_perm: (B, Perm) losses_perm = F.stack(losses_perm, axis=1) min_loss = F.sum(F.min(losses_perm, axis=1)) min_loss = F.sum(F.min(losses_perm, axis=1)) n_frames = np.sum([t.shape[0] for t in ts]) min_loss = min_loss / n_frames min_indices = xp.argmin(losses_perm.array, axis=1) labels_perm = [t[:, perms[idx]] for t, idx in zip(ts, min_indices)] return min_loss, labels_perm
def get_batch(self, batch_size, set_key, train, labels=False): xp = cuda.cupy if self.gpuid >= 0 else np batches = [] num_b = self.buckets[set_key]["num_b"] width_b = self.buckets[set_key]["width_b"] max_sp = (num_b + 1) * width_b if labels: dec_key = self.data_cfg["dec_key"] max_pred = self.data_cfg["max_pred"] for b, bucket in enumerate(self.buckets[set_key]["buckets"]): # Shuffle utterances in a bucket random.shuffle(bucket) for i in range(0, len(bucket), batch_size): # append utterances, and the width of the current batch # width of 10, implies 10 speech frames = 10 * 10ms = 100ms batches.append((bucket[i:i + batch_size], (b + 1) * width_b)) # end for # Shuffle all the batches random.shuffle(batches) # Generator for batches for (utts, b) in batches: batch_data = {"X": [], "utts": []} if labels: batch_data["y"] = [] for u in utts: batch_data["X"].append(self._load_speech(u, set_key, max_sp)) if labels: en_ids = [ self.vocab[dec_key]['w2i'].get(w, SYMBOLS.UNK_ID) for w in self.map[set_key][u][dec_key] ] y_ids = [SYMBOLS.GO_ID ] + en_ids[:max_pred - 2] + [SYMBOLS.EOS_ID] batch_data["y"].append(xp.asarray(y_ids, dtype=xp.int32)) # end for utts # include the utt ids batch_data['utts'].extend(utts) batch_data['X'] = F.pad_sequence(batch_data['X'], padding=SYMBOLS.PAD_ID) batch_data['X'].to_gpu(self.gpuid) if labels: batch_data['y'] = F.pad_sequence(batch_data['y'], padding=SYMBOLS.PAD_ID) batch_data['y'].to_gpu(self.gpuid) yield batch_data
def _compute_metrics(parsed, gold_batch, lengths, use_predicted_arcs_for_rels=True): logits_arc, logits_rel = parsed true_arcs, true_rels = zip(*gold_batch) # exclude attachment from the root logits_arc, logits_rel = logits_arc[:, 1:], logits_rel[:, 1:] true_arcs = F.pad_sequence(true_arcs, padding=-1)[:, 1:] true_rels = F.pad_sequence(true_rels, padding=-1)[:, 1:] lengths = np.array(lengths, dtype=np.int32) - 1 xp = chainer.cuda.get_array_module(logits_arc) if xp is not np: true_arcs.to_gpu() true_rels.to_gpu() b, n_deps, n_heads = logits_arc.shape logits_arc_flatten = F.reshape(logits_arc, (b * n_deps, n_heads)) true_arcs_flatten = F.reshape(true_arcs, (b * n_deps, )) arc_loss = F.softmax_cross_entropy(logits_arc_flatten, true_arcs_flatten, ignore_label=-1) arc_accuracy = F.accuracy(logits_arc_flatten, true_arcs_flatten, ignore_label=-1) arc_accuracy.to_cpu() if use_predicted_arcs_for_rels: parsed_arcs = xp.argmax(logits_arc.data, axis=2) else: parsed_arcs = true_arcs.data logits_rel = [ logits[np.arange(length), arcs[:length]] for logits, arcs, length in zip(logits_rel, parsed_arcs, lengths) ] logits_rel = F.pad_sequence(logits_rel) b, n_deps, n_rels = logits_rel.shape logits_rel_flatten = F.reshape(logits_rel, (b * n_deps, n_rels)) true_rels_flatten = F.reshape(true_rels, (b * n_deps, )) rel_loss = F.softmax_cross_entropy(logits_rel_flatten, true_rels_flatten, ignore_label=-1) rel_accuracy = F.accuracy(logits_rel_flatten, true_rels_flatten, ignore_label=-1) rel_accuracy.to_cpu() return { 'arc_loss': arc_loss, 'arc_accuracy': arc_accuracy, 'rel_loss': rel_loss, 'rel_accuracy': rel_accuracy }
def calculate_all_attentions(self, hs, ys): """Calculate all of attentions. Args: hs (list of chainer.Variable | N-dimensional array): Input variable from encoder. ys (list of chainer.Variable | N-dimensional array): Input variable of decoder. Returns: chainer.Variable: List of attention weights. """ # prepare input and output word sequences with sos/eos IDs eos = self.xp.array([self.eos], "i") sos = self.xp.array([self.sos], "i") ys_in = [F.concat([sos, y], axis=0) for y in ys] ys_out = [F.concat([y, eos], axis=0) for y in ys] # padding for ys with -1 # pys: utt x olen pad_ys_in = F.pad_sequence(ys_in, padding=self.eos) pad_ys_out = F.pad_sequence(ys_out, padding=-1) # get length info olength = pad_ys_out.shape[1] # initialization c_list = [None] # list of cell state of each layer z_list = [None] # list of hidden state of each layer for _ in six.moves.range(1, self.dlayers): c_list.append(None) z_list.append(None) att_w = None att_ws = [] self.att.reset() # reset pre-computation of h # pre-computation of embedding eys = self.embed(pad_ys_in) # utt x olen x zdim eys = F.separate(eys, axis=1) # loop for an output sequence for i in six.moves.range(olength): att_c, att_w = self.att(hs, z_list[0], att_w) ey = F.hstack((eys[i], att_c)) # utt x (zdim + hdim) z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list) att_ws.append(att_w) # for debugging att_ws = F.stack(att_ws, axis=1) att_ws.to_cpu() return att_ws.data
def _compute_metrics(parsed, gold_batch, lengths, use_predicted_arcs_for_rels=True): logits_arc, logits_rel, *_ = parsed true_arcs, true_rels, *_ = zip(*gold_batch) # exclude attachment from the root logits_arc, logits_rel = logits_arc[:, 1:], logits_rel[:, 1:] true_arcs = F.pad_sequence(true_arcs, padding=-1)[:, 1:] true_rels = F.pad_sequence(true_rels, padding=-1)[:, 1:] lengths = np.array(lengths, dtype=np.int32) - 1 xp = chainer.cuda.get_array_module(logits_arc) if xp is not np: true_arcs.to_gpu() true_rels.to_gpu() b, n_deps, n_heads = logits_arc.shape logits_arc_flatten = F.reshape(logits_arc, (b * n_deps, n_heads)) true_arcs_flatten = F.reshape(true_arcs, (b * n_deps, )) arc_loss = F.softmax_cross_entropy(logits_arc_flatten, true_arcs_flatten, ignore_label=-1) arc_accuracy = _accuracy(logits_arc_flatten, true_arcs_flatten, ignore_label=-1) if use_predicted_arcs_for_rels: parsed_arcs = xp.argmax(logits_arc.data, axis=2) else: parsed_arcs = true_arcs.data parsed_arcs = chainer.cuda.to_cpu(parsed_arcs) b, n_deps, n_heads, n_rels = logits_rel.shape base1, base2 = n_deps * n_heads, np.arange(n_deps) * n_heads parsed_arcs_flatten = np.concatenate( [base1 * i + base2 + arcs for i, arcs in enumerate(parsed_arcs)]) logits_rel_flatten = F.embed_id(xp.asarray(parsed_arcs_flatten), F.reshape(logits_rel, (b * base1, n_rels))) true_rels_flatten = F.reshape(true_rels, (b * n_deps, )) rel_loss = F.softmax_cross_entropy(logits_rel_flatten, true_rels_flatten, ignore_label=-1) rel_accuracy = _accuracy(logits_rel_flatten, true_rels_flatten, ignore_label=-1) return { 'arc_loss': arc_loss, 'arc_accuracy': arc_accuracy, 'rel_loss': rel_loss, 'rel_accuracy': rel_accuracy }
def _encode(self, xs, x_lens, is_multi_task=False): """Encode acoustic features. Args: xs (list of chainer.Variable(float)): A list of tensors of size `[T_in, input_size]` x_lens (np.ndarray): A tensor of size `[B]` is_multi_task (bool, optional): Returns: logits (chainer.Variable, float): A tensor of size `[B, T, num_classes (including the blank class)]` x_lens (np.ndarray): A tensor of size `[B]` OPTION: logits_sub (chainer.Variable, float): A tensor of size `[B, T, num_classes_sub (including the blank class)]` x_lens_sub (np.ndarray): A tensor of size `[B]` """ if is_multi_task: if self.encoder_type == 'cnn': xs, x_lens = self.encoder(xs, x_lens) xs_sub = xs x_lens_sub = x_lens else: xs, x_lens, xs_sub, x_lens_sub = self.encoder(xs, x_lens) else: xs, x_lens = self.encoder(xs, x_lens) # Concatenate xs = F.pad_sequence(xs, padding=0) # Path through fully-connected layers if len(self.fc_list) > 0: for i in range(len(self.fc_list)): # if self.batch_norm: # xs = self['bn_fc_' + str(i)](xs) xs = self['fc_' + str(i)](xs) logits = self.fc_out(xs) if is_multi_task: # Concatenate xs_sub = F.pad_sequence(xs_sub, padding=0) # Path through fully-connected layers if len(self.fc_list_sub) > 0: for i in range(len(self.fc_list_sub)): # if self.batch_norm: # xs_sub = self['bn_fc_sub_' + str(i)](xs_sub) xs_sub = self['fc_sub_' + str(i)](xs_sub) logits_sub = self.fc_out_sub(xs_sub) return logits, x_lens, logits_sub, x_lens_sub else: return logits, x_lens
def __call__(self, x): x_len = [len(d) for d in x] x_section = np.cumsum(x_len[:-1]) ex = np.copy(self.embed[F.concat(x, axis=0).data]) if self.gpu_flag >= 0: ex = cuda.to_gpu(ex) ex = F.dropout(ex, ratio=self.dr_input) exs = F.split_axis(ex, x_section, 0) _, y_seq_list = self.gru(None, exs) #================== # Self-Attention #================== # バッチ内の各ステップ出力を0でパディング pd_y_seq = F.pad_sequence(y_seq_list, padding=0) # [Batch, max_len, Unit] n_B, n_ML, n_U = pd_y_seq.shape n_AH = self.n_att_head pd_y_seq = F.reshape(pd_y_seq, (n_B, n_ML, n_U, 1)) # [Batch, max_len, Unit, 1] pd_y_seq = F.broadcast_to( pd_y_seq, (n_B, n_ML, n_U, n_AH)) # [Batch, max_len, Unit, Head] # concatanate att_in = F.concat(y_seq_list, axis=0) # [All element, Unit] att_h1 = F.tanh(self.att_w1(att_in)) # [All element, Att unit] att_h2 = self.att_w2(att_h1) # [All element, Head] att_h2_seq = F.split_axis(att_h2, x_section, 0, 0) att_h2_pad = F.pad_sequence(att_h2_seq, padding=-1024.0) # [Batch, max_len, Head] # Softmax weight = F.softmax(att_h2_pad, axis=1) # [Batch, max_len, Head] self.tmp_weight = weight.data weight = F.reshape(weight, (n_B, n_ML, 1, n_AH)) # [Batch, max_len, 1, Head] weight = F.broadcast_to( weight, (n_B, n_ML, n_U, n_AH)) # [Batch, max_len, Unit, Head] # 加重和を計算 att_out = F.sum(pd_y_seq * weight, axis=1) # [Batch, Unit, Head] out = F.tanh(self.decode(att_out)) return out
def forward(self, xs, h, c, mask): batch_size = len(xs) lens = [x.shape[0] for x in xs] #max_len = max(lens) max_len = self.sequence_length #mask = (np.expand_dims(np.arange(max_len), 0) < # np.expand_dims(lens, 1)).astype(np.float) #h = np.zeros((batch_size, self.num_hidden), dtype=np.float32) #c = np.zeros((batch_size, self.num_hidden), dtype=np.float32) #h = self.initial_h #c = self.initial_c inputs = F.pad_sequence(xs) for time in range(max_len): x = inputs[:, time] input = F.concat((x, h), axis=1) gate = self.l(input) i = gate[:, 0:self.num_hidden] o = gate[:, self.num_hidden:self.num_hidden * 2] f = gate[:, self.num_hidden * 2:self.num_hidden * 3] nc = gate[:, self.num_hidden * 3:self.num_hidden * 4] #i, o, f, nc = F.split_axis(gate, 4, axis=1) i = F.sigmoid(i) o = F.sigmoid(o) f = F.sigmoid(f) nc = F.tanh(nc) nc = f * c + i * nc nh = o * F.tanh(nc) m = mask[:, time] pmask = F.reshape(m, (self.batch_size, )) pmask = F.broadcast_to(F.expand_dims(pmask, axis=1), (self.batch_size, self.num_hidden)) nmask = 1.0 - pmask h = nh * pmask + h * nmask return h
def __call__(self, xs): # forward calculation h1 = [F.relu(self.l1(x)) for x in xs] _, _, h1 = self.encoder1(None, None, h1) h1 = [F.max_pooling_2d(x[None][None], ksize=(2, 1))[0][0] for x in h1] _, _, h1 = self.encoder2(None, None, h1) h1 = [F.max_pooling_2d(x[None][None], ksize=(2, 1))[0][0] for x in h1] _, _, h1 = self.encoder3(None, None, h1) h1 = [F.max_pooling_2d(x[None][None], ksize=(2, 1))[0][0] for x in h1] _, _, ys = self.encoder4(None, None, h1) _, _, ys = self.encoder5(None, None, h1) input_length = [len(y) for y in ys] ys = [self.output(y) for y in ys] ys = F.pad_sequence(ys) result = list(F.stack(ys, axis=1)) return result, input_length
def forward(self, equery, vmemory, ememory, mask, iteration=0): """Compute an attention over memory given the query.""" # equery.shape == (..., E) # vmemory.shape == (..., Ms, M) # ememory.shape == (..., Ms, E) # mask.shape == (..., Ms) # Setup memory embedding eq = F.repeat(equery[..., None, :], vmemory.shape[-2], -2) # (..., Ms, E) # Compute content based attention merged = F.concat( [eq, ememory, eq * ememory, F.squared_difference(eq, ememory)], -1) # (..., Ms, 4*E) inter = self.att_linear(merged, n_batch_axes=len(vmemory.shape) - 1) # (..., Ms, E) inter = F.tanh(inter) # (..., Ms, E) inter = F.dropout(inter, DROPOUT) # (..., Ms, E) # Split into sentences lengths = np.sum(np.any((vmemory != 0), -1), -1) # (...,) mems = [s[..., :l, :] for s, l in zip(F.separate(inter, 0), lengths) ] # B x [(M1, E), (M2, E), ...] _, bimems = self.att_birnn(None, mems) # B x [(M1, 2*E), (M2, 2*E), ...] bimems = F.pad_sequence(bimems) # (..., Ms, 2*E) att = self.att_score(bimems, n_batch_axes=len(vmemory.shape) - 1) # (..., Ms, 1) att = F.squeeze(att, -1) # (..., Ms) if mask is not None: att += mask * MINUS_INF # (..., Ms) return att
def forward(self, ys_pad, source, x_mask): """Forward decoder. :param xp.array e: input token ids, int64 (batch, maxlen_out) :param xp.array yy_mask: input token mask, uint8 (batch, maxlen_out) :param xp.array source: encoded memory, float32 (batch, maxlen_in, feat) :param xp.array xy_mask: encoded memory mask, uint8 (batch, maxlen_in) :return e: decoded token score before softmax (batch, maxlen_out, token) :rtype: chainer.Variable """ xp = self.xp sos = np.array([self.sos], np.int32) ys = [np.concatenate([sos, y], axis=0) for y in ys_pad] e = F.pad_sequence(ys, padding=self.eos).data e = xp.array(e) # mask preparation xy_mask = self.make_attention_mask(e, xp.array(x_mask)) yy_mask = self.make_attention_mask(e, e) yy_mask *= make_history_mask(xp, e) e = self.pe(self.embed(e)) batch, length, dims = e.shape e = e.reshape(-1, dims) source = source.reshape(-1, dims) for i in range(self.n_layers): e = self["decoders." + str(i)](e, source, xy_mask, yy_mask, batch) return self.output_layer(self.output_norm(e)).reshape( batch, length, -1)
def probability_loss(P, n_speakers): """Get cross-entropy loss for the probabilities reported Args: P: (B, n_speakers + 1, 1) n_speakers: B-length list Returns: loss_a: (1, )-shape mean cross entropy loss over mini-batch """ # l: (B, n_speakers + 1, 1) # loss = 0 # for p in P: # p = p.T # l = np.ones_like(p).astype(np.int32) # l[0, -1] = 0 # loss += F.sigmoid_cross_entropy(p, l) # return loss / len(P) # New Method P = F.swapaxes(F.pad_sequence(P, padding=1), 1, 2) L = np.ones_like(P).astype(np.int32) for i, n in enumerate(n_speakers): L[i, 0, n] = 0 return F.sigmoid_cross_entropy(P, L)
def forward(self, xs, n_speakers, activation=None): ilens = [x.shape[0] for x in xs] # xs: (B, T, F) xs = F.pad_sequence(xs, padding=-1) pad_shape = xs.shape # emb: (B*T, E) emb = self.enc(xs) # emb: (B, T, F) emb = F.separate(emb.reshape(pad_shape[0], pad_shape[1], -1), axis=0) emb = [F.get_item(e, slice(0, ilen)) for e, ilen in zip(emb, ilens)] emb2 = [cp.random.permutation(e) for e in emb] # get name: main- num_speakers=n_speakers, to_train=1 # validation/main- num_speakers=n_speaker, to_train=0 # validation_1/main- num_speakers=None, to_train=0 name = reporter.get_current_reporter()._observer_names[id(self)] num_speakers = None if name == "validation_1/main" else n_speakers to_train = 1 if name == 'main' else 0 # h_0: (1, B, F) # c_0: (1, B, F) h_0, c_0 = self.encoder(emb2) # A: (B, n_spk, F) # P: (B, n_spk, 1) A, P = self.decoder(h_0, c_0, n_speakers=num_speakers, to_train=to_train) # yhat: (B, T, n_spk) ys = [F.matmul(e, a.T) for a, e in zip(A, emb)] return ys, P
def seq_rnn_embed(vxs, exs, birnn, return_seqs=False): """Embed given sequences using rnn.""" # vxs.shape == (..., S) # exs.shape == (..., S, E) assert vxs.shape == exs.shape[: -1], "Sequence embedding dimensions do not match." lengths = np.sum(vxs != 0, -1).flatten() # (X,) seqs = F.reshape(exs, (-1, ) + exs.shape[-2:]) # (X, S, E) toembed = [ s[..., :l, :] for s, l in zip(F.separate(seqs, 0), lengths) if l != 0 ] # Y x [(S1, E), (S2, E), ...] hs, ys = birnn(None, toembed) # (2, Y, E), Y x [(S1, 2*E), (S2, 2*E), ...] if return_seqs: ys = F.pad_sequence(ys) # (Y, S, 2*E) ys = F.reshape(ys, ys.shape[:-1] + (2, EMBED)) # (Y, S, 2, E) ys = F.mean(ys, -2) # (Y, S, E) if ys.shape[0] == lengths.size: ys = F.reshape(ys, exs.shape) # (..., S, E) return ys embeds = np.zeros((lengths.size, vxs.shape[-1], EMBED), dtype=np.float32) # (X, S, E) idxs = np.nonzero(lengths) # (Y,) embeds = F.scatter_add(embeds, idxs, ys) # (X, S, E) embeds = F.reshape(embeds, exs.shape) # (..., S, E) return embeds # (..., S, E) hs = F.mean(hs, 0) # (Y, E) if hs.shape[0] == lengths.size: hs = F.reshape(hs, vxs.shape[:-1] + (EMBED, )) # (..., E) return hs # Add zero values back to match original shape embeds = np.zeros((lengths.size, EMBED), dtype=np.float32) # (X, E) idxs = np.nonzero(lengths) # (Y,) embeds = F.scatter_add(embeds, idxs, hs) # (X, E) embeds = F.reshape(embeds, vxs.shape[:-1] + (EMBED, )) # (..., E) return embeds # (..., E)
def __call__(self, hs, ys): """Core function of the Warp-CTC layer. Args: hs (iterable of chainer.Variable | N-dimention array): Input variable from encoder. ys (iterable of chainer.Variable | N-dimension array): Input variable of decoder. Returns: chainer.Variable: A variable holding a scalar value of the CTC loss. """ self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.transpose(y_hat, (1, 0, 2)) # batch x frames x hdim # get length info logging.info(self.__class__.__name__ + ' input lengths: ' + str(ilens)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(olens)) # get ctc loss from chainer_ctc.warpctc import ctc as warp_ctc self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(l.data) for l in ys])[0] logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def calc_attention(self, xs, ys, genre_exs, gender_exs, attn_linear): concat_ys = F.concat( ys, axis=0) # -> (total len of batched sentence, word embedding dim) attn_ys = attn_linear(F.tanh(concat_ys)) cond_feature = self.proj_cond(F.concat( (genre_exs, gender_exs))) # -> (batchsize, proj_cond dim) cumsum_ys = self.xp.cumsum( self.xp.array([len(x) for x in xs], dtype=self.xp.int32)) split_attn_ys = F.split_axis(attn_ys, cumsum_ys[:-1].tolist(), axis=0) split_attn_ys_pad = F.pad_sequence(split_attn_ys, padding=-1024) bool_cond = split_attn_ys_pad.array == -1024 split_attn_ys_pad = split_attn_ys_pad * F.expand_dims(F.broadcast_to( cond_feature, (split_attn_ys_pad.shape[:-1])), axis=-1) padding_array = self.xp.full(split_attn_ys_pad.shape, -1024, dtype=self.xp.float32) split_attn_ys_pad = F.where(bool_cond, padding_array, split_attn_ys_pad) attn_softmax = F.softmax(split_attn_ys_pad, axis=1) return attn_softmax
def test_train_acc(): n_out = 7 _eos = n_out - 1 n_batch = 3 label_length = numpy.array([4, 2, 3], dtype=numpy.int32) np_pred = numpy.random.rand(n_batch, max(label_length) + 1, n_out).astype(numpy.float32) # NOTE: 0 is only used for CTC, never appeared in attn target np_target = [ numpy.random.randint(1, n_out - 1, size=ol, dtype=numpy.int32) for ol in label_length ] eos = numpy.array([_eos], 'i') ys_out = [F.concat([y, eos], axis=0) for y in np_target] # padding for ys with -1 # pys: utt x olen # NOTE: -1 is default ignore index for chainer pad_ys_out = F.pad_sequence(ys_out, padding=-1) y_all = F.reshape(np_pred, (n_batch * (max(label_length) + 1), n_out)) ch_acc = F.accuracy(y_all, F.concat(pad_ys_out, axis=0), ignore_label=-1) # NOTE: this index 0 is only for CTC not attn. so it can be ignored # unfortunately, torch cross_entropy does not accept out-of-bound ids th_ignore = 0 th_pred = torch.from_numpy(y_all.data) th_ys = [torch.from_numpy(numpy.append(t, eos)).long() for t in np_target] th_target = pad_list(th_ys, th_ignore) th_acc = th_accuracy(th_pred, th_target, th_ignore) numpy.testing.assert_allclose(ch_acc.data, th_acc)
def __call__(self, hs, ys): '''CTC forward :param hs: :param ys: :return: ''' self.loss = None ilens = [x.shape[0] for x in hs] olens = [x.shape[0] for x in ys] # zero padding for hs y_hat = linear_tensor(self.ctc_lo, F.dropout( F.pad_sequence(hs), ratio=self.dropout_rate)) y_hat = F.transpose(y_hat, (1, 0, 2)) # batch x frames x hdim # get length info logging.info(self.__class__.__name__ + ' input lengths: ' + str(ilens)) logging.info(self.__class__.__name__ + ' output lengths: ' + str(olens)) # get ctc loss self.loss = warp_ctc(y_hat, ilens, [cuda.to_cpu(l.data) for l in ys])[0] logging.info('ctc loss:' + str(self.loss.data)) return self.loss
def norm_vec_sentence_level(d, nn_flag=False, include_norm_term=False): dim = d.shape[1] d_list = F.split_axis(d, np.cumsum(lengths)[:-1], axis=0) max_length = np.max(lengths) d_pad = F.pad_sequence(d_list, length=max_length, padding=0.0) d_flat = F.reshape(get_normalized_vector(d_pad, None), (-1, dim)) split_size = np.cumsum(np.full(batchsize, max_length))[:-1] d_list = F.split_axis(d_flat, split_size, axis=0) d_list = [_d[:_length] for _d, _length in zip(d_list, lengths)] d = F.concat(d_list, axis=0) return d
def check_forward(self, xs): # Non-finite values does not work for integer values. if not numpy.isfinite(self.pad) and \ numpy.dtype(self.dtype).kind != 'f': return with disable_debug_mode_if(self.can_include_nan): y = functions.pad_sequence( xs, length=self.length, padding=self.pad) self.assertEqual(y.shape, self.y_shape) for i, (length, x) in enumerate(six.moves.zip(self.lengths, self.xs)): testing.assert_allclose(y.data[i, 0:length], x) testing.assert_allclose( y.data[i, length:], self.dtype(self.pad))
def f(*xs): return functions.pad_sequence( xs, length=self.length, padding=self.pad)
def forward(self, inputs, batch_lengths, initial_state=None): """ Parameters ---------- inputs : ``torch.FloatTensor``, required. A tensor of shape (batch_size, num_timesteps, input_size) to apply the LSTM over. batch_lengths : ``List[int]``, required. A list of length batch_size containing the lengths of the sequences in batch. initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) A tuple (state, memory) representing the initial hidden state and memory of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the ``memory`` has shape (1, batch_size, cell_size). Returns ------- output_accumulator : ``torch.FloatTensor`` The outputs of the LSTM for each timestep. A tensor of shape (batch_size, max_timesteps, hidden_size) where for a given batch element, all outputs past the sequence length for that batch are zero tensors. final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]`` A tuple (state, memory) representing the initial hidden state and memory of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the ``memory`` has shape (1, batch_size, cell_size). """ batch_size = inputs.shape[0] total_timesteps = inputs.shape[1] output_accumulator_list = [] if initial_state is None: full_batch_previous_memory = chainer.Variable( self.xp.zeros((batch_size, self.cell_size), 'f')) full_batch_previous_state = chainer.Variable( self.xp.zeros((batch_size, self.hidden_size), 'f')) else: # first dimension is just (layer * (1 + is_bidirection)), i.e., 1. full_batch_previous_state = F.squeeze(initial_state[0], axis=0) full_batch_previous_memory = F.squeeze(initial_state[1], axis=0) current_length_index = batch_size - 1 if self.go_forward else 0 if self.recurrent_dropout_probability > 0.0 and \ (self.training or chainer.confing.train): dropout_mask = get_dropout_mask(self.recurrent_dropout_probability, full_batch_previous_state) else: dropout_mask = None for timestep in range(total_timesteps): # The index depends on which end we start. index = timestep if self.go_forward else total_timesteps - timestep - 1 # What we are doing here is finding the index into the batch dimension # which we need to use for this timestep, because the sequences have # variable length, so once the index is greater than the length of this # particular batch sequence, we no longer need to do the computation for # this sequence. The key thing to recognise here is that the batch inputs # must be _ordered_ by length from longest (first in batch) to shortest # (last) so initially, we are going forwards with every sequence and as we # pass the index at which the shortest elements of the batch finish, # we stop picking them up for the computation. if self.go_forward: while batch_lengths[current_length_index] <= index: current_length_index -= 1 # If we're going backwards, we are _picking up_ more indices. else: # First conditional: Are we already at the maximum number of elements in the batch? # Second conditional: Does the next shortest sequence beyond the current batch # index require computation use this timestep? while current_length_index < (len(batch_lengths) - 1) and \ batch_lengths[current_length_index + 1] > index: current_length_index += 1 # Actually get the slices of the batch which we # need for the computation at this timestep. # shape (batch_size, cell_size) previous_memory = full_batch_previous_memory[0: current_length_index + 1] # Shape (batch_size, hidden_size) previous_state = full_batch_previous_state[0: current_length_index + 1] # Shape (batch_size, input_size) timestep_input = inputs[0: current_length_index + 1, index] # Do the projections for all the gates all at once. # Both have shape (batch_size, 4 * cell_size) projected_input = self.input_linearity(timestep_input) projected_state = self.state_linearity(previous_state) # Main LSTM equations using relevant chunks of the big linear # projections of the hidden state and inputs. # TODO: split_axis # TODO: cuda kernel input_gate = F.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] + projected_state[:, (0 * self.cell_size):(1 * self.cell_size)]) forget_gate = F.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] + projected_state[:, (1 * self.cell_size):(2 * self.cell_size)]) memory_init = F.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] + projected_state[:, (2 * self.cell_size):(3 * self.cell_size)]) output_gate = F.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] + projected_state[:, (3 * self.cell_size):(4 * self.cell_size)]) memory = input_gate * memory_init + forget_gate * previous_memory # Here is the non-standard part of this LSTM cell; first, we clip the # memory cell, then we project the output of the timestep to a smaller size # and again clip it. if self.memory_cell_clip_value: memory = F.clip(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value) # shape (current_length_index, cell_size) pre_projection_timestep_output = output_gate * F.tanh(memory) # shape (current_length_index, hidden_size) timestep_output = self.state_projection( pre_projection_timestep_output) if self.state_projection_clip_value: timestep_output = F.clip(timestep_output, -self.state_projection_clip_value, self.state_projection_clip_value) # Only do dropout if the dropout prob is > 0.0 and we are in training mode. if dropout_mask is not None: timestep_output = timestep_output * \ dropout_mask[0: current_length_index + 1] # We've been doing computation with less than the full batch, so here we create a new # variable for the the whole batch at this timestep and insert the result for the # relevant elements of the batch into it. full_batch_previous_memory = F.concat( [memory, full_batch_previous_memory[current_length_index + 1:]], axis=0) full_batch_previous_state = F.concat( [timestep_output, full_batch_previous_state[current_length_index + 1:]], axis=0) output_accumulator_list.append(timestep_output) # Mimic the pytorch API by returning state in the following shape: # (num_layers * num_directions, batch_size, ...). As this # LSTM cell cannot be stacked, the first dimension here is just 1. final_state = (F.expand_dims(full_batch_previous_state, 0), F.expand_dims(full_batch_previous_memory, 0)) if not self.go_forward: output_accumulator_list = output_accumulator_list[::-1] output_accumulator = F.pad_sequence(output_accumulator_list) output_accumulator = output_accumulator.transpose((1, 0, 2)) # (batch_size, total_timesteps, self.hidden_size) return output_accumulator, final_state
def stack_and_to_gpu(data_list): sdata = F.pad_sequence( data_list, length=None, padding=0).array return chainer.dataset.to_device(gpu, sdata)
def _lstm_forward(self, inputs, batch_lengths, initial_state=None): """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) A tuple (state, memory) representing the initial hidden state and memory of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size) respectively. Returns ------- output_sequence : ``torch.FloatTensor`` The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size) final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]`` The per-layer final (state, memory) states of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size) respectively. The last dimension is duplicated because it contains the state/memory for both the forward and backward layers. """ if initial_state is None: hidden_states = [None] * len(self.forward_layers) elif initial_state[0].shape[0] != len(self.forward_layers): raise ConfigurationError("Initial states were passed to forward() but the number of " "initial states does not match the number of layers.") else: hidden_states = list(zip(F.split_axis(initial_state[0], initial_state[0].shape[0], 0), F.split_axis(initial_state[1], initial_state[1].shape[0], 0))) inputs = F.pad_sequence(inputs) forward_output_sequence = inputs backward_output_sequence = inputs final_states = [] sequence_outputs = [] for layer_index, state in enumerate(hidden_states): forward_layer = getattr( self, 'forward_layer_{}'.format(layer_index)) backward_layer = getattr( self, 'backward_layer_{}'.format(layer_index)) forward_cache = forward_output_sequence backward_cache = backward_output_sequence if state is not None: forward_hidden_state, backward_hidden_state = F.split_axis( state[0], 2, axis=2) forward_memory_state, backward_memory_state = F.split_axis( state[1], 2, axis=2) forward_state = (forward_hidden_state, forward_memory_state) backward_state = (backward_hidden_state, backward_memory_state) else: forward_state = None backward_state = None forward_output_sequence, forward_state = forward_layer.forward( forward_output_sequence, batch_lengths, forward_state) backward_output_sequence, backward_state = backward_layer.forward( backward_output_sequence, batch_lengths, backward_state) # Skip connections, just adding the input to the output. if layer_index != 0: forward_output_sequence += forward_cache backward_output_sequence += backward_cache sequence_outputs.append(F.concat([forward_output_sequence, backward_output_sequence], -1)) # Append the state tuples in a list, so that we can return # the final states for all the layers. final_states.append((F.concat([forward_state[0], backward_state[0]], -1), F.concat([forward_state[1], backward_state[1]], -1))) stacked_sequence_outputs = F.stack(sequence_outputs, axis=0) # Stack the hidden state and memory for each layer into 2 tensors of shape # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size) # respectively. final_hidden_states, final_memory_states = zip(*final_states) final_state_tuple = (F.concat(final_hidden_states, 0), F.concat(final_memory_states, 0)) return stacked_sequence_outputs, final_state_tuple