def batch_size(self): """Return length. Returns: length of sequence """ if self.expr_list: return tt.batch_size(self.expr_list[0]) elif self.expr_tensor is not None: return tt.batch_size(self.expr_tensor) else: return tt.batch_size(self.expr_transposed_tensor)
def add_input_to_prev(self, prev_state: UniLSTMState, x: Union[tt.Tensor, Sequence[tt.Tensor]]) \ -> Tuple[Sequence[tt.Tensor]]: if isinstance(x, dy.Expression): x = [x] elif type(x) != list: x = list(x) if self.dropout_rate > 0.0 and self.train and self.dropout_mask_x is None: self.set_dropout_masks(batch_size=tt.batch_size(x[0])) new_c, new_h = [], [] for layer_i in range(self.num_layers): if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) gates = dy.vanilla_lstm_gates_dropout_concat( x, prev_state._h[layer_i], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.dropout_mask_x[layer_i], self.dropout_mask_h[layer_i], self.weightnoise_std if self.train else 0.0) else: gates = dy.vanilla_lstm_gates_concat( x, prev_state._h[layer_i], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0) new_c.append(dy.vanilla_lstm_c(prev_state._c[layer_i], gates)) new_h.append(dy.vanilla_lstm_h(new_c[-1], gates)) x = [new_h[-1]] return new_c, new_h
def calc_nll(self, src: Union[batchers.Batch, sent.Sentence], trg: Union[batchers.Batch, sent.Sentence]) \ -> tt.Tensor: if not batchers.is_batched(src): src = batchers.ListBatch([src]) src_inputs = batchers.ListBatch( [s[:-1] for s in src], mask=batchers.Mask(src.mask.np_arr[:, :-1]) if src.mask else None) src_targets = batchers.ListBatch( [s[1:] for s in src], mask=batchers.Mask(src.mask.np_arr[:, 1:]) if src.mask else None) event_trigger.start_sent(src) embeddings = self.src_embedder.embed_sent(src_inputs) encodings = self.rnn.transduce(embeddings) encodings_tensor = encodings.as_tensor() encoding_reshaped = tt.merge_time_batch_dims(encodings_tensor) seq_len = tt.sent_len(encodings_tensor) batch_size = tt.batch_size(encodings_tensor) outputs = self.transform.transform(encoding_reshaped) ref_action = np.asarray([sent.words for sent in src_targets]).reshape( (seq_len * batch_size, )) loss_expr_perstep = self.scorer.calc_loss( outputs, batchers.mark_as_batch(ref_action)) loss_expr_perstep = tt.unmerge_time_batch_dims(loss_expr_perstep, batch_size) loss = tt.aggregate_masked_loss(loss_expr_perstep, src_targets.mask) return loss
def _combine_batches(self, batched_expr, comb_method: str = "sum"): if comb_method == "sum": return dy.sum_batches(batched_expr) elif comb_method == "avg": return dy.sum_batches(batched_expr) * (1.0 / tt.batch_size(batched_expr)) else: raise ValueError( f"Unknown batch combination method '{comb_method}', expected 'sum' or 'avg'.'" )
def _encode_src(self, src: Union[sent.Sentence, batchers.Batch]) -> tuple: event_trigger.start_sent(src) embeddings = self.src_embedder.embed_sent(src) encodings = self.encoder.transduce(embeddings) encodings_tensor = encodings.as_tensor() encoding_reshaped = tt.merge_time_batch_dims(encodings_tensor) outputs = self.transform.transform(encoding_reshaped) return tt.batch_size( encodings_tensor), encodings, outputs, tt.sent_len( encodings_tensor)
def transduce( self, expr_seq: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, expression_seqs.ExpressionSequence): expr_seq = [expr_seq] concat_inputs = len(expr_seq) >= 2 batch_size = tt.batch_size(expr_seq[0][0]) seq_len = expr_seq[0].sent_len() mask = expr_seq[0].mask if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)] c = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)] for pos_i in range(seq_len): if concat_inputs and layer_i == 0: x_t = tt.concatenate( [cur_input[i][pos_i] for i in range(len(cur_input))]) else: x_t = cur_input[0][pos_i] h_tm1 = h[-1] if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) x_t = torch.mul(x_t, self.dropout_mask_x[layer_i]) h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i]) h_t, c_t = self.layers[layer_i](x_t, (h_tm1, c[-1])) if mask is None or np.isclose( np.sum(mask.np_arr[:, pos_i:pos_i + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask.cmult_by_timestep_expr(c_t, pos_i, True) + mask.cmult_by_timestep_expr(c[-1], pos_i, False)) h.append( mask.cmult_by_timestep_expr(h_t, pos_i, True) + mask.cmult_by_timestep_expr(h[-1], pos_i, False)) self._final_states.append( transducers.FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask)
def transduce( self, es: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': batch_size = tt.batch_size(es.as_tensor()) if es.mask: seq_lengths = es.mask.seq_lengths() else: seq_lengths = [es.sent_len()] * batch_size # Sort the input and lengths as the descending order seq_lengths = torch.LongTensor(seq_lengths).to(xnmt.device) lengths, perm_index = seq_lengths.sort(0, descending=True) sorted_input = es.as_tensor()[perm_index] perm_index_rev = [-1] * len(lengths) for i in range(len(lengths)): perm_index_rev[perm_index[i]] = i perm_index_rev = torch.LongTensor(perm_index_rev).to(xnmt.device) packed_input = nn.utils.rnn.pack_padded_sequence(sorted_input, list(lengths.data), batch_first=True) state_size = self.num_dir * self.num_layers, batch_size, self.hidden_dim // self.num_dir h0 = sorted_input.new_zeros(*state_size) c0 = sorted_input.new_zeros(*state_size) output, (final_hiddens, final_cells) = self.lstm(packed_input, (h0, c0)) output = nn.utils.rnn.pad_packed_sequence( output, batch_first=True, total_length=es.sent_len())[0] # restore the sorting decoded = output[perm_index_rev] self._final_states = [] for layer_i in range(self.num_layers): final_hidden = final_hiddens.view( self.num_layers, self.num_dir, batch_size, -1)[layer_i].transpose(0, 1).contiguous().view(batch_size, -1) final_hidden = final_hidden[perm_index_rev] self._final_states.append( transducers.FinalTransducerState(final_hidden)) ret = expression_seqs.ExpressionSequence(expr_tensor=decoded, mask=es.mask) return ret
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = tt.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: batch_size = tt.batch_size(seq_tensor) merged_seq_tensor = tt.merge_time_batch_dims(seq_tensor) transformed_seq_tensor = self.layer_norm_component.transform( merged_seq_tensor) seq_tensor = tt.unmerge_time_batch_dims(transformed_seq_tensor, batch_size) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
def add_input_to_prev(self, prev_state: UniLSTMState, x: tt.Tensor) \ -> Tuple[Sequence[tt.Tensor]]: assert isinstance(x, tt.Tensor) if self.dropout_rate > 0.0 and self.train and self.dropout_mask_x is None: self.set_dropout_masks(batch_size=tt.batch_size(x)) new_c, new_h = [], [] for layer_i in range(self.num_layers): h_tm1 = prev_state._h[layer_i] if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) x = torch.mul(x, self.dropout_mask_x[layer_i]) h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i]) h_t, c_t = self.layers[layer_i](x, (h_tm1, prev_state._c[layer_i])) new_c.append(c_t) new_h.append(h_t) x = h_t return new_c, new_h
def initial_state(self, enc_final_states: Any, ss: Any) -> AutoRegressiveDecoderState: """Get the initial state of the decoder given the encoder final states. Args: enc_final_states: The encoder final states. Usually but not necessarily an :class:`xnmt.expression_sequence.ExpressionSequence` ss: first input Returns: initial decoder state """ rnn_state = self.rnn.initial_state() rnn_s = self.bridge.decoder_init(enc_final_states) rnn_state = rnn_state.set_s(rnn_s) ss_expr = self.embedder.embed(ss) zeros = tt.zeroes( hidden_dim=self.input_dim, batch_size=tt.batch_size(ss_expr)) if self.input_feeding else None rnn_state = rnn_state.add_input( tt.concatenate([ss_expr, zeros]) if self.input_feeding else ss_expr ) return AutoRegressiveDecoderState(rnn_state=rnn_state, context=zeros)
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: sent_len = src.sent_len() batch_size = tt.batch_size(src[0]) embeddings = self.embeddings( torch.tensor([list(range(sent_len))] * batch_size).to(xnmt.device)) # embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = tt.concatenate([embeddings, src.as_tensor()]) else: raise ValueError( f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")' ) if self.train and self.dropout > 0.0: output = tt.dropout(output, self.dropout) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def decoder_init( self, enc_final_states: Sequence[transducers.FinalTransducerState] ) -> List[tt.Tensor]: batch_size = tt.batch_size(enc_final_states[0].main_expr()) z = tt.zeroes(hidden_dim=self.dec_dim, batch_size=batch_size) return [z] * (self.dec_layers * 2)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [ dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo) ] bq, bk, bv, bo = [ dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo) ] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = tt.sent_len_transp(x) x_batch = tt.batch_size(x) # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q, k, v) ] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat( expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = dy.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [ transducers.FinalTransducerState(expr_seq[-1], None) ] return expr_seq
def transduce( self, embed_sent: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src = embed_sent.as_tensor() sent_len = tt.sent_len(src) batch_size = tt.batch_size(src) pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq