def _score(self, src_codes, tgt_nls): """score examples sorted by code length""" args = self.args if args.tie_embed: src_code_var = self.to_input_variable_with_unk_handling( src_codes, cuda=args.cuda).t() tgt_nl_var = self.to_input_variable_with_unk_handling( tgt_nls, cuda=args.cuda).t() else: src_code_var = nn_utils.to_input_variable(src_codes, self.vocab.code, cuda=args.cuda).t() tgt_nl_var = nn_utils.to_input_variable(tgt_nls, self.vocab.source, cuda=args.cuda).t() src_code_mask = Variable(nn_utils.length_array_to_mask_tensor( [len(x) for x in src_codes], cuda=args.cuda, valid_entry_has_mask_one=True).float(), requires_grad=False) tgt_nl_mask = Variable(nn_utils.length_array_to_mask_tensor( [len(x) for x in tgt_nls], cuda=args.cuda, valid_entry_has_mask_one=True).float(), requires_grad=False) scores = self.pi_model(src_code_var, tgt_nl_var, src_code_mask, tgt_nl_mask) return scores
def forward(self, src_sents_var, src_sents_len, tgt_sents_var, tgt_token_copy_pos, tgt_token_copy_mask, tgt_token_gen_mask): """ compute log p(y|x) :param tgt_token_copy_pos: Variable(tgt_action_len, batch_size) :param tgt_token_copy_mask: Variable(tgt_action_len, batch_size) :return: Variable(batch_size) """ src_encodings, (last_state, last_cell) = self.encode(src_sents_var, src_sents_len) dec_init_vec = self.init_decoder_state(last_state, last_cell) # (batch_size, src_sent_len) src_sent_masks = nn_utils.length_array_to_mask_tensor(src_sents_len, cuda=self.cuda) # (tgt_sent_len - 1, batch_size, hidden_size) att_vecs = self.decode(src_encodings, src_sent_masks, dec_init_vec, tgt_sents_var) # (tgt_sent_len - 1, batch_size, 2) tgt_token_predictor = F.softmax(self.tgt_token_predictor(att_vecs), dim=-1) # (tgt_sent_len - 1, batch_size, tgt_vocab_size) token_gen_prob = F.softmax(self.readout(att_vecs), dim=-1) # (tgt_sent_len - 1, batch_size, src_sent_len) token_copy_prob = self.src_pointer_net(src_encodings, src_sent_masks, att_vecs) tgt_token_idx = tgt_sents_var[1:] # remove leading </s> tgt_token_copy_pos = tgt_token_copy_pos[1:] tgt_token_gen_mask = tgt_token_gen_mask[1:] tgt_token_copy_mask = tgt_token_copy_mask[1:] # (tgt_sent_len - 1, batch_size) tgt_token_gen_prob = torch.gather( token_gen_prob, dim=2, index=tgt_token_idx.unsqueeze(2)).squeeze(2) * tgt_token_gen_mask # (tgt_sent_len - 1, batch_size) tgt_token_copy_prob = torch.gather( token_copy_prob, dim=2, index=tgt_token_copy_pos.unsqueeze( 2)).squeeze(2) * tgt_token_copy_mask tgt_token_mask = torch.gt(tgt_token_gen_mask + tgt_token_copy_mask, 0.).float() tgt_token_prob = torch.log( tgt_token_predictor[:, :, 0] * tgt_token_gen_prob + tgt_token_predictor[:, :, 1] * tgt_token_copy_prob + 1.e-7 * (1. - tgt_token_mask)) tgt_token_prob = tgt_token_prob * tgt_token_mask # (batch_size) scores = tgt_token_prob.sum(dim=0) return scores
def sample_from_src_variable(self, src_sents_var, src_sents_len, sample_size): # (batch_size * sample_size, src_sent_len) src_sent_masks = nn_utils.length_array_to_mask_tensor( list(chain.from_iterable([l] * sample_size for l in src_sents_len)), cuda=self.cuda) src_encodings, (last_state, last_cell) = self.encode(src_sents_var, src_sents_len) dec_init_vec = self.init_decoder_state(last_state, last_cell) return self.sample_from_src_encoding(src_encodings, dec_init_vec, sample_size, src_sent_masks)
def sample_from_src_variable(self, src_sents_var, src_sents_len, sample_size): # (batch_size * sample_size, src_sent_len) src_sent_masks = nn_utils.length_array_to_mask_tensor( list(chain.from_iterable([l] * sample_size for l in src_sents_len)), cuda=self.cuda) src_encodings, (last_state, last_cell) = self.encode(src_sents_var, src_sents_len) dec_init_vec = self.init_decoder_state(last_state, last_cell) return self.sample_from_src_encoding(src_encodings, dec_init_vec, sample_size, src_sent_masks)
def encode(self, src_sents_var, src_sents_len): """Encode the input natural language utterance Args: src_sents_var: a variable of shape (src_sent_len, batch_size), representing word ids of the input src_sents_len: a list of lengths of input source sentences, sorted by descending order Returns: src_encodings: source encodings of shape (batch_size, src_sent_len, hidden_size) last_state, last_cell: the last hidden state and cell state of the encoder, of shape (batch_size, hidden_size) """ args = self.args # (src_sent_len, batch_size, embed_size) # apply word dropout if self.training and args.word_dropout: mask = Variable( self.new_tensor(src_sents_var.size()).fill_( 1. - args.word_dropout).bernoulli().long()) src_sents_var = src_sents_var * mask + ( 1 - mask) * self.vocab.source.unk_id # (src_sent_len, batch_size, hidden_size) src_enc_vec = torch.tanh( self.src_enc_linear(self.src_embed(src_sents_var))) # (src_sent_len, batch_size, hidden_size) src = self.src_pos_encoder(src_enc_vec * math.sqrt(args.embed_size)) # (src_sent_len,src_sent_len) src_mask = generate_square_subsequent_mask(src.shape[0], args.cuda) # (batch_size, src_sent_len) src_key_padding_mask = length_array_to_mask_tensor( src_sents_len, args.cuda) # (src_sent_len, batch_size, hidden_state) src_encodings = self.transformer_encoder(src, src_mask, src_key_padding_mask) # TODO: shape assertion src_sent_len, batch_size = src_sents_var.shape assert (src_enc_vec.shape == (src_sent_len, batch_size, args.hidden_size)) assert (src.shape == (src_sent_len, batch_size, args.hidden_size)) assert (src_mask.shape == (src_sent_len, src_sent_len)) assert (src_key_padding_mask.shape == (batch_size, src_sent_len)) assert (src_encodings.shape == (src_sent_len, batch_size, args.hidden_size)) src_encodings = src_encodings.permute(1, 0, 2) # last_state = src_encodings[:, 0, :] last_state = src_encodings.mean(1) last_cell = last_state return src_encodings, (last_state, last_cell)
def decode(self, src_encodings, src_sents_len, dec_init_vec, tgt_sents_var): """ compute the final softmax layer at each decoding step :param src_encodings: Variable(src_sent_len, batch_size, hidden_size * 2) :param src_sents_len: list[int] :param dec_init_vec: tuple((batch_size, hidden_size)) :param tgt_sents_var: Variable(tgt_sent_len, batch_size) :return: scores: Variable(src_sent_len, batch_size, src_vocab_size) """ new_tensor = src_encodings.data.new batch_size = src_encodings.size(1) h_tm1 = dec_init_vec # (batch_size, query_len, hidden_size * 2) src_encodings = src_encodings.permute(1, 0, 2) # (batch_size, query_len, hidden_size) src_encodings_att_linear = self.att_src_linear(src_encodings) # initialize the attentional vector att_tm1 = Variable(new_tensor(batch_size, self.hidden_size).zero_(), requires_grad=False) # (batch_size, src_sent_len) src_sent_masks = nn_utils.length_array_to_mask_tensor(src_sents_len, cuda=self.cuda) # (tgt_sent_len, batch_size, embed_size) tgt_token_embed = self.tgt_embed(tgt_sents_var) scores = [] # start from `<s>`, until y_{T-1} for t, y_tm1_embed in list(enumerate(tgt_token_embed.split(split_size=1)))[:-1]: # input feeding: concate y_tm1 and previous attentional vector # split() keeps the first dim y_tm1_embed = y_tm1_embed.squeeze(0) if t > 0 and self.decoder_word_dropout: # (batch_size) y_tm1_mask = Variable(torch.bernoulli(new_tensor(batch_size).fill_(1 - self.decoder_word_dropout))) y_tm1_embed = y_tm1_embed * y_tm1_mask.unsqueeze(1) x = torch.cat([y_tm1_embed, att_tm1], 1) (h_t, cell_t), att_t, score_t = self.step(x, h_tm1, src_encodings, src_encodings_att_linear, src_sent_masks=src_sent_masks) scores.append(score_t) att_tm1 = att_t h_tm1 = (h_t, cell_t) # (src_sent_len, batch_size, tgt_vocab_size) scores = torch.stack(scores) return scores
def decode(self, src_encodings, src_sents_len, dec_init_vec, tgt_sents_var): """ compute the final softmax layer at each decoding step :param src_encodings: Variable(src_sent_len, batch_size, hidden_size * 2) :param src_sents_len: list[int] :param dec_init_vec: tuple((batch_size, hidden_size)) :param tgt_sents_var: Variable(tgt_sent_len, batch_size) :return: scores: Variable(src_sent_len, batch_size, src_vocab_size) """ new_tensor = src_encodings.data.new batch_size = src_encodings.size(1) h_tm1 = dec_init_vec # (batch_size, query_len, hidden_size * 2) src_encodings = src_encodings.permute(1, 0, 2) # (batch_size, query_len, hidden_size) src_encodings_att_linear = self.att_src_linear(src_encodings) # initialize the attentional vector att_tm1 = Variable(new_tensor(batch_size, self.hidden_size).zero_(), requires_grad=False) # (batch_size, src_sent_len) src_sent_masks = nn_utils.length_array_to_mask_tensor(src_sents_len, cuda=self.cuda) # (tgt_sent_len, batch_size, embed_size) tgt_token_embed = self.tgt_embed(tgt_sents_var) scores = [] # start from `<s>`, until y_{T-1} for t, y_tm1_embed in list(enumerate(tgt_token_embed.split(split_size=1)))[:-1]: # input feeding: concate y_tm1 and previous attentional vector # split() keeps the first dim y_tm1_embed = y_tm1_embed.squeeze(0) if t > 0 and self.decoder_word_dropout: # (batch_size) y_tm1_mask = Variable(torch.bernoulli(new_tensor(batch_size).fill_(1 - self.decoder_word_dropout))) y_tm1_embed = y_tm1_embed * y_tm1_mask.unsqueeze(1) x = torch.cat([y_tm1_embed, att_tm1], 1) (h_t, cell_t), att_t, score_t = self.step(x, h_tm1, src_encodings, src_encodings_att_linear, src_sent_masks=src_sent_masks) scores.append(score_t) att_tm1 = att_t h_tm1 = (h_t, cell_t) # (src_sent_len, batch_size, tgt_vocab_size) scores = torch.stack(scores) return scores
def step(self, x, src_encodings, src_key_padding_mask, hyp_len): """ At each step during inference time, x contains embeddings of tentative hypothesis. We need to mask appropriately and pass the entire x into the transformer decoder to get the updated att_vec for each hypothesis. Args: x: tgt inputs of shape (t, hyp_num, input_dim), t is the max hypothesis length at step t during inference. src_encodings: variable of shape (src_sent_len, batch_size, hidden_size), encodings of source utterances. src_key_padding_mask: to be used as the memory_key_padding_mask for the attention decoder. hyp_len: in-progress hypothesis length np.array of shape (hyp_num,). All values = t. Returns: att_t: output of the transformer decoder for the t-th step of the shape (hyp_num, hidden_size). """ tgt_action_len = x.shape[0] batch_size = x.shape[1] args = self.args # Transformer decoder # (tgt_action_len, batch_size, hidden_size) tgt_dec_vec = torch.tanh(self.tgt_dec_linear(x)) # (tgt_action_len, batch_size, hidden_size) tgt = self.tgt_pos_encoder(tgt_dec_vec * math.sqrt(self.input_dim)) # (tgt_action_len, tgt_action_len) tgt_mask = generate_square_subsequent_mask(tgt_action_len) memory_mask = None # (batch_size, tgt_action_len) tgt_key_padding_mask = length_array_to_mask_tensor(hyp_len, args.cuda) # (batch_size, src_sent_len) memory_key_padding_mask = src_key_padding_mask.clone() # TODO: shape assertion assert (tgt_dec_vec.shape == (tgt_action_len, batch_size, args.hidden_size)) assert (tgt.shape == (tgt_action_len, batch_size, args.hidden_size)) assert (tgt_mask.shape == (tgt_action_len, tgt_action_len)) assert (tgt_key_padding_mask.shape == (batch_size, tgt_action_len)) # assert(memory_key_padding_mask.shape[0]==batch_size) TODO:incompatible batch_size during inference time att_vecs = self.transformer_decoder(tgt, src_encodings, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask=None) assert (att_vecs.shape == (tgt_action_len, batch_size, args.hidden_size)) return att_vecs[-1, :, :]
def src_token_mask(self): return nn_utils.length_array_to_mask_tensor(self.src_sents_len, cuda=self.cuda)
def tgt_token_mask_usual(self): return nn_utils.length_array_to_mask_tensor(self.tgt_actions_len, cuda=self.cuda, valid_entry_has_mask_one=True)
def src_token_mask_usual(self): return nn_utils.length_array_to_mask_tensor(self.src_sents_len, cuda=self.cuda, valid_entry_has_mask_one=True)
def src_token_mask(self): return nn_utils.length_array_to_mask_tensor(self.src_sents_len, cuda=self.cuda)