def zero_state(self, inputs, encoder): batch_size = inputs.size(0) h0 = Variable(torch.zeros(encoder.num_layers * 2, batch_size, self.enc_hidden_dim), requires_grad=False) c0 = Variable(torch.zeros(encoder.num_layers * 2, batch_size, self.enc_hidden_dim), requires_grad=False) return set_cuda(h0, self.use_cuda), set_cuda(c0, self.use_cuda)
def unpack_batch(cls, batch, cuda): words = set_cuda(get_long_tensor(batch.word, batch.batch_size), cuda) masks = set_cuda(torch.eq(words, 0), cuda) pos = set_cuda(get_long_tensor(batch.pos, batch.batch_size), cuda) ner = set_cuda(get_long_tensor(batch.ner, batch.batch_size), cuda) coref = set_cuda(get_long_tensor(batch.coref, batch.batch_size), cuda) ucca_enc = set_cuda(get_long_tensor(batch.ucca_enc, batch.batch_size), cuda) rel = set_cuda(torch.LongTensor(batch.rel), cuda) input = Input(batch_size=batch.batch_size, word=words, mask=masks, pos=pos, ner=ner, coref=coref, ucca_enc=ucca_enc, len=batch.len, head=batch.head, ucca_head=batch.ucca_head, ucca_multi_head=batch.ucca_multi_head, ucca_dist_from_mh_path=batch.ucca_dist_from_mh_path, subj_p=batch.subj_p, obj_p=batch.obj_p, id=batch.id, orig_idx=batch.orig_idx) return input, rel
def dec_zero_state(self, batch_size): h0 = Variable(torch.zeros(batch_size, self.dec_hidden_dim), requires_grad=False) c0 = Variable(torch.zeros(batch_size, self.dec_hidden_dim), requires_grad=False) return set_cuda(h0, self.use_cuda), set_cuda(c0, self.use_cuda)
def decode(self, dec_inputs, dec_hidden, ctx, ctx_tokens, ctx_mask=None, indi_h=None, inference=False): batch_size = dec_inputs.size(0) seq_len = dec_inputs.size(1) ctx_len = ctx_tokens.size(1) copy_indices = set_cuda( torch.LongTensor(batch_size).zero_() + -1, self.use_cuda) # by default do not hard copy if self.use_indi: dec_inputs_with_indi = torch.cat([ dec_inputs, indi_h.unsqueeze(1).expand(-1, dec_inputs.size(1), -1) ], dim=2) else: dec_inputs_with_indi = dec_inputs # no indi h, h_tilde, h_c, attn, cov, dec_hidden = self.decoder( dec_inputs_with_indi, dec_hidden, ctx, ctx_mask) h_tilde_flat = h_tilde.contiguous().view(-1, h_tilde.size(2)) # B*L x dim decoder_logits = self.dec2vocab(h_tilde_flat).view( h_tilde.size(0), h_tilde.size(1), -1) decoder_logits[:, :, constant.PAD_ID] = -constant.INFINITY_NUMBER decoder_logits[:, :, constant.UNK_ID] = -constant.INFINITY_NUMBER decoder_probs = self.get_prob(decoder_logits) # [B, QT, V] decoder_probs = torch.log(decoder_probs + 1e-12) copier_probs = attn copier_probs = torch.log(copier_probs + 1e-12) h_flat = h.contiguous().view(-1, h.size(2)) h_c_flat = h_c.contiguous().view(-1, h_c.size(2)) dec_inputs_flat = dec_inputs.contiguous().view(-1, dec_inputs.size(2)) c = self.combiner(torch.cat([h_flat, h_c_flat, dec_inputs_flat], dim=1)).view(batch_size, seq_len, -1) # [B, QT, 2] cpy_prob = c[:, :, 0] dec_prob = c[:, :, 1] if inference: assert cpy_prob.size( 1 ) == seq_len == 1, "Inference mode has to decode one step at a time." use_cpy = (cpy_prob > dec_prob).float() cpy_prob = use_cpy dec_prob = 1.0 - use_cpy # find copy indices _, copy_indices = copier_probs.squeeze(1).max(dim=1) for i, c in enumerate(use_cpy.squeeze(1).data): if c == 0: copy_indices[i] = -1 # do not hard copy copy_indices = copy_indices.data expanded_cpy_prob = cpy_prob.unsqueeze(2).expand( batch_size, seq_len, ctx_len) expanded_dec_prob = dec_prob.unsqueeze(2).expand( batch_size, seq_len, self.vocab_size) full_copier_probs = set_cuda(Variable(torch.zeros(batch_size, seq_len, self.vocab_size) + -1e10), \ self.use_cuda) expanded_ctx_tokens = ctx_tokens.unsqueeze(1).expand_as(copier_probs) combined_copier_probs = copier_probs + expanded_cpy_prob # combine before scatter full_copier_probs.scatter_(2, expanded_ctx_tokens, combined_copier_probs) # scatter info back combined_probs = torch.exp(full_copier_probs) + torch.exp( expanded_dec_prob + decoder_probs) log_probs = torch.log(combined_probs) if inference: return log_probs, dec_hidden, copy_indices return log_probs, dec_hidden, attn, cov
def decode(self, dec_inputs, dec_hidden, ctx, ctx_tokens, ctx_mask=None, bg_h=None, inference=False): """ Every input should be batch first. For inference mode, the output will be decided to be either copier prob or decoder prob. """ batch_size = dec_inputs.size(0) seq_len = dec_inputs.size(1) ctx_len = ctx_tokens.size(1) copy_indices = set_cuda( torch.LongTensor(batch_size).zero_() + -1, self.use_cuda) # by default do not hard copy if self.use_bg: dec_inputs_with_bg = torch.cat([ dec_inputs, bg_h.unsqueeze(1).expand(-1, dec_inputs.size(1), -1) ], dim=2) else: dec_inputs_with_bg = dec_inputs # no bg # attentional decoder h, h_tilde, h_c, attn, cov, dec_hidden = self.decoder( dec_inputs_with_bg, dec_hidden, ctx, ctx_mask) # vocab prediction layer h_tilde_flat = h_tilde.contiguous().view(-1, h_tilde.size(2)) # B*L x dim decoder_logits = self.dec2vocab(h_tilde_flat).view( h_tilde.size(0), h_tilde.size(1), -1) # force PAD and UNK logits to -inf decoder_logits[:, :, constant.PAD_ID] = -constant.INFINITY_NUMBER decoder_logits[:, :, constant.UNK_ID] = -constant.INFINITY_NUMBER decoder_probs = self.get_prob(decoder_logits) # [B, QT, V] decoder_probs = torch.log(decoder_probs + 1e-12) # copy network copier_probs = attn copier_probs = torch.log(copier_probs + 1e-12) # [B, QT, CT] # combine h_flat = h.contiguous().view(-1, h.size(2)) h_c_flat = h_c.contiguous().view(-1, h_c.size(2)) dec_inputs_flat = dec_inputs.contiguous().view(-1, dec_inputs.size(2)) c = self.combiner(torch.cat([h_flat, h_c_flat, dec_inputs_flat], dim=1)).view(batch_size, seq_len, -1) # [B, QT, 2] cpy_prob = c[:, :, 0] dec_prob = c[:, :, 1] # if inference=True, do hard selection if inference: assert cpy_prob.size( 1 ) == seq_len == 1, "Inference mode has to decode one step at a time." use_cpy = (cpy_prob > dec_prob).float() cpy_prob = use_cpy dec_prob = 1.0 - use_cpy # find copy indices _, copy_indices = copier_probs.squeeze(1).max(dim=1) for i, c in enumerate(use_cpy.squeeze(1).data): if c == 0: copy_indices[i] = -1 # do not hard copy copy_indices = copy_indices.data expanded_cpy_prob = cpy_prob.unsqueeze(2).expand( batch_size, seq_len, ctx_len) expanded_dec_prob = dec_prob.unsqueeze(2).expand( batch_size, seq_len, self.vocab_size) # scatter probs back full_copier_probs = set_cuda(Variable(torch.zeros(batch_size, seq_len, self.vocab_size) + -1e10), \ self.use_cuda) expanded_ctx_tokens = ctx_tokens.unsqueeze(1).expand_as(copier_probs) combined_copier_probs = copier_probs + expanded_cpy_prob # combine before scatter full_copier_probs.scatter_(2, expanded_ctx_tokens, combined_copier_probs) # scatter info back # combine in log space combined_probs = torch.exp(full_copier_probs) + torch.exp( expanded_dec_prob + decoder_probs) log_probs = torch.log(combined_probs) # output dec_hidden for future decoding if inference: return log_probs, dec_hidden, copy_indices return log_probs, dec_hidden, attn, cov
def decode(self, dec_inputs, dec_hidden, ctx, ctx_tokens, ctx_mask=None, inference=False): batch_size = dec_inputs.size(0) seq_len = dec_inputs.size(1) ctx_len = ctx_tokens.size(1) copy_indices = set_cuda( torch.LongTensor(batch_size).zero_() + -1, self.use_cuda) # by default do not hard copy # attentional decoder h, h_tilde, h_c, attn, cov, dec_hidden = self.decoder( dec_inputs, dec_hidden, ctx, ctx_mask) # vocab prediction layer h_tilde_flat = h_tilde.contiguous().view(-1, h_tilde.size(2)) # B*L x dim decoder_logits = self.to_vocab(h_tilde_flat).view( h_tilde.size(0), h_tilde.size(1), -1) # force PAD and UNK logits to -inf decoder_logits[:, :, constant.PAD_ID] = -constant.INFINITY_NUMBER decoder_logits[:, :, constant.UNK_ID] = -constant.INFINITY_NUMBER decoder_probs = self.get_prob(decoder_logits) # [B, QT, V] decoder_probs = torch.log(decoder_probs + 1e-12) # copy network copier_probs = attn copier_probs = torch.log(copier_probs + 1e-12) # [B, QT, CT] # combine all of the outputs h_flat = h.contiguous().view(-1, h.size(2)) h_c_flat = h_c.contiguous().view(-1, h_c.size(2)) dec_inputs_flat = dec_inputs.contiguous().view(-1, dec_inputs.size(2)) c = self.sequential( torch.cat([h_flat, h_c_flat, dec_inputs_flat], dim=1)).view(batch_size, seq_len, -1) # [B, QT, 2] cpy_prob = c[:, :, 0] dec_prob = c[:, :, 1] if inference: use_cpy = (cpy_prob > dec_prob).float() cpy_prob = use_cpy dec_prob = 1.0 - use_cpy _, copy_indices = copier_probs.squeeze(1).max(dim=1) for i, c in enumerate(use_cpy.squeeze(1).data): if c == 0: copy_indices[i] = -1 copy_indices = copy_indices.data expanded_cpy_prob = cpy_prob.unsqueeze(2).expand( batch_size, seq_len, ctx_len) expanded_dec_prob = dec_prob.unsqueeze(2).expand( batch_size, seq_len, self.vocab_size) full_copier_probs = set_cuda(Variable(torch.zeros(batch_size, seq_len, self.vocab_size) + -1e10), \ self.use_cuda) expanded_ctx_tokens = ctx_tokens.unsqueeze(1).expand_as(copier_probs) combined_copier_probs = copier_probs + expanded_cpy_prob # combine before scatter full_copier_probs.scatter_(2, expanded_ctx_tokens, combined_copier_probs) # scatter info back combined_probs = torch.exp(full_copier_probs) + torch.exp( expanded_dec_prob + decoder_probs) log_probs = torch.log(combined_probs) # output dec_hidden for next step(s) decoding if inference: return log_probs, dec_hidden, copy_indices return log_probs, dec_hidden, attn, cov