def beamsearch(memory, model, device, beam_size=4, candidates=1, max_seq_length=128, bos_token=1, eos_token=2): # memory: Tx1xE model.eval() beam = Beam(beam_size=beam_size, min_length=0, n_top=candidates, ranker=None, start_token_id=bos_token, end_token_id=eos_token) with torch.no_grad(): # memory = memory.repeat(1, beam_size, 1) # TxNxE memory = model.SequenceModeling.expand_memory(memory, beam_size) for _ in range(max_seq_length): tgt_inp = beam.get_current_state().transpose(0, 1).to(device) # TxN decoder_outputs, memory = model.SequenceModeling.forward_decoder(tgt_inp, memory) log_prob = log_softmax(decoder_outputs[:, -1, :].squeeze(0), dim=-1) beam.advance(log_prob.cpu()) if beam.done(): break scores, ks = beam.sort_finished(minimum=1) hypothesises = [] for i, (times, k) in enumerate(ks[:candidates]): hypothesis = beam.get_hypothesis(times, k) hypothesises.append(hypothesis) return [1] + [int(i) for i in hypothesises[0][:-1]]
def gensummary_gpt2(template_vec, ge, vocab, LMModel, word_list, subvocab, clustermask=None, mono=True, renorm=True, temperature=1, bpe2word='last', max_step = 20, beam_width = 10, beam_width_start = 10, alpha=0.1, alpha_start=0.1, begineos=True, stopbyLMeos=False, devid=0, **kwargs): """ Unsupervised sentence summary generation using beam search, by contextual matching and a summary style language model. The contextual matching here is on top of pretrained ELMo embeddings. Input: template_vec: forward only ELMo embeddings of the source sentence. 'torch.Tensor' of size (3, seq_len, 512). ge: 'gpt2_sequential_embedder.GPT2Embedder' object. vocab: 'torchtext.vocab.Vocab' object. Should be the same as is used for the pretrained language model. LMModel: a pretrained language model on the summary sentences. word_list: a list of words in the vocabulary to work with. 'List'. subvocab: 'torch.LongTensor' consisting of the indices of the words corresponding to 'word_list'. clustermask: a binary mask for each of the sub-vocabulary word. 'torch.ByteTensor' of size (len(sub-vocabulary), len(vocabulary)). Default:None. mono: whether to keep monotonicity contraint. Default: True. renorm: whether to renormalize the probabilities over the sub-vocabulary. Default: True. Temperature: temperature applied to the softmax in the language model. Default: 1. bpe2word: how to turn the BPE vectors into word vectors. Choose from ['last', 'avg']. Default: 'last'. max_step: maximum number of beam steps. beam_width: beam width. beam_width_start: beam width of the first step. alpha: the amount of language model part used for scoring. The score is: (1 - \alpha) * similarity_logscore + \alpha * LM_logscore. begineos: whether to begin with the special '<eos>' token as is trained in the language model. Note that ELMo has its own special beginning token. Default: True. stopbyLMeos: whether to stop a sentence solely by the language model predicting '<eos>' as the top possibility. Default: False. devid: device id to run the algorithm and LSTM language models. 'int', default: 0. -1 for cpu. **kwargs: other arguments input to function <Beam.beamstep>. E.g. normalized: whether to normalize the dot product when calculating the similarity, which makes it cosine similarity. Default: True. ifadditive: whether to use an additive model on mixing the probability scores. Default: False. Output: beam: 'Beam' object, recording all the generated sequences. """ device = 'cpu' if devid == -1 else f'cuda:{devid}' # Beam Search: initialization if begineos: beam = Beam(1, vocab, init_ids=[vocab.stoi['<eos>']], device=device, sim_score=0, lm_score=0, lm_state=None, gpt2_state=None, align_loc=None) else: beam = Beam(1, vocab, init_ids=[None], device=device, sim_score=0, lm_score=0, lm_state=None, gpt2_state=None, align_loc=None) # first step: start with 'beam_width_start' best matched words beam.beamstep(beam_width_start, beam.combscoreK_GPT2, template_vec=template_vec, ge=ge, LMModel=LMModel, word_list=word_list, subvocab=subvocab, clustermask=clustermask, alpha=alpha_start, renorm=renorm, temperature=temperature, bpe2word=bpe2word, normalized=True, ifadditive=False, **kwargs) # run beam search, until all sentences hit <EOS> or max_step reached for s in range(max_step): print(f'beam step {s+1} ' + '-' * 50 + '\n') beam.beamstep(beam_width, beam.combscoreK_GPT2, template_vec=template_vec, ge=ge, LMModel=LMModel, word_list=word_list, subvocab=subvocab, clustermask=clustermask, mono=mono, alpha=alpha, renorm=renorm, temperature=temperature, stopbyLMeos=stopbyLMeos, bpe2word=bpe2word, normalized=True, ifadditive=False, **kwargs) # all beams reach termination if beam.endall: break return beam
def forward(self, decoder_input, embedded_inputs, hidden, context): """ Args: decoder_input: The initial input to the decoder size is [batch_size x embedding_dim]. Trainable parameter. embedded_inputs: [sourceL x batch_size x embedding_dim] hidden: the prev hidden state, size is [batch_size x hidden_dim]. Initially this is set to (enc_h[-1], enc_c[-1]) context: encoder outputs, [sourceL x batch_size x hidden_dim] """ def recurrence(x, hidden, logit_mask, prev_idxs, step): hx, cx = hidden # batch_size x hidden_dim gates = self.input_weights(x) + self.hidden_weights(hx) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) ingate = F.sigmoid(ingate) forgetgate = F.sigmoid(forgetgate) cellgate = F.tanh(cellgate) outgate = F.sigmoid(outgate) cy = (forgetgate * cx) + (ingate * cellgate) hy = outgate * F.tanh(cy) # batch_size x hidden_dim g_l = hy for i in range(self.n_glimpses): ref, logits = self.glimpse(g_l, context) logits, logit_mask = self.apply_mask_to_logits(step, logits, logit_mask, prev_idxs) # [batch_size x h_dim x sourceL] * [batch_size x sourceL x 1] = # [batch_size x h_dim x 1] g_l = torch.bmm(ref, self.sm(logits).unsqueeze(2)).squeeze(2) _, logits = self.pointer(g_l, context) logits, logit_mask = self.apply_mask_to_logits(step, logits, logit_mask, prev_idxs) probs = self.sm(logits) return hy, cy, probs, logit_mask batch_size = context.size(1) outputs = [] selections = [] steps = range(self.max_length) # or until terminating symbol ? inps = [] idxs = None mask = None if self.decode_type == "stochastic": for i in steps: hx, cx, probs, mask = recurrence(decoder_input, hidden, mask, idxs, i) hidden = (hx, cx) # select the next inputs for the decoder [batch_size x hidden_dim] decoder_input, idxs = self.decode_stochastic( probs, embedded_inputs, selections) inps.append(decoder_input) # use outs to point to next object outputs.append(probs) selections.append(idxs) return (outputs, selections), hidden elif self.decode_type == "beam_search": # Expand input tensors for beam search decoder_input = Variable(decoder_input.data.repeat(self.beam_size, 1)) context = Variable(context.data.repeat(1, self.beam_size, 1)) hidden = (Variable(hidden[0].data.repeat(self.beam_size, 1)), Variable(hidden[1].data.repeat(self.beam_size, 1))) beam = [ Beam(self.beam_size, self.max_length, cuda=self.use_cuda) for k in range(batch_size) ] for i in steps: hx, cx, probs, mask = recurrence(decoder_input, hidden, mask, idxs, i) hidden = (hx, cx) probs = probs.view(self.beam_size, batch_size, -1 ).transpose(0, 1).contiguous() n_best = 1 # select the next inputs for the decoder [batch_size x hidden_dim] decoder_input, idxs, active = self.decode_beam(probs, embedded_inputs, beam, batch_size, n_best, i) inps.append(decoder_input) # use probs to point to next object if self.beam_size > 1: outputs.append(probs[:, 0,:]) else: outputs.append(probs.squeeze(0)) # Check for indexing selections.append(idxs) # Should be done decoding if len(active) == 0: break decoder_input = Variable(decoder_input.data.repeat(self.beam_size, 1)) return (outputs, selections), hidden
def translate(self, src, trg, beam_size, Lang2): ''' beam search decoding. ''' ''' :param src: [src_max_len, batch] ## batch = 1 :param trg: [trg_max_len, batch] ## batch = 1 :param sentence: [sentence_len] :return: best translate candidate ''' max_len = trg.size(0) encoder_output, hidden = self.encoder(src) ''' ## src: [src_max_len, batch] ## encoder_output: [src_max_len, batch, hidden_size] ## hidden: (num_layers * num_directions, batch, hidden_size) -> [2, batch, hidden_size] ''' hidden = hidden[:self.decoder. n_layers] # [n_layers, batch, hidden_size] # trg: [trg_max_len, batch] output = Variable(trg.data[0, :]) # sos [batch] beam = Beam(beam_size, Lang2.vocab.stoi, True) input_feeding = None for t in range(1, max_len): # output: [batch] -> [batch, output_size] output, hidden, attn_weights = self.decoder( output, hidden, encoder_output, input_feeding) input_feeding = output output = self.decoder.out(output) output = F.log_softmax(output, dim=1) workd_lk = output if output.size(0) == 1: output_prob = output.squeeze(0) ## [output_size] workd_lk = output_prob.expand( beam_size, output_prob.size(0)) ## [beam_size, output_size] # [n_layers, batch, hidden_size] hidden = hidden.squeeze(1) # [n_layers, hidden_size] hidden = hidden.expand( beam_size, hidden.size(0), hidden.size(1)) # [beam_size, n_layers, hidden_size] hidden = hidden.transpose( 0, 1) # [n_layers, beam_size, hidden_size] # [src_max_len, batch, hidden_size] encoder_output = encoder_output.squeeze( 1) ## [src_max_len, hidden_size] encoder_output = encoder_output.expand( beam_size, encoder_output.size(0), encoder_output.size( 1)) ## [beam_size, src_max_len, hidden_size] encoder_output = encoder_output.transpose( 0, 1) ## [src_max_len, beam_size, hidden_size] input_feeding = input_feeding.squeeze(0) input_feeding = input_feeding.expand(beam_size, input_feeding.size(0)) flag = beam.advance(workd_lk) if flag: break nextInputs = beam.get_current_state() # print("[nextInputs]:", nextInputs) output = nextInputs # output = Variable(nextInputs).cuda() originState = beam.get_current_origin() ## print("[origin_state]:", originState) hidden = hidden[:, originState] input_feeding = input_feeding[originState] xx, yy = beam.get_best() zz = beam.get_final() return xx, yy, zz
def decode_batch(self, idx): """Decode a minibatch.""" # Get source minibatch input_lines_src, output_lines_src, lens_src, mask_src = get_minibatch( self.src['data'], self.src_dict, idx, self.config['data']['batch_size'], self.config['data']['max_src_length'], add_start=True, add_end=True) beam_size = self.beam_size # (1) run the encoder on the src context_h, ( context_h_t, context_c_t) = self.get_hidden_representation(input_lines_src) context_h = context_h.transpose(0, 1) # Make things sequence first. # (3) run the decoder to generate sentences, using beam search batch_size = context_h.size(1) # Expand tensors for each beam. context = Variable(context_h.data.repeat(1, beam_size, 1)) dec_states = [ Variable(context_h_t.data.repeat(1, beam_size, 1)), Variable(context_c_t.data.repeat(1, beam_size, 1)) ] beam = [ Beam(beam_size, self.tgt_dict, cuda=True) for k in range(batch_size) ] dec_out = self.get_init_state_decoder(dec_states[0].squeeze(0)) dec_states[0] = dec_out batch_idx = list(range(batch_size)) remaining_sents = batch_size for i in range(self.config['data']['max_trg_length']): input = torch.stack([ b.get_current_state() for b in beam if not b.done ]).t().contiguous().view(1, -1) trg_emb = self.model.trg_embedding(Variable(input).transpose(1, 0)) trg_h, (trg_h_t, trg_c_t) = self.model.decoder( trg_emb, (dec_states[0].squeeze(0), dec_states[1].squeeze(0)), context) dec_states = (trg_h_t.unsqueeze(0), trg_c_t.unsqueeze(0)) dec_out = trg_h_t.squeeze(1) out = F.softmax(self.model.decoder2vocab(dec_out)).unsqueeze(0) word_lk = out.view(beam_size, remaining_sents, -1).transpose(0, 1).contiguous() active = [] for b in range(batch_size): if beam[b].done: continue idx = batch_idx[b] if not beam[b].advance(word_lk.data[idx]): active += [b] for dec_state in dec_states: # iterate over h, c # layers x beam*sent x dim sent_states = dec_state.view(-1, beam_size, remaining_sents, dec_state.size(2))[:, :, idx] sent_states.data.copy_( sent_states.data.index_select( 1, beam[b].get_current_origin())) if not active: break # in this section, the sentences that are still active are # compacted so that the decoder is not run on completed sentences active_idx = torch.cuda.LongTensor([batch_idx[k] for k in active]) batch_idx = {beam: idx for idx, beam in enumerate(active)} def update_active(t): # select only the remaining active sentences view = t.data.view(-1, remaining_sents, self.model.decoder.hidden_size) new_size = list(t.size()) new_size[-2] = new_size[-2] * len(active_idx) \ // remaining_sents return Variable( view.index_select(1, active_idx).view(*new_size)) dec_states = (update_active(dec_states[0]), update_active(dec_states[1])) dec_out = update_active(dec_out) context = update_active(context) remaining_sents = len(active) # (4) package everything up allHyp, allScores = [], [] n_best = 1 for b in range(batch_size): scores, ks = beam[b].sort_best() allScores += [scores[:n_best]] hyps = zip(*[beam[b].get_hyp(k) for k in ks[:n_best]]) allHyp += [hyps] return allHyp, allScores
def sample_beam(self, batch_loader, seq_len, seed, use_cuda, State, beam_size, n_best): # seed = Variable(t.from_numpy(seed).float()) if use_cuda: seed = seed.cuda() decoder_word_input_np, decoder_character_input_np = batch_loader.go_input( 1) decoder_word_input = Variable( t.from_numpy(decoder_word_input_np).long()) decoder_character_input = Variable( t.from_numpy(decoder_character_input_np).long()) if use_cuda: decoder_word_input, decoder_character_input = decoder_word_input.cuda( ), decoder_character_input.cuda() dec_states = State # print '========= Before ================' # print "dec_states:", dec_states[0].size() # print "dec_states:", dec_states[1].size() # print '==================================' # dec_states = [ # Variable(dec_states[0].repeat(1, beam_size, 1)), # Variable(dec_states[1].repeat(1, beam_size, 1)) # ] dec_states = [ dec_states[0].repeat(1, beam_size, 1), dec_states[1].repeat(1, beam_size, 1) ] # print'========== After ==================' # print "dec_states:", dec_states[0].size() # print "dec_states:", dec_states[1].size() # print '==================================' # exit() drop_prob = 0.0 beam_size = beam_size batch_size = 1 beam = [ Beam(beam_size, batch_loader, cuda=True) for k in range(batch_size) ] batch_idx = list(range(batch_size)) remaining_sents = batch_size for i in range(seq_len): input = t.stack([ b.get_current_state() for b in beam if not b.done ]).t().contiguous().view(1, -1) trg_emb = self.embedding_2.word_embed( Variable(input).transpose(1, 0)) # print trg_emb.size() # print seed.size() trg_h, dec_states = self.decoder.only_decoder_beam( trg_emb, seed, drop_prob, dec_states) # trg_h, (trg_h_t, trg_c_t) = self.model.decoder(trg_emb, (dec_states[0].squeeze(0), dec_states[1].squeeze(0)), context ) # print trg_h.size() # print trg_h_t.size() # print trg_c_t.size() # dec_states = (trg_h_t, trg_c_t) # print 'State dimension ----------->' # print State[0].size() # print State[1].size() # print '=======================================' # print "dec_states:", dec_states[0].size() # print "dec_states:", dec_states[1].size() # print '========== Things successful ===========' # exit() dec_out = trg_h.squeeze(1) # print "dec_out:", dec_out.size() out = F.softmax(self.decoder.fc(dec_out)).unsqueeze(0) word_lk = out.view(beam_size, remaining_sents, -1).transpose(0, 1).contiguous() active = [] for b in range(batch_size): if beam[b].done: continue idx = batch_idx[b] if not beam[b].advance(word_lk.data[idx]): active += [b] for dec_state in dec_states: # iterate over h, c # layers x beam*sent x dim sent_states = dec_state.view(-1, beam_size, remaining_sents, dec_state.size(2))[:, :, idx] sent_states.data.copy_( sent_states.data.index_select( 1, beam[b].get_current_origin())) if not active: break # in this section, the sentences that are still active are # compacted so that the decoder is not run on completed sentences active_idx = t.cuda.LongTensor([batch_idx[k] for k in active]) batch_idx = {beam: idx for idx, beam in enumerate(active)} def update_active(t): # select only the remaining active sentences view = t.data.view(-1, remaining_sents, self.params.decoder_rnn_size) new_size = list(t.size()) new_size[-2] = new_size[-2] * len(active_idx) \ // remaining_sents return Variable( view.index_select(1, active_idx).view(*new_size)) dec_states = (update_active(dec_states[0]), update_active(dec_states[1])) dec_out = update_active(dec_out) # context = update_active(context) remaining_sents = len(active) # (4) package everything up allHyp, allScores = [], [] for b in range(batch_size): scores, ks = beam[b].sort_best() # print scores # print ks allScores += [scores[:n_best]] hyps = zip(*[beam[b].get_hyp(k) for k in ks[:n_best]]) # print hyps # print "------------------" allHyp += [hyps] # print '==== Complete =========' return allHyp, allScores
def beam_sample(self, images, image_names, processor, max_seq_length, beam_size): """ :param images: :param image_names: :param processor: :param max_seq_length: :param beam_size: :return: """ predicted_sentences = dict() # get the batch size b_size = images.shape[0] # Encode img_emb = self.encoder(images) # compute sentence mixing coefficient pi0 = self.softmax_alpha_0(img_emb) # compute global topic embedding z0 = torch.matmul(pi0, self.desc_decoder.topic_embeddings) # prepare decoder initial hidden state if self.hinit_method == 'ZEROS': h0 = torch.zeros([ self.lstm_layers, img_emb.shape[0] * beam_size, self.hidden_size ], device=self.device) c0 = torch.zeros([ self.lstm_layers, img_emb.shape[0] * beam_size, self.hidden_size ], device=self.device) elif self.hinit_method == 'TOPICS': h_init = z0.unsqueeze(0) # seq len, batch size, emb size h0 = self.h0_lin(h_init) h0 = h0.repeat(self.lstm_layers, beam_size, 1) c0 = self.c0_lin(h_init) c0 = c0.repeat(self.lstm_layers, beam_size, 1) else: h0, c0 = None, None exit( 'not a valid hinit_method. Use one of: \'ZEROS\', \'TOPICS\'.') hidden_state = (h0, c0) # create a variable for summing the past topic distributions pi_sum = torch.zeros_like(pi0, device=self.device) img_emb = img_emb.repeat(beam_size, 1) z0 = z0.repeat(beam_size, 1) # create the initial beam beam = [ Beam(beam_size, processor, device=self.device) for _ in range(b_size) ] batch_idx = list( range(b_size)) # indicating index for every sample in the batch remaining_sents = b_size # number of samples in batch # Decode pi_pasts = [torch.zeros_like(pi0, device=self.device)] count_pis = torch.zeros(img_emb.shape[0], device=self.device) for w_idx in range(max_seq_length): # compute z_past pi_pasts.append(pi_pasts[-1].clone()) msk = count_pis != 0 pi_pasts[-1][msk, :] = pi0[msk, :] / count_pis.unsqueeze(1)[ msk, :] * pi_sum[msk, :] z_past = torch.matmul(pi_pasts[-1], self.desc_decoder.topic_embeddings) # concatenate the image, the topic embeddings and the last hidden state to get the feature vectors if self.switch_feature == 'IMAGE': switch_features = torch.cat([ img_emb, z0, hidden_state[0].view(hidden_state[0].shape[1], -1) ], dim=-1) elif self.switch_feature == 'PAST_TOPICS': switch_features = torch.cat([ z_past, z0, hidden_state[0].view(hidden_state[0].shape[1], -1) ], dim=-1) else: switch_features = None exit( 'not a valid switch_feature. Use one of: \'IMAGE\', \'PAST_TOPICS\'.' ) if self.desc_feature == 'BOTH': desc_features = torch.cat([ img_emb, z0, hidden_state[0].view(hidden_state[0].shape[1], -1), z_past ], dim=-1) elif self.desc_feature == 'IMAGE_ONLY': desc_features = torch.cat([img_emb, z0, z_past], dim=-1) elif self.desc_feature == 'PAST_ONLY': desc_features = torch.cat([ z0, hidden_state[0].view(hidden_state[0].shape[1], -1), z_past ], dim=-1) elif self.desc_feature == 'NEITHER': desc_features = torch.cat([z0, z_past], dim=-1) else: desc_features = None exit( 'not a valid switch_feature. Use one of: \'BOTH\', \'IMAGE_ONLY\', \'PAST_ONLY\', \'NEITHER\'.' ) input_ = torch.stack([ b.get_current_state() for b in beam if not b.done ]).view(-1, 1) # the topic features are now of repeats of the batch stacked under each other. It should be # repeats of the sample for the remaining samples in the batch. So: [[1],[2],[1],[2]] -> [[1],[1],[2],[2]] switch_features = torch.stack([ switch_features[range(i, switch_features.shape[0], b_size)] for i in range(remaining_sents) ]).view(switch_features.shape[0], -1) desc_features = torch.stack([ desc_features[range(i, desc_features.shape[0], b_size)] for i in range(remaining_sents) ]).view(desc_features.shape[0], -1) # compute the switch Bi = self.sigmoid_s(switch_features) # compute the next timesteps pred_lang_model, hidden_state = self.lang_decoder( input_, hidden_state) pred_lang_model = pred_lang_model.squeeze(1) (pred_desc_model, pii) = self.desc_decoder(desc_features, z0) # select which prediction to use based on the switch out = pred_desc_model mask = torch.round(Bi).type(torch.uint8).squeeze() out[mask, :] = pred_lang_model[mask, :] out = torch.softmax(out, dim=-1) # update pi pasts pi_sum[1 - mask, :] = pi_sum[1 - mask, :] + pii[1 - mask, :] count_pis[1 - mask] = count_pis[1 - mask] + 1 # process lstm step in beam search word_lk = out.view(beam_size, remaining_sents, -1).transpose(0, 1).contiguous() active = [] # list of not finished samples for b in range(b_size): # if the current sample is done, skip it if beam[b].done: continue # get the original index of the sample idx = batch_idx[b] if not beam[b].advance( word_lk.data[idx]): # returns true if complete active.append(b) for dec_state in hidden_state: # iterate over h, c sent_states = dec_state.view(-1, beam_size, remaining_sents, dec_state.size(2))[:, :, idx] sent_states.data.copy_( sent_states.data.index_select( 1, beam[b].get_current_origin())) # test if the beam is finished if not active: break # in this section, the sentences that are still active are # compacted so that the decoder is not run on completed sentences active_idx = torch.LongTensor([batch_idx[k] for k in active]).to(self.device) batch_idx = {beam: idx for idx, beam in enumerate(active)} def update_active(t, hidden_size): # select only the remaining active sentences view = t.data.view(-1, remaining_sents, hidden_size) new_size = list(t.size()) new_size[-2] = new_size[-2] * len( active_idx) // remaining_sents return Variable( view.index_select(1, active_idx).view(*new_size)) hidden_state = (update_active(hidden_state[0], self.hidden_size), update_active(hidden_state[1], self.hidden_size)) img_emb = update_active(img_emb, self.embedding_size) z0 = update_active(z0, self.hidden_size) remaining_sents = len(active) # select the best hypothesis for b in range(b_size): score_, k = beam[b].get_best() hyp = beam[b].get_hyp(k) predicted_sentences[image_names[b]] = [ processor.i2w[idx.item()] for idx in hyp ] return predicted_sentences
def beam_sample(self, images, image_names, processor, max_seq_length, beam_size): """ :param images: :param image_names: :param processor: :param max_seq_length: :param beam_size: :return: """ predicted_sentences = dict() # Encode img_emb = self.encoder(images) # prepare decoder initial hidden state img_emb = img_emb.unsqueeze(0) # seq len, batch size, emb size h0 = self.h0_lin(img_emb) h0 = h0.repeat(self.lstm_layers, beam_size, 1) # for each chain in the beam a copy of hidden c0 = self.c0_lin(img_emb) c0 = c0.repeat(self.lstm_layers, beam_size, 1) hidden_state = (h0, c0) b_size = images.shape[0] # create the initial beam beam = [ Beam(beam_size, processor, device=self.device) for _ in range(b_size) ] batch_idx = list( range(b_size)) # indicating index for every sample in the batch remaining_sents = b_size # number of samples in batch # Decode for w_idx in range(max_seq_length): input_ = torch.stack([ b.get_current_state() for b in beam if not b.done ]).view(-1, 1) out, hidden_state = self.decoder(input_, hidden_state) out = torch.softmax(out, dim=2) # process lstm step in beam search word_lk = out.view(beam_size, remaining_sents, -1).transpose(0, 1).contiguous() active = [] # list of not finished samples for b in range(b_size): # if the current sample is done, skip it if beam[b].done: continue # get the original index of the sample idx = batch_idx[b] if not beam[b].advance( word_lk.data[idx]): # returns true if complete active.append(b) for dec_state in hidden_state: # iterate over h, c sent_states = dec_state.view(-1, beam_size, remaining_sents, dec_state.size(2))[:, :, idx] sent_states.data.copy_( sent_states.data.index_select( 1, beam[b].get_current_origin())) # test if the beam is finished if not active: break # in this section, the sentences that are still active are # compacted so that the decoder is not run on completed sentences active_idx = torch.LongTensor([batch_idx[k] for k in active]).to(self.device) batch_idx = {beam: idx for idx, beam in enumerate(active)} def update_active(t, hidden_size): # select only the remaining active sentences view = t.data.view(-1, remaining_sents, hidden_size) new_size = list(t.size()) new_size[-2] = new_size[-2] * len( active_idx) // remaining_sents return Variable( view.index_select(1, active_idx).view(*new_size)) hidden_state = (update_active(hidden_state[0], self.hidden_size), update_active(hidden_state[1], self.hidden_size)) remaining_sents = len(active) # select the best hypothesis for b in range(b_size): score_, k = beam[b].get_best() hyp = beam[b].get_hyp(k) predicted_sentences[image_names[b]] = [ processor.i2w[idx.item()] for idx in hyp ] return predicted_sentences
def sample_beam(self, batch_loader, seq_len, seed, use_cuda, State, beam_size, n_best): # seed = Variable(t.from_numpy(seed).float()) if use_cuda: seed = seed.cuda() decoder_word_input_np, decoder_character_input_np = batch_loader.go_input( 1) decoder_word_input = Variable( t.from_numpy(decoder_word_input_np).long()) decoder_character_input = Variable( t.from_numpy(decoder_character_input_np).long()) if use_cuda: decoder_word_input, decoder_character_input = decoder_word_input.cuda( ), decoder_character_input.cuda() dec_states = State dec_states = [ dec_states[0].repeat(1, beam_size, 1), dec_states[1].repeat(1, beam_size, 1) ] drop_prob = 0.0 beam_size = beam_size batch_size = 1 beam = [ Beam(beam_size, batch_loader, cuda=True) for k in range(batch_size) ] batch_idx = list(range(batch_size)) remaining_sents = batch_size for i in range(seq_len): input = t.stack([ b.get_current_state() for b in beam if not b.done ]).t().contiguous().view(1, -1) trg_emb = self.embedding_2.word_embed( Variable(input).transpose(1, 0)) # print trg_emb.size() # print seed.size() trg_h, dec_states = self.decoder.only_decoder_beam( trg_emb, seed, drop_prob, dec_states) dec_out = trg_h.squeeze(1) # print "dec_out:", dec_out.size() out = F.softmax(self.decoder.fc(dec_out)).unsqueeze(0) word_lk = out.view(beam_size, remaining_sents, -1).transpose(0, 1).contiguous() active = [] for b in range(batch_size): if beam[b].done: continue idx = batch_idx[b] if not beam[b].advance(word_lk.data[idx]): active += [b] for dec_state in dec_states: # iterate over h, c # layers x beam*sent x dim sent_states = dec_state.view(-1, beam_size, remaining_sents, dec_state.size(2))[:, :, idx] sent_states.data.copy_( sent_states.data.index_select( 1, beam[b].get_current_origin())) if not active: break active_idx = t.cuda.LongTensor([batch_idx[k] for k in active]) batch_idx = {beam: idx for idx, beam in enumerate(active)} def update_active(t): view = t.data.view(-1, remaining_sents, self.params.decoder_rnn_size) new_size = list(t.size()) new_size[-2] = new_size[-2] * len(active_idx) \ // remaining_sents return Variable( view.index_select(1, active_idx).view(*new_size)) dec_states = (update_active(dec_states[0]), update_active(dec_states[1])) dec_out = update_active(dec_out) remaining_sents = len(active) allHyp, allScores = [], [] for b in range(batch_size): scores, ks = beam[b].sort_best() allScores += [scores[:n_best]] hyps = zip(*[beam[b].get_hyp(k) for k in ks[:n_best]]) allHyp += [hyps] return allHyp, allScores
def gensummary_elmo(template_vec, ee, vocab, LMModel, word_list, subvocab, clustermask=None, mono=True, renorm=True, temperature=1, elmo_layer='avg', max_step=20, beam_width=10, beam_width_start=10, alpha=0.1, alpha_start=0.1, begineos=True, stopbyLMeos=False, devid=0, **kwargs): """ Unsupervised sentence summary generation using beam search, by contextual matching and a summary style language model. The contextual matching here is on top of pretrained ELMo embeddings. Input: - template_vec (torch.Tensor): forward only ELMo embeddings of the source sentence. 'torch.Tensor' of size (3, seq_len, 512). - ee (elmo_sequential_embedder.ElmoEmbedderForward): 'elmo_sequential_embedder.ElmoEmbedderForward' object. - vocab (torchtext.vocab.Vocab): 'torchtext.vocab.Vocab' object. Should be the same as is used for the pretrained language model. - LMModel (user defined torch.nn.Module): a pretrained language model on the summary sentences. - word_list (list): a list of words in the vocabulary to work with. 'List'. - subvocab (torch.LongTensor): 'torch.LongTensor' consisting of the indices of the words corresponding to `word_list`. - clustermask (torch.ByteTensor): a binary mask for each of the sub-vocabulary word. 'torch.ByteTensor' of size (len(sub-vocabulary), len(vocabulary)). Default:None. - mono (bool): whether to keep monotonicity contraint. Default: True. - renorm (bool): whether to renormalize the probabilities over the sub-vocabulary. Default: True. - temperature (float): temperature applied to the softmax in the language model. Default: 1. - elmo_layer (str): which ELMo layer to use as the word type representation. Choose from ['avg', 'cat', 'bot', 'mid', 'top']. Default: 'avg'. - max_step (int): maximum number of beam steps. - beam_width (int): beam width. - beam_width_start (int): beam width of the first step. - alpha (float): the amount of language model part used for scoring. The score is: (1 - \alpha) * similarity_logscore + \alpha * LM_logscore. - alpha_start (float): the amount of language model part used for scoring, only for the first step. - begineos (bool): whether to begin with the special '<eos>' token as is trained in the language model. Note that ELMo has its own special beginning token. Default: True. - stopbyLMeos (bool): whether to stop a sentence solely by the language model predicting '<eos>' as the top possibility. Default: False. - devid (int): device id to run the algorithm and LSTM language models. 'int', default: 0. -1 for cpu. **kwargs: other arguments input to function <Beam.beamstep>. E.g. - normalized (bool): whether to normalize the dot product when calculating the similarity, which makes it cosine similarity. Default: True. - ifadditive (bool): whether to use an additive model on mixing the probability scores. Default: False. Output: - beam (beam_search.Beam): 'Beam' object, recording all the generated sequences. """ device = 'cpu' if devid == -1 else f'cuda:{devid}' # Beam Search: initialization if begineos: beam = Beam(1, vocab, init_ids=[vocab.stoi['<eos>']], device=device, sim_score=0, lm_score=0, lm_state=None, elmo_state=None, align_loc=None) else: beam = Beam(1, vocab, init_ids=[None], device=device, sim_score=0, lm_score=0, lm_state=None, elmo_state=None, align_loc=None) # first step: start with 'beam_width_start' best matched words beam.beamstep( beam_width_start, beam.combscoreK, template_vec=template_vec, ee=ee, LMModel=LMModel, word_list=word_list, subvocab=subvocab, clustermask=clustermask, alpha=alpha_start, renorm=renorm, temperature=temperature, elmo_layer=elmo_layer, # normalized=True, # ifadditive=False, **kwargs) # run beam search, until all sentences hit <EOS> or max_step reached for s in range(max_step): print(f'beam step {s + 1} ' + '-' * 50 + '\n') beam.beamstep( beam_width, beam.combscoreK, template_vec=template_vec, ee=ee, LMModel=LMModel, word_list=word_list, subvocab=subvocab, clustermask=clustermask, mono=mono, alpha=alpha, renorm=renorm, temperature=temperature, stopbyLMeos=stopbyLMeos, elmo_layer=elmo_layer, # normalized=True, # ifadditive=False, **kwargs) # all beams reach termination if beam.endall: break return beam
def decode_batch(self, idx): """Decode a minibatch.""" # Get source minibatch input_lines_src, output_lines_src, lens_src, mask_src = get_minibatch( self.src['data'], self.src_dict, idx, self.config['data']['batch_size'], self.config['data']['max_src_length'], add_start=True, add_end=True ) #print(self.src_dict) ''' lines = [ ['<s>'] + line + ['</s>'] for line in self.src['data'][idx:idx + self.config['data']['max_src_length']] ] lines = [line[:self.config['data']['max_src_length']] for line in lines] lens = [len(line) for line in lines] max_len = max(lens) word2ind = self.src_dict input_lines = [ [word2ind[w] if w in word2ind else word2ind['<unk>'] for w in line[:-1]] + [word2ind['<pad>']] * (max_len - len(line)) for line in lines ] #print(len(input_lines)) #print(input_lines_src[0]) ''' #id2word_src = {v: k for k, v in self.src_dict.iteritems()} #inp = input_lines_src[0].data.cpu().numpy().tolist() #print([inv_dict[a] for a in inp]) beam_size = self.beam_size # (1) run the encoder on the src context_h, (context_h_t, context_c_t) = self.get_hidden_representation( input_lines_src ) context_h = context_h.transpose(0, 1) # Make things sequence first. # (3) run the decoder to generate sentences, using beam search batch_size = context_h.size(1) # Expand tensors for each beam. context = Variable(context_h.data.repeat(1, beam_size, 1)) #print context.size() dec_states = [ Variable(context_h_t.data.repeat(1, beam_size, 1)), Variable(context_c_t.data.repeat(1, beam_size, 1)) ] beam = [ Beam(beam_size, self.tgt_dict, self.id2word_src, trg['id2word'], cuda=True) for k in range(batch_size) ] dec_out = self.get_init_state_decoder(dec_states[0].squeeze(0)) dec_states[0] = dec_out #print(dec_states[0].size()) batch_idx = list(range(batch_size)) remaining_sents = batch_size for i in range(self.config['data']['max_trg_length']): #print(i) input = torch.stack( [b.get_current_state() for b in beam if not b.done] ).t().contiguous().view(1, -1) trg_emb = self.model.trg_embedding(Variable(input).transpose(1, 0)) #print trg_emb.size() #print dec_states[0].size(), dec_states[1].size() #print context.size() trg_h, (trg_h_t, trg_c_t) = self.model.decoder( trg_emb, (dec_states[0].squeeze(0), dec_states[1].squeeze(0)), context ) dec_states = (trg_h_t.unsqueeze(0), trg_c_t.unsqueeze(0)) dec_out = trg_h_t.squeeze(1).view(-1, self.model.trg_hidden_dim) #print dec_out.size() out = F.softmax(self.model.decoder2vocab(dec_out)).unsqueeze(0) word_lk = out.view( beam_size, remaining_sents, -1 ).transpose(0, 1).contiguous() active = [] for b in range(batch_size): if beam[b].done: continue idx = batch_idx[b] #print(idx, len(lines), input_lines_src.size()) if not beam[b].advance(word_lk.data[idx], input_lines_src[idx]): active += [b] for dec_state in dec_states: # iterate over h, c # layers x beam*sent x dim #print dec_state.size(1), dec_state.size(2), dec_state.size(3) state_size = dec_state.size(1) * dec_state.size(3) if self.model.nlayers_trg > 1 else dec_state.size(2) sent_states = dec_state.view( -1, beam_size, remaining_sents, state_size )[:, :, idx] sent_states.data.copy_( sent_states.data.index_select( 1, beam[b].get_current_origin() ) ) if not active: break # in this section, the sentences that are still active are # compacted so that the decoder is not run on completed sentences active_idx = torch.cuda.LongTensor([batch_idx[k] for k in active]) batch_idx = {beam: idx for idx, beam in enumerate(active)} def update_active(t): # select only the remaining active sentences view = t.data.view( -1, remaining_sents, self.model.decoder.hidden_size ) new_size = list(t.size()) new_size[-2] = new_size[-2] * len(active_idx) \ // remaining_sents return Variable(view.index_select( 1, active_idx ).view(*new_size)) dec_states = ( update_active(dec_states[0]), update_active(dec_states[1]) ) dec_out = update_active(dec_out) context = update_active(context) remaining_sents = len(active) # (4) package everything up allHyp, allScores = [], [] n_best = 1 for b in range(batch_size): scores, ks = beam[b].sort_best() #print(ks) allScores += [scores[:n_best]] hyps = zip(*[beam[b].get_hyp(k) for k in ks[:n_best]]) #print(hyps) allHyp += [hyps] return allHyp, allScores
def generate(self, init_hidden, encoder_outputs, max_gen_length, beam_size=1): # The hidden state in RNNs in Pytorch is always (seq_length, batch_size, emb_size) - even if you use batch_first # Note that during generation, the batch size should always be 1 if self.bidirectional_enc: self.hidden = Variable( torch.zeros(self.num_layers, 1, self.hidden_size)) #init to correct size if use_cuda: self.hidden = self.hidden.cuda() for x in range(self.num_layers): self.hidden[x] = torch.cat( (init_hidden[2 * x], init_hidden[1 + 2 * x]), 1 ) #concatenate the appropriate bidirectional hidden states else: self.hidden = init_hidden # Setup inputs and contexts #decoder_input = Variable(torch.LongTensor(init_hidden.shape[1] * [[SOS]])) #decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_contexts = Variable(torch.zeros(1, 1, self.hidden_size)) decoder_contexts = decoder_contexts.cuda( ) if use_cuda else decoder_contexts attn_scores = self.attn.calc_attn_scores(encoder_outputs) # Accumulate the output scores and words generated by the model source_len = encoder_outputs.shape[1] beam = Beam(beam_size, source_len) beam.add_initial_path(decoder_contexts, self.hidden) for i in range(max_gen_length): # Get paths up front since the dict changes size while iterating all_paths = [p for p in beam] for path in all_paths: decoder_input, decoder_contexts, hidden = beam.get_decoder_params( path) decoder_outputs, decoder_contexts, attn_weights, hidden = self.__forward_one_word( decoder_input, decoder_contexts, encoder_outputs, attn_scores, hidden) # Add the potential next steps for the current beam path beam.add_paths(path, decoder_outputs, decoder_contexts, hidden, attn=attn_weights) beam.prune() # Break if all beam paths have ended if beam.is_ended(i): break outputs, words, attn_weights_matrix = beam.get_best_path_results() return outputs, words, attn_weights_matrix