def step(self, word_scores, length, beam, beam_scores, beam_lengths, extra_delimiter): assert len(word_scores.size()) == 3, word_scores.size() batch_size, beam_size, vocab_size = word_scores.size() assert beam_size == self.beam_size, word_scores.size() assert len(beam.size()) == 3, beam.size() assert beam.size()[:2] == (batch_size, beam_size), \ '%s != (%s, %s, *)' % (beam.size(), batch_size, beam_size) assert beam_scores.size() == (batch_size, beam_size), \ '%s != %s' % (beam_scores.size(), (batch_size, beam_size)) assert beam_lengths.size() == (batch_size, beam_size), \ '%s != %s' % (beam_lengths.size(), (batch_size, beam_size)) # Compute updated scores done_mask = (beam_lengths == length - 1).type_as(word_scores)[:, :, None] new_scores = (word_scores * done_mask + beam_scores[:, :, np.newaxis]).view( batch_size, beam_size * vocab_size) # Get top k scores and their indices new_beam_scores, topk_indices = new_scores.topk(beam_size, dim=1) # Transform into previous beam indices and new token indices rows, new_indices = unravel_index(topk_indices, (beam_size, vocab_size)) assert rows.size() == (batch_size, beam_size), \ '%s != %s' % (rows.size(), (batch_size, beam_size)) assert new_indices.size() == (batch_size, beam_size), \ '%s != %s' % (new_indices.size(), (batch_size, beam_size)) # Extract best pre-existing rows beam = beam[lrange(batch_size)[:, None], rows.data, :] assert beam.size()[:2] == (batch_size, beam_size), (beam.size(), (batch_size, beam_size)) # Get previous done status and update it with # which rows have newly reached </s> new_beam_lengths = beam_lengths[lrange(batch_size)[:, None], rows.data].clone() # Pad already-finished sequences with </s> pad_delimiter = extra_delimiter if extra_delimiter is not None else self.delimiters[ 1] new_indices[(new_beam_lengths != length - 1)] = pad_delimiter # Add one to the beam lengths that are not done continue_mask = (new_indices != self.delimiters[1]) * (new_beam_lengths == length - 1) if extra_delimiter is not None: continue_mask = continue_mask * (new_indices != extra_delimiter) new_beam_lengths += continue_mask.type_as(beam_lengths) # Append new token indices new_beam = th.cat([beam, new_indices[:, :, None]], dim=2) return new_beam, new_beam_lengths, new_beam_scores
def forward(self, outputs, src_lengths): a = self.activations assert outputs.dim() == 3, outputs.size() assert outputs.size()[2] == self.repr_size, (outputs.size(), self.repr_size) batch_size, max_len, repr_size = outputs.size() a.attn_h1 = th.nn.Tanh()(self.hidden1(outputs)) a.attn_h2 = self.hidden2(outputs) assert a.attn_h2.size() == (batch_size, max_len, repr_size), \ (a.attn_h2.size(), (batch_size, max_len, repr_size)) init_var = th.autograd.Variable(cu(th.FloatTensor([1.0]))) a.target = self.target(init_var) assert a.target.size() == (repr_size, ), (a.target.size(), repr_size) a.attn_scores = th.matmul(a.attn_h2, a.target) assert a.attn_scores.size() == (batch_size, max_len), \ (a.attn_scores.size(), (batch_size, max_len)) attn_mask = th.autograd.Variable( cu( th.log((lrange(max_len)[None, :] < src_lengths.data[:, None]).float()))) a.attn_weights = th.exp( th.nn.LogSoftmax(dim=1)(a.attn_scores + attn_mask)) assert a.attn_weights.size() == (batch_size, max_len), \ (a.attn_weights.size(), (batch_size, max_len)) a.attn_out = th.matmul(a.attn_weights[:, None, :], outputs)[:, 0, :] assert a.attn_out.size() == (batch_size, repr_size), \ (a.attn_out.size(), (batch_size, repr_size)) self.dump_weights(a.attn_weights.data) result = a.attn_out, a.attn_weights if not self.monitor_activations: # Free up memory a.__dict__.clear() return result
def forward(self, enc_state, extra_inputs=None, extra_delimiter=None): if not isinstance(enc_state, tuple): enc_state = (enc_state, ) assert len(enc_state[0].size()) == 3, enc_state[0].size() num_layers, batch_size, h_size = enc_state[0].size() state_sizes = [] state = [] for enc_c in enc_state: assert len(enc_c.size()) == 3, enc_c.size() assert enc_c.size()[:2] == (num_layers, batch_size), enc_c.size() c_size = enc_c.size()[2] state_sizes.append(c_size) state.append(enc_c[:, :, None, :].expand(num_layers, batch_size, self.beam_size, c_size)) if extra_inputs is None: extra_inputs = [] else: extra_inputs = [ inp[:, None, ...].expand((inp.size()[0], self.beam_size) + tuple(inp.size()[1:])).contiguous().view( (inp.size()[0] * self.beam_size, 1) + tuple(inp.size()[1:])) for inp in extra_inputs ] def ravel(x): return x.contiguous().view( *tuple(x.size()[:-2]) + (batch_size, self.beam_size, x.size()[-1])) def unravel(x): return x.contiguous().view( *tuple(x.size()[:-3]) + (batch_size * self.beam_size, x.size()[-1])) beam = th.autograd.Variable( cu( th.LongTensor(batch_size, self.beam_size, 1).fill_(self.delimiters[0]))) beam_scores = th.autograd.Variable( cu(th.zeros(batch_size, self.beam_size))) beam_lengths = th.autograd.Variable( cu(th.LongTensor(batch_size, self.beam_size).zero_())) outputs = [] states = [] for length in itertools.count(1): last_tokens = beam[:, :, -1:] assert last_tokens.size() == (batch_size, self.beam_size, 1), last_tokens.size() word_scores, (dec_out, state) = self.decode_fn( unravel(last_tokens), tuple(unravel(c) for c in state), extra_inputs=extra_inputs) word_scores = ravel(word_scores[:, 0, :]) state = tuple(ravel(c) for c in state) states.append(state) outputs.append(dec_out) assert word_scores.size()[:2] == ( batch_size, self.beam_size), word_scores.size() beam, beam_lengths, beam_scores = self.step( word_scores, length, beam, beam_scores, beam_lengths, extra_delimiter=extra_delimiter) if (beam_lengths.data != length).prod() or \ (self.max_len is not None and length == self.max_len): break all_states_collated = [th.stack(s, dim=3) for s in zip(*states)] final_indices = th.clamp(beam_lengths.data, max=self.max_len - 1) final_states = [ s[:, lrange(batch_size)[:, None], lrange(self.beam_size)[None, :], final_indices, :] for s in all_states_collated ] all_outputs = th.stack(outputs, dim=1) return (beam, th.clamp(beam_lengths, max=self.max_len), beam_scores, (all_outputs, final_states))
def forward(self, enc_state, tgt_indices, tgt_lengths, extra_inputs=None, extra_delimiter=None, output_beam=None, output_sample=None): if output_beam is None: output_beam = not self.training if output_sample is None: output_sample = not self.training a = self.activations if output_beam: (beam, beam_lengths, beam_scores, beam_outputs) = self.beam_predictor( enc_state, extra_inputs=extra_inputs, extra_delimiter=extra_delimiter) if output_sample: (sample, sample_lengths, sample_scores, sample_outputs) = self.sampler(enc_state, extra_inputs=extra_inputs, extra_delimiter=extra_delimiter) ''' if not hasattr(neural.TorchModel, 'debug'): if not hasattr(neural.TorchModel, 'debug_counts'): from collections import Counter neural.TorchModel.debug_counts = Counter() print(' <COUNTS>') neural.TorchModel.debug_counts.update(sample[:, 0, 1].data.tolist()) ''' if extra_inputs is None: extra_inputs = [] else: extra_inputs = [ inp[:, None, ...].expand((inp.size()[0], tgt_indices.size()[1] - 1) + tuple(inp.size()[1:])) for inp in extra_inputs ] #if hasattr(RNNDecoder, 'debug'): # import pdb; pdb.set_trace() a.log_softmax, (dec_out, dec_state) = self.decode(tgt_indices[:, :-1], enc_state, extra_inputs=extra_inputs, monitor=True) #if hasattr(RNNDecoder, 'debug'): # print(f' {enc_state[0][0,0,0]}->{dec_state[0][0,0,0]}') # import pdb; pdb.set_trace() a.log_prob_token = index_sequence(a.log_softmax, tgt_indices.data[:, 1:]) a.mask = (lrange(a.log_prob_token.size()[1])[None, :] < tgt_lengths.data[:, None]).float() a.log_prob_masked = a.log_prob_token * th.autograd.Variable(a.mask) a.log_prob_seq = a.log_prob_masked.sum(1) predict = {} score = {'target': a.log_prob_seq} output = {'target': (dec_out, dec_state)} if output_beam: predict['beam'] = (beam[:, 0, 1:], beam_lengths[:, 0]) score['beam'] = beam_scores[:, 0] output['beam'] = beam_outputs if output_sample: predict['sample'] = (sample[:, 0, 1:], sample_lengths[:, 0]) score['sample'] = sample_scores[:, 0] output['sample'] = sample_outputs if not self.monitor_activations: # Free up memory a.__dict__.clear() return predict, score, output
def sel_indices_to_selection(self, feasible_sels, sel_indices): return feasible_sels[lrange(feasible_sels.size()[0]), sel_indices.data, :]
def selection(self, sel_indices, feasible_sels, num_feasible_sels): # "GRU_o": encode dialogue for selection a = self.activations assert sel_indices.dim() == 1, sel_indices.size() batch_size = sel_indices.size()[0] a.combined_repr = self.combined_layer(th.cat([a.context_repr, a.dialogue_repr], dim=1)) assert a.combined_repr.dim() == 2, a.combined_repr.size() assert a.combined_repr.size()[0] == batch_size, (a.combined_repr.size(), batch_size) a.all_item_scores = log_softmax(self.selection_layer(a.combined_repr)) assert a.all_item_scores.size() == (batch_size, self.selection_layer.out_features), \ (a.all_item_scores.size(), (batch_size, self.selection_layer.out_features)) a.feasible_item_scores = a.all_item_scores[ lrange(a.all_item_scores.size()[0])[:, None, None], feasible_sels.data ] assert a.feasible_item_scores.size() == (batch_size, MAX_FEASIBLE + 3, NUM_ITEMS), \ (a.feasible_item_scores.size(), batch_size) num_feasible_mask = th.autograd.Variable(cu( (lrange(a.feasible_item_scores.size()[1])[None, :, None] <= num_feasible_sels.data[:, None, None]).float() )) a.feasible_masked = a.feasible_item_scores + th.log(num_feasible_mask) a.full_selection_scores = log_softmax(a.feasible_item_scores.sum(dim=2), dim=1) assert a.full_selection_scores.size() == (batch_size, MAX_FEASIBLE + 3), \ (a.full_selection_scores.size(), batch_size) a.selection_beam_score, selection_beam = a.full_selection_scores.max(dim=1) assert selection_beam.size() == (batch_size,), (selection_beam.size(), batch_size) selection_sample = th.multinomial(th.exp(a.full_selection_scores), 1, replacement=True)[:, 0] a.selection_sample_score = th.exp(a.full_selection_scores)[ lrange(a.full_selection_scores.size()[0]), selection_sample.data ] assert selection_sample.size() == (batch_size,), (selection_sample.size(), batch_size) selection_predict = { 'beam': self.sel_indices_to_selection(feasible_sels, selection_beam), 'sample': self.sel_indices_to_selection(feasible_sels, selection_sample), } assert selection_predict['beam'].size() == (batch_size, NUM_ITEMS), \ (selection_predict['beam'].size(), batch_size) assert selection_predict['sample'].size() == (batch_size, NUM_ITEMS), \ (selection_predict['sample'].size(), batch_size) a.selection_target_score = a.full_selection_scores[ lrange(a.full_selection_scores.size()[0]), sel_indices.data ] assert a.selection_target_score.size() == (batch_size,), (a.selection_score.size(), batch_size) selection_score = { 'target': a.selection_target_score, 'beam': a.selection_beam_score, 'sample': a.selection_sample_score, } return selection_predict, selection_score