Example #1
0
    def step(self, word_scores, length, beam, beam_scores, beam_lengths,
             extra_delimiter):
        assert len(word_scores.size()) == 3, word_scores.size()
        batch_size, beam_size, vocab_size = word_scores.size()
        assert beam_size == self.beam_size, word_scores.size()
        assert len(beam.size()) == 3, beam.size()
        assert beam.size()[:2] == (batch_size, beam_size), \
            '%s != (%s, %s, *)' % (beam.size(), batch_size, beam_size)
        assert beam_scores.size() == (batch_size, beam_size), \
            '%s != %s' % (beam_scores.size(), (batch_size, beam_size))
        assert beam_lengths.size() == (batch_size, beam_size), \
            '%s != %s' % (beam_lengths.size(), (batch_size, beam_size))

        # Compute updated scores
        done_mask = (beam_lengths == length - 1).type_as(word_scores)[:, :,
                                                                      None]
        new_scores = (word_scores * done_mask +
                      beam_scores[:, :, np.newaxis]).view(
                          batch_size, beam_size * vocab_size)
        # Get top k scores and their indices
        new_beam_scores, topk_indices = new_scores.topk(beam_size, dim=1)
        # Transform into previous beam indices and new token indices
        rows, new_indices = unravel_index(topk_indices,
                                          (beam_size, vocab_size))
        assert rows.size() == (batch_size, beam_size), \
            '%s != %s' % (rows.size(), (batch_size, beam_size))
        assert new_indices.size() == (batch_size, beam_size), \
            '%s != %s' % (new_indices.size(), (batch_size, beam_size))

        # Extract best pre-existing rows
        beam = beam[lrange(batch_size)[:, None], rows.data, :]
        assert beam.size()[:2] == (batch_size, beam_size), (beam.size(),
                                                            (batch_size,
                                                             beam_size))
        # Get previous done status and update it with
        # which rows have newly reached </s>
        new_beam_lengths = beam_lengths[lrange(batch_size)[:, None],
                                        rows.data].clone()
        # Pad already-finished sequences with </s>
        pad_delimiter = extra_delimiter if extra_delimiter is not None else self.delimiters[
            1]
        new_indices[(new_beam_lengths != length - 1)] = pad_delimiter
        # Add one to the beam lengths that are not done
        continue_mask = (new_indices != self.delimiters[1]) * (new_beam_lengths
                                                               == length - 1)
        if extra_delimiter is not None:
            continue_mask = continue_mask * (new_indices != extra_delimiter)
        new_beam_lengths += continue_mask.type_as(beam_lengths)
        # Append new token indices
        new_beam = th.cat([beam, new_indices[:, :, None]], dim=2)

        return new_beam, new_beam_lengths, new_beam_scores
Example #2
0
    def forward(self, outputs, src_lengths):
        a = self.activations

        assert outputs.dim() == 3, outputs.size()
        assert outputs.size()[2] == self.repr_size, (outputs.size(),
                                                     self.repr_size)
        batch_size, max_len, repr_size = outputs.size()

        a.attn_h1 = th.nn.Tanh()(self.hidden1(outputs))
        a.attn_h2 = self.hidden2(outputs)
        assert a.attn_h2.size() == (batch_size, max_len, repr_size), \
            (a.attn_h2.size(), (batch_size, max_len, repr_size))
        init_var = th.autograd.Variable(cu(th.FloatTensor([1.0])))
        a.target = self.target(init_var)
        assert a.target.size() == (repr_size, ), (a.target.size(), repr_size)
        a.attn_scores = th.matmul(a.attn_h2, a.target)
        assert a.attn_scores.size() == (batch_size, max_len), \
            (a.attn_scores.size(), (batch_size, max_len))
        attn_mask = th.autograd.Variable(
            cu(
                th.log((lrange(max_len)[None, :] <
                        src_lengths.data[:, None]).float())))
        a.attn_weights = th.exp(
            th.nn.LogSoftmax(dim=1)(a.attn_scores + attn_mask))
        assert a.attn_weights.size() == (batch_size, max_len), \
            (a.attn_weights.size(), (batch_size, max_len))
        a.attn_out = th.matmul(a.attn_weights[:, None, :], outputs)[:, 0, :]
        assert a.attn_out.size() == (batch_size, repr_size), \
            (a.attn_out.size(), (batch_size, repr_size))

        self.dump_weights(a.attn_weights.data)

        result = a.attn_out, a.attn_weights

        if not self.monitor_activations:
            # Free up memory
            a.__dict__.clear()

        return result
Example #3
0
    def forward(self, enc_state, extra_inputs=None, extra_delimiter=None):
        if not isinstance(enc_state, tuple):
            enc_state = (enc_state, )
        assert len(enc_state[0].size()) == 3, enc_state[0].size()
        num_layers, batch_size, h_size = enc_state[0].size()
        state_sizes = []
        state = []
        for enc_c in enc_state:
            assert len(enc_c.size()) == 3, enc_c.size()
            assert enc_c.size()[:2] == (num_layers, batch_size), enc_c.size()
            c_size = enc_c.size()[2]
            state_sizes.append(c_size)
            state.append(enc_c[:, :, None, :].expand(num_layers, batch_size,
                                                     self.beam_size, c_size))
        if extra_inputs is None:
            extra_inputs = []
        else:
            extra_inputs = [
                inp[:, None,
                    ...].expand((inp.size()[0], self.beam_size) +
                                tuple(inp.size()[1:])).contiguous().view(
                                    (inp.size()[0] * self.beam_size, 1) +
                                    tuple(inp.size()[1:]))
                for inp in extra_inputs
            ]

        def ravel(x):
            return x.contiguous().view(
                *tuple(x.size()[:-2]) +
                (batch_size, self.beam_size, x.size()[-1]))

        def unravel(x):
            return x.contiguous().view(
                *tuple(x.size()[:-3]) +
                (batch_size * self.beam_size, x.size()[-1]))

        beam = th.autograd.Variable(
            cu(
                th.LongTensor(batch_size, self.beam_size,
                              1).fill_(self.delimiters[0])))
        beam_scores = th.autograd.Variable(
            cu(th.zeros(batch_size, self.beam_size)))
        beam_lengths = th.autograd.Variable(
            cu(th.LongTensor(batch_size, self.beam_size).zero_()))
        outputs = []
        states = []

        for length in itertools.count(1):
            last_tokens = beam[:, :, -1:]
            assert last_tokens.size() == (batch_size, self.beam_size,
                                          1), last_tokens.size()
            word_scores, (dec_out, state) = self.decode_fn(
                unravel(last_tokens),
                tuple(unravel(c) for c in state),
                extra_inputs=extra_inputs)
            word_scores = ravel(word_scores[:, 0, :])
            state = tuple(ravel(c) for c in state)
            states.append(state)
            outputs.append(dec_out)
            assert word_scores.size()[:2] == (
                batch_size, self.beam_size), word_scores.size()
            beam, beam_lengths, beam_scores = self.step(
                word_scores,
                length,
                beam,
                beam_scores,
                beam_lengths,
                extra_delimiter=extra_delimiter)
            if (beam_lengths.data != length).prod() or \
                    (self.max_len is not None and length == self.max_len):
                break

        all_states_collated = [th.stack(s, dim=3) for s in zip(*states)]
        final_indices = th.clamp(beam_lengths.data, max=self.max_len - 1)
        final_states = [
            s[:,
              lrange(batch_size)[:, None],
              lrange(self.beam_size)[None, :], final_indices, :]
            for s in all_states_collated
        ]
        all_outputs = th.stack(outputs, dim=1)
        return (beam, th.clamp(beam_lengths, max=self.max_len), beam_scores,
                (all_outputs, final_states))
Example #4
0
    def forward(self,
                enc_state,
                tgt_indices,
                tgt_lengths,
                extra_inputs=None,
                extra_delimiter=None,
                output_beam=None,
                output_sample=None):
        if output_beam is None:
            output_beam = not self.training
        if output_sample is None:
            output_sample = not self.training

        a = self.activations

        if output_beam:
            (beam, beam_lengths, beam_scores,
             beam_outputs) = self.beam_predictor(
                 enc_state,
                 extra_inputs=extra_inputs,
                 extra_delimiter=extra_delimiter)
        if output_sample:
            (sample, sample_lengths, sample_scores,
             sample_outputs) = self.sampler(enc_state,
                                            extra_inputs=extra_inputs,
                                            extra_delimiter=extra_delimiter)
            '''
            if not hasattr(neural.TorchModel, 'debug'):
                if not hasattr(neural.TorchModel, 'debug_counts'):
                    from collections import Counter
                    neural.TorchModel.debug_counts = Counter()
                print('  <COUNTS>')
                neural.TorchModel.debug_counts.update(sample[:, 0, 1].data.tolist())
            '''

        if extra_inputs is None:
            extra_inputs = []
        else:
            extra_inputs = [
                inp[:, None,
                    ...].expand((inp.size()[0], tgt_indices.size()[1] - 1) +
                                tuple(inp.size()[1:])) for inp in extra_inputs
            ]
        #if hasattr(RNNDecoder, 'debug'):
        #    import pdb; pdb.set_trace()
        a.log_softmax, (dec_out,
                        dec_state) = self.decode(tgt_indices[:, :-1],
                                                 enc_state,
                                                 extra_inputs=extra_inputs,
                                                 monitor=True)
        #if hasattr(RNNDecoder, 'debug'):
        #    print(f'    {enc_state[0][0,0,0]}->{dec_state[0][0,0,0]}')
        #    import pdb; pdb.set_trace()

        a.log_prob_token = index_sequence(a.log_softmax, tgt_indices.data[:,
                                                                          1:])
        a.mask = (lrange(a.log_prob_token.size()[1])[None, :] <
                  tgt_lengths.data[:, None]).float()
        a.log_prob_masked = a.log_prob_token * th.autograd.Variable(a.mask)
        a.log_prob_seq = a.log_prob_masked.sum(1)

        predict = {}
        score = {'target': a.log_prob_seq}
        output = {'target': (dec_out, dec_state)}
        if output_beam:
            predict['beam'] = (beam[:, 0, 1:], beam_lengths[:, 0])
            score['beam'] = beam_scores[:, 0]
            output['beam'] = beam_outputs
        if output_sample:
            predict['sample'] = (sample[:, 0, 1:], sample_lengths[:, 0])
            score['sample'] = sample_scores[:, 0]
            output['sample'] = sample_outputs

        if not self.monitor_activations:
            # Free up memory
            a.__dict__.clear()

        return predict, score, output
Example #5
0
 def sel_indices_to_selection(self, feasible_sels, sel_indices):
     return feasible_sels[lrange(feasible_sels.size()[0]), sel_indices.data, :]
Example #6
0
    def selection(self, sel_indices, feasible_sels, num_feasible_sels):
        # "GRU_o": encode dialogue for selection
        a = self.activations

        assert sel_indices.dim() == 1, sel_indices.size()
        batch_size = sel_indices.size()[0]

        a.combined_repr = self.combined_layer(th.cat([a.context_repr, a.dialogue_repr],
                                                     dim=1))
        assert a.combined_repr.dim() == 2, a.combined_repr.size()
        assert a.combined_repr.size()[0] == batch_size, (a.combined_repr.size(), batch_size)

        a.all_item_scores = log_softmax(self.selection_layer(a.combined_repr))
        assert a.all_item_scores.size() == (batch_size, self.selection_layer.out_features), \
            (a.all_item_scores.size(), (batch_size, self.selection_layer.out_features))

        a.feasible_item_scores = a.all_item_scores[
            lrange(a.all_item_scores.size()[0])[:, None, None],
            feasible_sels.data
        ]
        assert a.feasible_item_scores.size() == (batch_size, MAX_FEASIBLE + 3, NUM_ITEMS), \
            (a.feasible_item_scores.size(), batch_size)

        num_feasible_mask = th.autograd.Variable(cu(
            (lrange(a.feasible_item_scores.size()[1])[None, :, None] <=
             num_feasible_sels.data[:, None, None]).float()
        ))
        a.feasible_masked = a.feasible_item_scores + th.log(num_feasible_mask)
        a.full_selection_scores = log_softmax(a.feasible_item_scores.sum(dim=2), dim=1)
        assert a.full_selection_scores.size() == (batch_size, MAX_FEASIBLE + 3), \
            (a.full_selection_scores.size(), batch_size)

        a.selection_beam_score, selection_beam = a.full_selection_scores.max(dim=1)
        assert selection_beam.size() == (batch_size,), (selection_beam.size(), batch_size)
        selection_sample = th.multinomial(th.exp(a.full_selection_scores),
                                          1, replacement=True)[:, 0]
        a.selection_sample_score = th.exp(a.full_selection_scores)[
            lrange(a.full_selection_scores.size()[0]),
            selection_sample.data
        ]
        assert selection_sample.size() == (batch_size,), (selection_sample.size(), batch_size)
        selection_predict = {
            'beam': self.sel_indices_to_selection(feasible_sels, selection_beam),
            'sample': self.sel_indices_to_selection(feasible_sels, selection_sample),
        }
        assert selection_predict['beam'].size() == (batch_size, NUM_ITEMS), \
            (selection_predict['beam'].size(), batch_size)
        assert selection_predict['sample'].size() == (batch_size, NUM_ITEMS), \
            (selection_predict['sample'].size(), batch_size)
        a.selection_target_score = a.full_selection_scores[
            lrange(a.full_selection_scores.size()[0]),
            sel_indices.data
        ]
        assert a.selection_target_score.size() == (batch_size,), (a.selection_score.size(),
                                                                  batch_size)
        selection_score = {
            'target': a.selection_target_score,
            'beam': a.selection_beam_score,
            'sample': a.selection_sample_score,
        }

        return selection_predict, selection_score