コード例 #1
0
ファイル: test_dict.py プロジェクト: simplecoka/cortx
 def _run_specialtok_test(self, **kwargs):
     for special_token in ['SPECIAL TOKENS', '[SPECIAL; TOKENS]']:
         with testing_utils.tempdir() as tmpdir:
             if 'dict_file' not in kwargs:
                 kwargs['dict_file'] = os.path.join(tmpdir, 'dict')
             string = f"This is a test of {special_token}"
             parser = ParlaiParser(False, False)
             DictionaryAgent.add_cmdline_args(parser, partial_opt=None)
             opt = parser.parse_kwargs(**kwargs)
             da = DictionaryAgent(opt)
             before = da.tokenize(string)
             da.add_additional_special_tokens([special_token])
             after = da.tokenize(string)
             assert before != after
             assert len(before) > len(after)
             assert after[-1] == special_token
             assert before[:5] == after[:5]
             if opt['dict_tokenizer'] in (
                     'bytelevelbpe',
                     'gpt2',
                     'slow_bytelevel_bpe',
             ):
                 # we need to let the dictionary handle the tokenid mappings
                 assert da.vec2txt(da.txt2vec(string)) == string
コード例 #2
0
class PolyEncoderTokenizer(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        opt = load_poly_encoder_opt()
        self.dict = DictionaryAgent(opt)
        super().__init__(
            unk_token=self.dict.unk_token,
            pad_token=self.dict.null_token,
            cls_token=self.dict.start_token,
            sep_token=self.dict.end_token,
            **kwargs,
        )

    def get_vocab(self):
        return self.dict.tok2ind

    def save_vocabulary(self, save_directory):
        pass

    @property
    def vocab_size(self):
        return len(self.dict.tok2ind)

    def _tokenize(self, text, **kwargs):
        return self.dict.tokenize(str(text))

    def _convert_token_to_id(self, token):
        return self.dict[token]

    def _convert_id_to_token(self, index):
        return self.dict.ind2tok.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        out_string = self.dict.bpe.decode(tokens, token_ids=[], delimiter=' ')
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    @classmethod
    def from_pretrained(cls, *inputs, **kwargs):
        return cls()
コード例 #3
0
class IrBaselineAgent(Agent):
    """Information Retrieval baseline."""
    @staticmethod
    def add_cmdline_args(parser):
        """Add command line args specific to this agent."""
        parser = parser.add_argument_group('IrBaseline Arguments')
        parser.add_argument('-lp',
                            '--length_penalty',
                            type=float,
                            default=0.5,
                            help='length penalty for responses')
        parser.add_argument(
            '-hsz',
            '--history_size',
            type=int,
            default=1,
            help='number of utterances from the dialogue history to take use '
            'as the query')
        parser.add_argument('--label_candidates_file',
                            type=str,
                            default=None,
                            help='file of candidate responses to choose from')

    def __init__(self, opt, shared=None):
        """Initialize agent."""
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt
        self.history = []
        self.episodeDone = True
        if opt.get('label_candidates_file'):
            f = open(opt.get('label_candidates_file'))
            self.label_candidates = f.read().split('\n')

    def reset(self):
        """Reset agent properties."""
        self.observation = None
        self.history = []
        self.episodeDone = True

    def observe(self, obs):
        """Store and remember incoming observation message dict."""
        self.observation = obs
        self.dictionary.observe(obs)
        if self.episodeDone:
            self.history = []
        if 'text' in obs:
            self.history.append(obs.get('text', ''))
        self.episodeDone = obs.get('episode_done', False)
        return obs

    def act(self):
        """Generate a response to the previously seen observation(s)."""
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        cands = None
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            cands = obs['label_candidates']
        if hasattr(self, 'label_candidates'):
            # override label candidates with candidate file if set
            cands = self.label_candidates
        if cands:
            hist_sz = self.opt.get('history_size', 1)
            left_idx = max(0, len(self.history) - hist_sz)
            text = ' '.join(self.history[left_idx:len(self.history)])
            rep = self.build_query_representation(text)
            reply['text_candidates'] = (rank_candidates(
                rep, cands, self.length_penalty, self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        """Save dictionary tokenizer if available."""
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        """Load internal dictionary."""
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """Build representation of query, e.g. words or n-grams.

        :param query: string to represent.

        :returns: dictionary containing 'words' dictionary (token => frequency)
                  and 'norm' float (square root of the number of tokens)
        """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 +
                               math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        rep['norm'] = math.sqrt(len(words))
        return rep
コード例 #4
0
ファイル: fairseq.py プロジェクト: urikz/ParlAI
class FairseqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    For more information, see Convolutional Sequence to Sequence Learning
     `(Gehring et al. 2017) <https://arxiv.org/abs/1705.03122>`_.
    """

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Fairseq Arguments')
        agent.add_argument(
            '-tr', '--truncate',
            type=int, default=-1,
            help='truncate input & output lengths to speed up training (may '
                 'reduce accuracy). This fixes all input and output to have a '
                 'maximum length. This reduces the total amount of padding in '
                 'the batches.')
        agent.add_argument(
            '--max-positions',
            default=1024,
            type=int,
            metavar='N',
            help='max number of tokens in the sequence')
        agent.add_argument(
            '--seed',
            default=1,
            type=int,
            metavar='N',
            help='pseudo random number generator seed')
        options.add_optimization_args(argparser)
        options.add_generation_args(argparser)
        options.add_model_args(argparser)

    def __init__(self, opt, shared=None):
        # initialize defaults first
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.
            saved_state = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, saved_state = self.load(opt['model_file'])
                # override options with stored ones
                opt = self._override_opt(new_opt)

            self.args = OptWrapper(opt)
            self.parlai_dict = DictionaryAgent(opt)
            self.fairseq_dict = _make_fairseq_dict(self.parlai_dict)
            self.id = 'Fairseq'
            self.truncate = opt['truncate'] if opt['truncate'] > 0 else None

            self.EOS = self.fairseq_dict[self.fairseq_dict.eos()]
            self.EOS_TENSOR = (torch.LongTensor(1, 1)
                               .fill_(self.fairseq_dict.eos()))
            self.NULL_IDX = self.fairseq_dict.pad()

            encoder = fconv.FConvEncoder(
                self.fairseq_dict,
                embed_dim=self.args.encoder_embed_dim,
                convolutions=eval(self.args.encoder_layers),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            decoder = fconv.FConvDecoder(
                self.fairseq_dict,
                embed_dim=self.args.decoder_embed_dim,
                convolutions=eval(self.args.decoder_layers),
                out_embed_dim=self.args.decoder_out_embed_dim,
                attention=eval(self.args.decoder_attention),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            self.model = fconv.FConvModel(encoder, decoder)

            # from fairseq's build_criterion()
            if self.args.label_smoothing > 0:
                self.criterion = criterions.LabelSmoothedCrossEntropyCriterion(
                    self.args.label_smoothing, self.NULL_IDX)
            else:
                self.criterion = criterions.CrossEntropyCriterion(
                    self.args, self.fairseq_dict)

            self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion)
            if saved_state is not None:
                self.set_states(saved_state)
        self.reset()

    def _override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'arch',
            'encoder-embed-dim',
            'encoder-layers',
            'decoder-embed-dim',
            'decoder-layers',
            'decoder-out-embed-dim',
            'decoder-attention',
        }

        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True

    def observe(self, observation):
        # shallow copy observation (deep copy can be expensive)
        observation = observation.copy()
        if not self.episode_done and not observation.get('preprocessed', False):
            # if the last example wasn't the end of an episode, then we need to
            # recall what was said in that example
            prev_dialogue = self.observation['text']
            observation['text'] = prev_dialogue + '\n' + observation['text']
        self.observation = observation
        self.episode_done = observation['episode_done']
        return observation

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def batch_act(self, observations):
        bsz = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(bsz)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field

        # also, split observations into sub-batches based on number of gpus
        obs_split = np.array_split(observations, self.trainer.num_replicas)
        samples = [self.batchify(obs) for obs in obs_split]
        samples = [s for s in samples if s[0] is not None]
        any_valid = any(len(s[0]) > 0 for s in samples)

        if not any_valid:
            # no valid examples, just return the empty responses we set up
            return batch_reply

        # produce predictions if testing; otherwise, train
        has_targets = any(s[1] is not None for s in samples)
        if not has_targets:
            offset = 0
            for s in samples:
                xs = s[0]
                valid_inds = s[2]

                predictions = self._generate(self.args, xs)
                for i in range(len(predictions)):
                    # map the predictions back to non-empty examples in the batch
                    batch_reply[valid_inds[i] + offset]['text'] = predictions[i]
                    if i == 0:
                        print('prediction:', predictions[i])
                offset += len(valid_inds)
        else:
            loss = self._train(samples)

            batch_reply[0]['metrics'] = {}
            for k, v in loss.items():
                batch_reply[0]['metrics'][k] = v * bsz

        return batch_reply

    def parse(self, string):
        return [self.fairseq_dict.index(word)
                for word in self.parlai_dict.tokenize(string)]

    def batchify(self, observations):
        """Convert a list of observations into input & target tensors."""
        # valid examples
        exs = [ex for ex in observations if 'text' in ex]
        # the indices of the valid (non-empty) tensors
        valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex]

        # set up the input tensors
        batchsize = len(exs)
        if batchsize == 0:
            return None, None, None
        # tokenize the text
        parsed_x = [deque(maxlen=self.truncate) for _ in exs]
        for dq, ex in zip(parsed_x, exs):
            dq += self.parse(ex['text'])
        # parsed = [self.parse(ex['text']) for ex in exs]
        max_x_len = max((len(x) for x in parsed_x))
        for x in parsed_x:
            # left pad with zeros
            x.extendleft([self.fairseq_dict.pad()] * (max_x_len - len(x)))
        xs = torch.LongTensor(parsed_x)

        # set up the target tensors
        ys = None
        if 'labels' in exs[0]:
            # randomly select one of the labels to update on, if multiple
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            parsed_y = [deque(maxlen=self.truncate) for _ in labels]
            for dq, y in zip(parsed_y, labels):
                dq.extendleft(reversed(self.parse(y)))
            for y in parsed_y:
                y.append(self.fairseq_dict.eos())
            # append EOS to each label
            max_y_len = max(len(y) for y in parsed_y)
            for y in parsed_y:
                y += [self.fairseq_dict.pad()] * (max_y_len - len(y))
            ys = torch.LongTensor(parsed_y)
        return xs, ys, valid_inds

    def _positions_for_tokens(self, tokens):
        size = tokens.size()
        not_pad = tokens.ne(self.fairseq_dict.pad()).long()
        new_pos = tokens.new(size).fill_(self.fairseq_dict.pad())
        new_pos += not_pad
        for i in range(1, size[1]):
            new_pos[:, i] += new_pos[:, i-1] - 1
        return new_pos

    def _right_shifted_ys(self, ys):
        result = torch.LongTensor(ys.size())
        result[:, 0] = self.fairseq_dict.index(self.EOS)
        result[:, 1:] = ys[:, :-1]
        return result

    def _generate(self, opt, src_tokens):
        if not hasattr(self, 'translator'):
            self.translator = SequenceGenerator(
                [self.trainer.get_model()],
                beam_size=opt.beam,
                stop_early=(not opt.no_early_stop),
                normalize_scores=(not opt.unnormalized),
                len_penalty=opt.lenpen)
            self.translator.cuda()
        tokens = src_tokens.cuda(async=True)
        translations = self.translator.generate(Variable(tokens))
        results = [t[0] for t in translations]
        output_lines = [[] for _ in range(len(results))]
        for i in range(len(results)):
            output_lines[i] = ' '.join(self.fairseq_dict[idx]
                                       for idx in results[i]['tokens'][:-1])
        return output_lines

    def _train(self, samples):
        """Update the model using the targets."""
        for i, sample in enumerate(samples):
            # add extra info to samples
            sample = {
                'src_tokens': sample[0],
                'input_tokens': self._right_shifted_ys(sample[1]),
                'target': sample[1],
                'id': None
            }
            sample['ntokens'] = sum(len(t) for t in sample['target'])
            sample['src_positions'] = self._positions_for_tokens(
                sample['src_tokens'])
            sample['input_positions'] = self._positions_for_tokens(
                sample['input_tokens'])
            samples[i] = sample
        return self.trainer.train_step(samples)

    def save(self, path=None):
        path = self.opt.get('model_file', None) if path is None else path
        if path and hasattr(self, 'trainer'):
            model = {}
            model['state_dict'] = self.trainer.get_model().state_dict()
            model['opt'] = self.opt
            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        model = torch.load(path, map_location=lambda cpu, _: cpu)
        return model['opt'], model['state_dict']

    def set_states(self, state_dict):
        """Set the state dict of the model from saved states."""
        self.trainer.get_model().load_state_dict(state_dict)
コード例 #5
0
class LanguageModelAgent(Agent):
    """ Agent which trains an RNN on a language modeling task.
    It is adapted from the language model featured in Pytorch's examples repo
    here: <https://github.com/pytorch/examples/tree/master/word_language_model>.
    """
    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        argparser.set_defaults(batch_sort=False)
        agent = argparser.add_argument_group('Language Model Arguments')
        agent.add_argument(
            '--init-model',
            type=str,
            default=None,
            help='load dict/features/weights/opts from this file')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=200,
                           help='size of the hidden layers')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=200,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.2,
                           help='dropout rate')
        agent.add_argument('-clip',
                           '--gradient-clip',
                           type=float,
                           default=0.25,
                           help='gradient clipping')
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument(
            '-rnn',
            '--rnn-class',
            default='LSTM',
            help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
        agent.add_argument('-sl',
                           '--seq-len',
                           type=int,
                           default=35,
                           help='sequence length')
        agent.add_argument('-tied',
                           '--emb-tied',
                           action='store_true',
                           help='tie the word embedding and softmax weights')
        agent.add_argument('-seed',
                           '--random-seed',
                           type=int,
                           default=1111,
                           help='random seed')
        agent.add_argument('--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        agent.add_argument('-tr',
                           '--truncate-pred',
                           type=int,
                           default=50,
                           help='truncate predictions')
        agent.add_argument('-rf',
                           '--report-freq',
                           type=float,
                           default=0.1,
                           help='report frequency of prediction during eval')
        agent.add_argument('-pt',
                           '--person-tokens',
                           type='bool',
                           default=True,
                           help='append person1 and person2 tokens to text')
        # learning rate parameters
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=20,
                           help='initial learning rate')
        agent.add_argument(
            '-lrf',
            '--lr-factor',
            type=float,
            default=1.0,
            help='mutliply learning rate by this factor when the \
                           validation loss does not decrease')
        agent.add_argument('-lrp',
                           '--lr-patience',
                           type=int,
                           default=10,
                           help='wait before decreasing learning rate')
        agent.add_argument('-lrm',
                           '--lr-minimum',
                           type=float,
                           default=0.1,
                           help='minimum learning rate')
        agent.add_argument(
            '-sm',
            '--sampling-mode',
            type='bool',
            default=False,
            help='sample when generating tokens instead of taking \
                           the max and do not produce UNK token (when bs=1)')
        LanguageModelAgent.dictionary_class().add_cmdline_args(argparser)
        return agent

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        self.metrics = {
            'loss': 0,
            'num_tokens': 0,
            'lmloss': 0,
            'lm_num_tokens': 0
        }
        self.states = {}
        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        self.batchsize = opt.get('batchsize', 1)
        self.use_person_tokens = opt.get('person_tokens', True)
        self.sampling_mode = opt.get('sampling_mode', False)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']

            self.model = shared['model']
            self.metrics = shared['metrics']

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if 'states' in shared:
                self.states = shared['states']

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

        else:
            # this is not a shared instance of this class, so do full init
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            # for backwards compatibility: will only be called for older models
            # for which .opt file does not exist
            if (init_model is not None
                    and not os.path.isfile(init_model + '.opt')):
                new_opt = self.load_opt(init_model)
                # load model parameters if available
                print('[ Setting opt from {} ]'.format(init_model))
                # since .opt file does not exist, save one for future use
                print("Saving opt file at:", init_model + ".opt")
                with open(init_model + ".opt", 'wb') as handle:
                    pickle.dump(new_opt,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                opt = self.override_opt(new_opt)

            if ((init_model is not None
                 and os.path.isfile(init_model + '.dict'))
                    or opt['dict_file'] is None):
                opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'LanguageModel'

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

            # set model
            self.model = RNNModel(opt, len(self.dict))

            if init_model is not None:
                self.load(init_model)

            if self.use_cuda:
                self.model.cuda()

        self.next_observe = []
        self.next_batch = []

        self.is_training = True

        self.clip = opt.get('gradient_clip', 0.25)
        # set up criteria
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                             size_average=False)
        if self.use_cuda:
            # push to cuda
            self.criterion.cuda()
        # init hidden state
        self.hidden = self.model.init_hidden(self.batchsize)
        # init tensor of end tokens
        self.ends = torch.LongTensor(
            [self.END_IDX for _ in range(self.batchsize)])
        if self.use_cuda:
            self.ends = self.ends.cuda()
        # set up model and learning rate scheduler parameters
        self.lr = opt['learningrate']
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)
        self.best_val_loss = self.states.get('best_val_loss', None)
        self.lr_factor = opt['lr_factor']
        if self.lr_factor < 1.0:
            self.lr_patience = opt['lr_patience']
            self.lr_min = opt['lr_minimum']
            self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer,
                factor=self.lr_factor,
                verbose=True,
                patience=self.lr_patience,
                min_lr=self.lr_min)
            # initial step for scheduler if self.best_val_loss is initialized
            if self.best_val_loss is not None:
                self.scheduler.step(self.best_val_loss)
        else:
            self.scheduler = None

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.
        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len',
            'emb_tied', 'truncate_pred', 'report_freq', 'person_tokens',
            'learningrate'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def zero_grad(self):
        """Zero out optimizer."""
        self.optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
        self.optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.reset_metrics()

    def reset_metrics(self):
        self.metrics.clear()
        self.metrics['loss'] = 0
        self.metrics['lmloss'] = 0
        self.metrics['num_tokens'] = 0
        self.metrics['lm_num_tokens'] = 0

    def report(self):
        m = {}
        if self.metrics['num_tokens'] > 0:
            m['loss'] = self.metrics['loss'] / self.metrics['num_tokens']
            m['ppl'] = math.exp(m['loss'])
        if self.metrics['lm_num_tokens'] > 0:
            m['lmloss'] = self.metrics['lmloss'] / self.metrics['lm_num_tokens']
            m['lmppl'] = math.exp(m['lmloss'])
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            m[k] = round_sigfigs(v, 4)
        return m

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['opt'] = self.opt
        shared['dict'] = self.dict
        shared['NULL_IDX'] = self.NULL_IDX
        shared['END_IDX'] = self.END_IDX
        shared['model'] = self.model
        if self.opt.get('numthreads', 1) > 1:
            if type(self.metrics) == dict:
                # move metrics and model to shared memory
                self.metrics = SharedTable(self.metrics)
                self.model.share_memory()
            shared['states'] = {  # only need to pass optimizer states
                'optimizer': self.optimizer.state_dict(),
            }
        shared['metrics'] = self.metrics
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        seq_len = self.opt['seq_len']
        is_training = True
        if 'labels' not in obs:
            is_training = False

        if is_training:
            if 'text' in obs:
                if self.use_person_tokens:
                    obs['text'] = 'PERSON1 ' + obs['text']
                vec = self.parse(obs['text'])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if 'labels' in obs:
                if self.use_person_tokens:
                    labels = [
                        'PERSON2 ' + label for label in obs['labels']
                        if label != ''
                    ]
                    obs['labels'] = tuple(labels)
                vec = self.parse(obs['labels'][0])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if len(self.next_observe) < (seq_len + 1):
                # not enough to return to make a batch
                # we handle this case in vectorize
                # labels indicates that we are training
                self.observation = {'labels': ''}
                return self.observation
            else:
                vecs_to_return = []
                total = len(self.next_observe) // (seq_len + 1)
                for _ in range(total):
                    observe = self.next_observe[:(seq_len + 1)]
                    self.next_observe = self.next_observe[(seq_len + 1):]
                    vecs_to_return.append(observe)
                dict_to_return = {
                    'text': '',
                    'labels': '',
                    'text2vec': vecs_to_return
                }
                self.observation = dict_to_return
                return dict_to_return
        else:
            if 'text' in obs:
                if self.use_person_tokens:
                    obs['text'] = 'PERSON1 ' + obs['text']
            if 'eval_labels' in obs:
                if self.use_person_tokens:
                    eval_labels = [
                        'PERSON2 ' + label for label in obs['eval_labels']
                        if label != ''
                    ]
                    obs['eval_labels'] = tuple(eval_labels)
            self.observation = obs
            return obs

    def repackage_hidden(self, h):
        """Wraps hidden states in new Variables, to detach them from their history."""
        if isinstance(h, Variable):
            return Variable(h.data)
        else:
            return tuple(self.repackage_hidden(v) for v in h)

    def get_target_loss(self, data, hidden, targets):
        """Calculates the loss with respect to the targets, token by token,
           where each output token is conditioned on either the input or the
           previous target token.
        """
        loss = 0.0
        bsz = data.size(0)

        # during interactive mode, when no targets exist, we return 0
        if targets is None:
            return loss

        # feed in inputs without end token
        output, hidden = self.model(data.transpose(0, 1), hidden)
        self.hidden = self.repackage_hidden(hidden)
        # feed in end tokens
        output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)),
                                    self.hidden)
        self.hidden = self.repackage_hidden(hidden)
        output_flat = output.view(-1, len(self.dict))
        loss += self.criterion(output_flat, targets.select(1, 0).view(-1)).data

        for i in range(1, targets.size(1)):
            output, hidden = self.model(targets.select(1, i - 1).view(1, bsz),
                                        self.hidden,
                                        no_pack=True)
            self.hidden = self.repackage_hidden(hidden)
            output_flat = output.view(-1, len(self.dict))
            loss += self.criterion(output_flat,
                                   targets.select(1, i).view(-1)).data

        return loss

    def get_predictions(self, data):
        """Generates predictions word by word until we either reach the end token
           or some max length (opt['truncate_pred']).
        """
        token_list = []
        bsz = data.size(0)
        done = [False for _ in range(bsz)]
        total_done = 0
        hidden = self.model.init_hidden(bsz)

        i = 0
        word_idx = None
        while total_done < bsz and i <= self.opt['truncate_pred']:
            if i == 0:
                # feed in input without end tokens
                output, hidden = self.model(data.transpose(0, 1), hidden)
                hidden = self.repackage_hidden(hidden)
                # feed in end tokens
                output, hidden = self.model(
                    Variable(self.ends[:bsz].view(1, bsz)), hidden)
            else:
                output, hidden = self.model(Variable(word_idx.view(1, bsz)),
                                            hidden,
                                            no_pack=True)
            hidden = self.repackage_hidden(hidden)
            word_weights = output.squeeze().data.exp()
            if bsz > 1:
                _, word_idx = torch.max(word_weights, 1)
            else:
                if self.sampling_mode:
                    unk_idx = self.dict[self.dict.unk_token]
                    # make word_weights have smaller norm so that calculated
                    # norm does not blow up
                    word_weights = word_weights.div(1e10)
                    # make word_weights have L2 norm 1
                    ww_norm = torch.norm(word_weights, p=2)
                    word_weights = word_weights.div(ww_norm)
                    # square distribution
                    word_weights = torch.mul(word_weights, word_weights)
                    # sample distribution
                    word_idx = torch.multinomial(word_weights, 1)
                    # do not produce UNK token
                    while word_idx == unk_idx:
                        word_idx = torch.multinomial(word_weights, 1)
                else:
                    _, word_idx = torch.max(word_weights, 0)
            # mark end indices for items in batch
            word_idx = word_idx.view(-1)
            for k in range(word_idx.size(0)):
                if not done[k]:
                    if int(word_idx[k]) == self.END_IDX:
                        done[k] = True
                        total_done += 1
            token_list.append(word_idx.view(bsz, 1))
            i += 1

        return torch.cat(token_list, 1)

    def predict(self,
                data,
                hidden,
                targets=None,
                is_training=True,
                y_lens=None):
        """Produce a prediction from our model."""
        output = None
        predictions = None
        if is_training:
            self.model.train()
            self.zero_grad()
            output, hidden = self.model(data, hidden)
            loss = self.criterion(output.view(-1, len(self.dict)),
                                  targets.view(-1))
            # save loss to metrics
            target_tokens = targets.ne(self.NULL_IDX).float().sum().item()
            self.metrics['lmloss'] += loss.double().item()
            self.metrics['lm_num_tokens'] += target_tokens
            # average loss per token
            loss /= target_tokens
            loss.backward(retain_graph=True)
            self.update_params()
        else:
            self.model.eval()
            predictions = self.get_predictions(data)
            bsz = data.size(0)
            if bsz != self.batchsize:
                self.hidden = self.model.init_hidden(bsz)
            if targets is not None:
                loss = self.get_target_loss(data, self.hidden, targets)
                self.metrics['loss'] += loss
                self.metrics['num_tokens'] += sum(y_lens)

        return output, hidden, predictions

    def vectorize(self, observations, seq_len, is_training):
        """Convert a list of observations into input & target tensors."""
        labels = None
        valid_inds = None
        y_lens = None
        if is_training:
            for obs in observations:
                if obs:
                    if 'text2vec' in obs:
                        self.next_batch += obs['text2vec']
            if len(self.next_batch) <= self.batchsize:
                return None, None, None, None, None
            else:
                data_list = []
                targets_list = []
                # total is the number of batches
                total = len(self.next_batch) // self.batchsize
                for _ in range(total):
                    batch = self.next_batch[:self.batchsize]
                    self.next_batch = self.next_batch[self.batchsize:]

                    source = torch.LongTensor(batch).t().contiguous()
                    data = Variable(source[:seq_len])
                    targets = Variable(source[1:])

                    if self.use_cuda:
                        data = data.cuda()
                        targets = targets.cuda()

                    data_list.append(data)
                    targets_list.append(targets)
        else:
            # here we get valid examples and pad them with zeros
            xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text(
                observations,
                self.dict,
                end_idx=self.END_IDX,
                null_idx=self.NULL_IDX)

            if self.use_cuda:
                if xs is not None:
                    xs = Variable(torch.LongTensor(xs)).cuda()
                if ys is not None:
                    ys = Variable(torch.LongTensor(ys)).cuda()
            else:
                if xs is not None:
                    xs = Variable(torch.LongTensor(xs))
                if ys is not None:
                    ys = Variable(torch.LongTensor(ys))
            data_list = [xs]
            targets_list = [ys]

        return data_list, targets_list, labels, valid_inds, y_lens

    def batch_act(self, observations):
        batch_reply = [{'id': self.getID()} for _ in range(len(observations))]
        if any(['labels' in obs for obs in observations]):
            # if we are starting a new training epoch, reinitialize hidden
            if not self.is_training:
                self.hidden = self.model.init_hidden(self.batchsize)
            self.is_training = True
            data_list, targets_list, _c, _v, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)
        else:
            # if we just finished training, reinitialize hidden
            if self.is_training:
                self.hidden = self.model.init_hidden(self.batchsize)
                self.is_training = False
            data_list, targets_list, labels, valid_inds, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)

        if data_list is None:
            # not enough data to batch act yet, return empty responses
            return batch_reply

        batch_reply = []
        # during evaluation, len(data_list) is always 1
        # during training, len(dat_list) >= 0: vectorize returns a list
        #     containing all batches available at the time it is called
        for i in range(len(data_list)):
            temp_dicts = [{
                'id': self.getID()
            } for _ in range(len(observations))]
            # ignore case when we do not return any valid indices
            if data_list[i] is not None:
                output, hidden, predictions = self.predict(
                    data_list[i], self.hidden, targets_list[i],
                    self.is_training, y_lens)
                self.hidden = self.repackage_hidden(hidden)

                if predictions is not None:
                    # map predictions back to the right order
                    PaddingUtils.map_predictions(
                        predictions.cpu(),
                        valid_inds,
                        temp_dicts,
                        observations,
                        self.dict,
                        self.END_IDX,
                        report_freq=self.opt['report_freq'])

            batch_reply += temp_dicts

        # for prediction metrics computations, we get rid of PERSON1 and PERSON2 tokens
        if not self.is_training:
            for reply in batch_reply:
                if 'text' in reply:
                    reply['text'] = reply['text'].replace('PERSON1 ', '')
                    reply['text'] = reply['text'].replace('PERSON2 ', '')

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['opt'] = self.opt
            model['best_val_loss'] = self.best_val_loss

            with open(path, 'wb') as write:
                torch.save(model, write)
            # save opt file
            with open(path + ".opt", 'wb') as handle:
                pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def receive_metrics(self, metrics_dict):
        if 'loss' in metrics_dict and self.scheduler is not None:
            self.scheduler.step(metrics_dict['loss'])

    def load_opt(self, path):
        """Return opt, states."""
        states = torch.load(path, map_location=lambda cpu, _: cpu)
        return states['opt']

    def load(self, path):
        """Load model states."""
        if os.path.isfile(path):
            # load model parameters if available
            print('[ Loading existing model params from {} ]'.format(path))
            self.states = torch.load(path, map_location=lambda cpu, _: cpu)
            self.model.load_state_dict(self.states['model'])
コード例 #6
0
class IrBaselineAgent(Agent):

    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument(
            '-lp', '--length_penalty', type=float, default=0.5,
            help='length penalty for responses')
        parser.add_argument(
            '-hsz', '--history_size', type=int, default=1,
            help='number of utterances from the dialogue history to take use as the query')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt
        self.history = []
        self.episodeDone = True

    def reset(self):
        self.observation = None
        self.history = []
        self.episodeDone = True

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        if self.episodeDone:
            self.history = []
        if 'text' in obs:
            self.history.append(obs.get('text', ''))
        self.episodeDone = obs.get('episode_done', False)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            # text = obs['text']
            text = ' '.join(
                self.history[max(0, len(self.history) -
                                 self.opt.get('history_size', 1)):len(self.history)])
            rep = self.build_query_representation(text)
            reply['text_candidates'] = (
                rank_candidates(rep, obs['label_candidates'],
                                self.length_penalty, self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        rep['norm'] = math.sqrt(len(words))
        return rep
コード例 #7
0
ファイル: ir_baseline.py プロジェクト: ahiroto/ParlAI
class IrBaselineAgent(Agent):

    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument(
            '-lp', '--length_penalty', default=0.5,
            help='length penalty for responses')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            rep = self.build_query_representation(obs['text'])
            reply['text_candidates'] = (
                rank_candidates(rep, obs['label_candidates'],
                                self.length_penalty, self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        norm = len(used)
        rep['norm'] = math.sqrt(len(words))
        return rep
コード例 #8
0
ファイル: fairseq.py プロジェクト: ahiroto/ParlAI
class FairseqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    For more information, see Convolutional Sequence to Sequence Learning
     `(Gehring et al. 2017) <https://arxiv.org/abs/1705.03122>`_.
    """

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Fairseq Arguments')
        agent.add_argument(
            '-tr', '--truncate',
            type=int, default=-1,
            help='truncate input & output lengths to speed up training (may '
                 'reduce accuracy). This fixes all input and output to have a '
                 'maximum length. This reduces the total amount of padding in '
                 'the batches.')
        agent.add_argument(
            '--max-positions',
            default=1024,
            type=int,
            metavar='N',
            help='max number of tokens in the sequence')
        agent.add_argument(
            '--seed',
            default=1,
            type=int,
            metavar='N',
            help='pseudo random number generator seed')
        options.add_optimization_args(argparser)
        options.add_generation_args(argparser)
        options.add_model_args(argparser)

    def __init__(self, opt, shared=None):
        # initialize defaults first
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.
            saved_state = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, saved_state = self.load(opt['model_file'])
                # override options with stored ones
                opt = self._override_opt(new_opt)

            self.args = OptWrapper(opt)
            self.parlai_dict = DictionaryAgent(opt)
            self.fairseq_dict = _make_fairseq_dict(self.parlai_dict)
            self.id = 'Fairseq'
            self.truncate = opt['truncate'] if opt['truncate'] > 0 else None

            self.EOS = self.fairseq_dict[self.fairseq_dict.eos()]
            self.EOS_TENSOR = (torch.LongTensor(1, 1)
                               .fill_(self.fairseq_dict.eos()))
            self.NULL_IDX = self.fairseq_dict.pad()

            encoder = fconv.FConvEncoder(
                self.fairseq_dict,
                embed_dim=self.args.encoder_embed_dim,
                convolutions=eval(self.args.encoder_layers),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            decoder = fconv.FConvDecoder(
                self.fairseq_dict,
                embed_dim=self.args.decoder_embed_dim,
                convolutions=eval(self.args.decoder_layers),
                out_embed_dim=self.args.decoder_out_embed_dim,
                attention=eval(self.args.decoder_attention),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            self.model = fconv.FConvModel(encoder, decoder)

            # from fairseq's build_criterion()
            if self.args.label_smoothing > 0:
                self.criterion = criterions.LabelSmoothedCrossEntropyCriterion(
                    self.args.label_smoothing, self.NULL_IDX)
            else:
                self.criterion = criterions.CrossEntropyCriterion(
                    self.args, self.fairseq_dict)

            self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion)
            if saved_state is not None:
                self.set_states(saved_state)
        self.reset()

    def _override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'arch',
            'encoder-embed-dim',
            'encoder-layers',
            'decoder-embed-dim',
            'decoder-layers',
            'decoder-out-embed-dim',
            'decoder-attention',
        }

        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True

    def observe(self, observation):
        # shallow copy observation (deep copy can be expensive)
        observation = observation.copy()
        if not self.episode_done and not observation.get('preprocessed', False):
            # if the last example wasn't the end of an episode, then we need to
            # recall what was said in that example
            prev_dialogue = self.observation['text']
            observation['text'] = prev_dialogue + '\n' + observation['text']
        self.observation = observation
        self.episode_done = observation['episode_done']
        return observation

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def batch_act(self, observations):
        bsz = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(bsz)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field

        # also, split observations into sub-batches based on number of gpus
        obs_split = np.array_split(observations, self.trainer.num_replicas)
        samples = [self.batchify(obs) for obs in obs_split]
        samples = [s for s in samples if s[0] is not None]
        any_valid = any(len(s[0]) > 0 for s in samples)

        if not any_valid:
            # no valid examples, just return the empty responses we set up
            return batch_reply

        # produce predictions if testing; otherwise, train
        has_targets = any(s[1] is not None for s in samples)
        if not has_targets:
            offset = 0
            for s in samples:
                xs = s[0]
                valid_inds = s[2]

                predictions = self._generate(self.args, xs)
                for i in range(len(predictions)):
                    # map the predictions back to non-empty examples in the batch
                    batch_reply[valid_inds[i] + offset]['text'] = predictions[i]
                    if i == 0:
                        print('prediction:', predictions[i])
                offset += len(valid_inds)
        else:
            loss = self._train(samples)

            batch_reply[0]['metrics'] = {}
            for k, v in loss.items():
                batch_reply[0]['metrics'][k] = v * bsz
                if k == 'loss':
                    try:
                        perplexity = 2 ** v * bsz
                    except OverflowError:
                        perplexity = float('inf')
                    batch_reply[0]['metrics']['perplexity'] = perplexity

        return batch_reply

    def parse(self, string):
        return [self.fairseq_dict.index(word)
                for word in self.parlai_dict.tokenize(string)]

    def batchify(self, observations):
        """Convert a list of observations into input & target tensors."""
        # valid examples
        exs = [ex for ex in observations if 'text' in ex]
        # the indices of the valid (non-empty) tensors
        valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex]

        # set up the input tensors
        batchsize = len(exs)
        if batchsize == 0:
            return None, None, None
        # tokenize the text
        parsed_x = [deque(maxlen=self.truncate) for _ in exs]
        for dq, ex in zip(parsed_x, exs):
            dq += self.parse(ex['text'])
        # parsed = [self.parse(ex['text']) for ex in exs]
        max_x_len = max((len(x) for x in parsed_x))
        for x in parsed_x:
            # left pad with zeros
            x.extendleft([self.fairseq_dict.pad()] * (max_x_len - len(x)))
        xs = torch.LongTensor(parsed_x)

        # set up the target tensors
        ys = None
        if 'labels' in exs[0]:
            # randomly select one of the labels to update on, if multiple
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            parsed_y = [deque(maxlen=self.truncate) for _ in labels]
            for dq, y in zip(parsed_y, labels):
                dq.extendleft(reversed(self.parse(y)))
            for y in parsed_y:
                y.append(self.fairseq_dict.eos())
            # append EOS to each label
            max_y_len = max(len(y) for y in parsed_y)
            for y in parsed_y:
                y += [self.fairseq_dict.pad()] * (max_y_len - len(y))
            ys = torch.LongTensor(parsed_y)
        return xs, ys, valid_inds

    def _positions_for_tokens(self, tokens):
        size = tokens.size()
        not_pad = tokens.ne(self.fairseq_dict.pad()).long()
        new_pos = tokens.new(size).fill_(self.fairseq_dict.pad())
        new_pos += not_pad
        for i in range(1, size[1]):
            new_pos[:, i] += new_pos[:, i-1] - 1
        return new_pos

    def _right_shifted_ys(self, ys):
        result = torch.LongTensor(ys.size())
        result[:, 0] = self.fairseq_dict.index(self.EOS)
        result[:, 1:] = ys[:, :-1]
        return result

    def _generate(self, opt, src_tokens):
        if not hasattr(self, 'translator'):
            self.translator = SequenceGenerator(
                [self.trainer.get_model()],
                beam_size=opt.beam,
                stop_early=(not opt.no_early_stop),
                normalize_scores=(not opt.unnormalized),
                len_penalty=opt.lenpen)
            self.translator.cuda()
        tokens = src_tokens.cuda(async=True)
        translations = self.translator.generate(Variable(tokens))
        results = [t[0] for t in translations]
        output_lines = [[] for _ in range(len(results))]
        for i in range(len(results)):
            output_lines[i] = ' '.join(self.fairseq_dict[idx]
                                       for idx in results[i]['tokens'][:-1])
        return output_lines

    def _train(self, samples):
        """Update the model using the targets."""
        for i, sample in enumerate(samples):
            # add extra info to samples
            sample = {
                'src_tokens': sample[0],
                'input_tokens': self._right_shifted_ys(sample[1]),
                'target': sample[1],
                'id': None
            }
            sample['ntokens'] = sum(len(t) for t in sample['target'])
            sample['src_positions'] = self._positions_for_tokens(
                sample['src_tokens'])
            sample['input_positions'] = self._positions_for_tokens(
                sample['input_tokens'])
            samples[i] = sample
        return self.trainer.train_step(samples)

    def save(self, path=None):
        path = self.opt.get('model_file', None) if path is None else path
        if path and hasattr(self, 'trainer'):
            model = {}
            model['state_dict'] = self.trainer.get_model().state_dict()
            model['opt'] = self.opt
            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            model = torch.load(read)
        return model['opt'], model['state_dict']

    def set_states(self, state_dict):
        """Set the state dict of the model from saved states."""
        self.trainer.get_model().load_state_dict(state_dict)
コード例 #9
0
ファイル: ir_baseline.py プロジェクト: rikima/ParlAI
class IrBaselineAgent(Agent):
    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument('-lp',
                            '--length_penalty',
                            default=0.5,
                            help='length penalty for responses')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            rep = self.build_query_representation(obs['text'])
            reply['text_candidates'] = (rank_candidates(
                rep, obs['label_candidates'], self.length_penalty,
                self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 +
                               math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        norm = len(used)
        rep['norm'] = math.sqrt(len(words))
        return rep
コード例 #10
0
class LanguageModelAgent(Agent):
    """ Agent which trains an RNN on a language modeling task.

    It is adapted from the language model featured in Pytorch's examples repo
    here: <https://github.com/pytorch/examples/tree/master/word_language_model>.
    """
    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        argparser.set_defaults(batch_sort=False)
        LanguageModelAgent.dictionary_class().add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Language Model Arguments')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=200,
                           help='size of the hidden layers')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=200,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=20,
                           help='initial learning rate')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.2,
                           help='dropout rate')
        agent.add_argument('-clip',
                           '--gradient-clip',
                           type=float,
                           default=0.25,
                           help='gradient clipping')
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument(
            '-rnn',
            '--rnn-class',
            default='LSTM',
            help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
        agent.add_argument('-sl',
                           '--seq-len',
                           type=int,
                           default=35,
                           help='sequence length')
        agent.add_argument('-tied',
                           '--emb-tied',
                           action='store_true',
                           help='tie the word embedding and softmax weights')
        agent.add_argument('-seed',
                           '--random-seed',
                           type=int,
                           default=1111,
                           help='random seed')
        agent.add_argument('--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        agent.add_argument('-tr',
                           '--truncate-pred',
                           type=int,
                           default=50,
                           help='truncate predictions')
        agent.add_argument('-rf',
                           '--report-freq',
                           type=float,
                           default=0.1,
                           help='report frequency of prediction during eval')
        agent.add_argument('-pt',
                           '--person-tokens',
                           type=bool,
                           default=True,
                           help='append person1 and person2 tokens to text')
        agent.add_argument(
            '-lrf',
            '--lr-factor',
            type=float,
            default=0.5,
            help='mutliply learning rate by this factor when the \
                           validation loss does not decrease')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        self.states = {}
        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        self.batchsize = opt.get('batchsize', 1)
        self.use_person_tokens = opt.get('person_tokens', True)

        if shared:
            # set up shared properties
            self.dict = shared['dict']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

        else:
            # this is not a shared instance of this class, so do full init
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'LanguageModel'

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

            # set model
            self.model = RNNModel(opt, len(self.dict))

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        self.next_observe = []
        self.next_batch = []

        self.is_training = True

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.25)
            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            if self.use_cuda:
                # push to cuda
                self.criterion.cuda()
            # set up criterion for eval: we do not want to average over size
            self.eval_criterion = nn.CrossEntropyLoss(
                ignore_index=self.NULL_IDX, size_average=False)
            if self.use_cuda:
                # push to cuda
                self.eval_criterion.cuda()
            # init hidden state
            self.hidden = self.model.init_hidden(self.batchsize)
            # init tensor of end tokens
            self.ends = torch.LongTensor(
                [self.END_IDX for _ in range(self.batchsize)])
            if self.use_cuda:
                self.ends = self.ends.cuda()
            # set up optimizer
            self.lr = opt['learningrate']
            self.lr_factor = opt['lr_factor']
            self.best_val_loss = None

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len',
            'emb_tied'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def zero_grad(self):
        """Zero out optimizer."""
        self.model.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip)
        for p in self.model.parameters():
            p.data.add_(-self.lr, p.grad.data)

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['dict'] = self.dict
        shared['NULL_IDX'] = self.NULL_IDX
        shared['END_IDX'] = self.END_IDX
        if self.opt.get('numthreads', 1) > 1:
            shared['model'] = self.model
            self.model.share_memory()
            shared['states'] = self.states
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        #shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        seq_len = self.opt['seq_len']
        is_training = True
        if 'labels' not in obs:
            is_training = False

        if is_training:
            if 'text' in obs:
                if self.use_person_tokens:
                    obs['text'] = 'PERSON1 ' + obs['text']
                vec = self.parse(obs['text'])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if 'labels' in obs:
                if self.use_person_tokens:
                    labels = [
                        'PERSON2 ' + label for label in obs['labels']
                        if label != ''
                    ]
                    obs['labels'] = tuple(labels)
                vec = self.parse(obs['labels'][0])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if len(self.next_observe) < (seq_len + 1):
                # not enough to return to make a batch
                # we handle this case in vectorize
                # labels indicates that we are training
                self.observation = {'labels': ''}
                return self.observation
            else:
                vecs_to_return = []
                total = len(self.next_observe) // (seq_len + 1)
                for _ in range(total):
                    observe = self.next_observe[:(seq_len + 1)]
                    self.next_observe = self.next_observe[(seq_len + 1):]
                    vecs_to_return.append(observe)
                dict_to_return = {
                    'text': '',
                    'labels': '',
                    'text2vec': vecs_to_return
                }
                self.observation = dict_to_return
                return dict_to_return
        else:
            if 'text' in obs:
                if self.use_person_tokens:
                    obs['text'] = 'PERSON1 ' + obs['text']
            if 'eval_labels' in obs:
                if self.use_person_tokens:
                    eval_labels = [
                        'PERSON2 ' + label for label in obs['eval_labels']
                        if label != ''
                    ]
                    obs['eval_labels'] = tuple(eval_labels)
            self.observation = obs
            return obs

    def repackage_hidden(self, h):
        """Wraps hidden states in new Variables, to detach them from their history."""
        if type(h) == Variable:
            return Variable(h.data)
        else:
            return tuple(self.repackage_hidden(v) for v in h)

    def get_target_loss(self, data, hidden, targets, y_lens):
        """Calculates the loss with respect to the targets, token by token,
           where each output token is conditioned on either the input or the
           previous target token.
        """
        loss = 0.0
        bsz = data.size(0)

        # during interactive mode, when no targets exist, we return 0
        if targets is None:
            return loss

        # feed in inputs without end token
        output, hidden = self.model(data.transpose(0, 1), hidden)
        self.hidden = self.repackage_hidden(hidden)
        # feed in end tokens
        output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)),
                                    self.hidden)
        self.hidden = self.repackage_hidden(hidden)
        output_flat = output.view(-1, len(self.dict))
        loss += self.eval_criterion(output_flat,
                                    targets.select(1, 0).view(-1)).data

        for i in range(1, targets.size(1)):
            output, hidden = self.model(targets.select(1, i - 1).view(1, bsz),
                                        self.hidden,
                                        no_pack=True)
            self.hidden = self.repackage_hidden(hidden)
            output_flat = output.view(-1, len(self.dict))
            loss += self.eval_criterion(output_flat,
                                        targets.select(1, i).view(-1)).data

        return loss / float(sum(y_lens))

    def get_predictions(self, data):
        """Generates predictions word by word until we either reach the end token
           or some max length (opt['truncate_pred']).
        """
        token_list = []
        bsz = data.size(0)
        done = [False for _ in range(bsz)]
        total_done = 0
        hidden = self.model.init_hidden(bsz)

        i = 0
        while total_done < bsz and i <= self.opt['truncate_pred']:
            if i == 0:
                # feed in input without end tokens
                output, hidden = self.model(data.transpose(0, 1), hidden)
                hidden = self.repackage_hidden(hidden)
                # feed in end tokens
                output, hidden = self.model(
                    Variable(self.ends[:bsz].view(1, bsz)), hidden)
            else:
                output, hidden = self.model(Variable(word_idx.view(1, bsz)),
                                            hidden,
                                            no_pack=True)
            hidden = self.repackage_hidden(hidden)
            word_weights = output.squeeze().data.exp()
            if bsz > 1:
                value, word_idx = torch.max(word_weights, 1)
            else:
                value, word_idx = torch.max(word_weights, 0)
            # mark end indices for items in batch
            for k in range(word_idx.size(0)):
                if not done[k]:
                    if int(word_idx[k]) == self.END_IDX:
                        done[k] = True
                        total_done += 1
            token_list.append(word_idx.view(bsz, 1))
            i += 1

        if token_list:
            return torch.cat(token_list, 1)
        else:
            return None

    def predict(self,
                data,
                hidden,
                targets=None,
                is_training=True,
                y_lens=None):
        """Produce a prediction from our model.
        """
        loss_dict = None
        output = None
        predictions = None
        if is_training:
            self.model.train()
            self.zero_grad()
            output, hidden = self.model(data, hidden)
            loss = self.criterion(output.view(-1, len(self.dict)),
                                  targets.view(-1))
            loss.backward(retain_graph=True)
            self.update_params()
            loss_dict = {'lmloss': loss.data}
            loss_dict['lmppl'] = math.exp(loss.data)
        else:
            self.model.eval()
            predictions = self.get_predictions(data)
            loss_dict = {}
            bsz = data.size(0)
            if bsz != self.batchsize:
                self.hidden = self.model.init_hidden(bsz)
            loss = self.get_target_loss(data, self.hidden, targets, y_lens)
            loss_dict['loss'] = loss
            loss_dict['ppl'] = math.exp(loss)

        return output, hidden, loss_dict, predictions

    def vectorize(self, observations, seq_len, is_training):
        """Convert a list of observations into input & target tensors."""
        labels = None
        valid_inds = None
        y_lens = None
        if is_training:
            for obs in observations:
                if obs:
                    if 'text2vec' in obs:
                        self.next_batch += obs['text2vec']
            if len(self.next_batch) <= self.batchsize:
                return None, None, None, None, None
            else:
                data_list = []
                targets_list = []
                # total is the number of batches
                total = len(self.next_batch) // self.batchsize
                for i in range(total):
                    batch = self.next_batch[:self.batchsize]
                    self.next_batch = self.next_batch[self.batchsize:]

                    source = torch.LongTensor(batch).t().contiguous()
                    data = Variable(source[:seq_len])
                    targets = Variable(source[1:])

                    if self.use_cuda:
                        data = data.cuda()
                        targets = targets.cuda()

                    data_list.append(data)
                    targets_list.append(targets)
        else:
            # here we get valid examples and pad them with zeros
            xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text(
                observations, self.dict, self.END_IDX, self.NULL_IDX)
            if self.use_cuda:
                xs = Variable(xs).cuda()
                if ys is not None:
                    ys = Variable(ys).cuda()
            else:
                xs = Variable(xs)
                if ys is not None:
                    ys = Variable(ys)
            data_list = [xs]
            targets_list = [ys]

        return data_list, targets_list, labels, valid_inds, y_lens

    def batch_act(self, observations):
        batch_reply = [{'id': self.getID()} for _ in range(len(observations))]
        if any(['labels' in obs for obs in observations]):
            # if we are starting a new training epoch, reinitialize hidden
            if self.is_training == False:
                self.hidden = self.model.init_hidden(self.batchsize)
            self.is_training = True
            data_list, targets_list, _, _, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)
        else:
            # if we just finished training, reinitialize hidden
            if self.is_training == True:
                self.hidden = self.model.init_hidden(self.batchsize)
                self.is_training = False
            data_list, targets_list, labels, valid_inds, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)

        if data_list is None:
            # not enough data to batch act yet, return empty responses
            return batch_reply

        batch_reply = []
        # during evaluation, len(data_list) is always 1
        # during training, len(dat_list) >= 0: vectorize returns a list containing all batches available at the time it is called
        for i in range(len(data_list)):
            temp_dicts = [{
                'id': self.getID()
            } for _ in range(len(observations))]
            output, hidden, loss_dict, predictions = self.predict(
                data_list[i], self.hidden, targets_list[i], self.is_training,
                y_lens)
            self.hidden = self.repackage_hidden(hidden)

            if predictions is not None:
                # map predictions back to the right order
                PaddingUtils.map_predictions(
                    predictions,
                    valid_inds,
                    temp_dicts,
                    observations,
                    self.dict,
                    self.END_IDX,
                    report_freq=self.opt['report_freq'])

            if loss_dict is not None:
                if 'metrics' in temp_dicts[0]:
                    for k, v in loss_dict.items():
                        temp_dicts[0]['metrics'][k] = v
                else:
                    temp_dicts[0]['metrics'] = loss_dict

            batch_reply += temp_dicts

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['opt'] = self.opt

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def receive_metrics(self, metrics_dict):
        if 'loss' in metrics_dict:
            if self.best_val_loss is None:
                self.best_val_loss = metrics_dict['loss']
            else:
                if metrics_dict['loss'] > self.best_val_loss:
                    self.lr *= self.lr_factor
                    print("Updating learning rate: lr =", self.lr)

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            states = torch.load(read)

        return states['opt'], states