def _run_specialtok_test(self, **kwargs): for special_token in ['SPECIAL TOKENS', '[SPECIAL; TOKENS]']: with testing_utils.tempdir() as tmpdir: if 'dict_file' not in kwargs: kwargs['dict_file'] = os.path.join(tmpdir, 'dict') string = f"This is a test of {special_token}" parser = ParlaiParser(False, False) DictionaryAgent.add_cmdline_args(parser, partial_opt=None) opt = parser.parse_kwargs(**kwargs) da = DictionaryAgent(opt) before = da.tokenize(string) da.add_additional_special_tokens([special_token]) after = da.tokenize(string) assert before != after assert len(before) > len(after) assert after[-1] == special_token assert before[:5] == after[:5] if opt['dict_tokenizer'] in ( 'bytelevelbpe', 'gpt2', 'slow_bytelevel_bpe', ): # we need to let the dictionary handle the tokenid mappings assert da.vec2txt(da.txt2vec(string)) == string
class PolyEncoderTokenizer(PreTrainedTokenizer): def __init__(self, **kwargs): opt = load_poly_encoder_opt() self.dict = DictionaryAgent(opt) super().__init__( unk_token=self.dict.unk_token, pad_token=self.dict.null_token, cls_token=self.dict.start_token, sep_token=self.dict.end_token, **kwargs, ) def get_vocab(self): return self.dict.tok2ind def save_vocabulary(self, save_directory): pass @property def vocab_size(self): return len(self.dict.tok2ind) def _tokenize(self, text, **kwargs): return self.dict.tokenize(str(text)) def _convert_token_to_id(self, token): return self.dict[token] def _convert_id_to_token(self, index): return self.dict.ind2tok.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): out_string = self.dict.bpe.decode(tokens, token_ids=[], delimiter=' ') return out_string def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep @classmethod def from_pretrained(cls, *inputs, **kwargs): return cls()
class IrBaselineAgent(Agent): """Information Retrieval baseline.""" @staticmethod def add_cmdline_args(parser): """Add command line args specific to this agent.""" parser = parser.add_argument_group('IrBaseline Arguments') parser.add_argument('-lp', '--length_penalty', type=float, default=0.5, help='length penalty for responses') parser.add_argument( '-hsz', '--history_size', type=int, default=1, help='number of utterances from the dialogue history to take use ' 'as the query') parser.add_argument('--label_candidates_file', type=str, default=None, help='file of candidate responses to choose from') def __init__(self, opt, shared=None): """Initialize agent.""" super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt self.history = [] self.episodeDone = True if opt.get('label_candidates_file'): f = open(opt.get('label_candidates_file')) self.label_candidates = f.read().split('\n') def reset(self): """Reset agent properties.""" self.observation = None self.history = [] self.episodeDone = True def observe(self, obs): """Store and remember incoming observation message dict.""" self.observation = obs self.dictionary.observe(obs) if self.episodeDone: self.history = [] if 'text' in obs: self.history.append(obs.get('text', '')) self.episodeDone = obs.get('episode_done', False) return obs def act(self): """Generate a response to the previously seen observation(s).""" if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates cands = None if 'label_candidates' in obs and len(obs['label_candidates']) > 0: cands = obs['label_candidates'] if hasattr(self, 'label_candidates'): # override label candidates with candidate file if set cands = self.label_candidates if cands: hist_sz = self.opt.get('history_size', 1) left_idx = max(0, len(self.history) - hist_sz) text = ' '.join(self.history[left_idx:len(self.history)]) rep = self.build_query_representation(text) reply['text_candidates'] = (rank_candidates( rep, cands, self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): """Save dictionary tokenizer if available.""" fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): """Load internal dictionary.""" self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """Build representation of query, e.g. words or n-grams. :param query: string to represent. :returns: dictionary containing 'words' dictionary (token => frequency) and 'norm' float (square root of the number of tokens) """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True rep['norm'] = math.sqrt(len(words)) return rep
class FairseqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. For more information, see Convolutional Sequence to Sequence Learning `(Gehring et al. 2017) <https://arxiv.org/abs/1705.03122>`_. """ @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument( '-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up training (may ' 'reduce accuracy). This fixes all input and output to have a ' 'maximum length. This reduces the total amount of padding in ' 'the batches.') agent.add_argument( '--max-positions', default=1024, type=int, metavar='N', help='max number of tokens in the sequence') agent.add_argument( '--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed') options.add_optimization_args(argparser) options.add_generation_args(argparser) options.add_model_args(argparser) def __init__(self, opt, shared=None): # initialize defaults first super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. saved_state = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, saved_state = self.load(opt['model_file']) # override options with stored ones opt = self._override_opt(new_opt) self.args = OptWrapper(opt) self.parlai_dict = DictionaryAgent(opt) self.fairseq_dict = _make_fairseq_dict(self.parlai_dict) self.id = 'Fairseq' self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.EOS = self.fairseq_dict[self.fairseq_dict.eos()] self.EOS_TENSOR = (torch.LongTensor(1, 1) .fill_(self.fairseq_dict.eos())) self.NULL_IDX = self.fairseq_dict.pad() encoder = fconv.FConvEncoder( self.fairseq_dict, embed_dim=self.args.encoder_embed_dim, convolutions=eval(self.args.encoder_layers), dropout=self.args.dropout, max_positions=self.args.max_positions) decoder = fconv.FConvDecoder( self.fairseq_dict, embed_dim=self.args.decoder_embed_dim, convolutions=eval(self.args.decoder_layers), out_embed_dim=self.args.decoder_out_embed_dim, attention=eval(self.args.decoder_attention), dropout=self.args.dropout, max_positions=self.args.max_positions) self.model = fconv.FConvModel(encoder, decoder) # from fairseq's build_criterion() if self.args.label_smoothing > 0: self.criterion = criterions.LabelSmoothedCrossEntropyCriterion( self.args.label_smoothing, self.NULL_IDX) else: self.criterion = criterions.CrossEntropyCriterion( self.args, self.fairseq_dict) self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion) if saved_state is not None: self.set_states(saved_state) self.reset() def _override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'arch', 'encoder-embed-dim', 'encoder-layers', 'decoder-embed-dim', 'decoder-layers', 'decoder-out-embed-dim', 'decoder-attention', } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True def observe(self, observation): # shallow copy observation (deep copy can be expensive) observation = observation.copy() if not self.episode_done and not observation.get('preprocessed', False): # if the last example wasn't the end of an episode, then we need to # recall what was said in that example prev_dialogue = self.observation['text'] observation['text'] = prev_dialogue + '\n' + observation['text'] self.observation = observation self.episode_done = observation['episode_done'] return observation def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def batch_act(self, observations): bsz = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(bsz)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field # also, split observations into sub-batches based on number of gpus obs_split = np.array_split(observations, self.trainer.num_replicas) samples = [self.batchify(obs) for obs in obs_split] samples = [s for s in samples if s[0] is not None] any_valid = any(len(s[0]) > 0 for s in samples) if not any_valid: # no valid examples, just return the empty responses we set up return batch_reply # produce predictions if testing; otherwise, train has_targets = any(s[1] is not None for s in samples) if not has_targets: offset = 0 for s in samples: xs = s[0] valid_inds = s[2] predictions = self._generate(self.args, xs) for i in range(len(predictions)): # map the predictions back to non-empty examples in the batch batch_reply[valid_inds[i] + offset]['text'] = predictions[i] if i == 0: print('prediction:', predictions[i]) offset += len(valid_inds) else: loss = self._train(samples) batch_reply[0]['metrics'] = {} for k, v in loss.items(): batch_reply[0]['metrics'][k] = v * bsz return batch_reply def parse(self, string): return [self.fairseq_dict.index(word) for word in self.parlai_dict.tokenize(string)] def batchify(self, observations): """Convert a list of observations into input & target tensors.""" # valid examples exs = [ex for ex in observations if 'text' in ex] # the indices of the valid (non-empty) tensors valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex] # set up the input tensors batchsize = len(exs) if batchsize == 0: return None, None, None # tokenize the text parsed_x = [deque(maxlen=self.truncate) for _ in exs] for dq, ex in zip(parsed_x, exs): dq += self.parse(ex['text']) # parsed = [self.parse(ex['text']) for ex in exs] max_x_len = max((len(x) for x in parsed_x)) for x in parsed_x: # left pad with zeros x.extendleft([self.fairseq_dict.pad()] * (max_x_len - len(x))) xs = torch.LongTensor(parsed_x) # set up the target tensors ys = None if 'labels' in exs[0]: # randomly select one of the labels to update on, if multiple labels = [random.choice(ex.get('labels', [''])) for ex in exs] parsed_y = [deque(maxlen=self.truncate) for _ in labels] for dq, y in zip(parsed_y, labels): dq.extendleft(reversed(self.parse(y))) for y in parsed_y: y.append(self.fairseq_dict.eos()) # append EOS to each label max_y_len = max(len(y) for y in parsed_y) for y in parsed_y: y += [self.fairseq_dict.pad()] * (max_y_len - len(y)) ys = torch.LongTensor(parsed_y) return xs, ys, valid_inds def _positions_for_tokens(self, tokens): size = tokens.size() not_pad = tokens.ne(self.fairseq_dict.pad()).long() new_pos = tokens.new(size).fill_(self.fairseq_dict.pad()) new_pos += not_pad for i in range(1, size[1]): new_pos[:, i] += new_pos[:, i-1] - 1 return new_pos def _right_shifted_ys(self, ys): result = torch.LongTensor(ys.size()) result[:, 0] = self.fairseq_dict.index(self.EOS) result[:, 1:] = ys[:, :-1] return result def _generate(self, opt, src_tokens): if not hasattr(self, 'translator'): self.translator = SequenceGenerator( [self.trainer.get_model()], beam_size=opt.beam, stop_early=(not opt.no_early_stop), normalize_scores=(not opt.unnormalized), len_penalty=opt.lenpen) self.translator.cuda() tokens = src_tokens.cuda(async=True) translations = self.translator.generate(Variable(tokens)) results = [t[0] for t in translations] output_lines = [[] for _ in range(len(results))] for i in range(len(results)): output_lines[i] = ' '.join(self.fairseq_dict[idx] for idx in results[i]['tokens'][:-1]) return output_lines def _train(self, samples): """Update the model using the targets.""" for i, sample in enumerate(samples): # add extra info to samples sample = { 'src_tokens': sample[0], 'input_tokens': self._right_shifted_ys(sample[1]), 'target': sample[1], 'id': None } sample['ntokens'] = sum(len(t) for t in sample['target']) sample['src_positions'] = self._positions_for_tokens( sample['src_tokens']) sample['input_positions'] = self._positions_for_tokens( sample['input_tokens']) samples[i] = sample return self.trainer.train_step(samples) def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'trainer'): model = {} model['state_dict'] = self.trainer.get_model().state_dict() model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" model = torch.load(path, map_location=lambda cpu, _: cpu) return model['opt'], model['state_dict'] def set_states(self, state_dict): """Set the state dict of the model from saved states.""" self.trainer.get_model().load_state_dict(state_dict)
class LanguageModelAgent(Agent): """ Agent which trains an RNN on a language modeling task. It is adapted from the language model featured in Pytorch's examples repo here: <https://github.com/pytorch/examples/tree/master/word_language_model>. """ @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" argparser.set_defaults(batch_sort=False) agent = argparser.add_argument_group('Language Model Arguments') agent.add_argument( '--init-model', type=str, default=None, help='load dict/features/weights/opts from this file') agent.add_argument('-hs', '--hiddensize', type=int, default=200, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=200, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-dr', '--dropout', type=float, default=0.2, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=0.25, help='gradient clipping') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument( '-rnn', '--rnn-class', default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') agent.add_argument('-sl', '--seq-len', type=int, default=35, help='sequence length') agent.add_argument('-tied', '--emb-tied', action='store_true', help='tie the word embedding and softmax weights') agent.add_argument('-seed', '--random-seed', type=int, default=1111, help='random seed') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-tr', '--truncate-pred', type=int, default=50, help='truncate predictions') agent.add_argument('-rf', '--report-freq', type=float, default=0.1, help='report frequency of prediction during eval') agent.add_argument('-pt', '--person-tokens', type='bool', default=True, help='append person1 and person2 tokens to text') # learning rate parameters agent.add_argument('-lr', '--learningrate', type=float, default=20, help='initial learning rate') agent.add_argument( '-lrf', '--lr-factor', type=float, default=1.0, help='mutliply learning rate by this factor when the \ validation loss does not decrease') agent.add_argument('-lrp', '--lr-patience', type=int, default=10, help='wait before decreasing learning rate') agent.add_argument('-lrm', '--lr-minimum', type=float, default=0.1, help='minimum learning rate') agent.add_argument( '-sm', '--sampling-mode', type='bool', default=False, help='sample when generating tokens instead of taking \ the max and do not produce UNK token (when bs=1)') LanguageModelAgent.dictionary_class().add_cmdline_args(argparser) return agent def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init self.metrics = { 'loss': 0, 'num_tokens': 0, 'lmloss': 0, 'lm_num_tokens': 0 } self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() self.batchsize = opt.get('batchsize', 1) self.use_person_tokens = opt.get('person_tokens', True) self.sampling_mode = opt.get('sampling_mode', False) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.model = shared['model'] self.metrics = shared['metrics'] # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if 'states' in shared: self.states = shared['states'] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) else: # this is not a shared instance of this class, so do full init if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] # for backwards compatibility: will only be called for older models # for which .opt file does not exist if (init_model is not None and not os.path.isfile(init_model + '.opt')): new_opt = self.load_opt(init_model) # load model parameters if available print('[ Setting opt from {} ]'.format(init_model)) # since .opt file does not exist, save one for future use print("Saving opt file at:", init_model + ".opt") with open(init_model + ".opt", 'wb') as handle: pickle.dump(new_opt, handle, protocol=pickle.HIGHEST_PROTOCOL) opt = self.override_opt(new_opt) if ((init_model is not None and os.path.isfile(init_model + '.dict')) or opt['dict_file'] is None): opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'LanguageModel' # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) # set model self.model = RNNModel(opt, len(self.dict)) if init_model is not None: self.load(init_model) if self.use_cuda: self.model.cuda() self.next_observe = [] self.next_batch = [] self.is_training = True self.clip = opt.get('gradient_clip', 0.25) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.criterion.cuda() # init hidden state self.hidden = self.model.init_hidden(self.batchsize) # init tensor of end tokens self.ends = torch.LongTensor( [self.END_IDX for _ in range(self.batchsize)]) if self.use_cuda: self.ends = self.ends.cuda() # set up model and learning rate scheduler parameters self.lr = opt['learningrate'] self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr) self.best_val_loss = self.states.get('best_val_loss', None) self.lr_factor = opt['lr_factor'] if self.lr_factor < 1.0: self.lr_patience = opt['lr_patience'] self.lr_min = opt['lr_minimum'] self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, factor=self.lr_factor, verbose=True, patience=self.lr_patience, min_lr=self.lr_min) # initial step for scheduler if self.best_val_loss is initialized if self.best_val_loss is not None: self.scheduler.step(self.best_val_loss) else: self.scheduler = None self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len', 'emb_tied', 'truncate_pred', 'report_freq', 'person_tokens', 'learningrate' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def zero_grad(self): """Zero out optimizer.""" self.optimizer.zero_grad() def update_params(self): """Do one optimization step.""" torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) self.optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.reset_metrics() def reset_metrics(self): self.metrics.clear() self.metrics['loss'] = 0 self.metrics['lmloss'] = 0 self.metrics['num_tokens'] = 0 self.metrics['lm_num_tokens'] = 0 def report(self): m = {} if self.metrics['num_tokens'] > 0: m['loss'] = self.metrics['loss'] / self.metrics['num_tokens'] m['ppl'] = math.exp(m['loss']) if self.metrics['lm_num_tokens'] > 0: m['lmloss'] = self.metrics['lmloss'] / self.metrics['lm_num_tokens'] m['lmppl'] = math.exp(m['lmloss']) for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats m[k] = round_sigfigs(v, 4) return m def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['opt'] = self.opt shared['dict'] = self.dict shared['NULL_IDX'] = self.NULL_IDX shared['END_IDX'] = self.END_IDX shared['model'] = self.model if self.opt.get('numthreads', 1) > 1: if type(self.metrics) == dict: # move metrics and model to shared memory self.metrics = SharedTable(self.metrics) self.model.share_memory() shared['states'] = { # only need to pass optimizer states 'optimizer': self.optimizer.state_dict(), } shared['metrics'] = self.metrics return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) obs = observation.copy() seq_len = self.opt['seq_len'] is_training = True if 'labels' not in obs: is_training = False if is_training: if 'text' in obs: if self.use_person_tokens: obs['text'] = 'PERSON1 ' + obs['text'] vec = self.parse(obs['text']) vec.append(self.END_IDX) self.next_observe += vec if 'labels' in obs: if self.use_person_tokens: labels = [ 'PERSON2 ' + label for label in obs['labels'] if label != '' ] obs['labels'] = tuple(labels) vec = self.parse(obs['labels'][0]) vec.append(self.END_IDX) self.next_observe += vec if len(self.next_observe) < (seq_len + 1): # not enough to return to make a batch # we handle this case in vectorize # labels indicates that we are training self.observation = {'labels': ''} return self.observation else: vecs_to_return = [] total = len(self.next_observe) // (seq_len + 1) for _ in range(total): observe = self.next_observe[:(seq_len + 1)] self.next_observe = self.next_observe[(seq_len + 1):] vecs_to_return.append(observe) dict_to_return = { 'text': '', 'labels': '', 'text2vec': vecs_to_return } self.observation = dict_to_return return dict_to_return else: if 'text' in obs: if self.use_person_tokens: obs['text'] = 'PERSON1 ' + obs['text'] if 'eval_labels' in obs: if self.use_person_tokens: eval_labels = [ 'PERSON2 ' + label for label in obs['eval_labels'] if label != '' ] obs['eval_labels'] = tuple(eval_labels) self.observation = obs return obs def repackage_hidden(self, h): """Wraps hidden states in new Variables, to detach them from their history.""" if isinstance(h, Variable): return Variable(h.data) else: return tuple(self.repackage_hidden(v) for v in h) def get_target_loss(self, data, hidden, targets): """Calculates the loss with respect to the targets, token by token, where each output token is conditioned on either the input or the previous target token. """ loss = 0.0 bsz = data.size(0) # during interactive mode, when no targets exist, we return 0 if targets is None: return loss # feed in inputs without end token output, hidden = self.model(data.transpose(0, 1), hidden) self.hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)), self.hidden) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.criterion(output_flat, targets.select(1, 0).view(-1)).data for i in range(1, targets.size(1)): output, hidden = self.model(targets.select(1, i - 1).view(1, bsz), self.hidden, no_pack=True) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.criterion(output_flat, targets.select(1, i).view(-1)).data return loss def get_predictions(self, data): """Generates predictions word by word until we either reach the end token or some max length (opt['truncate_pred']). """ token_list = [] bsz = data.size(0) done = [False for _ in range(bsz)] total_done = 0 hidden = self.model.init_hidden(bsz) i = 0 word_idx = None while total_done < bsz and i <= self.opt['truncate_pred']: if i == 0: # feed in input without end tokens output, hidden = self.model(data.transpose(0, 1), hidden) hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model( Variable(self.ends[:bsz].view(1, bsz)), hidden) else: output, hidden = self.model(Variable(word_idx.view(1, bsz)), hidden, no_pack=True) hidden = self.repackage_hidden(hidden) word_weights = output.squeeze().data.exp() if bsz > 1: _, word_idx = torch.max(word_weights, 1) else: if self.sampling_mode: unk_idx = self.dict[self.dict.unk_token] # make word_weights have smaller norm so that calculated # norm does not blow up word_weights = word_weights.div(1e10) # make word_weights have L2 norm 1 ww_norm = torch.norm(word_weights, p=2) word_weights = word_weights.div(ww_norm) # square distribution word_weights = torch.mul(word_weights, word_weights) # sample distribution word_idx = torch.multinomial(word_weights, 1) # do not produce UNK token while word_idx == unk_idx: word_idx = torch.multinomial(word_weights, 1) else: _, word_idx = torch.max(word_weights, 0) # mark end indices for items in batch word_idx = word_idx.view(-1) for k in range(word_idx.size(0)): if not done[k]: if int(word_idx[k]) == self.END_IDX: done[k] = True total_done += 1 token_list.append(word_idx.view(bsz, 1)) i += 1 return torch.cat(token_list, 1) def predict(self, data, hidden, targets=None, is_training=True, y_lens=None): """Produce a prediction from our model.""" output = None predictions = None if is_training: self.model.train() self.zero_grad() output, hidden = self.model(data, hidden) loss = self.criterion(output.view(-1, len(self.dict)), targets.view(-1)) # save loss to metrics target_tokens = targets.ne(self.NULL_IDX).float().sum().item() self.metrics['lmloss'] += loss.double().item() self.metrics['lm_num_tokens'] += target_tokens # average loss per token loss /= target_tokens loss.backward(retain_graph=True) self.update_params() else: self.model.eval() predictions = self.get_predictions(data) bsz = data.size(0) if bsz != self.batchsize: self.hidden = self.model.init_hidden(bsz) if targets is not None: loss = self.get_target_loss(data, self.hidden, targets) self.metrics['loss'] += loss self.metrics['num_tokens'] += sum(y_lens) return output, hidden, predictions def vectorize(self, observations, seq_len, is_training): """Convert a list of observations into input & target tensors.""" labels = None valid_inds = None y_lens = None if is_training: for obs in observations: if obs: if 'text2vec' in obs: self.next_batch += obs['text2vec'] if len(self.next_batch) <= self.batchsize: return None, None, None, None, None else: data_list = [] targets_list = [] # total is the number of batches total = len(self.next_batch) // self.batchsize for _ in range(total): batch = self.next_batch[:self.batchsize] self.next_batch = self.next_batch[self.batchsize:] source = torch.LongTensor(batch).t().contiguous() data = Variable(source[:seq_len]) targets = Variable(source[1:]) if self.use_cuda: data = data.cuda() targets = targets.cuda() data_list.append(data) targets_list.append(targets) else: # here we get valid examples and pad them with zeros xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text( observations, self.dict, end_idx=self.END_IDX, null_idx=self.NULL_IDX) if self.use_cuda: if xs is not None: xs = Variable(torch.LongTensor(xs)).cuda() if ys is not None: ys = Variable(torch.LongTensor(ys)).cuda() else: if xs is not None: xs = Variable(torch.LongTensor(xs)) if ys is not None: ys = Variable(torch.LongTensor(ys)) data_list = [xs] targets_list = [ys] return data_list, targets_list, labels, valid_inds, y_lens def batch_act(self, observations): batch_reply = [{'id': self.getID()} for _ in range(len(observations))] if any(['labels' in obs for obs in observations]): # if we are starting a new training epoch, reinitialize hidden if not self.is_training: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = True data_list, targets_list, _c, _v, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) else: # if we just finished training, reinitialize hidden if self.is_training: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = False data_list, targets_list, labels, valid_inds, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) if data_list is None: # not enough data to batch act yet, return empty responses return batch_reply batch_reply = [] # during evaluation, len(data_list) is always 1 # during training, len(dat_list) >= 0: vectorize returns a list # containing all batches available at the time it is called for i in range(len(data_list)): temp_dicts = [{ 'id': self.getID() } for _ in range(len(observations))] # ignore case when we do not return any valid indices if data_list[i] is not None: output, hidden, predictions = self.predict( data_list[i], self.hidden, targets_list[i], self.is_training, y_lens) self.hidden = self.repackage_hidden(hidden) if predictions is not None: # map predictions back to the right order PaddingUtils.map_predictions( predictions.cpu(), valid_inds, temp_dicts, observations, self.dict, self.END_IDX, report_freq=self.opt['report_freq']) batch_reply += temp_dicts # for prediction metrics computations, we get rid of PERSON1 and PERSON2 tokens if not self.is_training: for reply in batch_reply: if 'text' in reply: reply['text'] = reply['text'].replace('PERSON1 ', '') reply['text'] = reply['text'].replace('PERSON2 ', '') return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['opt'] = self.opt model['best_val_loss'] = self.best_val_loss with open(path, 'wb') as write: torch.save(model, write) # save opt file with open(path + ".opt", 'wb') as handle: pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def receive_metrics(self, metrics_dict): if 'loss' in metrics_dict and self.scheduler is not None: self.scheduler.step(metrics_dict['loss']) def load_opt(self, path): """Return opt, states.""" states = torch.load(path, map_location=lambda cpu, _: cpu) return states['opt'] def load(self, path): """Load model states.""" if os.path.isfile(path): # load model parameters if available print('[ Loading existing model params from {} ]'.format(path)) self.states = torch.load(path, map_location=lambda cpu, _: cpu) self.model.load_state_dict(self.states['model'])
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument( '-lp', '--length_penalty', type=float, default=0.5, help='length penalty for responses') parser.add_argument( '-hsz', '--history_size', type=int, default=1, help='number of utterances from the dialogue history to take use as the query') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt self.history = [] self.episodeDone = True def reset(self): self.observation = None self.history = [] self.episodeDone = True def observe(self, obs): self.observation = obs self.dictionary.observe(obs) if self.episodeDone: self.history = [] if 'text' in obs: self.history.append(obs.get('text', '')) self.episodeDone = obs.get('episode_done', False) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: # text = obs['text'] text = ' '.join( self.history[max(0, len(self.history) - self.opt.get('history_size', 1)):len(self.history)]) rep = self.build_query_representation(text) reply['text_candidates'] = ( rank_candidates(rep, obs['label_candidates'], self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True rep['norm'] = math.sqrt(len(words)) return rep
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument( '-lp', '--length_penalty', default=0.5, help='length penalty for responses') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt def observe(self, obs): self.observation = obs self.dictionary.observe(obs) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: rep = self.build_query_representation(obs['text']) reply['text_candidates'] = ( rank_candidates(rep, obs['label_candidates'], self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True norm = len(used) rep['norm'] = math.sqrt(len(words)) return rep
class FairseqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. For more information, see Convolutional Sequence to Sequence Learning `(Gehring et al. 2017) <https://arxiv.org/abs/1705.03122>`_. """ @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument( '-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up training (may ' 'reduce accuracy). This fixes all input and output to have a ' 'maximum length. This reduces the total amount of padding in ' 'the batches.') agent.add_argument( '--max-positions', default=1024, type=int, metavar='N', help='max number of tokens in the sequence') agent.add_argument( '--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed') options.add_optimization_args(argparser) options.add_generation_args(argparser) options.add_model_args(argparser) def __init__(self, opt, shared=None): # initialize defaults first super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. saved_state = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, saved_state = self.load(opt['model_file']) # override options with stored ones opt = self._override_opt(new_opt) self.args = OptWrapper(opt) self.parlai_dict = DictionaryAgent(opt) self.fairseq_dict = _make_fairseq_dict(self.parlai_dict) self.id = 'Fairseq' self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.EOS = self.fairseq_dict[self.fairseq_dict.eos()] self.EOS_TENSOR = (torch.LongTensor(1, 1) .fill_(self.fairseq_dict.eos())) self.NULL_IDX = self.fairseq_dict.pad() encoder = fconv.FConvEncoder( self.fairseq_dict, embed_dim=self.args.encoder_embed_dim, convolutions=eval(self.args.encoder_layers), dropout=self.args.dropout, max_positions=self.args.max_positions) decoder = fconv.FConvDecoder( self.fairseq_dict, embed_dim=self.args.decoder_embed_dim, convolutions=eval(self.args.decoder_layers), out_embed_dim=self.args.decoder_out_embed_dim, attention=eval(self.args.decoder_attention), dropout=self.args.dropout, max_positions=self.args.max_positions) self.model = fconv.FConvModel(encoder, decoder) # from fairseq's build_criterion() if self.args.label_smoothing > 0: self.criterion = criterions.LabelSmoothedCrossEntropyCriterion( self.args.label_smoothing, self.NULL_IDX) else: self.criterion = criterions.CrossEntropyCriterion( self.args, self.fairseq_dict) self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion) if saved_state is not None: self.set_states(saved_state) self.reset() def _override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'arch', 'encoder-embed-dim', 'encoder-layers', 'decoder-embed-dim', 'decoder-layers', 'decoder-out-embed-dim', 'decoder-attention', } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True def observe(self, observation): # shallow copy observation (deep copy can be expensive) observation = observation.copy() if not self.episode_done and not observation.get('preprocessed', False): # if the last example wasn't the end of an episode, then we need to # recall what was said in that example prev_dialogue = self.observation['text'] observation['text'] = prev_dialogue + '\n' + observation['text'] self.observation = observation self.episode_done = observation['episode_done'] return observation def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def batch_act(self, observations): bsz = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(bsz)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field # also, split observations into sub-batches based on number of gpus obs_split = np.array_split(observations, self.trainer.num_replicas) samples = [self.batchify(obs) for obs in obs_split] samples = [s for s in samples if s[0] is not None] any_valid = any(len(s[0]) > 0 for s in samples) if not any_valid: # no valid examples, just return the empty responses we set up return batch_reply # produce predictions if testing; otherwise, train has_targets = any(s[1] is not None for s in samples) if not has_targets: offset = 0 for s in samples: xs = s[0] valid_inds = s[2] predictions = self._generate(self.args, xs) for i in range(len(predictions)): # map the predictions back to non-empty examples in the batch batch_reply[valid_inds[i] + offset]['text'] = predictions[i] if i == 0: print('prediction:', predictions[i]) offset += len(valid_inds) else: loss = self._train(samples) batch_reply[0]['metrics'] = {} for k, v in loss.items(): batch_reply[0]['metrics'][k] = v * bsz if k == 'loss': try: perplexity = 2 ** v * bsz except OverflowError: perplexity = float('inf') batch_reply[0]['metrics']['perplexity'] = perplexity return batch_reply def parse(self, string): return [self.fairseq_dict.index(word) for word in self.parlai_dict.tokenize(string)] def batchify(self, observations): """Convert a list of observations into input & target tensors.""" # valid examples exs = [ex for ex in observations if 'text' in ex] # the indices of the valid (non-empty) tensors valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex] # set up the input tensors batchsize = len(exs) if batchsize == 0: return None, None, None # tokenize the text parsed_x = [deque(maxlen=self.truncate) for _ in exs] for dq, ex in zip(parsed_x, exs): dq += self.parse(ex['text']) # parsed = [self.parse(ex['text']) for ex in exs] max_x_len = max((len(x) for x in parsed_x)) for x in parsed_x: # left pad with zeros x.extendleft([self.fairseq_dict.pad()] * (max_x_len - len(x))) xs = torch.LongTensor(parsed_x) # set up the target tensors ys = None if 'labels' in exs[0]: # randomly select one of the labels to update on, if multiple labels = [random.choice(ex.get('labels', [''])) for ex in exs] parsed_y = [deque(maxlen=self.truncate) for _ in labels] for dq, y in zip(parsed_y, labels): dq.extendleft(reversed(self.parse(y))) for y in parsed_y: y.append(self.fairseq_dict.eos()) # append EOS to each label max_y_len = max(len(y) for y in parsed_y) for y in parsed_y: y += [self.fairseq_dict.pad()] * (max_y_len - len(y)) ys = torch.LongTensor(parsed_y) return xs, ys, valid_inds def _positions_for_tokens(self, tokens): size = tokens.size() not_pad = tokens.ne(self.fairseq_dict.pad()).long() new_pos = tokens.new(size).fill_(self.fairseq_dict.pad()) new_pos += not_pad for i in range(1, size[1]): new_pos[:, i] += new_pos[:, i-1] - 1 return new_pos def _right_shifted_ys(self, ys): result = torch.LongTensor(ys.size()) result[:, 0] = self.fairseq_dict.index(self.EOS) result[:, 1:] = ys[:, :-1] return result def _generate(self, opt, src_tokens): if not hasattr(self, 'translator'): self.translator = SequenceGenerator( [self.trainer.get_model()], beam_size=opt.beam, stop_early=(not opt.no_early_stop), normalize_scores=(not opt.unnormalized), len_penalty=opt.lenpen) self.translator.cuda() tokens = src_tokens.cuda(async=True) translations = self.translator.generate(Variable(tokens)) results = [t[0] for t in translations] output_lines = [[] for _ in range(len(results))] for i in range(len(results)): output_lines[i] = ' '.join(self.fairseq_dict[idx] for idx in results[i]['tokens'][:-1]) return output_lines def _train(self, samples): """Update the model using the targets.""" for i, sample in enumerate(samples): # add extra info to samples sample = { 'src_tokens': sample[0], 'input_tokens': self._right_shifted_ys(sample[1]), 'target': sample[1], 'id': None } sample['ntokens'] = sum(len(t) for t in sample['target']) sample['src_positions'] = self._positions_for_tokens( sample['src_tokens']) sample['input_positions'] = self._positions_for_tokens( sample['input_tokens']) samples[i] = sample return self.trainer.train_step(samples) def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'trainer'): model = {} model['state_dict'] = self.trainer.get_model().state_dict() model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: model = torch.load(read) return model['opt'], model['state_dict'] def set_states(self, state_dict): """Set the state dict of the model from saved states.""" self.trainer.get_model().load_state_dict(state_dict)
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument('-lp', '--length_penalty', default=0.5, help='length penalty for responses') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt def observe(self, obs): self.observation = obs self.dictionary.observe(obs) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: rep = self.build_query_representation(obs['text']) reply['text_candidates'] = (rank_candidates( rep, obs['label_candidates'], self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True norm = len(used) rep['norm'] = math.sqrt(len(words)) return rep
class LanguageModelAgent(Agent): """ Agent which trains an RNN on a language modeling task. It is adapted from the language model featured in Pytorch's examples repo here: <https://github.com/pytorch/examples/tree/master/word_language_model>. """ @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" argparser.set_defaults(batch_sort=False) LanguageModelAgent.dictionary_class().add_cmdline_args(argparser) agent = argparser.add_argument_group('Language Model Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=200, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=200, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=20, help='initial learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.2, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=0.25, help='gradient clipping') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument( '-rnn', '--rnn-class', default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') agent.add_argument('-sl', '--seq-len', type=int, default=35, help='sequence length') agent.add_argument('-tied', '--emb-tied', action='store_true', help='tie the word embedding and softmax weights') agent.add_argument('-seed', '--random-seed', type=int, default=1111, help='random seed') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-tr', '--truncate-pred', type=int, default=50, help='truncate predictions') agent.add_argument('-rf', '--report-freq', type=float, default=0.1, help='report frequency of prediction during eval') agent.add_argument('-pt', '--person-tokens', type=bool, default=True, help='append person1 and person2 tokens to text') agent.add_argument( '-lrf', '--lr-factor', type=float, default=0.5, help='mutliply learning rate by this factor when the \ validation loss does not decrease') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() self.batchsize = opt.get('batchsize', 1) self.use_person_tokens = opt.get('person_tokens', True) if shared: # set up shared properties self.dict = shared['dict'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) else: # this is not a shared instance of this class, so do full init if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'LanguageModel' # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) # set model self.model = RNNModel(opt, len(self.dict)) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() self.next_observe = [] self.next_batch = [] self.is_training = True if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.25) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) if self.use_cuda: # push to cuda self.criterion.cuda() # set up criterion for eval: we do not want to average over size self.eval_criterion = nn.CrossEntropyLoss( ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.eval_criterion.cuda() # init hidden state self.hidden = self.model.init_hidden(self.batchsize) # init tensor of end tokens self.ends = torch.LongTensor( [self.END_IDX for _ in range(self.batchsize)]) if self.use_cuda: self.ends = self.ends.cuda() # set up optimizer self.lr = opt['learningrate'] self.lr_factor = opt['lr_factor'] self.best_val_loss = None self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len', 'emb_tied' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def zero_grad(self): """Zero out optimizer.""" self.model.zero_grad() def update_params(self): """Do one optimization step.""" torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip) for p in self.model.parameters(): p.data.add_(-self.lr, p.grad.data) def reset(self): """Reset observation and episode_done.""" self.observation = None def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['dict'] = self.dict shared['NULL_IDX'] = self.NULL_IDX shared['END_IDX'] = self.END_IDX if self.opt.get('numthreads', 1) > 1: shared['model'] = self.model self.model.share_memory() shared['states'] = self.states return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ #shallow copy observation (deep copy can be expensive) obs = observation.copy() seq_len = self.opt['seq_len'] is_training = True if 'labels' not in obs: is_training = False if is_training: if 'text' in obs: if self.use_person_tokens: obs['text'] = 'PERSON1 ' + obs['text'] vec = self.parse(obs['text']) vec.append(self.END_IDX) self.next_observe += vec if 'labels' in obs: if self.use_person_tokens: labels = [ 'PERSON2 ' + label for label in obs['labels'] if label != '' ] obs['labels'] = tuple(labels) vec = self.parse(obs['labels'][0]) vec.append(self.END_IDX) self.next_observe += vec if len(self.next_observe) < (seq_len + 1): # not enough to return to make a batch # we handle this case in vectorize # labels indicates that we are training self.observation = {'labels': ''} return self.observation else: vecs_to_return = [] total = len(self.next_observe) // (seq_len + 1) for _ in range(total): observe = self.next_observe[:(seq_len + 1)] self.next_observe = self.next_observe[(seq_len + 1):] vecs_to_return.append(observe) dict_to_return = { 'text': '', 'labels': '', 'text2vec': vecs_to_return } self.observation = dict_to_return return dict_to_return else: if 'text' in obs: if self.use_person_tokens: obs['text'] = 'PERSON1 ' + obs['text'] if 'eval_labels' in obs: if self.use_person_tokens: eval_labels = [ 'PERSON2 ' + label for label in obs['eval_labels'] if label != '' ] obs['eval_labels'] = tuple(eval_labels) self.observation = obs return obs def repackage_hidden(self, h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(self.repackage_hidden(v) for v in h) def get_target_loss(self, data, hidden, targets, y_lens): """Calculates the loss with respect to the targets, token by token, where each output token is conditioned on either the input or the previous target token. """ loss = 0.0 bsz = data.size(0) # during interactive mode, when no targets exist, we return 0 if targets is None: return loss # feed in inputs without end token output, hidden = self.model(data.transpose(0, 1), hidden) self.hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)), self.hidden) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.eval_criterion(output_flat, targets.select(1, 0).view(-1)).data for i in range(1, targets.size(1)): output, hidden = self.model(targets.select(1, i - 1).view(1, bsz), self.hidden, no_pack=True) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.eval_criterion(output_flat, targets.select(1, i).view(-1)).data return loss / float(sum(y_lens)) def get_predictions(self, data): """Generates predictions word by word until we either reach the end token or some max length (opt['truncate_pred']). """ token_list = [] bsz = data.size(0) done = [False for _ in range(bsz)] total_done = 0 hidden = self.model.init_hidden(bsz) i = 0 while total_done < bsz and i <= self.opt['truncate_pred']: if i == 0: # feed in input without end tokens output, hidden = self.model(data.transpose(0, 1), hidden) hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model( Variable(self.ends[:bsz].view(1, bsz)), hidden) else: output, hidden = self.model(Variable(word_idx.view(1, bsz)), hidden, no_pack=True) hidden = self.repackage_hidden(hidden) word_weights = output.squeeze().data.exp() if bsz > 1: value, word_idx = torch.max(word_weights, 1) else: value, word_idx = torch.max(word_weights, 0) # mark end indices for items in batch for k in range(word_idx.size(0)): if not done[k]: if int(word_idx[k]) == self.END_IDX: done[k] = True total_done += 1 token_list.append(word_idx.view(bsz, 1)) i += 1 if token_list: return torch.cat(token_list, 1) else: return None def predict(self, data, hidden, targets=None, is_training=True, y_lens=None): """Produce a prediction from our model. """ loss_dict = None output = None predictions = None if is_training: self.model.train() self.zero_grad() output, hidden = self.model(data, hidden) loss = self.criterion(output.view(-1, len(self.dict)), targets.view(-1)) loss.backward(retain_graph=True) self.update_params() loss_dict = {'lmloss': loss.data} loss_dict['lmppl'] = math.exp(loss.data) else: self.model.eval() predictions = self.get_predictions(data) loss_dict = {} bsz = data.size(0) if bsz != self.batchsize: self.hidden = self.model.init_hidden(bsz) loss = self.get_target_loss(data, self.hidden, targets, y_lens) loss_dict['loss'] = loss loss_dict['ppl'] = math.exp(loss) return output, hidden, loss_dict, predictions def vectorize(self, observations, seq_len, is_training): """Convert a list of observations into input & target tensors.""" labels = None valid_inds = None y_lens = None if is_training: for obs in observations: if obs: if 'text2vec' in obs: self.next_batch += obs['text2vec'] if len(self.next_batch) <= self.batchsize: return None, None, None, None, None else: data_list = [] targets_list = [] # total is the number of batches total = len(self.next_batch) // self.batchsize for i in range(total): batch = self.next_batch[:self.batchsize] self.next_batch = self.next_batch[self.batchsize:] source = torch.LongTensor(batch).t().contiguous() data = Variable(source[:seq_len]) targets = Variable(source[1:]) if self.use_cuda: data = data.cuda() targets = targets.cuda() data_list.append(data) targets_list.append(targets) else: # here we get valid examples and pad them with zeros xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text( observations, self.dict, self.END_IDX, self.NULL_IDX) if self.use_cuda: xs = Variable(xs).cuda() if ys is not None: ys = Variable(ys).cuda() else: xs = Variable(xs) if ys is not None: ys = Variable(ys) data_list = [xs] targets_list = [ys] return data_list, targets_list, labels, valid_inds, y_lens def batch_act(self, observations): batch_reply = [{'id': self.getID()} for _ in range(len(observations))] if any(['labels' in obs for obs in observations]): # if we are starting a new training epoch, reinitialize hidden if self.is_training == False: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = True data_list, targets_list, _, _, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) else: # if we just finished training, reinitialize hidden if self.is_training == True: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = False data_list, targets_list, labels, valid_inds, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) if data_list is None: # not enough data to batch act yet, return empty responses return batch_reply batch_reply = [] # during evaluation, len(data_list) is always 1 # during training, len(dat_list) >= 0: vectorize returns a list containing all batches available at the time it is called for i in range(len(data_list)): temp_dicts = [{ 'id': self.getID() } for _ in range(len(observations))] output, hidden, loss_dict, predictions = self.predict( data_list[i], self.hidden, targets_list[i], self.is_training, y_lens) self.hidden = self.repackage_hidden(hidden) if predictions is not None: # map predictions back to the right order PaddingUtils.map_predictions( predictions, valid_inds, temp_dicts, observations, self.dict, self.END_IDX, report_freq=self.opt['report_freq']) if loss_dict is not None: if 'metrics' in temp_dicts[0]: for k, v in loss_dict.items(): temp_dicts[0]['metrics'][k] = v else: temp_dicts[0]['metrics'] = loss_dict batch_reply += temp_dicts return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def receive_metrics(self, metrics_dict): if 'loss' in metrics_dict: if self.best_val_loss is None: self.best_val_loss = metrics_dict['loss'] else: if metrics_dict['loss'] > self.best_val_loss: self.lr *= self.lr_factor print("Updating learning rate: lr =", self.lr) def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: states = torch.load(read) return states['opt'], states