def build_dict(opt): if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return print('[ setting up dictionary. ]') if os.path.isfile(opt['dict_file']): # Dictionary already built print("[ dictionary already built .]") return if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['datatype'] = 'train:ordered' if 'stream' in opt['datatype']: ordered_opt['datatype'] += ':stream' ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary for _ in world_dict: cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() print('[ dictionary built. ]') dictionary.save(opt['dict_file'], sort=True)
def add_cmdline_args(argparser): DictionaryAgent.add_cmdline_args(argparser) arg_group = argparser.add_argument_group('MemNN Arguments') arg_group.add_argument('-lr', '--learning-rate', type=float, default=0.01, help='learning rate') arg_group.add_argument('--embedding-size', type=int, default=128, help='size of token embeddings') arg_group.add_argument('--hops', type=int, default=3, help='number of memory hops') arg_group.add_argument('--mem-size', type=int, default=100, help='size of memory') arg_group.add_argument('--time-features', type='bool', default=True, help='use time features for memory embeddings') arg_group.add_argument('--position-encoding', type='bool', default=False, help='use position encoding instead of bag of words embedding') arg_group.add_argument('--output', type=str, default='rank', help='type of output (rank|generate)') arg_group.add_argument('--rnn-layers', type=int, default=2, help='number of hidden layers in RNN decoder for generative output') arg_group.add_argument('--dropout', type=float, default=0.1, help='dropout probability for RNN decoder training') arg_group.add_argument('--optimizer', default='adam', help='optimizer type (sgd|adam)') arg_group.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') arg_group.add_argument('--gpu', type=int, default=-1, help='which GPU device to use')
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument( '-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up training (may ' 'reduce accuracy). This fixes all input and output to have a ' 'maximum length. This reduces the total amount of padding in ' 'the batches.') agent.add_argument( '--max-positions', default=1024, type=int, metavar='N', help='max number of tokens in the sequence') agent.add_argument( '--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed') options.add_optimization_args(argparser) options.add_generation_args(argparser) options.add_model_args(argparser)
def test_basic_parse(self): """Check that the dictionary is correctly adding and parsing short sentence. """ from parlai.core.dict import DictionaryAgent from parlai.core.params import ParlaiParser argparser = ParlaiParser() DictionaryAgent.add_cmdline_args(argparser) opt = argparser.parse_args() dictionary = DictionaryAgent(opt) num_builtin = len(dictionary) dictionary.observe({'text': 'hello world'}) dictionary.act() assert len(dictionary) - num_builtin == 2 vec = dictionary.parse('hello world') assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=list) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=tuple) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) group = argparser.add_argument_group('Cooperative Game Agent Arguments') agent.add_argument('--optimizer', default='adam', choices=CooperativeGameAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. Any member of torch.optim ' 'is valid and will be used with default params except learning ' 'rate (as specified by -lr).') group.add_argument('--learning-rate', default=1e-2, type=float, help='Initial learning rate') group.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') group.add_argument('--gpuid', type=int, default=-1, help='which GPU device to use (defaults to cpu)')
def add_cmdline_args(argparser): DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=64, help='size of the hidden layers and embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.5, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use')
def add_cmdline_args(argparser): group = DictionaryAgent.add_cmdline_args(argparser) group.add_argument( '--pretrained_words', type='bool', default=True, help='Use only words found in provided embedding_file' ) group.set_defaults(dict_tokenizer='spacy')
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-emb', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.5, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-att', '--attention', type=int, default=0, help='if greater than 0, use attention of specified' ' length while decoding') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type='bool', default=True, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length and to ' 'be similar in length to one another by throwing ' 'away extra tokens. This reduces the total amount ' 'of padding in the batches.') agent.add_argument('-enc', '--encoder', default='gru', choices=Seq2seqAgent.ENC_OPTS.keys(), help='Choose between different encoder modules.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()), help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights.') agent.add_argument('-opt', '--optimizer', default='sgd', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).')
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent. Default values at according to (Kottur et al. 2017).""" DictionaryAgent.add_cmdline_args(argparser) group = argparser.add_argument_group('Questioner Agent Arguments') parser.add_argument('--q-in-vocab', default=13, type=int, help='Input vocabulary for questioner. Usually includes total ' 'distinct words spoken by answerer, questioner itself, ' 'and words by which the goal is described.') parser.add_argument('--q-embed-size', default=20, type=int, help='Size of word embeddings for questioner') parser.add_argument('--q-state-size', default=100, type=int, help='Size of hidden state of questioner') parser.add_argument('--q-out-vocab', default=3, type=int, help='Output vocabulary for questioner') parser.add_argument('--q-num-pred', default=12, type=int, help='Size of output to be predicted (for goal).') super().add_cmdline_args(argparser)
def __init__(self, opt, shared=None): opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available() if opt['cuda']: print('[ Using CUDA ]') torch.cuda.device(opt['gpu']) if not shared: self.opt = opt self.id = 'MemNN' self.dict = DictionaryAgent(opt) self.answers = [None] * opt['batchsize'] self.model = MemNN(opt, self.dict) self.mem_size = opt['mem_size'] self.loss_fn = CrossEntropyLoss() self.decoder = None self.longest_label = 1 self.END = self.dict.end_token self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END)) self.START = self.dict.start_token self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START)) if opt['output'] == 'generate' or opt['output'] == 'g': self.decoder = Decoder(opt['embedding_size'], opt['embedding_size'], opt['rnn_layers'], opt, self.dict) elif opt['output'] != 'rank' and opt['output'] != 'r': raise NotImplementedError('Output type not supported.') optim_params = [p for p in self.model.parameters() if p.requires_grad] lr = opt['learning_rate'] if opt['optimizer'] == 'sgd': self.optimizers = {'memnn': optim.SGD(optim_params, lr=lr)} if self.decoder is not None: self.optimizers['decoder'] = optim.SGD(self.decoder.parameters(), lr=lr) elif opt['optimizer'] == 'adam': self.optimizers = {'memnn': optim.Adam(optim_params, lr=lr)} if self.decoder is not None: self.optimizers['decoder'] = optim.Adam(self.decoder.parameters(), lr=lr) else: raise NotImplementedError('Optimizer not supported.') if opt['cuda']: self.model.share_memory() if self.decoder is not None: self.decoder.cuda() if opt.get('model_file') and os.path.isfile(opt['model_file']): print('Loading existing model parameters from ' + opt['model_file']) self.load(opt['model_file']) else: self.answers = shared['answers'] self.episode_done = True self.last_cands, self.last_cands_list = None, None super().__init__(opt, shared)
def __init__(self, opt, shared=None): # initialize defaults first super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. saved_state = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, saved_state = self.load(opt['model_file']) # override options with stored ones opt = self._override_opt(new_opt) self.args = OptWrapper(opt) self.parlai_dict = DictionaryAgent(opt) self.fairseq_dict = _make_fairseq_dict(self.parlai_dict) self.id = 'Fairseq' self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.EOS = self.fairseq_dict[self.fairseq_dict.eos()] self.EOS_TENSOR = (torch.LongTensor(1, 1) .fill_(self.fairseq_dict.eos())) self.NULL_IDX = self.fairseq_dict.pad() encoder = fconv.FConvEncoder( self.fairseq_dict, embed_dim=self.args.encoder_embed_dim, convolutions=eval(self.args.encoder_layers), dropout=self.args.dropout, max_positions=self.args.max_positions) decoder = fconv.FConvDecoder( self.fairseq_dict, embed_dim=self.args.decoder_embed_dim, convolutions=eval(self.args.decoder_layers), out_embed_dim=self.args.decoder_out_embed_dim, attention=eval(self.args.decoder_attention), dropout=self.args.dropout, max_positions=self.args.max_positions) self.model = fconv.FConvModel(encoder, decoder) # from fairseq's build_criterion() if self.args.label_smoothing > 0: self.criterion = criterions.LabelSmoothedCrossEntropyCriterion( self.args.label_smoothing, self.NULL_IDX) else: self.criterion = criterions.CrossEntropyCriterion( self.args, self.fairseq_dict) self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion) if saved_state is not None: self.set_states(saved_state) self.reset()
def build_dict(opt): if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return print('[ setting up dictionary. ]') if os.path.isfile(opt['dict_file']): # Dictionary already built print("[ dictionary already built .]") return if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['datatype'] = 'train:ordered:stream' ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' if ordered_opt['task'] == 'pytorch_teacher' and ordered_opt.get('pytorch_preprocess', False): pytorch_buildteacher_task = ordered_opt.get('pytorch_buildteacher', '') if pytorch_buildteacher_task != '': ordered_opt['task'] = pytorch_buildteacher_task world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() print('[ dictionary built. ]') dictionary.save(opt['dict_file'], sort=True)
def __init__(self, opt, shared=None): super().__init__(opt, shared) opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available() if opt['cuda']: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if not shared: # don't enter this loop for shared (ie batch) instantiations self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' hsz = opt['hiddensize'] self.EOS = self.dict.eos_token self.observation = {'text': self.EOS, 'episode_done': True} self.EOS_TENSOR = torch.LongTensor(self.dict.parse(self.EOS)) self.hidden_size = hsz self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.use_cuda = opt.get('cuda', False) self.longest_label = 1 self.criterion = nn.NLLLoss() self.lt = nn.Embedding(len(self.dict), hsz, padding_idx=0, scale_grad_by_freq=True) self.encoder = nn.GRU(hsz, hsz, opt['numlayers']) self.decoder = nn.GRU(hsz, hsz, opt['numlayers']) self.d2o = nn.Linear(hsz, len(self.dict)) self.dropout = nn.Dropout(opt['dropout']) self.softmax = nn.LogSoftmax() lr = opt['learningrate'] self.optims = { 'lt': optim.SGD(self.lt.parameters(), lr=lr), 'encoder': optim.SGD(self.encoder.parameters(), lr=lr), 'decoder': optim.SGD(self.decoder.parameters(), lr=lr), 'd2o': optim.SGD(self.d2o.parameters(), lr=lr), } if self.use_cuda: self.cuda() if opt.get('model_file') and os.path.isfile(opt['model_file']): print('Loading existing model parameters from ' + opt['model_file']) self.load(opt['model_file']) self.episode_done = True
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt self.reset_metrics() self.id = 'Starspace' self.NULL_IDX = 0 self.cands = torch.LongTensor(1, 1, 1) self.ys_cache = [] self.ys_cache_sz = opt['cache_size'] self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} self.debugMode = False if shared: self.threadindex = shared['threadindex'] print("[ creating Starspace thread " + str(self.threadindex) + " ]") # set up shared properties self.dict = shared['dict'] self.model = shared['model'] #Starspace(opt, len(self.dict)) else: print("[ creating StarspaceAgent ]") # this is not a shared instance of this class, so do full init if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.model = Starspace(opt, len(self.dict), self.dict) if opt.get('model_file') and os.path.isfile(opt['model_file']): self.load(opt['model_file']) self.model.share_memory() # set up modules self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'], size_average=False) self.reset() self.fixedCands = False if self.opt.get('fixed-candidates-file'): self.fixedCands = load_cands(self.opt.get('fixed-candidates-file'))
class Seq2seqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. This model supports encoding the input and decoding the output via one of several flavors of RNN. It then uses a linear layer (whose weights can be shared with the embedding layer) to convert RNN output states into output tokens. This model currently uses greedy decoding, selecting the highest probability token at each time step. For more information, see Sequence to Sequence Learning with Neural Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" Seq2seqAgent.dictionary_class().add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.005, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=0.2, help='gradient clipping using l2 norm') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', default='none', choices=['none', 'concat', 'general', 'dot', 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' 'For more details see: ' 'https://arxiv.org/pdf/1508.04025.pdf') agent.add_argument('-attl', '--attention-length', default=48, type=int, help='Length of local attention.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length. This ' 'reduces the total amount ' 'of padding in the batches.') agent.add_argument('-rnn', '--rnn-class', default='lstm', choices=Seq2seq.RNN_OPTS.keys(), help='Choose between different types of RNNs.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'], help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights. ' 'Note that shared disabled some encoder ' 'options--in particular, bidirectionality.') agent.add_argument('-lt', '--lookuptable', default='all', choices=['unique', 'enc_dec', 'dec_out', 'all'], help='The encoder, decoder, and output modules can ' 'share weights, or not. ' 'Unique has independent embeddings for each. ' 'Enc_dec shares the embedding for the encoder ' 'and decoder. ' 'Dec_out shares decoder embedding and output ' 'weights. ' 'All shares all three weights.') agent.add_argument('-opt', '--optimizer', default='adam', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-emb', '--embedding-type', default='random', choices=['random', 'glove', 'glove-fixed', 'fasttext', 'fasttext-fixed'], help='Choose between different strategies ' 'for word embeddings. Default is random, ' 'but can also preinitialize from Glove or ' 'Fasttext.' 'Preinitialized embeddings can also be fixed ' 'so they are not updated during training.') agent.add_argument('-hist', '--history-length', default=100000, type=int, help='Number of past tokens to remember. ' 'Default remembers 100000 tokens.') agent.add_argument('-histr', '--history-replies', default='none', type=str, choices=['none', 'model', 'label'], help='Keep replies in the history, or not.') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] self.model = Seq2seq(opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=self.states.get('longest_label', 1)) if opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ModuleNotFoundError as ex: print('Please install torch text with `pip install torchtext`') raise ex if opt['embedding_type'].startswith('glove'): init = 'glove' embs = vocab.GloVe(name='840B', dim=300) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en') else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != 300: rp = torch.Tensor(300, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.2) self.rank = opt['rank_candidates'] # set up tensors once self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) if self.use_cuda: # push to cuda self.xs = self.xs.cuda(async=True) self.ys = self.ys.cuda(async=True) if self.rank: self.cands = self.cands.cuda(async=True) self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs) if self.states: if self.states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(self.states['optimizer']) self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder', 'decoder', 'lookuptable', 'attention', 'attention_length'} for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" if type(vec) == Variable: vec = vec.data new_vec = [] for i in vec: if i == self.END_IDX: break elif i != self.START_IDX: new_vec.append(i) return self.dict.vec2txt(new_vec) def zero_grad(self): """Zero out optimizer.""" self.optimizer.zero_grad() def update_params(self): """Do one optimization step.""" torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip) self.optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['answers'] = self.answers shared['dict'] = self.dict shared['START_IDX'] = self.START_IDX shared['END_IDX'] = self.END_IDX shared['NULL_IDX'] = self.NULL_IDX if self.opt.get('numthreads', 1) > 1: shared['model'] = self.model self.model.share_memory() shared['states'] = self.states return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) obs = observation.copy() batch_idx = self.opt.get('batchindex', 0) if not obs.get('preprocessed', False): obs['text2vec'] = maintain_dialog_history( self.history, obs, reply=self.answers[batch_idx], historyLength=self.opt['history_length'], useReplies=self.opt['history_replies'], dict=self.dict, useStartEndIndices=False) else: obs['text2vec'] = deque(obs['text2vec'], self.opt['history_length']) self.observation = obs self.answers[batch_idx] = None return obs def predict(self, xs, ys=None, cands=None, valid_cands=None): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ is_training = ys is not None text_cand_inds, loss_dict = None, None if is_training: self.model.train() self.zero_grad() loss = 0 predictions, scores, _ = self.model(xs, ys) loss += self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1)) loss.backward() self.update_params() loss_dict = {'loss': loss.mul(len(xs)).data[0]} loss_dict['ppl'] = (math.e**loss).mul(len(xs)).data[0] else: self.model.eval() predictions, scores, text_cand_inds = self.model(xs, ys, cands, valid_cands) return predictions, text_cand_inds, loss_dict def vectorize(self, observations): """Convert a list of observations into input & target tensors.""" ys = None xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text( observations, self.dict, self.END_IDX, self.NULL_IDX, dq=True, eval_labels=False, truncate=self.truncate) if xs is None: return None, None, None, None, None, None if self.use_cuda: # copy to gpu self.xs.resize_(xs.size()) self.xs.copy_(xs, async=True) xs = Variable(self.xs) if ys is not None: self.ys.resize_(ys.size()) self.ys.copy_(ys, async=True) ys = Variable(self.ys) else: xs = Variable(xs) if ys is not None: ys = Variable(ys) # set up candidates cands = None valid_cands = None if ys is None and self.rank: # only do ranking when no targets available and ranking flag set parsed_cs = [] valid_cands = [] for i, v in enumerate(valid_inds): if 'label_candidates' in observations[v]: # each candidate tuple is a pair of the parsed version and # the original full string cs = list(observations[v]['label_candidates']) curr_dqs = [deque(maxlen=self.truncate) for _ in cs] for dq, c in zip(curr_dqs, cs): dq.extendleft(reversed(self.parse(c))) parsed_cs.append(curr_dqs) valid_cands.append((i, v, cs)) if len(parsed_cs) > 0: # TODO: store lengths of cands separately, so don't have zero # padding for varying number of cands per example # found cands, pack them into tensor max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs) max_c_cnt = max(len(cs) for cs in parsed_cs) for cs in parsed_cs: for c in cs: c += [self.NULL_IDX] * (max_c_len - len(c)) cs += [self.NULL_IDX] * (max_c_cnt - len(cs)) cands = torch.LongTensor(parsed_cs) if self.use_cuda: # copy to gpu self.cands.resize_(cands.size()) self.cands.copy_(cands, async=True) cands = Variable(self.cands) else: cands = Variable(cands) return xs, ys, labels, valid_inds, cands, valid_cands def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, labels, valid_inds, cands, valid_cands = self.vectorize(observations) if xs is None: # no valid examples, just return empty responses return batch_reply # produce predictions, train on targets if availables predictions, text_cand_inds, loss = self.predict(xs, ys, cands, valid_cands) if loss is not None: if 'metrics' in batch_reply[0]: for k, v in loss.items(): batch_reply[0]['metrics'][k] = v else: batch_reply[0]['metrics'] = loss if ys is not None: report_freq = 0 else: report_freq = 0.1 PaddingUtils.map_predictions( predictions, valid_inds, batch_reply, observations, self.dict, self.END_IDX, report_freq=report_freq, labels=labels, answers=self.answers, ys=ys) if text_cand_inds is not None: text_cand_inds = text_cand_inds.cpu().data for i in range(len(valid_cands)): order = text_cand_inds[i] _, batch_idx, curr_cands = valid_cands[i] curr = batch_reply[batch_idx] curr['text_candidates'] = [curr_cands[idx] for idx in order if idx < len(curr_cands)] return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['longest_label'] = self.model.longest_label model['optimizer'] = self.optimizer.state_dict() model['optimizer_type'] = self.opt['optimizer'] model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: states = torch.load(read) return states['opt'], states
def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument('-lp', '--length_penalty', default=0.5, help='length penalty for responses')
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() self.batchsize = opt.get('batchsize', 1) self.use_person_tokens = opt.get('person_tokens', True) if shared: # set up shared properties self.dict = shared['dict'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) else: # this is not a shared instance of this class, so do full init if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file' elif opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] else: init_model = None if init_model is not None: # load model parameters if available print('Loading existing model params from ' + init_model) new_opt, self.states = self.load(init_model) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None: if init_model is not None and os.path.isfile(init_model + '.dict'): # check first to see if a dictionary exists opt['dict_file'] = init_model + '.dict' elif opt.get('model_file'): # otherwise, set default dict-file if it is not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'LanguageModel' # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) # set model self.model = RNNModel(opt, len(self.dict)) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() self.next_observe = [] self.next_batch = [] self.is_training = True if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.25) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) if self.use_cuda: # push to cuda self.criterion.cuda() # set up criterion for eval: we do not want to average over size self.eval_criterion = nn.CrossEntropyLoss( ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.eval_criterion.cuda() # init hidden state self.hidden = self.model.init_hidden(self.batchsize) # init tensor of end tokens self.ends = torch.LongTensor( [self.END_IDX for _ in range(self.batchsize)]) if self.use_cuda: self.ends = self.ends.cuda() # set up model and learning rate scheduler parameters self.lr = opt['learningrate'] self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr) self.best_val_loss = self.states.get('best_val_loss', None) self.lr_factor = opt['lr_factor'] if self.lr_factor < 1.0: self.lr_patience = opt['lr_patience'] self.lr_min = opt['lr_minimum'] self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, factor=self.lr_factor, verbose=True, patience=self.lr_patience, min_lr=self.lr_min) # initial step for scheduler if self.best_val_loss is initialized if self.best_val_loss is not None: self.scheduler.step(self.best_val_loss) else: self.scheduler = None self.reset()
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument( '--max-positions', default=1024, type=int, metavar='N', help='max number of tokens in the sequence') agent.add_argument( '--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed') agent.add_argument( '--lr', '--learning-rate', default=0.25, type=float, metavar='LR', help='initial learning rate') agent.add_argument( '--momentum', default=0.99, type=float, metavar='M', help='momentum factor') agent.add_argument( '--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') agent.add_argument( '--force-anneal', '--fa', default=0, type=int, metavar='N', help='force annealing at specified epoch') agent.add_argument( '--beam', default=5, type=int, metavar='N', help='beam size') agent.add_argument( '--no-early-stop', action='store_true', help=('continue searching even after finalizing k=beam ' 'hypotheses; this is more correct, but increases ' 'generation time by 50%%')) agent.add_argument( '--unnormalized', action='store_true', help='compare unnormalized hypothesis scores') agent.add_argument( '--lenpen', default=1, type=float, help= 'length penalty: <1.0 favors shorter, >1.0 favors longer sentences') agent.add_argument( '--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') agent.add_argument( '--arch', '-a', default='fconv', metavar='ARCH', choices=models.arch_model_map.keys(), help='model architecture ({})'.format( ', '.join(models.arch_model_map.keys()))) agent.add_argument( '--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') agent.add_argument( '--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') agent.add_argument( '--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') agent.add_argument( '--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') agent.add_argument( '--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') agent.add_argument( '--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') # These arguments have default values independent of the model: agent.add_argument( '--dropout', default=0.1, type=float, metavar='D', help='dropout probability') agent.add_argument( '--label-smoothing', default=0, type=float, metavar='D', help='epsilon for label smoothing, 0 means no label smoothing')
class ScoringNetAgent(Agent): """Agent which takes an input sequence and produces an output sequence. For more information, see Sequence to Sequence Learning with Neural Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM} @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-emb', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learning_rate', type=float, default=0.5, help='learning rate') agent.add_argument('-wd', '--weight_decay', type=float, default=0, help='weight decay') agent.add_argument('-dr', '--dropout', type=float, default=0.2, help='dropout rate') agent.add_argument('-att', '--attention', default=False, type='bool', help='if True, use attention') agent.add_argument( '-attType', '--attn-type', default='general', choices=['general', 'concat', 'dot'], help='general=bilinear dotproduct, concat=bahdanau\'s implemenation' ) agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type='bool', default=True, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length and to ' 'be similar in length to one another by throwing ' 'away extra tokens. This reduces the total amount ' 'of padding in the batches.') agent.add_argument('-enc', '--encoder', default='gru', choices=ScoringNetAgent.ENC_OPTS.keys(), help='Choose between different encoder modules.') agent.add_argument('-bi', '--bi-encoder', default=True, type='bool', help='Bidirection of encoder') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'] + list(ScoringNetAgent.ENC_OPTS.keys()), help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights.') agent.add_argument('-opt', '--optimizer', default='sgd', choices=ScoringNetAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-gradClip', '--grad-clip', type=float, default=-1, help='gradient clip, default = -1 (no clipping)') agent.add_argument( '-epi', '--episode-concat', type='bool', default=False, help= 'If multiple observations are from the same episode, concatenate them.' ) agent.add_argument( '--beam_size', type=int, default=0, help= 'Beam size for beam search (only for generation mode) \n For Greedy search set 0' ) agent.add_argument('--max_seq_len', type=int, default=50, help='The maximum sequence length, default = 50') agent.add_argument('-ptrmodel', '--ptr_model', default='', help='The pretrained model directory') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available( ) if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) """ if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override options with stored ones opt = self.override_opt(new_opt) """ if opt.get('ptr_model') and os.path.isfile(opt['ptr_model']): # load model parameters if available print('Loading existing model params from ' + opt['ptr_model']) new_opt, self.states = self.load( opt['ptr_model']) ## TODO:: load what? # override options with stored ones #opt = self.override_opt(new_opt) self.dict = DictionaryAgent(opt) self.id = 'ScoringNet' # we use START markers to start our output self.START = self.dict.start_token self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START)) # we use END markers to end our output self.END = self.dict.end_token self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END)) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params directly hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learning_rate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.truncate = opt['truncate'] self.attention = opt['attention'] # set up tensors if self.opt['bi_encoder']: self.zeros = torch.zeros(2 * self.num_layers, 1, hsz) else: self.zeros = torch.zeros(self.num_layers, 1, hsz) self.zeros_dec = torch.zeros(self.num_layers, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) self.neg_ys = torch.LongTensor(1, 1) # set up modules #self.criterion = nn.NLLLoss(size_average = False, ignore_index = 0) self.criterion = nn.BCELoss() # lookup table stores word embeddings self.lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX) #scale_grad_by_freq=True) # encoder captures the input text enc_class = ScoringNetAgent.ENC_OPTS[opt['encoder']] self.encoder = enc_class(emb, hsz, opt['numlayers'], bidirectional=opt['bi_encoder'], dropout=opt['dropout']) # decoder produces our output states dec_isz = hsz if opt['bi_encoder']: dec_isz += hsz # linear layer helps us produce outputs from final decoder state self.h2o = nn.Linear(dec_isz, dec_isz, bias=False) # droput on the linear layer helps us generalize self.dropout = nn.Dropout(opt['dropout']) self.use_attention = False self.attn = None # if attention is greater than 0, set up additional members if self.attention: self.use_attention = True self.att_type = opt['attn_type'] input_size = hsz if opt['bi_encoder']: input_size += hsz if self.att_type == 'concat': self.attn = nn.Linear(input_size + hsz, 1, bias=False) elif self.att_type == 'dot': assert not opt['bi_encoder'] elif self.att_type == 'general': self.attn = nn.Linear(hsz, input_size, bias=False) # set up optims for each module self.lr = opt['learning_rate'] self.wd = opt['weight_decay'] is not 0 optim_class = ScoringNetAgent.OPTIM_OPTS[opt['optimizer']] self.optims = { 'lt': optim_class(self.lt.parameters(), lr=self.lr), 'encoder': optim_class(self.encoder.parameters(), lr=self.lr), 'h2o': optim_class(self.h2o.parameters(), lr=self.lr, weight_decay=self.wd), } if self.attention and self.attn is not None: self.optims.update({ 'attn': optim_class(self.attn.parameters(), lr=self.lr, weight_decay=self.wd) }) if hasattr(self, 'states'): # set loaded states if applicable if opt.get('ptr_model'): self.init_pretrain(self.states) else: self.set_states(self.states) if self.use_cuda: self.cuda() self.loss = 0 self.ndata = 0 self.loss_valid = 0 self.ndata_valid = 0 if opt['beam_size'] > 0: self.beamsize = opt['beam_size'] self.episode_concat = opt['episode_concat'] self.training = True self.generating = False self.local_human = False self.max_seq_len = opt['max_seq_len'] self.reset() def set_lrate(self, lr): self.lr = lr for key in self.optims: self.optims[key].param_groups[0]['lr'] = self.lr def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" return self.dict.vec2txt(vec) def cuda(self): """Push parameters to the GPU.""" self.START_TENSOR = self.START_TENSOR.cuda(async=True) self.END_TENSOR = self.END_TENSOR.cuda(async=True) self.zeros = self.zeros.cuda(async=True) self.zeros_dec = self.zeros_dec.cuda(async=True) self.xs = self.xs.cuda(async=True) self.ys = self.ys.cuda(async=True) self.neg_ys = self.neg_ys.cuda(async=True) self.criterion.cuda() self.lt.cuda() self.encoder.cuda() self.h2o.cuda() self.dropout.cuda() if self.use_attention: self.attn.cuda() def hidden_to_idx(self, hidden, dropout=False): """Convert hidden state vectors into indices into the dictionary.""" if hidden.size(0) > 1: raise RuntimeError('bad dimensions of tensor:', hidden) hidden = hidden.squeeze(0) if dropout: hidden = self.dropout(hidden) # dropout over the last hidden scores = self.h2o(hidden) scores = F.log_softmax(scores) _max_score, idx = scores.max(1) return idx, scores def zero_grad(self): """Zero out optimizers.""" for optimizer in self.optims.values(): optimizer.zero_grad() def update_params(self): """Do one optimization step.""" for optimizer in self.optims.values(): optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True def preprocess(self, reply_text): # preprocess for opensub reply_text = reply_text.replace('\\n', '\n') ## TODO: pre-processing reply_text = reply_text.replace("'m", " 'm") reply_text = reply_text.replace("'ve", " 've") reply_text = reply_text.replace("'s", " 's") reply_text = reply_text.replace("'t", " 't") reply_text = reply_text.replace("'il", " 'il") reply_text = reply_text.replace("'d", " 'd") reply_text = reply_text.replace("'re", " 're") reply_text = reply_text.lower().strip() return reply_text def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ if self.local_human: observation = {} observation['id'] = self.getID() reply_text = input("Enter Your Message: ") reply_text = self.preprocess(reply_text) observation['episode_done'] = True ### TODO: for history observation['text'] = reply_text reply_text = input("Enter a lable: ") observation['labels'] = self.preprocess(reply_text) reply_text = input("Enter a candidate: ") observation['cands'] = self.preprocess(reply_text) else: # shallow copy observation (deep copy can be expensive) observation = observation.copy() if not self.episode_done and self.episode_concat: # if the last example wasn't the end of an episode, then we need to # recall what was said in that example prev_dialogue = self.observation['text'] observation['text'] = prev_dialogue + '\n' + observation[ 'text'] #### TODO!!!! # DATA is concatenated!! self.observation = observation self.episode_done = observation['episode_done'] return observation def _encode(self, xs, xlen, dropout=False, packed=True): """Call encoder and return output and hidden states.""" batchsize = len(xs) # first encode context xes = self.lt(xs).transpose(0, 1) #if dropout: # xes = self.dropout(xes) # initial hidden if self.zeros.size(1) != batchsize: if self.opt['bi_encoder']: self.zeros.resize_(2 * self.num_layers, batchsize, self.hidden_size).fill_(0) else: self.zeros.resize_(self.num_layers, batchsize, self.hidden_size).fill_(0) h0 = Variable(self.zeros.fill_(0)) # forward if packed: xes = torch.nn.utils.rnn.pack_padded_sequence(xes, xlen) if type(self.encoder) == nn.LSTM: encoder_output, _ = self.encoder( xes, (h0, h0)) ## Note : we can put None instead of (h0, h0) else: encoder_output, _ = self.encoder(xes, h0) if packed: encoder_output, _ = torch.nn.utils.rnn.pad_packed_sequence( encoder_output) encoder_output = encoder_output.transpose(0, 1) #batch first """ if self.use_attention: if encoder_output.size(1) > self.max_length: offset = encoder_output.size(1) - self.max_length encoder_output = encoder_output.narrow(1, offset, self.max_length) """ return encoder_output def _apply_attention(self, word_input, encoder_output, last_hidden, xs): """Apply attention to encoder hidden layer.""" batch_size = encoder_output.size(0) enc_length = encoder_output.size(1) mask = Variable(xs.data.eq(0).eq(0).float()) #pdb.set_trace() # encoder_output # B x T x 2H # last_hidden B x H if self.att_type == 'concat': last_hidden = last_hidden.unsqueeze(1).expand( batch_size, encoder_output.size(1), self.hidden_size) # B x T x H attn_weights = F.tanh( self.attn( torch.cat((encoder_output, last_hidden), 2).view(batch_size * enc_length, -1)).view(batch_size, enc_length)) elif self.att_type == 'dot': attn_weights = F.tanh( torch.bmm(encoder_output, last_hidden.unsqueeze(2)).squeeze()) elif self.att_type == 'general': attn_weights = F.tanh( torch.bmm(encoder_output, self.attn(last_hidden).unsqueeze(2)).squeeze()) #attn_weights = F.softmax(attn_weights.view(batch_size, enc_length)) attn_weights = attn_weights.exp().mul(mask) denom = attn_weights.sum(1).unsqueeze(1).expand_as(attn_weights) attn_weights = attn_weights.div(denom) context = torch.bmm(attn_weights.unsqueeze(1), encoder_output).squeeze(1) output = torch.cat((word_input, context.unsqueeze(0)), 2) return output def _get_context(self, batchsize, xlen_t, encoder_output): " return initial hidden of decoder and encoder context (last_state)" ## The initial of decoder is the hidden (last states) of encoder --> put zero! if self.zeros_dec.size(1) != batchsize: self.zeros_dec.resize_(self.num_layers, batchsize, self.hidden_size).fill_(0) hidden = Variable(self.zeros_dec.fill_(0)) last_state = None if not self.use_attention: last_state = torch.gather( encoder_output, 1, xlen_t.view(-1, 1, 1).expand(encoder_output.size(0), 1, encoder_output.size(2))) if self.opt['bi_encoder']: last_state = torch.cat( (encoder_output[:, 0, self.hidden_size:], last_state[:, 0, :self.hidden_size]), 1) return hidden, last_state def predict(self, xs, xlen, x_idx, ys, ylen, y_idx, nys=None, nylen=None, ny_idx=None): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available. """ self._training(self.training) self.zero_grad() batchsize = len(xs) #text_cand_inds = None #target_exist = ys is not None xlen_t = Variable(torch.LongTensor(xlen) - 1) ylen_t = Variable(torch.LongTensor(ylen) - 1) if self.use_cuda: xlen_t = xlen_t.cuda() ylen_t = ylen_t.cuda() _, x_idx_t = torch.LongTensor(x_idx).sort(0) _, y_idx_t = torch.LongTensor(y_idx).sort(0) if self.use_cuda: x_idx_t = x_idx_t.cuda() y_idx_t = y_idx_t.cuda() if ny_idx is not None: nylen_t = Variable(torch.LongTensor(nylen) - 1) _, ny_idx_t = torch.LongTensor(ny_idx).sort(0) if self.use_cuda: nylen_t = nylen_t.cuda() ny_idx_t = ny_idx_t.cuda() # Encoding _, enc_x = self._get_context( batchsize, xlen_t, self._encode(xs, xlen, dropout=self.training)) # encode x _, enc_y = self._get_context( batchsize, ylen_t, self._encode(ys, ylen, dropout=self.training)) # encode x # Permute enc_x = enc_x[x_idx_t, :] enc_y = enc_y[y_idx_t, :] target = Variable(torch.Tensor(batchsize).zero_()) if ny_idx is not None: _, enc_ny = self._get_context( batchsize, nylen_t, self._encode(nys, nylen, dropout=self.training)) # encode x enc_ny = enc_ny[ny_idx_t, :] # make batch enc_x = torch.cat((enc_x, enc_x), 0) enc_y = torch.cat((enc_y, enc_ny), 0) target = torch.cat((target, target + 1), 0) if self.use_cuda: target = target.cuda() # calcuate the score output = F.sigmoid( torch.bmm(enc_y.unsqueeze(1), self.h2o(enc_x).unsqueeze(1).transpose(1, 2))) # loss loss = self.criterion(output.squeeze(), target) if self.training: self.ndata += batchsize self.loss = loss else: self.ndata_valid += batchsize self.loss_valid += loss.data[0] * batchsize # list of output tokens for each example in the batch if self.training: self.loss.backward() if self.opt['grad_clip'] > 0: torch.nn.utils.clip_grad_norm(self.lt.parameters(), self.opt['grad_clip']) torch.nn.utils.clip_grad_norm(self.h2o.parameters(), self.opt['grad_clip']) torch.nn.utils.clip_grad_norm(self.encoder.parameters(), self.opt['grad_clip']) self.update_params() self.display_predict(xs[x_idx_t[0], :], ys[y_idx_t[0], :], nys[ny_idx_t[0], :], target, output, batchsize, freq=0.05) return self.loss, output.squeeze() def display_predict(self, xs, ys, nys, target, output, batchsize, freq=0.01): if random.random() < freq: # sometimes output a prediction for debugging print( '\n input:', self.dict.vec2txt(xs.data.cpu()).replace( self.dict.null_token + ' ', ''), '\n postive:', ' {0:.2e} '.format(output[0].data.cpu()[0, 0]), self.dict.vec2txt(ys.data.cpu()).replace( self.dict.null_token + ' ', ''), '\n negative:', ' {0:.2e} '.format(output[batchsize].data.cpu()[0, 0]), self.dict.vec2txt(nys.data.cpu()).replace( self.dict.null_token + ' ', ''), '\n') def txt2tensor(self, parsed, batchsize): max_x_len = max([len(x) for x in parsed]) if self.truncate: # shrink xs to to limit batch computation max_x_len = min(max_x_len, self.max_seq_len) parsed = [x[-max_x_len:] for x in parsed] # sorting for unpack in encoder parsed_x = sorted(enumerate(parsed), key=lambda p: len(p[1]), reverse=True) x_idx, parsed_x = zip(*parsed_x) x_idx = list(x_idx) xlen = [len(x) for x in parsed_x] xs = torch.LongTensor(batchsize, max_x_len).fill_(0) for i, x in enumerate(parsed_x): for j, idx in enumerate(x): xs[i][j] = idx if self.use_cuda: # copy to gpu self.xs.resize_(xs.size()) self.xs.copy_(xs, async=True) xs = Variable(self.xs) else: xs = Variable(xs) return xs, xlen, x_idx def batchify(self, observations): """Convert a list of observations into input & target tensors.""" # valid examples exs = [ex for ex in observations if 'text' in ex] # the indices of the valid (non-empty) tensors valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex] # set up the input tensors batchsize = len(exs) # tokenize the text xs = None xlen = None x_idx = None if batchsize > 0: parsed = [ self.dict.parse(self.START) + self.parse(ex['text']) + self.dict.parse(self.END) for ex in exs ] xs, xlen, x_idx = self.txt2tensor(parsed, batchsize) # set up the target tensors (positive exampels) ys = None ylen = None y_idx = None if batchsize > 0 and (any(['labels' in ex for ex in exs]) or any(['eval_labels' in ex for ex in exs])): # randomly select one of the labels to update on, if multiple # append END to each label if any(['labels' in ex for ex in exs]): labels = [ self.START + ' ' + random.choice(ex.get('labels', [''])) + ' ' + self.END for ex in exs ] else: labels = [ self.START + ' ' + random.choice(ex.get('eval_labels', [''])) + ' ' + self.END for ex in exs ] parsed_y = [self.parse(y) for y in labels] ys, ylen, y_idx = self.txt2tensor(parsed_y, batchsize) # set up candidates (negative samples, randomly select!!) neg_ys = None neg_ylen = None ny_idx = None if batchsize > 0: cands = None for i in range(len(exs)): if exs[i].get('label_candidates') is not None: cands = list(exs[i]['label_candidates']) break if cands is None: if any(['labels' in ex for ex in exs]): cands = [ex['labels'][0] for ex in exs ] ## TODO: the same index should not be selected else: cands = [ex['eval_labels'][0] for ex in exs ] ## TODO: the same index should not be selected # randomly select one of the labels to update on, if multiple # append END to each label parsed_ny = [ self.dict.parse(self.START) + self.parse(random.choice(cands)) + self.dict.parse(self.END) for ex in exs ] neg_ys, neg_ylen, ny_idx = self.txt2tensor(parsed_ny, batchsize) return xs, xlen, x_idx, ys, ylen, y_idx, valid_inds, neg_ys, neg_ylen, ny_idx def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, xlen, x_idx, ys, ylen, y_idx, valid_inds, neg_ys, neg_ylen, ny_idx = self.batchify( observations) if xs is None: # no valid examples, just return the empty responses we set up return batch_reply ## seperate : test code / train code loss = self.predict(xs, xlen, x_idx, ys, ylen, y_idx, neg_ys, neg_ylen, ny_idx) return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def act_scoring_test(self): ## see ../../bot_code/CC_scoring.py x = self.observation['text'] y = self.observation['labels'] batchsize = len(x) parsed = [ self.dict.parse(self.START) + self.parse(ex) + self.dict.parse(self.END) for ex in x ] xs, xlen, x_idx = self.txt2tensor(parsed, batchsize) labels = [ self.dict.parse(self.START) + self.parse(ex) + self.dict.parse(self.END) for ex in y ] ys, ylen, y_idx = self.txt2tensor(labels, batchsize) loss, output = self.predict(xs, xlen, x_idx, ys, ylen, y_idx) return output.data def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'lt'): model = {} model['lt'] = self.lt.state_dict() model['encoder'] = self.encoder.state_dict() model['h2o'] = self.h2o.state_dict() if self.use_attention: model['attn'] = self.attn.state_dict() model['optims'] = { k: v.state_dict() for k, v in self.optims.items() } model['longest_label'] = self.longest_label model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: model = torch.load(read) return model['opt'], model def set_states(self, states): """Set the state dicts of the modules from saved states.""" self.lt.load_state_dict(states['lt']) self.encoder.load_state_dict(states['encoder']) #self.h2o.load_state_dict(states['h2o']) if self.use_attention: self.attn.load_state_dict(states['attn']) for k, v in states['optims'].items(): self.optims[k].load_state_dict(v) self.longest_label = states['longest_label'] def init_pretrain(self, states): """Set the state dicts of the modules from saved states.""" self.lt.load_state_dict(states['lt']) self.encoder.load_state_dict(states['encoder']) #self.h2o.load_state_dict(states['h2o']) """ if self.use_attention: self.attn.load_state_dict(states['attn']) for k, v in states['optims'].items(): self.optims[k].load_state_dict(v) self.longest_label = states['longest_label'] """ def report(self): m = {} if not self.generating: if self.training: m['loss'] = self.loss.data[0] m['ndata'] = self.ndata else: m['loss'] = self.loss_valid / self.ndata_valid m['ndata'] = self.ndata_valid m['lr'] = self.lr self.print_weight_state() return m def reset_valid_report(self): self.ndata_valid = 0 self.loss_valid = 0 def print_weight_state(self): self._print_grad_weight(getattr(self, 'lt').weight, 'lookup') for module in {'encoder'}: layer = getattr(self, module) for weights in layer._all_weights: for weight_name in weights: self._print_grad_weight(getattr(layer, weight_name), module + ' ' + weight_name) self._print_grad_weight(getattr(self, 'h2o').weight, 'h2o') if self.use_attention: self._print_grad_weight(getattr(self, 'attn').weight, 'attn') def _print_grad_weight(self, weight, module_name): if weight.dim() == 2: nparam = weight.size(0) * weight.size(1) norm_w = weight.norm(2).pow(2) norm_dw = weight.grad.norm(2).pow(2) print('{:30}'.format(module_name) + ' {:5} x{:5}'.format(weight.size(0), weight.size(1)) + ' : w {0:.2e} | '.format((norm_w / nparam).sqrt().data[0]) + 'dw {0:.2e}'.format((norm_dw / nparam).sqrt().data[0])) def _training(self, training=True): for module in {'encoder', 'lt', 'h2o', 'attn'}: layer = getattr(self, module) if layer is not None: layer.training = training
class LanguageModelAgent(Agent): """ Agent which trains an RNN on a language modeling task. It is adapted from the language model featured in Pytorch's examples repo here: <https://github.com/pytorch/examples/tree/master/word_language_model>. """ @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" argparser.set_defaults(batch_sort=False) agent = argparser.add_argument_group('Language Model Arguments') agent.add_argument( '--init-model', type=str, default=None, help='load dict/features/weights/opts from this file') agent.add_argument('-hs', '--hiddensize', type=int, default=200, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=200, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-dr', '--dropout', type=float, default=0.2, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=0.25, help='gradient clipping') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument( '-rnn', '--rnn-class', default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') agent.add_argument('-sl', '--seq-len', type=int, default=35, help='sequence length') agent.add_argument('-tied', '--emb-tied', action='store_true', help='tie the word embedding and softmax weights') agent.add_argument('-seed', '--random-seed', type=int, default=1111, help='random seed') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-tr', '--truncate-pred', type=int, default=50, help='truncate predictions') agent.add_argument('-rf', '--report-freq', type=float, default=0.1, help='report frequency of prediction during eval') agent.add_argument('-pt', '--person-tokens', type='bool', default=True, help='append person1 and person2 tokens to text') # learning rate parameters agent.add_argument('-lr', '--learningrate', type=float, default=20, help='initial learning rate') agent.add_argument( '-lrf', '--lr-factor', type=float, default=1.0, help='mutliply learning rate by this factor when the \ validation loss does not decrease') agent.add_argument('-lrp', '--lr-patience', type=int, default=10, help='wait before decreasing learning rate') agent.add_argument('-lrm', '--lr-minimum', type=float, default=0.1, help='minimum learning rate') agent.add_argument( '-sm', '--sampling-mode', type='bool', default=False, help='sample when generating tokens instead of taking \ the max and do not produce UNK token (when bs=1)') LanguageModelAgent.dictionary_class().add_cmdline_args(argparser) return agent def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init self.metrics = { 'loss': 0, 'num_tokens': 0, 'lmloss': 0, 'lm_num_tokens': 0 } self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() self.batchsize = opt.get('batchsize', 1) self.use_person_tokens = opt.get('person_tokens', True) self.sampling_mode = opt.get('sampling_mode', False) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] self.metrics = shared['metrics'] # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) else: # this is not a shared instance of this class, so do full init if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] # for backwards compatibility: will only be called for older models # for which .opt file does not exist if (init_model is not None and not os.path.isfile(init_model + '.opt')): new_opt = self.load_opt(init_model) # load model parameters if available print('[ Setting opt from {} ]'.format(init_model)) # since .opt file does not exist, save one for future use print("Saving opt file at:", init_model + ".opt") with open(init_model + ".opt", 'wb') as handle: pickle.dump(new_opt, handle, protocol=pickle.HIGHEST_PROTOCOL) opt = self.override_opt(new_opt) if ((init_model is not None and os.path.isfile(init_model + '.dict')) or opt['dict_file'] is None): opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'LanguageModel' # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) # set model self.model = RNNModel(opt, len(self.dict)) if init_model is not None: self.load(init_model) if self.use_cuda: self.model.cuda() self.next_observe = [] self.next_batch = [] self.is_training = True if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.25) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.criterion.cuda() # init hidden state self.hidden = self.model.init_hidden(self.batchsize) # init tensor of end tokens self.ends = torch.LongTensor( [self.END_IDX for _ in range(self.batchsize)]) if self.use_cuda: self.ends = self.ends.cuda() # set up model and learning rate scheduler parameters self.lr = opt['learningrate'] self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr) self.best_val_loss = self.states.get('best_val_loss', None) self.lr_factor = opt['lr_factor'] if self.lr_factor < 1.0: self.lr_patience = opt['lr_patience'] self.lr_min = opt['lr_minimum'] self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, factor=self.lr_factor, verbose=True, patience=self.lr_patience, min_lr=self.lr_min) # initial step for scheduler if self.best_val_loss is initialized if self.best_val_loss is not None: self.scheduler.step(self.best_val_loss) else: self.scheduler = None self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len', 'emb_tied', 'truncate_pred', 'report_freq', 'person_tokens', 'learningrate' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def zero_grad(self): """Zero out optimizer.""" self.optimizer.zero_grad() def update_params(self): """Do one optimization step.""" torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) self.optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.reset_metrics() def reset_metrics(self): self.metrics.clear() self.metrics['loss'] = 0 self.metrics['lmloss'] = 0 self.metrics['num_tokens'] = 0 self.metrics['lm_num_tokens'] = 0 def report(self): m = {} if self.metrics['num_tokens'] > 0: m['loss'] = self.metrics['loss'] / self.metrics['num_tokens'] m['ppl'] = math.exp(m['loss']) if self.metrics['lm_num_tokens'] > 0: m['lmloss'] = self.metrics['lmloss'] / self.metrics['lm_num_tokens'] m['lmppl'] = math.exp(m['lmloss']) for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats m[k] = round_sigfigs(v, 4) return m def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['opt'] = self.opt shared['dict'] = self.dict shared['NULL_IDX'] = self.NULL_IDX shared['END_IDX'] = self.END_IDX shared['metrics'] = self.metrics shared['model'] = self.model self.model.share_memory() shared['states'] = { # only need to pass optimizer states 'optimizer': self.optimizer.state_dict(), } return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ #shallow copy observation (deep copy can be expensive) obs = observation.copy() seq_len = self.opt['seq_len'] is_training = True if 'labels' not in obs: is_training = False if is_training: if 'text' in obs: if self.use_person_tokens: obs['text'] = 'PERSON1 ' + obs['text'] vec = self.parse(obs['text']) vec.append(self.END_IDX) self.next_observe += vec if 'labels' in obs: if self.use_person_tokens: labels = [ 'PERSON2 ' + label for label in obs['labels'] if label != '' ] obs['labels'] = tuple(labels) vec = self.parse(obs['labels'][0]) vec.append(self.END_IDX) self.next_observe += vec if len(self.next_observe) < (seq_len + 1): # not enough to return to make a batch # we handle this case in vectorize # labels indicates that we are training self.observation = {'labels': ''} return self.observation else: vecs_to_return = [] total = len(self.next_observe) // (seq_len + 1) for _ in range(total): observe = self.next_observe[:(seq_len + 1)] self.next_observe = self.next_observe[(seq_len + 1):] vecs_to_return.append(observe) dict_to_return = { 'text': '', 'labels': '', 'text2vec': vecs_to_return } self.observation = dict_to_return return dict_to_return else: if 'text' in obs: if self.use_person_tokens: obs['text'] = 'PERSON1 ' + obs['text'] if 'eval_labels' in obs: if self.use_person_tokens: eval_labels = [ 'PERSON2 ' + label for label in obs['eval_labels'] if label != '' ] obs['eval_labels'] = tuple(eval_labels) self.observation = obs return obs def repackage_hidden(self, h): """Wraps hidden states in new Variables, to detach them from their history.""" if isinstance(h, Variable): return Variable(h.data) else: return tuple(self.repackage_hidden(v) for v in h) def get_target_loss(self, data, hidden, targets): """Calculates the loss with respect to the targets, token by token, where each output token is conditioned on either the input or the previous target token. """ loss = 0.0 bsz = data.size(0) # during interactive mode, when no targets exist, we return 0 if targets is None: return loss # feed in inputs without end token output, hidden = self.model(data.transpose(0, 1), hidden) self.hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)), self.hidden) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.criterion(output_flat, targets.select(1, 0).view(-1)).data for i in range(1, targets.size(1)): output, hidden = self.model(targets.select(1, i - 1).view(1, bsz), self.hidden, no_pack=True) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.criterion(output_flat, targets.select(1, i).view(-1)).data return loss def get_predictions(self, data): """Generates predictions word by word until we either reach the end token or some max length (opt['truncate_pred']). """ token_list = [] bsz = data.size(0) done = [False for _ in range(bsz)] total_done = 0 hidden = self.model.init_hidden(bsz) i = 0 while total_done < bsz and i <= self.opt['truncate_pred']: if i == 0: # feed in input without end tokens output, hidden = self.model(data.transpose(0, 1), hidden) hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model( Variable(self.ends[:bsz].view(1, bsz)), hidden) else: output, hidden = self.model(Variable(word_idx.view(1, bsz)), hidden, no_pack=True) hidden = self.repackage_hidden(hidden) word_weights = output.squeeze().data.exp() if bsz > 1: _, word_idx = torch.max(word_weights, 1) else: if self.sampling_mode: unk_idx = self.dict[self.dict.unk_token] # make word_weights have smaller norm so that calculated # norm does not blow up word_weights = word_weights.div(1e10) # make word_weights have L2 norm 1 ww_norm = torch.norm(word_weights, p=2) word_weights = word_weights.div(ww_norm) # square distribution word_weights = torch.mul(word_weights, word_weights) # sample distribution word_idx = torch.multinomial(word_weights, 1) # do not produce UNK token while word_idx == unk_idx: word_idx = torch.multinomial(word_weights, 1) else: _, word_idx = torch.max(word_weights, 0) # mark end indices for items in batch word_idx = word_idx.view(-1) for k in range(word_idx.size(0)): if not done[k]: if int(word_idx[k]) == self.END_IDX: done[k] = True total_done += 1 token_list.append(word_idx.view(bsz, 1)) i += 1 return torch.cat(token_list, 1) def predict(self, data, hidden, targets=None, is_training=True, y_lens=None): """Produce a prediction from our model.""" output = None predictions = None if is_training: self.model.train() self.zero_grad() output, hidden = self.model(data, hidden) loss = self.criterion(output.view(-1, len(self.dict)), targets.view(-1)) # save loss to metrics target_tokens = targets.ne(self.NULL_IDX).float().sum().item() self.metrics['lmloss'] += loss.double().item() self.metrics['lm_num_tokens'] += target_tokens # average loss per token loss /= target_tokens loss.backward(retain_graph=True) self.update_params() else: self.model.eval() predictions = self.get_predictions(data) bsz = data.size(0) if bsz != self.batchsize: self.hidden = self.model.init_hidden(bsz) if targets is not None: loss = self.get_target_loss(data, self.hidden, targets) self.metrics['loss'] += loss self.metrics['num_tokens'] += sum(y_lens) return output, hidden, predictions def vectorize(self, observations, seq_len, is_training): """Convert a list of observations into input & target tensors.""" labels = None valid_inds = None y_lens = None if is_training: for obs in observations: if obs: if 'text2vec' in obs: self.next_batch += obs['text2vec'] if len(self.next_batch) <= self.batchsize: return None, None, None, None, None else: data_list = [] targets_list = [] # total is the number of batches total = len(self.next_batch) // self.batchsize for i in range(total): batch = self.next_batch[:self.batchsize] self.next_batch = self.next_batch[self.batchsize:] source = torch.LongTensor(batch).t().contiguous() data = Variable(source[:seq_len]) targets = Variable(source[1:]) if self.use_cuda: data = data.cuda() targets = targets.cuda() data_list.append(data) targets_list.append(targets) else: # here we get valid examples and pad them with zeros xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text( observations, self.dict, end_idx=self.END_IDX, null_idx=self.NULL_IDX) if self.use_cuda: if xs is not None: xs = Variable(torch.LongTensor(xs)).cuda() if ys is not None: ys = Variable(torch.LongTensor(ys)).cuda() else: if xs is not None: xs = Variable(torch.LongTensor(xs)) if ys is not None: ys = Variable(torch.LongTensor(ys)) data_list = [xs] targets_list = [ys] return data_list, targets_list, labels, valid_inds, y_lens def batch_act(self, observations): batch_reply = [{'id': self.getID()} for _ in range(len(observations))] if any(['labels' in obs for obs in observations]): # if we are starting a new training epoch, reinitialize hidden if self.is_training == False: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = True data_list, targets_list, _, _, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) else: # if we just finished training, reinitialize hidden if self.is_training == True: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = False data_list, targets_list, labels, valid_inds, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) if data_list is None: # not enough data to batch act yet, return empty responses return batch_reply batch_reply = [] # during evaluation, len(data_list) is always 1 # during training, len(dat_list) >= 0: vectorize returns a list containing all batches available at the time it is called for i in range(len(data_list)): temp_dicts = [{ 'id': self.getID() } for _ in range(len(observations))] # ignore case when we do not return any valid indices if data_list[i] is not None: output, hidden, predictions = self.predict( data_list[i], self.hidden, targets_list[i], self.is_training, y_lens) self.hidden = self.repackage_hidden(hidden) if predictions is not None: # map predictions back to the right order PaddingUtils.map_predictions( predictions.cpu(), valid_inds, temp_dicts, observations, self.dict, self.END_IDX, report_freq=self.opt['report_freq']) batch_reply += temp_dicts # for prediction metrics computations, we get rid of PERSON1 and PERSON2 tokens if not self.is_training: for reply in batch_reply: if 'text' in reply: reply['text'] = reply['text'].replace('PERSON1 ', '') reply['text'] = reply['text'].replace('PERSON2 ', '') return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['opt'] = self.opt model['best_val_loss'] = self.best_val_loss with open(path, 'wb') as write: torch.save(model, write) # save opt file with open(path + ".opt", 'wb') as handle: pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def receive_metrics(self, metrics_dict): if 'loss' in metrics_dict and self.scheduler is not None: self.scheduler.step(metrics_dict['loss']) def load_opt(self, path): """Return opt, states.""" states = torch.load(path, map_location=lambda cpu, _: cpu) return states['opt'] def load(self, path): """Load model states.""" if os.path.isfile(path): # load model parameters if available print('[ Loading existing model params from {} ]'.format(path)) self.states = torch.load(path, map_location=lambda cpu, _: cpu) self.model.load_state_dict(self.states['model'])
def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt
class KvmemnnAgent(Agent): """ Simple implementation of the memnn algorithm with 1 hop. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, # type: ignore 'adagrad': optim.Adagrad, # type: ignore 'adam': optim.Adam, 'adamax': optim.Adamax, # type: ignore 'asgd': optim.ASGD, # type: ignore 'lbfgs': optim.LBFGS, # type: ignore 'rmsprop': optim.RMSprop, # type: ignore 'rprop': optim.Rprop, # type: ignore 'sgd': optim.SGD, } @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """ Add command-line arguments specifically for this agent. """ KvmemnnAgent.dictionary_class().add_cmdline_args(argparser) agent = argparser.add_argument_group('Kvmemnn Arguments') agent.add_argument('--hops', type=int, default=1, help='num hops') agent.add_argument('--lins', type=int, default=0, help='num lins projecting after hops') agent.add_argument( '-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings', ) agent.add_argument( '-enorm', '--embeddingnorm', type=float, default=10, help='max norm of word embeddings', ) agent.add_argument( '-shareEmb', '--share-embeddings', type='bool', default=True, help='whether LHS and RHS share embeddings', ) agent.add_argument('-lr', '--learningrate', type=float, default=0.005, help='learning rate') agent.add_argument('-margin', '--margin', type=float, default=0.3, help='margin') agent.add_argument('-loss', '--loss', default='cosine', choices={'cosine', 'nll'}) agent.add_argument( '-opt', '--optimizer', default='sgd', choices=KvmemnnAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).', ) agent.add_argument( '-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length.', ) agent.add_argument( '-k', '--neg-samples', type=int, default=10, help='number k of negative samples per example', ) agent.add_argument('--parrot-neg', type=int, default=0, help='include query as a negative') agent.add_argument('--take-next-utt', type='bool', default=False, help='take next utt') agent.add_argument( '--twohop-range', type=int, default=100, help='2 hop range constraint for num rescored utterances', ) agent.add_argument( '--twohop-blend', type=float, default=0, help='2 hop blend in the first hop scores if > 0', ) agent.add_argument( '--kvmemnn-debug', type='bool', default=False, help='print debug information', ) agent.add_argument( '--tfidf', type='bool', default=False, help='Use frequency based normalization for embeddings.', ) agent.add_argument( '-cs', '--cache-size', type=int, default=1000, help='size of negative sample cache to draw from', ) agent.add_argument( '-hist', '--history-length', default=100, type=int, help='Number of past tokens to remember. ', ) agent.add_argument( '-histr', '--history-replies', default='label', type=str, choices=['none', 'model', 'label'], help='Keep replies in the history, or not.', ) agent.add_argument('--interactive-mode', default=False, type='bool', choices=[True, False]) agent.add_argument( '--loadcands', type='bool', default=True, help='Load candidates to rank from .candspair files, or not.', ) def __init__(self, opt, shared=None): """ Set up model if shared params not set, otherwise no work to do. """ super().__init__(opt, shared) opt = self.opt if opt.get('batchsize', 1) > 1: raise RuntimeError('Kvmemnn model does not support batchsize > 1, ' 'try training with numthreads > 1 instead.') self.reset_metrics() # all instances needs truncate param self.id = 'Kvmemnn' self.NULL_IDX = 0 self.start2 = 99 # set up tensors once self.cands = torch.LongTensor(1, 1, 1) self.ys_cache = [] self.ys_cache_sz = opt['cache_size'] self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} if shared: torch.set_num_threads(1) if 'threadindex' in shared: self.threadindex = shared['threadindex'] else: self.threadindex = 1 # set up shared properties self.dict = shared['dict'] # answers contains a batch_size list of the last answer produced self.model = shared['model'] # Kvmemnn(opt, len(self.dict)) if 'fixedX' in shared: self.fixedX = shared['fixedX'] self.fixedCands = shared['fixedCands'] self.fixedCands_txt = shared['fixedCands_txt'] self.fixedCands2 = shared['fixedCands2'] self.fixedCands_txt2 = shared['fixedCands_txt2'] else: print("[ creating KvmemnnAgent ]") # this is not a shared instance of this class, so do full init self.threadindex = -1 torch.set_num_threads(1) if (opt['dict_file'] is None and opt.get('model_file') ) or os.path.isfile(opt['model_file'] + '.dict'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) if 'loss' not in opt: opt['loss'] = 'cosine' self.model = Kvmemnn(opt, len(self.dict), self.dict) if opt.get('model_file') and os.path.isfile(opt['model_file']): self.load(opt['model_file']) self.model.share_memory() self.fixedCands = False self.fixedX = None path = opt['model_file'] + '.candspair' if os.path.isfile(path) and opt.get('loadcands') is not False: print("[loading candidates: " + path + "*]") fc = load_cands(path) fcs = [] for c in fc: fcs.append( Variable(torch.LongTensor(self.parse(c)).unsqueeze(0))) self.fixedCands = fcs self.fixedCands_txt = fc fc2 = load_cands(path + "2") fcs2 = [] for c2 in fc2: fcs2.append( Variable( torch.LongTensor(self.parse(c2)).unsqueeze(0))) self.fixedCands2 = fcs2 self.fixedCands_txt2 = fc2 print("[caching..]") xsq = Variable(torch.LongTensor([self.parse('nothing')])) xe, ye = self.model(xsq, [], None, self.fixedCands) self.fixedX = ye print("=init done=") if self.opt['loss'] == 'cosine': self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'], size_average=False) elif self.opt['loss'] == 'nll': self.criterion = nn.CrossEntropyLoss(ignore_index=-100) else: raise RuntimeError('unspecified loss') # self.criterion = torch.nn.MultiMarginLoss(p=1, margin=0.1) self.reset() # can be used to look at embeddings: # self.dict_neighbors('coffee') self.take_next_utt = True self.cands_done = [] if 'interactive_mode' in opt: self.interactiveMode = self.opt['interactive_mode'] else: self.interactiveMode = False if self.interactiveMode: print("[ Interactive mode ]") def override_opt(self, new_opt): """ Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder', 'decoder', 'lookuptable', 'attention', 'attention_length', 'fixed_candidates_file', } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """ Convert string to token indices. """ text = text.lower() text = text.replace("n't", " not") vec = self.dict.txt2vec(text) if vec == []: vec = [self.dict[self.dict.null_token]] return vec def t2v(self, text): p = self.dict.txt2vec(text) return Variable(torch.LongTensor(p).unsqueeze(1)) def v2t(self, vec): """ Convert token indices to string of tokens. """ if type(vec) == Variable: vec = vec.data if type(vec) == torch.LongTensor and vec.dim() == 2: vec = vec.squeeze(0) if type(vec) == torch.Tensor and vec.dim() == 2: vec = vec.squeeze(0) new_vec = [] for i in vec: new_vec.append(i) return self.dict.vec2txt(new_vec) def zero_grad(self): """ Zero out optimizer. """ self.optimizer.zero_grad() def update_params(self): """ Do one optimization step. """ self.optimizer.step() def reset(self): """ Reset observation and episode_done. """ self.observation = None self.episode_done = True self.cands_done = [] self.history = {} # set up optimizer lr = self.opt['learningrate'] optim_class = KvmemnnAgent.OPTIM_OPTS[self.opt['optimizer']] kwargs = {'lr': lr} self.optimizer = optim_class(self.model.parameters(), **kwargs) def share(self): """ Share internal states between parent and child instances. """ shared = super().share() shared['dict'] = self.dict shared['model'] = self.model if self.fixedX is not None: shared['fixedX'] = self.fixedX shared['fixedCands'] = self.fixedCands shared['fixedCands_txt'] = self.fixedCands_txt shared['fixedCands2'] = self.fixedCands2 shared['fixedCands_txt2'] = self.fixedCands_txt2 return shared def observe(self, observation): self.episode_done = observation['episode_done'] # shallow copy observation (deep copy can be expensive) obs = observation.copy() obs['query'], obs['mem'] = maintain_dialog_history( self.history, obs, historyLength=self.opt['history_length'], useReplies=self.opt['history_replies'], dict=self.dict, useStartEndIndices=False, ) self.observation = obs return obs def report2(self): def clip(f): return round_sigfigs(f) metrics = self.metrics if metrics['exs'] == 0: report = {'mean_rank': self.opt['neg_samples']} else: maxn = 0 for _ in range(100): n = self.model.lt.weight[5].norm(2)[0].item() if n > maxn: maxn = n report = { 'exs': clip(metrics['total_total']), 'loss': clip(metrics['loss'] / metrics['exs']), 'mean_rank': clip(metrics['mean_rank'] / metrics['exs']), 'mlp_time': clip(metrics['mlp_time'] / metrics['exs']), 'tot_time': clip(metrics['tot_time'] / metrics['exs']), 'max_norm': clip(n), } return report def reset_metrics(self, keep_total=False): if keep_total: self.metrics = { 'exs': 0, 'mean_rank': 0, 'loss': 0, 'total_total': self.metrics['total_total'], 'mlp_time': 0, 'tot_time': 0, 'max_weight': 0, 'mean_weight': 0, } else: self.metrics = { 'total_total': 0, 'mean_rank': 0, 'exs': 0, 'mlp_time': 0, 'tot_time': 0, 'loss': 0, 'max_weight': 0, 'mean_weight': 0, } def compute_metrics(self, loss, scores, mlp_time, non_mlp_time): metrics = {} pos = scores[0] cnt = 0 for i in range(1, len(scores)): if scores[i] >= pos: cnt += 1 metrics['mean_rank'] = cnt metrics['loss'] = loss metrics['tot_time'] = mlp_time + non_mlp_time metrics['mlp_time'] = mlp_time return metrics def same(self, y1, y2): """ Check if two tensors are the same, within small margin of error. """ if len(y1) != len(y2): return False if abs((y1 - y2).sum().data.sum()) > 0.00001: return False return True def get_negs(self, xs, ys): negs = [] # for neg in self.ys_cache: cache_sz = len(self.ys_cache) - 1 if cache_sz < 1: return negs k = self.opt['neg_samples'] for _ in range(1, k * 3): index = random.randint(0, cache_sz) neg = self.ys_cache[index] if not self.same(ys.squeeze(0), neg.squeeze(0)): negs.append(neg) if len(negs) >= k: break if self.opt['parrot_neg'] > 0: utt = self.history['last_utterance'] if len(utt) > 2: query = Variable(torch.LongTensor(utt).unsqueeze(0)) negs.append(query) return negs def dict_neighbors(self, word, useRHS=False): input = self.t2v(word) W = self.model.encoder.lt.weight q = W[input[0].item()] if useRHS: W = self.model.encoder2.lt.weight score = torch.Tensor(W.size(0)) for i in range(W.size(0)): score[i] = torch.nn.functional.cosine_similarity(q, W[i], dim=0)[0].item() val, ind = score.sort(descending=True) for i in range(20): print( str(ind[i]) + " [" + str(val[i]) + "]: " + self.v2t(torch.Tensor([ind[i]]))) def predict(self, xs, ys=None, cands=None, cands_txt=None, obs=None): """ Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ self.start = time.time() if xs is None: return [{}] is_training = ys is not None if is_training: # negs = self.get_negs(xs, ys) if len(negs) > 0: self.model.train() self.zero_grad() if self.opt['loss'] == 'cosine': xe, ye = self.model(xs, obs[0]['mem'], ys, negs) y = Variable(-torch.ones(xe.size(0))) y[0] = 1 loss = self.criterion(xe, ye, y) else: x = self.model(xs, obs[0]['mem'], ys, negs) y = Variable(torch.LongTensor([0])) loss = self.criterion(x.unsqueeze(0), y) loss.backward() self.update_params() rest = 0 if self.start2 != 99: rest = self.start - self.start2 self.start2 = time.time() if self.opt['loss'] == 'cosine': pred = nn.CosineSimilarity().forward(xe, ye) else: pred = x metrics = self.compute_metrics(loss.item(), pred.squeeze(0), self.start2 - self.start, rest) return [{'metrics': metrics}] else: fixed = False if hasattr(self, 'fixedCands') and self.fixedCands: self.take_next_utt = True self.twohoputt = True self.tricks = True else: self.take_next_utt = False self.twohoputt = False self.tricks = False if cands is None or cands[0] is None or self.take_next_utt: # cannot predict without candidates. if self.fixedCands or self.take_next_utt: cands_txt2 = [self.fixedCands_txt2] fixed = True else: return [{}] # test set prediction uses candidates self.model.eval() if fixed: if obs[0]['episode_done']: self.cands_done = [] if xs is None: xs = Variable(torch.LongTensor([self.parse('nothing')])) xs = xs.clone() if self.tricks: vv = self.history['last_utterance'] if len(vv) == 0: xsq = Variable( torch.LongTensor([self.parse('nothing')])) else: xsq = Variable(torch.LongTensor([vv])) else: xsq = xs mems = obs[0]['mem'] if self.tricks: mems = [] if self.fixedX is None: xe, ye = self.model(xsq, mems, ys, self.fixedCands) self.fixedX = ye else: # fixed cand embed vectors are cached, dont't recompute blah = Variable(torch.LongTensor([1])) xe, ye = self.model(xsq, mems, ys, [blah]) ye = self.fixedX pred = nn.CosineSimilarity().forward(xe, ye) origxe = xe origpred = pred val, ind = pred.sort(descending=True) ypred = cands_txt2[0][ind[0].item()] # reply to match if self.opt.get('kvmemnn_debug', False): print("twohop-range:", self.opt.get('twohop_range', 100)) for i in range(10): txt1 = self.fixedCands_txt[ind[i].item()] txt2 = cands_txt2[0][ind[i].item()] print(i, txt1, '\n ', txt2) tc = [ypred] if self.twohoputt: # now we rerank original cands against this prediction zq = [] z = [] ztxt = [] newwords = {} r = self.opt.get('twohop_range', 100) for i in range(r): c = self.fixedCands2[ind[i].item()] ctxt = self.fixedCands_txt2[ind[i].item()] if i < 10: zq.append(c) z.append(c) ztxt.append(ctxt) for w in c[0]: newwords[w.item()] = True xs2 = torch.cat(zq, 1) if (self.interactiveMode and self.twohoputt) or cands[0] is None: # used for nextutt alg in demo mode, get 2nd hop blah = Variable(torch.LongTensor([1])) if self.tricks: xe, ye = self.model(xs2, obs[0]['mem'], ys, z) else: xe, ye = self.model(xs2, obs[0]['mem'], ys, [blah]) ye = self.fixedX blend = self.opt.get('twohop_blend', 0) if blend > 0: xe = (1 - blend) * xe + blend * origxe pred = nn.CosineSimilarity().forward(xe, ye) for c in self.cands_done: for i in range(len(ztxt)): if ztxt[i] == c: # interactive heuristic: don't repeat yourself pred[i] = -1000 val, ind = pred.sort(descending=True) # predict the highest scoring candidate, and return it. # print(" [query: " + self.v2t(xsq) + "]") ps = [] for c in obs[0]['mem']: ps.append(self.v2t(c)) # print(" [persona: " + '|'.join(ps) + "]") # print(" [1st hop qmatch: " + ypredorig + "]") # print(" [1st hop nextut: " + ypred + "]") if self.tricks: ypred = ztxt[ind[0].item()] # match self.cands_done.append(ypred) else: ypred = self.fixedCands_txt[ind[0].item()] # match self.cands_done.append(ind[0].item()) # print(" [2nd hop nextut: " + ypred2 + "]") tc = [ypred] self.history['labels'] = [ypred] # print(" [final pred: " + ypred + "]") ret = [{'text': ypred, 'text_candidates': tc}] return ret elif self.take_next_utt and not self.interactiveMode: xe, ye = self.model(xs2, obs[0]['mem'], ys, cands[0]) pred = nn.CosineSimilarity().forward(xe, ye) xe, ye = self.model(xs, obs[0]['mem'], ys, cands[0]) origpred = nn.CosineSimilarity().forward(xe, ye) if 'alpha' not in self.opt: alpha = 0.1 else: alpha = self.opt['alpha'] pred = alpha * pred + 1 * origpred val, ind = pred.sort(descending=True) # predict the highest scoring candidate, and return it. ypred = cands_txt[0][ind[0].item()] # match tc = [] for i in range(len(ind)): tc.append(cands_txt[0][ind[i].item()]) else: if self.opt['loss'] == 'cosine': xe, ye = self.model(xs, obs[0]['mem'], ys, cands[0]) pred = nn.CosineSimilarity().forward(xe, ye) else: x = self.model(xs, obs[0]['mem'], ys, cands[0]) pred = x # .squeeze() val, ind = pred.sort(descending=True) ypred = cands_txt[0][ind[0].item()] # match tc = [] for i in range(min(100, ind.size(0))): tc.append(cands_txt[0][ind[i].item()]) ret = [{'text': ypred, 'text_candidates': tc}] return ret return [{}] * xs.size(0) def batchify(self, observations): """ Convert a list of observations into input & target tensors. """ def valid(obs): # check if this is an example our model should actually process return 'query' in obs and len(obs['query']) > 0 try: # valid examples and their indices valid_inds, exs = zip(*[(i, ex) for i, ex in enumerate(observations) if valid(ex)]) except ValueError: # zero examples to process in this batch, so zip failed to unpack return None, None, None, None # `x` text is already tokenized and truncated # sort by length so we can use pack_padded parsed_x = [ex['query'] for ex in exs] x_lens = [len(x) for x in parsed_x] ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k]) exs = [exs[k] for k in ind_sorted] valid_inds = [valid_inds[k] for k in ind_sorted] parsed_x = [parsed_x[k] for k in ind_sorted] labels_avail = any(['labels' in ex for ex in exs]) max_x_len = max([len(x) for x in parsed_x]) for x in parsed_x: x += [self.NULL_IDX] * (max_x_len - len(x)) xs = torch.LongTensor(parsed_x) xs = Variable(xs) # set up the target tensors ys = None labels = None if labels_avail: # randomly select one of the labels to update on, if multiple labels = [random.choice(ex.get('labels', [''])) for ex in exs] # parse each label and append END parsed_y = [deque(maxlen=self.truncate) for _ in labels] for dq, y in zip(parsed_y, labels): dq.extendleft(reversed(self.parse(y))) max_y_len = max(len(y) for y in parsed_y) for y in parsed_y: y += [self.NULL_IDX] * (max_y_len - len(y)) if len(parsed_y[0]) == 0: return None, None, None, None else: ys = torch.LongTensor(parsed_y) ys = Variable(ys) cands = [] cands_txt = [] if ys is None: # only build candidates in eval mode. for o in observations: if 'label_candidates' in o and o[ 'label_candidates'] is not None: cs = [] ct = [] for c in o['label_candidates']: cs.append( Variable( torch.LongTensor(self.parse(c)).unsqueeze(0))) ct.append(c) cands.append(cs) cands_txt.append(ct) else: cands.append(None) cands_txt.append(None) return xs, ys, cands, cands_txt def add_to_ys_cache(self, ys): if ys is None or len(ys) == 0: return if len(self.ys_cache) < self.ys_cache_sz: self.ys_cache.append(copy.deepcopy(ys)) else: ind = random.randint(0, self.ys_cache_sz - 1) self.ys_cache[ind] = copy.deepcopy(ys) def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] if batchsize == 0 or 'text' not in observations[0]: return [{'text': 'dunno'}] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, cands, cands_txt = self.batchify(observations) batch_reply = self.predict(xs, ys, cands, cands_txt, observations) self.add_to_ys_cache(ys) return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def shutdown(self): # """Save the state of the model when shutdown.""" super().shutdown() def save(self, path=None): """ Save model parameters if model_file is set. """ path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): data = {} data['model'] = self.model.state_dict() data['optimizer'] = self.optimizer.state_dict() data['opt'] = self.opt with open(path, 'wb') as handle: torch.save(data, handle) with open(path + ".opt", 'wb') as handle: pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL) def load(self, path): """ Return opt and model states. """ with open(path, 'rb') as read: print('Loading existing model params from ' + path) data = torch.load(read) self.model.load_state_dict(data['model']) self.reset() self.optimizer.load_state_dict(data['optimizer']) self.opt = self.override_opt(data['opt'])
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print('Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.') return if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 ordered_opt['image_mode'] = 'none' if ordered_opt['task'] == 'pytorch_teacher': pytorch_buildteacher_task = ordered_opt.get('pytorch_buildteacher', '') if pytorch_buildteacher_task != '': ordered_opt['task'] = pytorch_buildteacher_task datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary print('[ running dictionary over data.. ]') log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0: print('Processed {} exs, moving on.'.format( opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if log_time.time() > log_every_n_secs: sys.stdout.write('\r') text, _log = log_time.log( cnt, max(opt.get('dict_maxexs', 0), world_dict.num_examples())) sys.stdout.write(text) sys.stdout.flush() dictionary.save(opt['dict_file'], sort=True) print('[ dictionary built with {} tokens ]'.format(len(dictionary))) return dictionary
def main(): # Get command line arguments argparser = ParlaiParser() DictionaryAgent.add_cmdline_args(argparser) opt = argparser.parse_args() build_dict(opt)
def build_dict(opt, skip_if_built=False): if isinstance(opt, ParlaiParser): print('[ Deprecated Warning: should be passed opt not Parser ]') opt = opt.parse_args() if not opt.get('dict_file'): print( 'Tried to build dictionary but `--dict-file` is not set. Set ' + 'this param so the dictionary can be saved.' ) return if skip_if_built and os.path.isfile(opt['dict_file']): # Dictionary already built, skip all loading or setup print("[ dictionary already built .]") return None if is_distributed(): raise ValueError('Dictionaries should be pre-built before distributed train.') if opt.get('dict_class'): # Custom dictionary class dictionary = str2class(opt['dict_class'])(opt) else: # Default dictionary class dictionary = DictionaryAgent(opt) if os.path.isfile(opt['dict_file']): # Dictionary already built, return loaded dictionary agent print("[ dictionary already built .]") return dictionary ordered_opt = copy.deepcopy(opt) cnt = 0 # we use train set to build dictionary ordered_opt['numthreads'] = 1 ordered_opt['batchsize'] = 1 # Set this to none so that image features are not calculated when Teacher is # instantiated while building the dict ordered_opt['image_mode'] = 'no_image_model' datatypes = ['train:ordered:stream'] if opt.get('dict_include_valid'): datatypes.append('valid:stream') if opt.get('dict_include_test'): datatypes.append('test:stream') cnt = 0 for dt in datatypes: ordered_opt['datatype'] = dt world_dict = create_task(ordered_opt, dictionary) # pass examples to dictionary print('[ running dictionary over data.. ]') log_time = TimeLogger() total = world_dict.num_examples() if opt['dict_maxexs'] >= 0: total = min(total, opt['dict_maxexs']) log_every_n_secs = opt.get('log_every_n_secs', None) if log_every_n_secs: pbar = tqdm.tqdm( total=total, desc='Building dictionary', unit='ex', unit_scale=True ) else: pbar = None while not world_dict.epoch_done(): cnt += 1 if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0: print('Processed {} exs, moving on.'.format(opt['dict_maxexs'])) # don't wait too long... break world_dict.parley() if pbar: pbar.update(1) if pbar: pbar.close() dictionary.save(opt['dict_file'], sort=True) print( '[ dictionary built with {} tokens in {}s ]'.format( len(dictionary), round(log_time.total_time(), 2) ) ) return dictionary
def verify(opt, printargs=None, print_parser=None): if opt['datatype'] == 'train': print("[ note: changing datatype from train to train:ordered ]") opt['datatype'] = 'train:ordered' # create repeat label agent and assign it to the specified task agent = RepeatLabelAgent(opt) world = create_task(opt, agent) log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() dictionary = DictionaryAgent(opt) ignore_tokens = opt.get('ignore_tokens').split(',') counts = {} for t in {'input', 'labels', 'both'}: counts['tokens_in_' + t] = 0 counts['utterances_in_' + t] = 0 counts['avg_utterance_length_in_' + t] = 0 counts['unique_tokens_in_' + t] = 0 counts['unique_utterances_in_' + t] = 0 # for counting the stats.. counts['token_dict_' + t] = {} counts['utterance_dict_' + t] = {} def tokenize(txt): return dictionary.tokenize(txt) def keep_token(t): for s in ignore_tokens: if s != '' and s in t: return False return True # Show some example dialogs. while not world.epoch_done(): world.parley() act = world.get_acts()[opt.get('agent')] for itype in {'input', 'labels'}: if itype == 'input': if opt.get('new_line_new_utt'): txts = act.get('text').split('\n') else: txts = [act.get('text')] else: txts = act.get('labels', act.get('eval_labels', [''])) for txt in txts: tokens = tokenize(txt) retxt = [] for t in tokens: if keep_token(t): retxt.append(t) counts['tokens_in_' + itype] += len(retxt) counts['tokens_in_' + 'both'] += len(retxt) counts['utterances_in_' + itype] += 1 counts['utterances_in_' + 'both'] += 1 counts['avg_utterance_length_in_' + itype] = ( counts['tokens_in_' + itype] / counts['utterances_in_' + itype] ) counts['avg_utterance_length_in_' + 'both'] = ( counts['tokens_in_' + 'both'] / counts['utterances_in_' + 'both'] ) for t in retxt: if t not in counts['token_dict_' + itype]: counts['unique_tokens_in_' + itype] += 1 counts['token_dict_' + itype][t] = True if t not in counts['token_dict_' + 'both']: counts['unique_tokens_in_' + 'both'] += 1 counts['token_dict_' + 'both'][t] = True retxt = ' '.join(retxt) if retxt not in counts['utterance_dict_' + itype]: counts['unique_utterances_in_' + itype] += 1 counts['utterance_dict_' + itype][retxt] = True if retxt not in counts['utterance_dict_' + 'both']: counts['unique_utterances_in_' + 'both'] += 1 counts['utterance_dict_' + 'both'][retxt] = True if log_time.time() > log_every_n_secs: text, log = report(world, counts, log_time) if print_parser: print(text) try: # print dataset size if available print( '[ loaded {} episodes with a total of {} examples ]'.format( world.num_episodes(), world.num_examples() ) ) except Exception: pass return report(world, counts, log_time)
def eval_wordstat(opt, print_parser=None): """ Evaluates a model. :param opt: tells the evaluation function how to run :param print_parser: if provided, prints the options that are set within the model after loading the model """ random.seed(42) # Create model and assign it to the specified task agent = create_agent(opt, requireModelExists=True) world = create_task(opt, agent) if opt.get('external_dict'): print('[ Using external dictionary from: {} ]'.format( opt['external_dict'])) dict_opt = copy.deepcopy(opt) dict_opt['dict_file'] = opt['external_dict'] dictionary = DictionaryAgent(dict_opt) else: print('[ Using model bundled dictionary ]') dictionary = agent.dict batch_size = opt['batchsize'] if print_parser: # Show arguments after loading model print_parser.opt = agent.opt print_parser.print_args() log_every_n_secs = opt.get('log_every_n_secs', -1) if log_every_n_secs <= 0: log_every_n_secs = float('inf') log_time = TimeLogger() cnt = 0 max_cnt = opt['num_examples'] if opt['num_examples'] > 0 else float('inf') word_statistics = { 'mean_wlength': [], 'mean_clength': [], 'freqs_cnt': Counter(), 'word_cnt': 0, 'pred_list': [], 'pure_pred_list': [], 'context_list': [], 'unique_words': set(), } bins = [int(i) for i in opt['freq_bins'].split(',')] def process_prediction(prediction, word_statistics): normalized = normalize_answer(prediction) word_statistics['pred_list'].append(normalized) freqs, _cnt, wlength, clength = get_word_stats(prediction, dictionary, bins=bins) word_statistics['word_cnt'] += _cnt word_statistics['mean_wlength'].append(wlength) word_statistics['mean_clength'].append(clength) word_statistics['freqs_cnt'] += Counter(freqs) word_statistics['unique_words'] |= set(normalized.split(" ")) return word_statistics while not world.epoch_done(): world.parley() if batch_size == 1: cnt += 1 prediction = world.acts[-1]['text'] word_statistics['context_list'].append(world.acts[0]['text']) word_statistics['pure_pred_list'].append(prediction) word_statistics = process_prediction(prediction, word_statistics) else: for w in world.worlds: try: if 'text' not in w.acts[-1]: continue prediction = w.acts[-1]['text'] word_statistics['context_list'].append(w.acts[0]['text']) word_statistics['pure_pred_list'].append(prediction) except IndexError: continue cnt += 1 word_statistics = process_prediction(prediction, word_statistics) if log_time.time() > log_every_n_secs: report = world.report() text, report = log_time.log(report['exs'], min(max_cnt, world.num_examples()), report) print(text) stat_str = 'total_words: {}, '.format(word_statistics['word_cnt']) stat_str += ', '.join([ '<{}:{} ({:.{prec}f}%)'.format( b, word_statistics['freqs_cnt'].get(b, 0), (word_statistics['freqs_cnt'].get(b, 0) / word_statistics['word_cnt']) * 100, prec=2, ) for b in bins ]) print("Word statistics: {}, avg_word_length: {:.{prec}f}, " "avg_char_length: {:.{prec}f}".format( stat_str, numpy.array(word_statistics['mean_wlength']).mean(), numpy.array(word_statistics['mean_clength']).mean(), prec=2, )) if cnt >= max_cnt: break if world.epoch_done(): print("EPOCH DONE") if opt['compute_unique'] is True: unique_list = [] cntr = Counter(word_statistics['pred_list']) for k, v in cntr.items(): if v == 1: unique_list.append(k) print("Unique responses: {:.{prec}f}%".format( len(unique_list) / len(word_statistics['pred_list']) * 100, prec=2)) print("Total unique tokens:", len(word_statistics['unique_words'])) if opt['dump_predictions_path'] is not None: with open(opt['dump_predictions_path'], 'w') as f: f.writelines([ 'CONTEXT: {}\nPREDICTION:{}\n\n'.format(c, p) for c, p in zip( word_statistics['context_list'], word_statistics['pure_pred_list'], ) ]) if opt['compute_unique'] is True: with open(opt['dump_predictions_path'] + '_unique', 'w') as f: f.writelines(['{}\n'.format(i) for i in unique_list]) stat_str = 'total_words: {}, '.format(word_statistics['word_cnt']) stat_str += ', '.join([ '<{}:{} ({:.{prec}f}%)'.format( b, word_statistics['freqs_cnt'].get(b, 0), (word_statistics['freqs_cnt'].get(b, 0) / word_statistics['word_cnt']) * 100, prec=2, ) for b in bins ]) print("Word statistics: {}, avg_word_length: {:.{prec}f}, " "avg_char_length: {:.{prec}f}".format( stat_str, numpy.array(word_statistics['mean_wlength']).mean(), numpy.array(word_statistics['mean_clength']).mean(), prec=2, )) report = world.report() print(report) return report
class StarspaceAgent(Agent): """Simple implementation of the starspace algorithm: https://arxiv.org/abs/1709.03856 """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" StarspaceAgent.dictionary_class().add_cmdline_args(argparser) agent = argparser.add_argument_group('StarSpace Arguments') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-enorm', '--embeddingnorm', type=float, default=10, help='max norm of word embeddings') agent.add_argument('-shareEmb', '--share-embeddings', type='bool', default=True, help='whether LHS and RHS share embeddings') agent.add_argument('-lr', '--learningrate', type=float, default=0.1, help='learning rate') agent.add_argument('-margin', '--margin', type=float, default=0.1, help='margin') agent.add_argument('-opt', '--optimizer', default='sgd', choices=StarspaceAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length.') agent.add_argument('-k', '--neg-samples', type=int, default=10, help='number k of negative samples per example') agent.add_argument('--parrot-neg', type=int, default=0, help='include query as a negative') agent.add_argument('--tfidf', type='bool', default=False, help='Use frequency based normalization for embeddings.') agent.add_argument('-cs', '--cache-size', type=int, default=1000, help='size of negative sample cache to draw from') agent.add_argument('-hist', '--history-length', default=10000, type=int, help='Number of past tokens to remember. ') agent.add_argument('-histr', '--history-replies', default='label', type=str, choices=['none', 'model', 'label'], help='Keep replies in the history, or not.') agent.add_argument('-fixedCands', '--fixed-candidates-file', default=None, type=str, help='File of cands to use for prediction') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt self.reset_metrics() self.id = 'Starspace' self.NULL_IDX = 0 self.cands = torch.LongTensor(1, 1, 1) self.ys_cache = [] self.ys_cache_sz = opt['cache_size'] self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} self.debugMode = False if shared: self.threadindex = shared['threadindex'] print("[ creating Starspace thread " + str(self.threadindex) + " ]") # set up shared properties self.dict = shared['dict'] self.model = shared['model'] #Starspace(opt, len(self.dict)) else: print("[ creating StarspaceAgent ]") # this is not a shared instance of this class, so do full init if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.model = Starspace(opt, len(self.dict), self.dict) if opt.get('model_file') and os.path.isfile(opt['model_file']): self.load(opt['model_file']) self.model.share_memory() # set up modules self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'], size_average=False) self.reset() self.fixedCands = False if self.opt.get('fixed-candidates-file'): self.fixedCands = load_cands(self.opt.get('fixed-candidates-file')) def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True # set up optimizer lr = self.opt['learningrate'] optim_class = StarspaceAgent.OPTIM_OPTS[self.opt['optimizer']] kwargs = {'lr': lr} self.optimizer = optim_class(self.model.parameters(), **kwargs) def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['dict'] = self.dict shared['model'] = self.model return shared def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = {'embeddingsize', 'optimizer'} for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def t2v(self, text): p = self.dict.txt2vec(text) return Variable(torch.LongTensor(p).unsqueeze(1)) def v2t(self, vec): """Convert token indices to string of tokens.""" if type(vec) == Variable: vec = vec.data new_vec = [] for i in vec: new_vec.append(i) return self.dict.vec2txt(new_vec) def observe(self, observation): self.episode_done = observation['episode_done'] # shallow copy observation (deep copy can be expensive) obs = observation.copy() obs['text2vec'] = maintain_dialog_history( self.history, obs, historyLength=self.opt['history_length'], useReplies=self.opt['history_replies'], dict=self.dict, useStartEndIndices=False) self.observation = obs return obs def same(self, y1, y2): if len(y1.squeeze()) != len(y2.squeeze()): return False if abs((y1.squeeze()-y2.squeeze()).sum().data.sum()) > 0.00001: return False return True def get_negs(self, xs, ys): negs = [] cache_sz = len(self.ys_cache) - 1 if cache_sz < 1: return negs k = self.opt['neg_samples'] for i in range(1, k * 3): index = random.randint(0, cache_sz) neg = self.ys_cache[index] if not self.same(ys, neg): negs.append(neg) if len(negs) >= k: break if self.opt['parrot_neg'] > 0: utt = self.history['last_utterance'] if len(utt) > 2: query = Variable(torch.LongTensor(utt).unsqueeze(0)) negs.append(query) return negs def dict_neighbors(self, word, useRHS=False): input = self.t2v(word) W = self.model.encoder.lt.weight q = W[input.data[0][0]] if useRHS: W = self.model.encoder2.lt.weight score = torch.Tensor(W.size(0)) for i in range(W.size(0)): score[i] = torch.nn.functional.cosine_similarity(q, W[i], dim=0).data[0] val,ind=score.sort(descending=True) for i in range(20): print(str(ind[i]) + " [" + str(val[i]) + "]: " + self.v2t(torch.Tensor([ind[i]]))) def compute_metrics(self, loss, scores): metrics = {} pos = scores[0] cnt = 0 for i in range(1, len(scores)): if scores[i] >= pos: cnt += 1 metrics['mean_rank'] = cnt metrics['loss'] = loss return metrics def predict(self, xs, ys=None, cands=None, cands_txt=None, obs=None): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ is_training = ys is not None if is_training: # text_cand_inds, loss_dict = None, None negs = self.get_negs(xs, ys) if is_training and len(negs) > 0: self.model.train() self.optimizer.zero_grad() xe, ye = self.model(xs, ys, negs) if self.debugMode: # print example print("inp: " + self.v2t(xs.squeeze())) print("pos: " + self.v2t(ys.squeeze())) for c in negs: print("neg: " + self.v2t(c.squeeze())) print("---") y = Variable(-torch.ones(xe.size(0))) y[0]= 1 loss = self.criterion(xe, ye, y) loss.backward() self.optimizer.step() pred = nn.CosineSimilarity().forward(xe,ye) metrics = self.compute_metrics(loss.data[0], pred.data.squeeze()) return [{'metrics':metrics}] else: if cands is None or cands[0] is None: # cannot predict without candidates. if self.fixedCands: cands = [self.fixedCands] else: return [{}] # test set prediction uses candidates self.model.eval() xe, ye = self.model(xs, ys, cands[0]) pred = nn.CosineSimilarity().forward(xe,ye) # This is somewhat costly which we could avoid if we do not evalute ranking. # i.e. by only doing: val,ind = pred.max(0) val,ind=pred.sort(descending=True) # predict the highest scoring candidate, and return it. ypred = cands_txt[0][ind.data[0]] tc = [] for i in range(min(100, ind.size(0))): tc.append(cands_txt[0][ind.data[i]]) ret = [{'text': ypred, 'text_candidates': tc }] return ret return [{}] def vectorize(self, observations): """Convert a list of observations into input & target tensors.""" def valid(obs): # check if this is an example our model should actually process return 'text2vec' in obs and len(obs['text2vec']) > 0 try: # valid examples and their indices valid_inds, exs = zip(*[(i, ex) for i, ex in enumerate(observations) if valid(ex)]) except ValueError: # zero examples to process in this batch, so zip failed to unpack return None, None, None, None # set up the input tensors bsz = len(exs) # `x` text is already tokenized and truncated # sort by length so we can use pack_padded parsed_x = [ex['text2vec'] for ex in exs] x_lens = [len(x) for x in parsed_x] ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k]) exs = [exs[k] for k in ind_sorted] valid_inds = [valid_inds[k] for k in ind_sorted] parsed_x = [parsed_x[k] for k in ind_sorted] labels_avail = any(['labels' in ex for ex in exs]) max_x_len = max([len(x) for x in parsed_x]) for x in parsed_x: x += [[self.NULL_IDX]] * (max_x_len - len(x)) xs = torch.LongTensor(parsed_x) xs = Variable(xs) # set up the target tensors ys = None labels = None if labels_avail: # randomly select one of the labels to update on, if multiple labels = [random.choice(ex.get('labels', [''])) for ex in exs] # parse each label and append END parsed_y = [deque(maxlen=self.truncate) for _ in labels] for dq, y in zip(parsed_y, labels): dq.extendleft(reversed(self.parse(y))) max_y_len = max(len(y) for y in parsed_y) for y in parsed_y: y += [self.NULL_IDX] * (max_y_len - len(y)) ys = torch.LongTensor(parsed_y) ys = Variable(ys) cands = [] cands_txt = [] if ys is None: # only build candidates in eval mode. for o in observations: if 'label_candidates' in o: cs = [] ct = [] for c in o['label_candidates']: cs.append(Variable(torch.LongTensor(self.parse(c)).unsqueeze(0))) ct.append(c) cands.append(cs) cands_txt.append(ct) else: cands.append(None) cands_txt.append(None) return xs, ys, cands, cands_txt def add_to_ys_cache(self, ys): if ys is None or len(ys) == 0: return if len(self.ys_cache) < self.ys_cache_sz: self.ys_cache.append(copy.deepcopy(ys)) else: ind = random.randint(0, self.ys_cache_sz - 1) self.ys_cache[ind] = copy.deepcopy(ys) def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, cands, cands_txt = self.vectorize(observations) batch_reply = self.predict(xs, ys, cands, cands_txt, observations) self.add_to_ys_cache(ys) return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def shutdown(self): #"""Save the state of the model when shutdown.""" super().shutdown() def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): data = {} data['model'] = self.model.state_dict() data['optimizer'] = self.optimizer.state_dict() data['opt'] = self.opt with open(path, 'wb') as handle: torch.save(data, handle) with open(path + ".opt", 'wb') as handle: pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL) def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: print('Loading existing model params from ' + path) data = torch.load(read) self.model.load_state_dict(data['model']) self.reset() self.optimizer.load_state_dict(data['optimizer']) self.opt = self.override_opt(data['opt'])
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument( '-lp', '--length_penalty', default=0.5, help='length penalty for responses') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt def observe(self, obs): self.observation = obs self.dictionary.observe(obs) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: rep = self.build_query_representation(obs['text']) reply['text_candidates'] = ( rank_candidates(rep, obs['label_candidates'], self.length_penalty, self.dictionary)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname=None): fname = self.opt.get('model_file', None) if fname is None else fname if fname: self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = [w for w in self.dictionary.tokenize(query.lower())] rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True norm = len(used) rep['norm'] = math.sqrt(len(words)) return rep
def __init__(self, opt, shared=None): """ Set up model if shared params not set, otherwise no work to do. """ super().__init__(opt, shared) opt = self.opt if opt.get('batchsize', 1) > 1: raise RuntimeError('Kvmemnn model does not support batchsize > 1, ' 'try training with numthreads > 1 instead.') self.reset_metrics() # all instances needs truncate param self.id = 'Kvmemnn' self.NULL_IDX = 0 self.start2 = 99 # set up tensors once self.cands = torch.LongTensor(1, 1, 1) self.ys_cache = [] self.ys_cache_sz = opt['cache_size'] self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} if shared: torch.set_num_threads(1) if 'threadindex' in shared: self.threadindex = shared['threadindex'] else: self.threadindex = 1 # set up shared properties self.dict = shared['dict'] # answers contains a batch_size list of the last answer produced self.model = shared['model'] # Kvmemnn(opt, len(self.dict)) if 'fixedX' in shared: self.fixedX = shared['fixedX'] self.fixedCands = shared['fixedCands'] self.fixedCands_txt = shared['fixedCands_txt'] self.fixedCands2 = shared['fixedCands2'] self.fixedCands_txt2 = shared['fixedCands_txt2'] else: print("[ creating KvmemnnAgent ]") # this is not a shared instance of this class, so do full init self.threadindex = -1 torch.set_num_threads(1) if (opt['dict_file'] is None and opt.get('model_file') ) or os.path.isfile(opt['model_file'] + '.dict'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) if 'loss' not in opt: opt['loss'] = 'cosine' self.model = Kvmemnn(opt, len(self.dict), self.dict) if opt.get('model_file') and os.path.isfile(opt['model_file']): self.load(opt['model_file']) self.model.share_memory() self.fixedCands = False self.fixedX = None path = opt['model_file'] + '.candspair' if os.path.isfile(path) and opt.get('loadcands') is not False: print("[loading candidates: " + path + "*]") fc = load_cands(path) fcs = [] for c in fc: fcs.append( Variable(torch.LongTensor(self.parse(c)).unsqueeze(0))) self.fixedCands = fcs self.fixedCands_txt = fc fc2 = load_cands(path + "2") fcs2 = [] for c2 in fc2: fcs2.append( Variable( torch.LongTensor(self.parse(c2)).unsqueeze(0))) self.fixedCands2 = fcs2 self.fixedCands_txt2 = fc2 print("[caching..]") xsq = Variable(torch.LongTensor([self.parse('nothing')])) xe, ye = self.model(xsq, [], None, self.fixedCands) self.fixedX = ye print("=init done=") if self.opt['loss'] == 'cosine': self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'], size_average=False) elif self.opt['loss'] == 'nll': self.criterion = nn.CrossEntropyLoss(ignore_index=-100) else: raise RuntimeError('unspecified loss') # self.criterion = torch.nn.MultiMarginLoss(p=1, margin=0.1) self.reset() # can be used to look at embeddings: # self.dict_neighbors('coffee') self.take_next_utt = True self.cands_done = [] if 'interactive_mode' in opt: self.interactiveMode = self.opt['interactive_mode'] else: self.interactiveMode = False if self.interactiveMode: print("[ Interactive mode ]")
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] self.model = Seq2seq(opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=self.states.get('longest_label', 1)) if opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ModuleNotFoundError as ex: print('Please install torch text with `pip install torchtext`') raise ex if opt['embedding_type'].startswith('glove'): init = 'glove' embs = vocab.GloVe(name='840B', dim=300) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en') else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != 300: rp = torch.Tensor(300, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.2) self.rank = opt['rank_candidates'] # set up tensors once self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) if self.use_cuda: # push to cuda self.xs = self.xs.cuda(async=True) self.ys = self.ys.cuda(async=True) if self.rank: self.cands = self.cands.cuda(async=True) self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs) if self.states: if self.states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(self.states['optimizer']) self.reset()
class DefaultDataset(Dataset): """A Pytorch Dataset utilizing streaming.""" def __init__(self, opt, version='2014'): self.opt = opt self.use_hdf5 = opt.get('use_hdf5', False) self.datatype = self.opt.get('datatype') self.training = self.datatype.startswith('train') self.num_epochs = self.opt.get('num_epochs', 0) self.image_loader = ImageLoader(opt) test_info_path, annotation_path, self.image_path = _path(opt, version) self._setup_data(test_info_path, annotation_path, opt.get('unittest', False)) if self.use_hdf5: try: import h5py self.h5py = h5py except ImportError: raise ImportError('Need to install h5py - `pip install h5py`') self._setup_image_data() self.dict_agent = DictionaryAgent(opt) def __getitem__(self, index): index %= self.num_episodes() image_id = None if not self.datatype.startswith('test'): anno = self.annotation['annotations'][index] image_id = anno['image_id'] else: image_id = self.test_info['images'][index]['id'] ep = { 'text': self.dict_agent.txt2vec(QUESTION), 'image': self.get_image(image_id), 'episode_done': True, } if self.opt.get('extract_image', False): ep['image_id'] = image_id return ep if not self.datatype.startswith('test'): anno = self.annotation['annotations'][index] ep['labels'] = [anno['caption']] ep['valid'] = True else: ep['valid'] = True ep['use_hdf5'] = self.use_hdf5 return (index, ep) def __len__(self): num_epochs = self.num_epochs if self.num_epochs > 0 else 100 num_iters = num_epochs if self.training else 1 return int(num_iters * self.num_episodes()) def _load_lens(self): with open(self.length_datafile) as length: lengths = json.load(length) self.num_eps = lengths['num_eps'] self.num_exs = lengths['num_exs'] def _setup_data(self, test_info_path, annotation_path, unittest): if not self.datatype.startswith('test'): with open(annotation_path) as data_file: self.annotation = json.load(data_file) else: with open(test_info_path) as data_file: self.test_info = json.load(data_file) if unittest: if not self.datatype.startswith('test'): self.annotation['annotations'] = self.annotation[ 'annotations'][:10] else: self.test_info['images'] = self.test_info['images'][:10] self.image_paths = set() # Depending on whether we are using the train/val/test set, we need to # find the image IDs in annotations or test image info if not self.datatype.startswith('test'): for anno in self.annotation['annotations']: self.image_paths.add(self.image_path + '%012d.jpg' % (anno['image_id'])) else: for info in self.test_info['images']: self.image_paths.add(self.image_path + '%012d.jpg' % (info['id'])) def _setup_image_data(self): '''hdf5 image dataset''' extract_feats(self.opt) im = self.opt.get('image_mode') hdf5_path = os.path.join(self.image_path, 'mode_{}_noatt.hdf5'.format(im)) hdf5_file = self.h5py.File(hdf5_path, 'r') self.image_dataset = hdf5_file['images'] image_id_to_idx_path = os.path.join(self.image_path, 'mode_{}_id_to_idx.txt'.format(im)) with open(image_id_to_idx_path, 'r') as f: self.image_id_to_idx = json.load(f) def get_image(self, image_id): if not self.use_hdf5: im_path = os.path.join(self.image_path, '%012d.jpg' % (image_id)) return self.image_loader.load(im_path) else: img_idx = self.image_id_to_idx[str(image_id)] return torch.Tensor(self.image_dataset[img_idx]) def num_examples(self): if not self.datatype.startswith('test'): return len(self.annotation['annotations']) else: return len(self.test_info['images']) def num_episodes(self): return self.num_examples() def num_images(self): if not hasattr(self, 'num_imgs'): return self.num_examples() return self.num_imgs
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available( ) if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) """ if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override options with stored ones opt = self.override_opt(new_opt) """ if opt.get('ptr_model') and os.path.isfile(opt['ptr_model']): # load model parameters if available print('Loading existing model params from ' + opt['ptr_model']) new_opt, self.states = self.load( opt['ptr_model']) ## TODO:: load what? # override options with stored ones #opt = self.override_opt(new_opt) self.dict = DictionaryAgent(opt) self.id = 'ScoringNet' # we use START markers to start our output self.START = self.dict.start_token self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START)) # we use END markers to end our output self.END = self.dict.end_token self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END)) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params directly hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learning_rate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.truncate = opt['truncate'] self.attention = opt['attention'] # set up tensors if self.opt['bi_encoder']: self.zeros = torch.zeros(2 * self.num_layers, 1, hsz) else: self.zeros = torch.zeros(self.num_layers, 1, hsz) self.zeros_dec = torch.zeros(self.num_layers, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) self.neg_ys = torch.LongTensor(1, 1) # set up modules #self.criterion = nn.NLLLoss(size_average = False, ignore_index = 0) self.criterion = nn.BCELoss() # lookup table stores word embeddings self.lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX) #scale_grad_by_freq=True) # encoder captures the input text enc_class = ScoringNetAgent.ENC_OPTS[opt['encoder']] self.encoder = enc_class(emb, hsz, opt['numlayers'], bidirectional=opt['bi_encoder'], dropout=opt['dropout']) # decoder produces our output states dec_isz = hsz if opt['bi_encoder']: dec_isz += hsz # linear layer helps us produce outputs from final decoder state self.h2o = nn.Linear(dec_isz, dec_isz, bias=False) # droput on the linear layer helps us generalize self.dropout = nn.Dropout(opt['dropout']) self.use_attention = False self.attn = None # if attention is greater than 0, set up additional members if self.attention: self.use_attention = True self.att_type = opt['attn_type'] input_size = hsz if opt['bi_encoder']: input_size += hsz if self.att_type == 'concat': self.attn = nn.Linear(input_size + hsz, 1, bias=False) elif self.att_type == 'dot': assert not opt['bi_encoder'] elif self.att_type == 'general': self.attn = nn.Linear(hsz, input_size, bias=False) # set up optims for each module self.lr = opt['learning_rate'] self.wd = opt['weight_decay'] is not 0 optim_class = ScoringNetAgent.OPTIM_OPTS[opt['optimizer']] self.optims = { 'lt': optim_class(self.lt.parameters(), lr=self.lr), 'encoder': optim_class(self.encoder.parameters(), lr=self.lr), 'h2o': optim_class(self.h2o.parameters(), lr=self.lr, weight_decay=self.wd), } if self.attention and self.attn is not None: self.optims.update({ 'attn': optim_class(self.attn.parameters(), lr=self.lr, weight_decay=self.wd) }) if hasattr(self, 'states'): # set loaded states if applicable if opt.get('ptr_model'): self.init_pretrain(self.states) else: self.set_states(self.states) if self.use_cuda: self.cuda() self.loss = 0 self.ndata = 0 self.loss_valid = 0 self.ndata_valid = 0 if opt['beam_size'] > 0: self.beamsize = opt['beam_size'] self.episode_concat = opt['episode_concat'] self.training = True self.generating = False self.local_human = False self.max_seq_len = opt['max_seq_len'] self.reset()
class Seq2seqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. This model supports encoding the input and decoding the output via one of several flavors of RNN. It then uses a linear layer (whose weights can be shared with the embedding layer) to convert RNN output states into output tokens. This model currently uses greedy decoding, selecting the highest probability token at each time step. For more information, see the following papers: - Neural Machine Translation by Jointly Learning to Align and Translate `(Bahdanau et al. 2014) <arxiv.org/abs/1409.0473>`_ - Sequence to Sequence Learning with Neural Networks `(Sutskever et al. 2014) <arxiv.org/abs/1409.3215>`_ - Effective Approaches to Attention-based Neural Machine Translation `(Luong et al. 2015) <arxiv.org/abs/1508.04025>`_ """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument( '--init-model', type=str, default=None, help='load dict/features/weights/opts from this file') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=1, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=0.1, help='gradient clipping using l2 norm') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument( '-att', '--attention', default='none', choices=['none', 'concat', 'general', 'dot', 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' '(see arxiv.org/abs/1508.04025)') agent.add_argument('-attl', '--attention-length', default=48, type=int, help='Length of local attention.') agent.add_argument('--attention-time', default='post', choices=['pre', 'post'], help='Whether to apply attention before or after ' 'decoding.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('-gpu', '--gpu', type=int, default=-1, help='which GPU device to use') # ranking arguments agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the prob score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length. This ' 'reduces the total amount ' 'of padding in the batches.') agent.add_argument('-rnn', '--rnn-class', default='lstm', choices=Seq2seq.RNN_OPTS.keys(), help='Choose between different types of RNNs.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'], help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights. ' 'Note that shared disabled some encoder ' 'options--in particular, bidirectionality.') agent.add_argument('-lt', '--lookuptable', default='unique', choices=['unique', 'enc_dec', 'dec_out', 'all'], help='The encoder, decoder, and output modules can ' 'share weights, or not. ' 'Unique has independent embeddings for each. ' 'Enc_dec shares the embedding for the encoder ' 'and decoder. ' 'Dec_out shares decoder embedding and output ' 'weights. ' 'All shares all three weights.') agent.add_argument('-opt', '--optimizer', default='sgd', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-mom', '--momentum', default=-1, type=float, help='if applicable, momentum value for optimizer. ' 'if > 0, sgd uses nesterov momentum.') agent.add_argument('-emb', '--embedding-type', default='random', choices=[ 'random', 'glove', 'glove-fixed', 'fasttext', 'fasttext-fixed', 'glove-twitter' ], help='Choose between different strategies ' 'for word embeddings. Default is random, ' 'but can also preinitialize from Glove or ' 'Fasttext.' 'Preinitialized embeddings can also be fixed ' 'so they are not updated during training.') agent.add_argument('-soft', '--numsoftmax', default=1, type=int, help='default 1, if greater then uses mixture of ' 'softmax (see arxiv.org/abs/1711.03953).') agent.add_argument('-rf', '--report-freq', type=float, default=0.001, help='Report frequency of prediction during eval.') agent.add_argument( '-histr', '--history-replies', default='label_else_model', type=str, choices=['none', 'model', 'label', 'label_else_model'], help='Keep replies in the history, or not.') agent.add_argument('-pt', '--person-tokens', type='bool', default=False, help='use special tokens before each speaker') agent.add_argument('--beam-size', type=int, default=1, help='Beam size, if 1 then greedy search') agent.add_argument( '--beam-log-freq', type=float, default=0.0, help= 'The portion of beams to dump from minibatch into model_name.beam_dump folder' ) agent.add_argument( '--topk', type=int, default=1, help= 'Top k sampling from renormalized softmax in test/valid time, default 1 means simple greedy max output' ) agent.add_argument( '--softmax-layer-bias', type='bool', default=False, help='Put True if you want to include the bias in decoder.e2s layer' ) Seq2seqAgent.dictionary_class().add_cmdline_args(argparser) return agent def __init__(self, opt, shared=None): """Set up model.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = { 'loss': 0.0, 'num_tokens': 0, 'correct_tokens': 0, 'total_skipped_batches': 0 } self.history = {} self.report_freq = opt.get('report_freq', 0.001) self.use_person_tokens = opt.get('person_tokens', False) self.batch_idx = shared and shared.get('batchindex') or 0 self.rank = opt['rank_candidates'] self.beam_size = opt.get('beam_size', 1) self.topk = opt.get('topk', 1) states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if opt.get('numthreads', 1) > 1: torch.set_num_threads(1) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] self.model = shared['model'] self.metrics = shared['metrics'] states = shared.get('states', {}) else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format( init_model)) states = self.load(init_model) if os.path.isfile(init_model + '.dict') or opt['dict_file'] is None: opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] if not hasattr(self, 'model_class'): # this allows child classes to override this but inherit init self.model_class = Seq2seq self.model = self.model_class(opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=states.get( 'longest_label', 1)) if opt.get('dict_tokenizer' ) == 'bpe' and opt['embedding_type'] != 'random': print('skipping preinitialization of embeddings for bpe') elif not states and opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ImportError as ex: print( 'Please install torch text with `pip install torchtext`' ) raise ex pretrained_dim = 300 if opt['embedding_type'].startswith('glove'): if 'twitter' in opt['embedding_type']: init = 'glove-twitter' name = 'twitter.27B' pretrained_dim = 200 else: init = 'glove' name = '840B' embs = vocab.GloVe(name=name, dim=pretrained_dim, cache=modelzoo_path( self.opt.get('datapath'), 'models:glove_vectors')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_vectors')) else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != pretrained_dim: rp = torch.Tensor(pretrained_dim, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() # set up criteria if opt.get('numsoftmax', 1) > 1: self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX, size_average=False) else: self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: self.criterion.cuda() if 'train' in opt.get('datatype', ''): # we only set up optimizers when training # we only set this up for the original instance or hogwild ones self.clip = opt.get('gradient_clip', -1) # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in [ 'sgd', 'rmsprop' ]: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['optimizer'] == 'adam': # https://openreview.net/forum?id=ryQu7f-RZ kwargs['amsgrad'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: try: self.optimizer.load_state_dict(states['optimizer']) except ValueError: print('WARNING: not loading optim state since model ' 'params changed.') if self.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder', 'decoder', 'lookuptable', 'attention', 'attention_length', 'rnn_class' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('[ Adding new option: | {k}: {v} | ]'.format(k=k, v=v)) elif self.opt[k] != v: print('[ Overriding option: | {k}: {old} => {v} | ]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v if 'dict_file' in new_opt and not self.opt.get('dict_file'): print('[ No dictionary path detected, trying to load previous ' 'path {} ]'.format(new_opt['dict_file'])) self.opt['dict_file'] = new_opt['dict_file'] return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" new_vec = [] for i in vec: if i == self.END_IDX: break elif i != self.START_IDX: new_vec.append(i) return self.dict.vec2txt(new_vec) def zero_grad(self): """Zero out optimizer.""" self.optimizer.zero_grad() def update_params(self): """Do one optimization step.""" if self.clip > 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) self.optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.history.clear() for i in range(len(self.answers)): self.answers[i] = None self.reset_metrics() def reset_metrics(self): """Reset metrics for reporting loss and perplexity.""" self.metrics['loss'] = 0.0 self.metrics['num_tokens'] = 0 self.metrics['correct_tokens'] = 0 def report(self): """Report loss and perplexity from model's perspective. Note that this includes predicting __END__ and __UNK__ tokens and may differ from a truly independent measurement. """ m = {} num_tok = self.metrics['num_tokens'] if num_tok > 0: if self.metrics['correct_tokens'] > 0: m['token_acc'] = self.metrics['correct_tokens'] / num_tok m['loss'] = self.metrics['loss'] / num_tok try: m['ppl'] = math.exp(m['loss']) except OverflowError: m['ppl'] = float('inf') if self.metrics['total_skipped_batches'] > 0: m['total_skipped_batches'] = self.metrics['total_skipped_batches'] for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats m[k] = round_sigfigs(v, 4) return m def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['opt'] = self.opt shared['answers'] = self.answers shared['dict'] = self.dict shared['START_IDX'] = self.START_IDX shared['END_IDX'] = self.END_IDX shared['NULL_IDX'] = self.NULL_IDX shared['model'] = self.model if self.opt.get('numthreads', 1) > 1: # we're doing hogwild so share the model too if type(self.metrics) == dict: # move metrics and model to shared memory self.metrics = SharedTable(self.metrics) self.model.share_memory() shared['states'] = { # don't share optimizer states 'optimizer_type': self.opt['optimizer'], } shared['metrics'] = self.metrics # do after numthreads check return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) obs = observation.copy() if not obs.get('preprocessed', False) or 'text2vec' not in obs: obs['text2vec'] = maintain_dialog_history( self.history, obs, reply=self.answers[self.batch_idx], historyLength=self.truncate, useReplies=self.opt.get('history_replies'), dict=self.dict, useStartEndIndices=self.use_person_tokens) else: obs['text2vec'] = deque(obs['text2vec'], maxlen=self.truncate) self.observation = obs self.answers[self.batch_idx] = None return obs def predict(self, xs, ys=None, cands=None, valid_cands=None, is_training=False): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ predictions, cand_preds = None, None if is_training: self.model.train() self.zero_grad() out = None try: out = self.model(xs, ys, rank_during_training=cands is not None) # generated response _preds, scores, cand_preds = out[0], out[1], out[2] score_view = scores.view(-1, scores.size(-1)) loss = self.criterion(score_view, ys.view(-1)) # save loss to metrics y_ne = ys.ne(self.NULL_IDX) target_tokens = y_ne.long().sum().item() correct = ((ys == _preds) * y_ne).sum().item() self.metrics['correct_tokens'] += correct self.metrics['loss'] += loss.item() self.metrics['num_tokens'] += target_tokens loss /= target_tokens # average loss per token loss.backward() except RuntimeError as e: # catch out of memory exceptions during fwd/bck (skip batch) if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch. ' 'if this happens frequently, decrease batchsize or ' 'truncate the inputs to the model.') self.metrics['total_skipped_batches'] += 1 return predictions, cand_preds else: raise e self.update_params() else: self.model.eval() if valid_cands: out = self.model(xs, ys=None, cands=cands, valid_cands=valid_cands, beam_size=self.beam_size, topk=self.topk) else: out = self.model(xs, ys=None, cands=cands, beam_size=self.beam_size, topk=self.topk) predictions, cand_preds = out[0], out[2] if ys is not None: # calculate loss on targets out = self.model(xs, ys) scores = out[1] score_view = scores.view(-1, scores.size(-1)) loss = self.criterion(score_view, ys.view(-1)) # save loss to metrics target_tokens = ys.ne(self.NULL_IDX).long().sum().item() self.metrics['loss'] += loss.item() self.metrics['num_tokens'] += target_tokens return predictions, cand_preds def vectorize(self, observations): """Convert a list of observations into input & target tensors.""" is_training = any(['labels' in obs for obs in observations]) xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text( observations, self.dict, end_idx=self.END_IDX, null_idx=self.NULL_IDX, dq=True, eval_labels=True, truncate=self.truncate) if xs is None: return None, None, None, None, None, None, None xs = torch.LongTensor(xs) if ys is not None: ys = torch.LongTensor(ys) if self.use_cuda: # copy to gpu xs = xs.cuda() if ys is not None: ys = ys.cuda() cands = None valid_cands = None if not is_training and self.rank: # set up candidates cands = [] valid_cands = [] for i, v in enumerate(valid_inds): if 'label_candidates' in observations[v]: curr_lcs = list(observations[v]['label_candidates']) curr_cands = [{'text': c} for c in curr_lcs] cs, _, _, valid_c_inds, *_ = PaddingUtils.pad_text( curr_cands, self.dict, null_idx=self.NULL_IDX, dq=True, truncate=self.truncate) valid_cands.append( (i, v, [curr_lcs[j] for j in valid_c_inds])) cs = torch.LongTensor(cs) if self.use_cuda: cs = cs.cuda() cands.append(cs) return xs, ys, labels, valid_inds, cands, valid_cands, is_training def init_cuda_buffer(self, batchsize): if self.use_cuda and not hasattr(self, 'buffer_initialized'): try: print('preinitializing pytorch cuda buffer') bsz = self.opt.get('batchsize', batchsize) maxlen = self.truncate or 180 dummy = torch.ones(bsz, maxlen).long().cuda() sc = self.model(dummy, dummy)[1] loss = self.criterion(sc.view(-1, sc.size(-1)), dummy.view(-1)) loss.backward() self.buffer_initialized = True except RuntimeError as e: if 'out of memory' in str(e): m = ( 'CUDA OOM: Lower batch size (-bs) from {} or lower max' ' sequence length (-tr) from {}'.format(bsz, maxlen)) raise RuntimeError(m) else: raise e def batch_act(self, observations): batchsize = len(observations) self.init_cuda_buffer(batchsize) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, labels, valid_inds, cands, valid_cands, is_training = self.vectorize( observations) if xs is None: # no valid examples, just return empty responses return batch_reply # produce predictions, train on targets if availables cand_inds = [i[0] for i in valid_cands] if valid_cands is not None else None predictions, cand_preds = self.predict(xs, ys, cands, cand_inds, is_training) if is_training: report_freq = 0 else: report_freq = self.report_freq if predictions is not None: PaddingUtils.map_predictions( predictions, valid_inds, batch_reply, observations, self.dict, self.END_IDX, report_freq=report_freq, labels=labels, answers=self.answers, ys=ys.data if ys is not None else None) if cand_preds is not None: if valid_cands is None: valid_cands = [(None, i, labels) for i in valid_inds] for i in range(len(valid_cands)): order = cand_preds[i] _, batch_idx, curr_cands = valid_cands[i] curr = batch_reply[batch_idx] curr['text_candidates'] = [ curr_cands[idx] for idx in order if idx < len(curr_cands) ] return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['longest_label'] = self.model.longest_label model['optimizer'] = self.optimizer.state_dict() model['optimizer_type'] = self.opt['optimizer'] with open(path, 'wb') as write: torch.save(model, write) # save opt file with open(path + ".opt", 'wb') as handle: pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None and hasattr(self, 'optimizer'): self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" states = torch.load(path, map_location=lambda cpu, _: cpu) return states def receive_metrics(self, metrics_dict): """Use the metrics to decide when to adjust LR schedule.""" if 'loss' in metrics_dict: self.scheduler.step(metrics_dict['loss'])
class StarspaceAgent(Agent): """Simple implementation of the starspace algorithm: https://arxiv.org/abs/1709.03856 """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" agent = argparser.add_argument_group('StarSpace Arguments') agent.add_argument( '-emb', '--embedding-type', default='random', choices=[ 'random', 'glove', 'glove-fixed', 'glove-twitter-fixed', 'fasttext', 'fasttext-fixed', 'fasttext_cc', 'fasttext_cc-fixed' ], help='Choose between different strategies for initializing word ' 'embeddings. Default is random, but can also preinitialize ' 'from Glove or Fasttext. Preinitialized embeddings can also ' 'be fixed so they are not updated during training.') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-enorm', '--embeddingnorm', type=float, default=10, help='max norm of word embeddings') agent.add_argument('-shareEmb', '--share-embeddings', type='bool', default=True, help='whether LHS and RHS share embeddings') agent.add_argument( '--lins', default=0, type=int, help='If set to 1, add a linear layer between lhs and rhs.') agent.add_argument('-lr', '--learningrate', type=float, default=0.1, help='learning rate') agent.add_argument('-margin', '--margin', type=float, default=0.1, help='margin') agent.add_argument( '--input_dropout', type=float, default=0, help='fraction of input/output features to dropout during training' ) agent.add_argument('-opt', '--optimizer', default='sgd', choices=StarspaceAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length.') agent.add_argument('-k', '--neg-samples', type=int, default=10, help='number k of negative samples per example') agent.add_argument('--parrot-neg', type=int, default=0, help='include query as a negative') agent.add_argument( '--tfidf', type='bool', default=False, help='Use frequency based normalization for embeddings.') agent.add_argument('-cs', '--cache-size', type=int, default=1000, help='size of negative sample cache to draw from') agent.add_argument('-hist', '--history-length', default=10000, type=int, help='Number of past tokens to remember. ') agent.add_argument( '-histr', '--history-replies', default='label_else_model', type=str, choices=['none', 'model', 'label', 'label_else_model'], help='Keep replies in the history, or not.') agent.add_argument('-fixedCands', '--fixed-candidates-file', default=None, type=str, help='File of cands to use for prediction') StarspaceAgent.dictionary_class().add_cmdline_args(argparser) def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt self.reset_metrics() self.id = 'Starspace' self.NULL_IDX = 0 self.cands = torch.LongTensor(1, 1, 1) self.ys_cache = [] self.ys_cache_sz = opt['cache_size'] self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.history = {} self.debugMode = False if shared: torch.set_num_threads(1) # set up shared properties self.dict = shared['dict'] self.model = shared['model'] else: print("[ creating StarspaceAgent ]") # this is not a shared instance of this class, so do full init if (opt.get('model_file') and (os.path.isfile(opt.get('model_file') + '.dict') or (opt['dict_file'] is None))): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.model = Starspace(opt, len(self.dict), self.dict) if opt.get('model_file') and os.path.isfile(opt['model_file']): self.load(opt['model_file']) else: self._init_embeddings() self.model.share_memory() # set up modules self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'], size_average=False) self.reset() self.fixedCands = False self.fixedX = None if self.opt.get('fixed_candidates_file'): self.fixedCands_txt = load_cands( self.opt.get('fixed_candidates_file')) fcs = [] for c in self.fixedCands_txt: fcs.append(torch.LongTensor(self.parse(c)).unsqueeze(0)) self.fixedCands = fcs print("[loaded candidates]") def _init_embeddings(self, log=True): """Copy embeddings from the pretrained embeddings to the lookuptable. :param weight: weights of lookup table (nn.Embedding/nn.EmbeddingBag) :param emb_type: pretrained embedding type """ weight = self.model.lt.weight emb_type = self.opt.get('embedding_type', 'random') if emb_type == 'random': return embs, name = TorchAgent._get_embtype(self, emb_type) cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = TorchAgent._project_vec(self, embs.vectors[embs.stoi[w]], weight.size(1)) weight.data[i] = vec cnt += 1 if log: print('Initialized embeddings for {} tokens ({}%) from {}.' ''.format(cnt, round(cnt * 100 / len(self.dict), 1), name)) def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True # set up optimizer lr = self.opt['learningrate'] optim_class = StarspaceAgent.OPTIM_OPTS[self.opt['optimizer']] kwargs = {'lr': lr} self.optimizer = optim_class(self.model.parameters(), **kwargs) def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['dict'] = self.dict shared['model'] = self.model return shared def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = {'embeddingsize', 'optimizer'} for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" vec = self.dict.txt2vec(text) if vec == []: vec = [self.dict[self.dict.null_token]] return vec def t2v(self, text): p = self.dict.txt2vec(text) return torch.LongTensor(p).unsqueeze(1) def v2t(self, vec): """Convert token indices to string of tokens.""" new_vec = [] for i in vec: new_vec.append(i) return self.dict.vec2txt(new_vec) def observe(self, observation): self.episode_done = observation['episode_done'] # shallow copy observation (deep copy can be expensive) obs = observation.copy() obs['text2vec'] = maintain_dialog_history( self.history, obs, historyLength=self.opt['history_length'], useReplies=self.opt['history_replies'], dict=self.dict, useStartEndIndices=False) self.observation = obs return obs def same(self, y1, y2): if len(y1.squeeze(0)) != len(y2.squeeze(0)): return False if abs((y1.squeeze(0) - y2.squeeze(0)).sum().data.sum()) > 0.00001: return False return True def get_negs(self, xs, ys): negs = [] cache_sz = len(self.ys_cache) - 1 if cache_sz < 1: return negs k = self.opt['neg_samples'] for i in range(1, k * 3): index = random.randint(0, cache_sz) neg = self.ys_cache[index] if not self.same(ys, neg): negs.append(neg) if len(negs) >= k: break if self.opt['parrot_neg'] > 0: utt = self.history['last_utterance'] if len(utt) > 2: query = torch.LongTensor(utt).unsqueeze(0) negs.append(query) return negs def dict_neighbors(self, word, useRHS=False): input = self.t2v(word) W = self.model.encoder.lt.weight q = W[input.data[0][0]] if useRHS: W = self.model.encoder2.lt.weight score = torch.Tensor(W.size(0)) for i in range(W.size(0)): score[i] = torch.nn.functional.cosine_similarity(q, W[i], dim=0).data[0] val, ind = score.sort(descending=True) for i in range(20): print( str(ind[i]) + " [" + str(val[i]) + "]: " + self.v2t(torch.Tensor([ind[i]]))) def compute_metrics(self, loss, scores): metrics = {} pos = scores[0] cnt = 0 for i in range(1, len(scores)): if scores[i] >= pos: cnt += 1 metrics['mean_rank'] = cnt metrics['loss'] = loss return metrics def input_dropout(self, xs, ys, negs): def dropout(x, rate): xd = [] for i in x[0]: if random.uniform(0, 1) > rate: xd.append(i) if len(xd) == 0: # pick one random thing to put in xd xd.append(x[0][random.randint(0, x.size(1) - 1)]) return torch.LongTensor(xd).unsqueeze(0) rate = self.opt.get('input_dropout') xs2 = dropout(xs, rate) ys2 = dropout(ys, rate) negs2 = [] for n in negs: negs2.append(dropout(n, rate)) return xs2, ys2, negs2 def predict(self, xs, ys=None, cands=None, cands_txt=None, obs=None): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ is_training = ys is not None if is_training: negs = self.get_negs(xs, ys) if is_training and len(negs) > 0: self.model.train() self.optimizer.zero_grad() if self.opt.get('input_dropout', 0) > 0: xs, ys, negs = self.input_dropout(xs, ys, negs) xe, ye = self.model(xs, ys, negs) if self.debugMode: # print example print("inp: " + self.v2t(xs.squeeze())) print("pos: " + self.v2t(ys.squeeze())) for c in negs: print("neg: " + self.v2t(c.squeeze())) print("---") y = -torch.ones(xe.size(0)) y[0] = 1 loss = self.criterion(xe, ye, y) loss.backward() self.optimizer.step() pred = nn.CosineSimilarity().forward(xe, ye) metrics = self.compute_metrics(loss.item(), pred.data.squeeze()) return [{'metrics': metrics}] else: self.model.eval() if cands is None or cands[0] is None: # cannot predict without candidates. if self.fixedCands: cands = [self.fixedCands] cands_txt = [self.fixedCands_txt] else: return [{'text': 'I dunno.'}] # test set prediction uses fixed candidates if self.fixedX is None: xe, ye = self.model(xs, ys, self.fixedCands) self.fixedX = ye else: # fixed candidate embed vectors are cached, dont't recompute blah = torch.LongTensor([1]) xe, ye = self.model(xs, ys, [blah]) ye = self.fixedX else: # test set prediction uses candidates xe, ye = self.model(xs, ys, cands[0]) pred = nn.CosineSimilarity().forward(xe, ye) # This is somewhat costly which we could avoid if we do not evalute ranking. # i.e. by only doing: val,ind = pred.max(0) val, ind = pred.sort(descending=True) # predict the highest scoring candidate, and return it. ypred = cands_txt[0][ind.data[0]] tc = [] for i in range(min(100, ind.size(0))): tc.append(cands_txt[0][ind.data[i]]) ret = [{'text': ypred, 'text_candidates': tc}] return ret return [{'id': self.getID()}] def vectorize(self, observations): """Convert a list of observations into input & target tensors.""" def valid(obs): # check if this is an example our model should actually process return 'text2vec' in obs and len(obs['text2vec']) > 0 try: # valid examples and their indices valid_inds, exs = zip(*[(i, ex) for i, ex in enumerate(observations) if valid(ex)]) except ValueError: # zero examples to process in this batch, so zip failed to unpack return None, None, None, None # `x` text is already tokenized and truncated # sort by length so we can use pack_padded parsed_x = [ex['text2vec'] for ex in exs] x_lens = [len(x) for x in parsed_x] ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k]) exs = [exs[k] for k in ind_sorted] valid_inds = [valid_inds[k] for k in ind_sorted] parsed_x = [parsed_x[k] for k in ind_sorted] labels_avail = any(['labels' in ex for ex in exs]) max_x_len = max([len(x) for x in parsed_x]) for x in parsed_x: x += [self.NULL_IDX] * (max_x_len - len(x)) xs = torch.LongTensor(parsed_x) # set up the target tensors ys = None labels = None if labels_avail: # randomly select one of the labels to update on, if multiple labels = [random.choice(ex.get('labels', [''])) for ex in exs] # parse each label and append END parsed_y = [deque(maxlen=self.truncate) for _ in labels] for dq, y in zip(parsed_y, labels): dq.extendleft(reversed(self.parse(y))) max_y_len = max(len(y) for y in parsed_y) for y in parsed_y: y += [self.NULL_IDX] * (max_y_len - len(y)) ys = torch.LongTensor(parsed_y) cands = [] cands_txt = [] if ys is None: # only build candidates in eval mode. for o in observations: if o.get('label_candidates', False): cs = [] ct = [] for c in o['label_candidates']: cs.append(torch.LongTensor(self.parse(c)).unsqueeze(0)) ct.append(c) cands.append(cs) cands_txt.append(ct) else: cands.append(None) cands_txt.append(None) return xs, ys, cands, cands_txt def add_to_ys_cache(self, ys): if ys is None or len(ys) == 0: return if len(self.ys_cache) < self.ys_cache_sz: self.ys_cache.append(copy.deepcopy(ys)) else: ind = random.randint(0, self.ys_cache_sz - 1) self.ys_cache[ind] = copy.deepcopy(ys) def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, cands, cands_txt = self.vectorize(observations) batch_reply = self.predict(xs, ys, cands, cands_txt, observations) while len(batch_reply) < batchsize: batch_reply.append({'id': self.getID()}) self.add_to_ys_cache(ys) return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def shutdown(self): # """Save the state of the model when shutdown.""" super().shutdown() def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): data = {} data['model'] = self.model.state_dict() data['optimizer'] = self.optimizer.state_dict() data['opt'] = self.opt with open(path, 'wb') as handle: torch.save(data, handle) with open(path + '.opt', 'w') as handle: json.dump(self.opt, handle) def load(self, path): """Return opt and model states.""" print('Loading existing model params from ' + path) data = torch.load(path, map_location=lambda cpu, _: cpu) self.model.load_state_dict(data['model']) self.reset() self.optimizer.load_state_dict(data['optimizer']) self.opt = self.override_opt(data['opt'])
def __init__(self, opt, shared=None): """Set up model.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = { 'loss': 0.0, 'num_tokens': 0, 'correct_tokens': 0, 'total_skipped_batches': 0 } self.history = {} self.report_freq = opt.get('report_freq', 0.001) self.use_person_tokens = opt.get('person_tokens', False) self.batch_idx = shared and shared.get('batchindex') or 0 self.rank = opt['rank_candidates'] self.beam_size = opt.get('beam_size', 1) self.topk = opt.get('topk', 1) states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if opt.get('numthreads', 1) > 1: torch.set_num_threads(1) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] self.model = shared['model'] self.metrics = shared['metrics'] states = shared.get('states', {}) else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format( init_model)) states = self.load(init_model) if os.path.isfile(init_model + '.dict') or opt['dict_file'] is None: opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] if not hasattr(self, 'model_class'): # this allows child classes to override this but inherit init self.model_class = Seq2seq self.model = self.model_class(opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=states.get( 'longest_label', 1)) if opt.get('dict_tokenizer' ) == 'bpe' and opt['embedding_type'] != 'random': print('skipping preinitialization of embeddings for bpe') elif not states and opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ImportError as ex: print( 'Please install torch text with `pip install torchtext`' ) raise ex pretrained_dim = 300 if opt['embedding_type'].startswith('glove'): if 'twitter' in opt['embedding_type']: init = 'glove-twitter' name = 'twitter.27B' pretrained_dim = 200 else: init = 'glove' name = '840B' embs = vocab.GloVe(name=name, dim=pretrained_dim, cache=modelzoo_path( self.opt.get('datapath'), 'models:glove_vectors')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_vectors')) else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != pretrained_dim: rp = torch.Tensor(pretrained_dim, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() # set up criteria if opt.get('numsoftmax', 1) > 1: self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX, size_average=False) else: self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: self.criterion.cuda() if 'train' in opt.get('datatype', ''): # we only set up optimizers when training # we only set this up for the original instance or hogwild ones self.clip = opt.get('gradient_clip', -1) # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in [ 'sgd', 'rmsprop' ]: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['optimizer'] == 'adam': # https://openreview.net/forum?id=ryQu7f-RZ kwargs['amsgrad'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: try: self.optimizer.load_state_dict(states['optimizer']) except ValueError: print('WARNING: not loading optim state since model ' 'params changed.') if self.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset()
class FairseqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. For more information, see Convolutional Sequence to Sequence Learning `(Gehring et al. 2017) <https://arxiv.org/abs/1705.03122>`_. """ @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Fairseq Arguments') agent.add_argument( '-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up training (may ' 'reduce accuracy). This fixes all input and output to have a ' 'maximum length. This reduces the total amount of padding in ' 'the batches.') agent.add_argument( '--max-positions', default=1024, type=int, metavar='N', help='max number of tokens in the sequence') agent.add_argument( '--seed', default=1, type=int, metavar='N', help='pseudo random number generator seed') options.add_optimization_args(argparser) options.add_generation_args(argparser) options.add_model_args(argparser) def __init__(self, opt, shared=None): # initialize defaults first super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. saved_state = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, saved_state = self.load(opt['model_file']) # override options with stored ones opt = self._override_opt(new_opt) self.args = OptWrapper(opt) self.parlai_dict = DictionaryAgent(opt) self.fairseq_dict = _make_fairseq_dict(self.parlai_dict) self.id = 'Fairseq' self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.EOS = self.fairseq_dict[self.fairseq_dict.eos()] self.EOS_TENSOR = (torch.LongTensor(1, 1) .fill_(self.fairseq_dict.eos())) self.NULL_IDX = self.fairseq_dict.pad() encoder = fconv.FConvEncoder( self.fairseq_dict, embed_dim=self.args.encoder_embed_dim, convolutions=eval(self.args.encoder_layers), dropout=self.args.dropout, max_positions=self.args.max_positions) decoder = fconv.FConvDecoder( self.fairseq_dict, embed_dim=self.args.decoder_embed_dim, convolutions=eval(self.args.decoder_layers), out_embed_dim=self.args.decoder_out_embed_dim, attention=eval(self.args.decoder_attention), dropout=self.args.dropout, max_positions=self.args.max_positions) self.model = fconv.FConvModel(encoder, decoder) # from fairseq's build_criterion() if self.args.label_smoothing > 0: self.criterion = criterions.LabelSmoothedCrossEntropyCriterion( self.args.label_smoothing, self.NULL_IDX) else: self.criterion = criterions.CrossEntropyCriterion( self.args, self.fairseq_dict) self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion) if saved_state is not None: self.set_states(saved_state) self.reset() def _override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'arch', 'encoder-embed-dim', 'encoder-layers', 'decoder-embed-dim', 'decoder-layers', 'decoder-out-embed-dim', 'decoder-attention', } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True def observe(self, observation): # shallow copy observation (deep copy can be expensive) observation = observation.copy() if not self.episode_done and not observation.get('preprocessed', False): # if the last example wasn't the end of an episode, then we need to # recall what was said in that example prev_dialogue = self.observation['text'] observation['text'] = prev_dialogue + '\n' + observation['text'] self.observation = observation self.episode_done = observation['episode_done'] return observation def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def batch_act(self, observations): bsz = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(bsz)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field # also, split observations into sub-batches based on number of gpus obs_split = np.array_split(observations, self.trainer.num_replicas) samples = [self.batchify(obs) for obs in obs_split] samples = [s for s in samples if s[0] is not None] any_valid = any(len(s[0]) > 0 for s in samples) if not any_valid: # no valid examples, just return the empty responses we set up return batch_reply # produce predictions if testing; otherwise, train has_targets = any(s[1] is not None for s in samples) if not has_targets: offset = 0 for s in samples: xs = s[0] valid_inds = s[2] predictions = self._generate(self.args, xs) for i in range(len(predictions)): # map the predictions back to non-empty examples in the batch batch_reply[valid_inds[i] + offset]['text'] = predictions[i] if i == 0: print('prediction:', predictions[i]) offset += len(valid_inds) else: loss = self._train(samples) batch_reply[0]['metrics'] = {} for k, v in loss.items(): batch_reply[0]['metrics'][k] = v * bsz if k == 'loss': try: perplexity = 2 ** v * bsz except OverflowError: perplexity = float('inf') batch_reply[0]['metrics']['perplexity'] = perplexity return batch_reply def parse(self, string): return [self.fairseq_dict.index(word) for word in self.parlai_dict.tokenize(string)] def batchify(self, observations): """Convert a list of observations into input & target tensors.""" # valid examples exs = [ex for ex in observations if 'text' in ex] # the indices of the valid (non-empty) tensors valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex] # set up the input tensors batchsize = len(exs) if batchsize == 0: return None, None, None # tokenize the text parsed_x = [deque(maxlen=self.truncate) for _ in exs] for dq, ex in zip(parsed_x, exs): dq += self.parse(ex['text']) # parsed = [self.parse(ex['text']) for ex in exs] max_x_len = max((len(x) for x in parsed_x)) for x in parsed_x: # left pad with zeros x.extendleft([self.fairseq_dict.pad()] * (max_x_len - len(x))) xs = torch.LongTensor(parsed_x) # set up the target tensors ys = None if 'labels' in exs[0]: # randomly select one of the labels to update on, if multiple labels = [random.choice(ex.get('labels', [''])) for ex in exs] parsed_y = [deque(maxlen=self.truncate) for _ in labels] for dq, y in zip(parsed_y, labels): dq.extendleft(reversed(self.parse(y))) for y in parsed_y: y.append(self.fairseq_dict.eos()) # append EOS to each label max_y_len = max(len(y) for y in parsed_y) for y in parsed_y: y += [self.fairseq_dict.pad()] * (max_y_len - len(y)) ys = torch.LongTensor(parsed_y) return xs, ys, valid_inds def _positions_for_tokens(self, tokens): size = tokens.size() not_pad = tokens.ne(self.fairseq_dict.pad()).long() new_pos = tokens.new(size).fill_(self.fairseq_dict.pad()) new_pos += not_pad for i in range(1, size[1]): new_pos[:, i] += new_pos[:, i-1] - 1 return new_pos def _right_shifted_ys(self, ys): result = torch.LongTensor(ys.size()) result[:, 0] = self.fairseq_dict.index(self.EOS) result[:, 1:] = ys[:, :-1] return result def _generate(self, opt, src_tokens): if not hasattr(self, 'translator'): self.translator = SequenceGenerator( [self.trainer.get_model()], beam_size=opt.beam, stop_early=(not opt.no_early_stop), normalize_scores=(not opt.unnormalized), len_penalty=opt.lenpen) self.translator.cuda() tokens = src_tokens.cuda(async=True) translations = self.translator.generate(Variable(tokens)) results = [t[0] for t in translations] output_lines = [[] for _ in range(len(results))] for i in range(len(results)): output_lines[i] = ' '.join(self.fairseq_dict[idx] for idx in results[i]['tokens'][:-1]) return output_lines def _train(self, samples): """Update the model using the targets.""" for i, sample in enumerate(samples): # add extra info to samples sample = { 'src_tokens': sample[0], 'input_tokens': self._right_shifted_ys(sample[1]), 'target': sample[1], 'id': None } sample['ntokens'] = sum(len(t) for t in sample['target']) sample['src_positions'] = self._positions_for_tokens( sample['src_tokens']) sample['input_positions'] = self._positions_for_tokens( sample['input_tokens']) samples[i] = sample return self.trainer.train_step(samples) def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'trainer'): model = {} model['state_dict'] = self.trainer.get_model().state_dict() model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: model = torch.load(read) return model['opt'], model['state_dict'] def set_states(self, state_dict): """Set the state dict of the model from saved states.""" self.trainer.get_model().load_state_dict(state_dict)
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = {'loss': 0, 'num_tokens': 0} self.history = {} self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file' elif opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] else: init_model = None if init_model is not None: # load model parameters if available print('Loading existing model params from ' + init_model) new_opt, self.states = self.load(init_model) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None: if init_model is not None and os.path.isfile(init_model + '.dict'): # check first to see if a dictionary exists opt['dict_file'] = init_model + '.dict' elif opt.get('model_file'): # otherwise, set default dict-file if it is not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] encoder = EncoderRNN(len(self.dict), opt['maxlength_in'], opt['hiddensize'], dropout_p=opt['dropout'], input_dropout_p=opt['dropout'], n_layers=opt['numlayers'], rnn_cell=opt['rnncell'], bidirectional=opt['bidirectional'], variable_lengths=True) decoder = DecoderRNN( len(self.dict), opt['maxlength_out'], opt['hiddensize'] * 2 if opt['bidirectional'] else opt['hiddensize'], dropout_p=opt['dropout'], input_dropout_p=opt['dropout'], n_layers=opt['numlayers'], rnn_cell=opt['rnncell'], bidirectional=opt['bidirectional'], sos_id=self.START_IDX, eos_id=self.END_IDX, use_attention=opt['attention']) self.model = Seq2seq(encoder, decoder) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt['gradient_clip'] # set up tensors once self.START = torch.LongTensor([self.START_IDX]) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) # set up criteria self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.START = self.START.cuda() self.xs = self.xs.cuda() self.ys = self.ys.cuda() self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = IbmSeq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if self.states: if self.states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(self.states['optimizer']) self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset()
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) # all instances needs truncate param self.truncate = opt['truncate'] if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) states = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START = self.dict.start_token self.START_IDX = self.dict[self.START] self.START_TENSOR = torch.LongTensor([self.START_IDX]) # we use END markers to end our output self.END = self.dict.end_token self.END_IDX = self.dict[self.END] self.END_TENSOR = torch.LongTensor([self.END_IDX]) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params in self hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.attention = opt['attention'] self.bidirectional = opt['bidirectional'] self.num_dirs = 2 if self.bidirectional else 1 self.dropout = opt['dropout'] self.lm = opt['language_model'] # set up tensors once self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) self.cand_scores = torch.FloatTensor(1) self.cand_lengths = torch.LongTensor(1) # set up modules self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) # lookup table stores word embeddings self.enc_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if opt['lookuptable'] in ['enc_dec', 'all']: # share this with the encoder self.dec_lt = self.enc_lt else: self.dec_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if not states and opt['embedding_type'].startswith('glove'): # set up pre-initialized vectors from GloVe try: import torchtext.vocab as vocab except ImportError: raise ImportError('Please install torchtext from' 'github.com/pytorch/text.') Glove = vocab.GloVe(name='840B', dim=300) # do better than uniform random proj = torch.FloatTensor(emb, 300).uniform_(-0.057735, 0.057735) if emb != 300 else None for w in self.dict.freq: if w in Glove.stoi: vec = Glove.vectors[Glove.stoi[w]] if emb != 300: vec = torch.mm(proj, vec.unsqueeze(1)).squeeze() self.enc_lt.weight.data[self.dict[w]] = vec self.dec_lt.weight.data[self.dict[w]] = vec # encoder captures the input text enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']] # decoder produces our output states if opt['decoder'] in ['same', 'shared']: # use same class as encoder self.decoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) else: # use set class dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']] self.decoder = dec_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) if opt['decoder'] == 'shared': # shared weights: use the decoder to encode if self.bidirectional: raise RuntimeError('Cannot share enc/dec and do ' 'bidirectional encoding.') self.encoder = self.decoder else: self.encoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True, bidirectional=self.bidirectional) # linear layers help us produce outputs from final decoder state hszXdirs = hsz * self.num_dirs # hidden to embedding self.h2e = nn.Linear(hsz, emb) # embedding to output. note that this CAN predict NULL self.e2o = nn.Linear(emb, len(self.dict)) if opt['lookuptable'] in ['dec_out', 'all']: # share these weights with the decoder lookup table self.e2o.weight = self.dec_lt.weight if self.attention != 'none': # we'll need this for all attention types self.attn_combine = nn.Linear(hszXdirs + emb, emb) if self.attention == 'local': # local attention over fixed set of output states if opt['attention_length'] < 0: raise RuntimeError('Set attention length to > 0.') self.max_length = opt['attention_length'] # combines input and previous hidden output layer self.attn = nn.Linear(hsz + emb, self.max_length) # combines attention weights with encoder outputs elif self.attention == 'concat': self.attn = nn.Linear(hsz + hszXdirs, hsz) self.attn_v = nn.Linear(hsz, 1) elif self.attention == 'general': # equivalent to dot if attn is identity self.attn = nn.Linear(hsz, hszXdirs) # set up optims for each module lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True self.optims = { 'decoder': optim_class(self.decoder.parameters(), **kwargs), 'h2e': optim_class(self.h2e.parameters(), **kwargs), } if opt['decoder'] != 'shared': # update the encoder as well self.optims['encoder'] = optim_class( self.encoder.parameters(), **kwargs) if not opt['embedding_type'].endswith('-fixed'): # update embeddings during training self.optims['enc_lt'] = optim_class( self.enc_lt.parameters(), **kwargs) self.optims['e2o'] = optim_class( self.e2o.parameters(), **kwargs) if opt['lookuptable'] not in ['enc_dec', 'all']: # only add dec if it's separate from enc self.optims['dec_lt'] = optim_class( self.dec_lt.parameters(), **kwargs) elif opt['lookuptable'] not in ['dec_out', 'all']: # embeddings are fixed, so only update e2o if it's not shared self.optims['e2o'] = optim_class( self.e2o.parameters(), **kwargs) # add attention parameters into optims if available for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): self.optims[attn_name] = optim_class( getattr(self, attn_name).parameters(), **kwargs) if states is not None: # set loaded states if applicable self.set_states(states) if self.use_cuda: self.cuda() self.reset()
class IrBaselineAgent(Agent): @staticmethod def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument('-lp', '--length_penalty', default=0.5, help='length penalty for responses') def __init__(self, opt, shared=None): super().__init__(opt) self.id = 'IRBaselineAgent' self.length_penalty = float(opt['length_penalty']) self.dictionary = DictionaryAgent(opt) self.opt = opt def observe(self, obs): self.observation = obs self.dictionary.observe(obs) return obs def act(self): if self.opt.get('datatype', '').startswith('train'): self.dictionary.act() obs = self.observation reply = {} reply['id'] = self.getID() # Rank candidates if 'label_candidates' in obs and len(obs['label_candidates']) > 0: rep = self.build_query_representation(obs['text']) reply['text_candidates'] = (rank_candidates( rep, obs['label_candidates'], self.length_penalty)) reply['text'] = reply['text_candidates'][0] else: reply['text'] = "I don't know." return reply def save(self, fname): self.dictionary.save(fname + '.dict') def load(self, fname): self.dictionary.load(fname + '.dict') def build_query_representation(self, query): """ Build representation of query, e.g. words or n-grams """ rep = {} rep['words'] = {} words = query.lower().split(' ') rw = rep['words'] used = {} for w in words: if len(self.dictionary.freqs()) > 0: rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w])) else: if w not in stopwords: rw[w] = 1 used[w] = True norm = len(used) rep['norm'] = math.sqrt(len(words)) return rep
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.005, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', default='none', choices=['none', 'concat', 'general', 'dot', 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' 'For more details see: ' 'https://arxiv.org/pdf/1508.04025.pdf') agent.add_argument('-attl', '--attention-length', default=48, type=int, help='Length of local attention.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length and to ' 'be similar in length to one another by throwing ' 'away extra tokens. This reduces the total amount ' 'of padding in the batches.') agent.add_argument('-enc', '--encoder', default='gru', choices=Seq2seqAgent.ENC_OPTS.keys(), help='Choose between different encoder modules.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()), help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights. ' 'Note that shared disabled some encoder ' 'options--in particular, bidirectionality.') agent.add_argument('-lt', '--lookuptable', default='all', choices=['unique', 'enc_dec', 'dec_out', 'all'], help='The encoder, decoder, and output modules can ' 'share weights, or not. ' 'Unique has independent embeddings for each. ' 'Enc_dec shares the embedding for the encoder ' 'and decoder. ' 'Dec_out shares decoder embedding and output ' 'weights. ' 'All shares all three weights.') agent.add_argument('-opt', '--optimizer', default='adam', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-emb', '--embedding-type', default='random', choices=['random', 'glove', 'glove-fixed'], help='Choose between different strategies ' 'for word embeddings. Default is random, ' 'but can also preinitialize from Glove.' 'Preinitialized embeddings can also be fixed ' 'so they are not updated during training.') agent.add_argument('-lm', '--language-model', default='none', choices=['none', 'only', 'both'], help='Enabled language modeling training on the ' 'concatenated input and label data.')
def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.005, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', default='none', choices=['none', 'concat', 'general', 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' 'For more details see: ' 'https://arxiv.org/pdf/1508.04025.pdf') agent.add_argument('-attl', '--attention-length', default=48, type=int, help='Length of local attention.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length and to ' 'be similar in length to one another by throwing ' 'away extra tokens. This reduces the total amount ' 'of padding in the batches.') agent.add_argument('-enc', '--encoder', default='gru', choices=Seq2seqAgent.ENC_OPTS.keys(), help='Choose between different encoder modules.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()), help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights. ' 'Note that shared disabled some encoder ' 'options--in particular, bidirectionality.') agent.add_argument('-lt', '--lookuptable', default='all', choices=['unique', 'enc_dec', 'dec_out', 'all'], help='The encoder, decoder, and output modules can ' 'share weights, or not. ' 'Unique has independent embeddings for each. ' 'Enc_dec shares the embedding for the encoder ' 'and decoder. ' 'Dec_out shares decoder embedding and output ' 'weights. ' 'All shares all three weights.') agent.add_argument('-opt', '--optimizer', default='adam', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-emb', '--embedding-init', default='random', choices=['random', 'glove'], help='Choose between initialization strategies ' 'for word embeddings. Default is random, ' 'but can also preinitialize from Glove') agent.add_argument('-lm', '--language-model', type='bool', default=False, help='enabled language modeling training on the ' 'concatenated input and label data')
class Seq2seqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. This model supports encoding the input and decoding the output via one of several flavors of RNN. It then uses a linear layer (whose weights can be shared with the embedding layer) to convert RNN output states into output tokens. This model currently uses greedy decoding, selecting the highest probability token at each time step. For more information, see Sequence to Sequence Learning with Neural Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM} @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.005, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', default='none', choices=['none', 'concat', 'general', 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' 'For more details see: ' 'https://arxiv.org/pdf/1508.04025.pdf') agent.add_argument('-attl', '--attention-length', default=48, type=int, help='Length of local attention.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length and to ' 'be similar in length to one another by throwing ' 'away extra tokens. This reduces the total amount ' 'of padding in the batches.') agent.add_argument('-enc', '--encoder', default='gru', choices=Seq2seqAgent.ENC_OPTS.keys(), help='Choose between different encoder modules.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()), help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights. ' 'Note that shared disabled some encoder ' 'options--in particular, bidirectionality.') agent.add_argument('-lt', '--lookuptable', default='all', choices=['unique', 'enc_dec', 'dec_out', 'all'], help='The encoder, decoder, and output modules can ' 'share weights, or not. ' 'Unique has independent embeddings for each. ' 'Enc_dec shares the embedding for the encoder ' 'and decoder. ' 'Dec_out shares decoder embedding and output ' 'weights. ' 'All shares all three weights.') agent.add_argument('-opt', '--optimizer', default='adam', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-emb', '--embedding-init', default='random', choices=['random', 'glove'], help='Choose between initialization strategies ' 'for word embeddings. Default is random, ' 'but can also preinitialize from Glove') agent.add_argument('-lm', '--language-model', type='bool', default=False, help='enabled language modeling training on the ' 'concatenated input and label data') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) # all instances needs truncate param self.truncate = opt['truncate'] if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available( ) if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) states = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START = self.dict.start_token self.START_IDX = self.dict[self.START] self.START_TENSOR = torch.LongTensor([self.START_IDX]) # we use END markers to end our output self.END = self.dict.end_token self.END_IDX = self.dict[self.END] self.END_TENSOR = torch.LongTensor([self.END_IDX]) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params in self hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.attention = opt['attention'] self.bidirectional = opt['bidirectional'] self.num_dirs = 2 if self.bidirectional else 1 self.dropout = opt['dropout'] self.lm = opt['language_model'] # set up tensors once self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) self.cand_scores = torch.FloatTensor(1) self.cand_lengths = torch.LongTensor(1) # set up modules self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) # lookup table stores word embeddings self.enc_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if opt['lookuptable'] in ['enc_dec', 'all']: # share this with the encoder self.dec_lt = self.enc_lt else: self.dec_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if not states and opt['embedding_init'] == 'glove': # set up pre-initialized vectors from GloVe try: import torchtext.vocab as vocab except ImportError: raise ImportError('Please install torchtext from' 'github.com/pytorch/text.') Glove = vocab.GloVe(name='840B', dim=300) # do better than uniform random proj = torch.FloatTensor(emb, 300).uniform_( -0.057735, 0.057735) if emb != 300 else None for w in self.dict.freq: if w in Glove.stoi: vec = Glove.vectors[Glove.stoi[w]] if emb != 300: vec = torch.mm(proj, vec.unsqueeze(1)).squeeze() self.enc_lt.weight.data[self.dict[w]] = vec self.dec_lt.weight.data[self.dict[w]] = vec # encoder captures the input text enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']] # decoder produces our output states if opt['decoder'] in ['same', 'shared']: # use same class as encoder self.decoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) else: # use set class dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']] self.decoder = dec_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) if opt['decoder'] == 'shared': # shared weights: use the decoder to encode if self.bidirectional: raise RuntimeError('Cannot share enc/dec and do ' 'bidirectional encoding.') self.encoder = self.decoder else: self.encoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True, bidirectional=self.bidirectional) # linear layers help us produce outputs from final decoder state hszXdirs = hsz * self.num_dirs # hidden to embedding self.h2e = nn.Linear(hsz, emb) # embedding to output. note that this CAN predict NULL self.e2o = nn.Linear(emb, len(self.dict)) if opt['lookuptable'] in ['dec_out', 'all']: # share these weights with the decoder lookup table self.e2o.weight = self.dec_lt.weight if self.attention == 'local': # local attention over fixed set of output states if opt['attention_length'] < 0: raise RuntimeError('Set attention length to > 0.') self.max_length = opt['attention_length'] # combines input and previous hidden output layer self.attn = nn.Linear(hsz + emb, self.max_length) # combines attention weights with encoder outputs self.attn_combine = nn.Linear(hszXdirs + emb, emb) elif self.attention == 'concat': self.attn = nn.Linear(hsz + hszXdirs, hsz) self.attn_v = nn.Linear(hsz, 1) self.attn_combine = nn.Linear(hszXdirs + emb, emb) elif self.attention == 'general': self.attn = nn.Linear(hsz, hszXdirs) self.attn_combine = nn.Linear(hszXdirs + emb, emb) # set up optims for each module lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True self.optims = { 'enc_lt': optim_class(self.enc_lt.parameters(), **kwargs), 'decoder': optim_class(self.decoder.parameters(), **kwargs), 'h2e': optim_class(self.h2e.parameters(), **kwargs), 'e2o': optim_class(self.e2o.parameters(), **kwargs), } if opt['decoder'] != 'shared': self.optims['encoder'] = optim_class(self.encoder.parameters(), **kwargs) if opt['lookuptable'] not in ['enc_dec', 'all']: # only add dec if it's separate from enc self.optims['dec_lt'] = optim_class(self.dec_lt.parameters(), **kwargs) # add attention parameters into optims if available for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): self.optims[attn_name] = optim_class( getattr(self, attn_name).parameters(), **kwargs) if states is not None: # set loaded states if applicable self.set_states(states) if self.use_cuda: self.cuda() self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder', 'decoder', 'lookuptable', 'attention', 'attention_length' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" new_vec = [] for i in vec: if i == self.END_IDX: break elif i not in [self.NULL_IDX, self.START_IDX]: new_vec.append(i) return self.dict.vec2txt(new_vec) def cuda(self): """Push parameters to the GPU.""" self.START_TENSOR = self.START_TENSOR.cuda(async=True) self.END_TENSOR = self.END_TENSOR.cuda(async=True) self.zeros = self.zeros.cuda(async=True) self.xs = self.xs.cuda(async=True) self.ys = self.ys.cuda(async=True) if self.rank: self.cands = self.cands.cuda(async=True) self.cand_scores = self.cand_scores.cuda(async=True) self.cand_lengths = self.cand_lengths.cuda(async=True) self.criterion.cuda() self.enc_lt.cuda() self.dec_lt.cuda() self.encoder.cuda() self.decoder.cuda() self.h2e.cuda() self.e2o.cuda() if self.attention != 'none': for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): getattr(self, attn_name).cuda() def hidden_to_idx(self, hidden, is_training=False): """Convert hidden state vectors into indices into the dictionary.""" # dropout at each step e = F.dropout(self.h2e(hidden), p=self.dropout, training=is_training) scores = F.dropout(self.e2o(e), p=self.dropout, training=is_training) _max_score, idx = scores.max(2) return idx, scores def zero_grad(self): """Zero out optimizers.""" for optimizer in self.optims.values(): optimizer.zero_grad() def update_params(self): """Do one optimization step.""" for optimizer in self.optims.values(): optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['answers'] = self.answers shared['dict'] = self.dict shared['START_IDX'] = self.START_IDX shared['END_IDX'] = self.END_IDX return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) observation = observation.copy() if 'text' in observation: # put START and END around text parsed_x = [self.START_IDX] parsed_x.extend(self.parse(observation['text'])) parsed_x.append(self.END_IDX) if self.truncate > 0: parsed_x = parsed_x[-self.truncate:] observation['text'] = parsed_x if not self.episode_done: prev_dialog = self.observation['text'] # get last y batch_idx = self.opt.get('batchindex', 0) if self.answers[batch_idx] is not None: # use our last answer, which is the label during training lastY = self.answers[batch_idx] prev_dialog.append(self.START_IDX) prev_dialog.extend(lastY) prev_dialog.append(self.END_IDX) self.answers[batch_idx] = None # forget last y prev_dialog.extend(parsed_x) if self.truncate > 0: prev_dialog = prev_dialog[-self.truncate:] observation['text'] = prev_dialog self.observation = observation self.episode_done = observation['episode_done'] return observation def _encode(self, xs, is_training=False): """Call encoder and return output and hidden states.""" self.lastxs = xs batchsize = len(xs) # first encode context xes = F.dropout(self.enc_lt(xs), p=self.dropout, training=is_training) # project from emb_size to hidden_size dimensions x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True) if self.zeros.size(1) != batchsize: self.zeros.resize_(self.num_layers * self.num_dirs, batchsize, self.hidden_size).fill_(0) h0 = Variable(self.zeros, requires_grad=False) if type(self.encoder) == nn.LSTM: encoder_output_packed, hidden = self.encoder(xes_packed, (h0, h0)) # take elementwise max between forward and backward hidden states hidden = (hidden[0].view(-1, self.num_dirs, hidden[0].size(1), hidden[0].size(2)).max(1)[0], hidden[1].view(-1, self.num_dirs, hidden[1].size(1), hidden[1].size(2)).max(1)[0]) if type(self.decoder) != nn.LSTM: hidden = hidden[0] else: encoder_output_packed, hidden = self.encoder(xes_packed, h0) # take elementwise max between forward and backward hidden states hidden = hidden.view(-1, self.num_dirs, hidden.size(1), hidden.size(2)).max(1)[0] if type(self.decoder) == nn.LSTM: hidden = (hidden, h0.narrow(0, 0, 2)) encoder_output, _ = pad_packed_sequence(encoder_output_packed, batch_first=True) encoder_output = encoder_output if self.attention == 'local': # if using local attention, narrow encoder_output to max_length if encoder_output.size(1) > self.max_length: offset = encoder_output.size(1) - self.max_length encoder_output = encoder_output.narrow(1, offset, self.max_length) return encoder_output, hidden def _apply_attention(self, xes, encoder_output, hidden, attn_mask=None): """Apply attention to encoder hidden layer.""" last_hidden = hidden[-1] # select hidden from last RNN layer if self.attention == 'concat': hidden_expand = last_hidden.unsqueeze(1).expand( last_hidden.size(0), encoder_output.size(1), last_hidden.size(1)) attn_w_premask = self.attn_v( F.tanh(self.attn(torch.cat((encoder_output, hidden_expand), 2)))).squeeze(2) attn_weights = F.softmax(attn_w_premask * attn_mask - (1 - attn_mask) * 1e20) elif self.attention == 'general': hidden_expand = last_hidden.unsqueeze(1) attn_w_premask = torch.bmm(self.attn(hidden_expand), encoder_output.transpose(1, 2)).squeeze(1) attn_weights = F.softmax(attn_w_premask * attn_mask - (1 - attn_mask) * 1e20) elif self.attention == 'local': attn_weights = F.softmax( self.attn(torch.cat((xes.squeeze(1), last_hidden), 1))) if attn_weights.size(1) > encoder_output.size(1): attn_weights = attn_weights.narrow(1, 0, encoder_output.size(1)) attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_output).squeeze(1) output = torch.cat((xes.squeeze(1), attn_applied), 1) output = self.attn_combine(output).unsqueeze(1) output = F.tanh(output) return output def _decode_and_train(self, batchsize, xes, ys, encoder_output, hidden, attn_mask, lm=False): """Update the model based on the labels.""" self.zero_grad() loss = 0 predictions = [] # keep track of longest label we've ever seen # we'll never produce longer ones than that during prediction if not lm: self.longest_label = max(self.longest_label, ys.size(1)) if self.attention != 'none': # using attention, produce one token at a time for i in range(ys.size(1)): h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden output = self._apply_attention(xes, encoder_output, h_att, attn_mask) output, hidden = self.decoder(output, hidden) preds, scores = self.hidden_to_idx(output, is_training=True) y = ys.select(1, i) loss += self.criterion(scores.squeeze(1), y) # use the true token as the next input instead of predicted xes = self.dec_lt(y).unsqueeze(1) xes = F.dropout(xes, p=self.dropout, training=True) predictions.append(preds) else: # force the entire sequence at once by feeding in START + y[:-2] y_in = ys.narrow(1, 0, ys.size(1) - 1) xes = torch.cat([xes, self.dec_lt(y_in)], 1) output, hidden = self.decoder(xes, hidden) preds, scores = self.hidden_to_idx(output, is_training=True) for i in range(ys.size(1)): # sum loss per-token score = scores.select(1, i) y = ys.select(1, i) loss += self.criterion(score, y) predictions.append(preds) loss.backward() self.update_params() predictions = torch.cat(predictions, 1) if random.random() < 0.1: # sometimes output a prediction for debugging # print('prediction:', ' '.join(output_lines[0])) # print('label:', self.v2t(ys.data[0])) print('lm' if lm else ' ', 'loss:', loss.data[0]) return predictions def _decode_only(self, batchsize, xes, ys, encoder_output, hidden, attn_mask): """Just produce a prediction without training the model.""" done = [False for _ in range(batchsize)] total_done = 0 max_len = 0 predictions = [] # generate a response from scratch while (total_done < batchsize) and max_len < self.longest_label: # keep producing tokens until we hit END or max length for each # example in the batch if self.attention == 'none': output = xes else: h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden output = self._apply_attention(xes, encoder_output, h_att, attn_mask) output, hidden = self.decoder(output, hidden) preds, _scores = self.hidden_to_idx(output, is_training=False) predictions.append(preds) xes = self.dec_lt(preds) max_len += 1 for b in range(batchsize): if not done[b]: # only add more tokens for examples that aren't done yet if preds.data[b][0] == self.END_IDX: # if we produced END, we're done done[b] = True total_done += 1 predictions = torch.cat(predictions, 1) if random.random() < 0.2: # sometimes output a prediction for debugging print('\nprediction:', self.v2t(predictions.data[0])) return predictions def _score_candidates(self, cands, cand_inds, start, encoder_output, hidden, attn_mask): """Rank candidates by their likelihood according to the decoder.""" if type(self.decoder) == nn.LSTM: hidden, cell = hidden # score each candidate separately # cands are exs_with_cands x cands_per_ex x words_per_cand # cview is total_cands x words_per_cand cview = cands.view(-1, cands.size(2)) c_xes = start.expand(cview.size(0), start.size(0), start.size(1)) if len(cand_inds) != hidden.size(1): # only use hidden state from inputs with associated candidates cand_indices = torch.LongTensor([i for i, _, _ in cand_inds]) if self.use_cuda: cand_indices = cand_indices.cuda() cand_indices = Variable(cand_indices) hidden = hidden.index_select(1, cand_indices) sz = hidden.size() cands_hn = (hidden.view(sz[0], sz[1], 1, sz[2]).expand( sz[0], sz[1], cands.size(1), sz[2]).contiguous().view(sz[0], -1, sz[2])) if type(self.decoder) == nn.LSTM: if len(cand_inds) != cell.size(1): # only use cell state from inputs with associated candidates cell = cell.index_select(1, cand_indices) cands_hn = (cands_hn, cell.view(sz[0], sz[1], 1, sz[2]).expand( sz[0], sz[1], cands.size(1), sz[2]).contiguous().view(sz[0], -1, sz[2])) cand_scores = Variable( self.cand_scores.resize_(cview.size(0)).fill_(0)) cand_lengths = Variable( self.cand_lengths.resize_(cview.size(0)).fill_(0)) if self.attention != 'none': # using attention sz = encoder_output.size() cands_encoder_output = (encoder_output.contiguous().view( sz[0], 1, sz[1], sz[2]).expand(sz[0], cands.size(1), sz[1], sz[2]).contiguous().view(-1, sz[1], sz[2])) msz = attn_mask.size() cands_attn_mask = (attn_mask.contiguous().view( msz[0], 1, msz[1]).expand(msz[0], cands.size(1), msz[1]).contiguous().view(-1, msz[1])) for i in range(cview.size(1)): # process one token at a time h_att = cands_hn[0] if type( self.decoder) == nn.LSTM else cands_hn output = self._apply_attention(c_xes, cands_encoder_output, h_att, cands_attn_mask) output, cands_hn = self.decoder(output, cands_hn) _preds, scores = self.hidden_to_idx(output, is_training=False) cs = cview.select(1, i) non_nulls = cs.ne(self.NULL_IDX) cand_lengths += non_nulls.long() score_per_cand = torch.gather(scores.select(1, i), 1, cs.unsqueeze(1)) cand_scores += score_per_cand.squeeze() * non_nulls.float() c_xes = self.dec_lt(cs).unsqueeze(1) else: # process entire sequence at once if cview.size(1) > 1: # feed in START + cands[:-2] cands_in = cview.narrow(1, 0, cview.size(1) - 1) c_xes = torch.cat([c_xes, self.dec_lt(cands_in)], 1) output, cands_hn = self.decoder(c_xes, cands_hn) _preds, scores = self.hidden_to_idx(output, is_training=False) for i in range(cview.size(1)): # calculate score at each token cs = cview.select(1, i) non_nulls = cs.ne(self.NULL_IDX) cand_lengths += non_nulls.long() score_per_cand = torch.gather(scores.select(1, i), 1, cs.unsqueeze(1)) cand_scores += score_per_cand.squeeze() * non_nulls.float() # set empty scores to -1, so when divided by 0 they become -inf cand_scores -= cand_lengths.eq(0).float() # average the scores per token cand_scores /= cand_lengths.float() cand_scores = cand_scores.view(cands.size(0), cands.size(1)) srtd_scores, text_cand_inds = cand_scores.sort(1, True) text_cand_inds = text_cand_inds.data return text_cand_inds def predict(self, xs, ys=None, cands=None, valid_cands=None, lm=False): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ batchsize = len(xs) text_cand_inds = None is_training = ys is not None self.encoder.train(mode=is_training) self.decoder.train(mode=is_training) encoder_output, hidden = self._encode(xs, is_training) # next we use START as an input to kick off our decoder if not lm: x = Variable(self.START_TENSOR, requires_grad=False) xe = self.dec_lt(x) xe = F.dropout(xe, p=self.dropout, training=is_training) xes = xe.expand(batchsize, 1, xe.size(1)) else: # during language_model mode, just start with zeros xes = Variable(self.zeros[0].narrow(1, 0, self.emb_size).unsqueeze(1), requires_grad=False) if self.attention == 'none': attn_mask = None else: attn_mask = xs.ne(0).float() if is_training: predictions = self._decode_and_train(batchsize, xes, ys, encoder_output, hidden, attn_mask, lm=lm) else: if cands is not None: text_cand_inds = self._score_candidates( cands, valid_cands, xe, encoder_output, hidden, attn_mask) predictions = self._decode_only(batchsize, xes, ys, encoder_output, hidden, attn_mask) return predictions, text_cand_inds def batchify(self, observations): """Convert a list of observations into input & target tensors.""" def valid(obs): # check if this is an example our model should actually process return 'text' in obs and ('labels' in obs or 'eval_labels' in obs) # valid examples and their indices valid_inds, exs = zip(*[(i, ex) for i, ex in enumerate(observations) if valid(ex)]) # set up the input tensors batchsize = len(exs) if batchsize == 0: return None, None, None, None, None, None # `x` text is already tokenized and truncated parsed = [ex['text'] for ex in exs] x_lens = [len(x) for x in parsed] ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k]) exs = [exs[k] for k in ind_sorted] valid_inds = [valid_inds[k] for k in ind_sorted] parsed = [parsed[k] for k in ind_sorted] max_x_len = max([len(x) for x in parsed]) xs = torch.LongTensor(batchsize, max_x_len).fill_(self.NULL_IDX) # right-padded with zeros for i, x in enumerate(parsed): for j, idx in enumerate(x): xs[i][j] = idx if self.use_cuda: # copy to gpu self.xs.resize_(xs.size()) self.xs.copy_(xs, async=True) xs = Variable(self.xs) else: xs = Variable(xs) # set up the target tensors ys = None labels = None if any(['labels' in ex for ex in exs]): # randomly select one of the labels to update on, if multiple # append END to each label labels = [random.choice(ex.get('labels', [''])) for ex in exs] parsed = [self.parse(y + ' ' + self.END) for y in labels if y] max_y_len = max(len(y) for y in parsed) if self.truncate > 0: # shrink ys to to limit batch computation max_y_len = min(max_y_len, self.truncate) parsed = [y[:max_y_len] for y in parsed] ys = torch.LongTensor(batchsize, max_y_len).fill_(self.NULL_IDX) for i, y in enumerate(parsed): for j, idx in enumerate(y): ys[i][j] = idx if self.use_cuda: # copy to gpu self.ys.resize_(ys.size()) self.ys.copy_(ys, async=True) ys = Variable(self.ys) else: ys = Variable(ys) # set up candidates cands = None valid_cands = None if ys is None and self.rank: # only do ranking when no targets available and ranking flag set parsed = [] valid_cands = [] for i, v in enumerate(valid_inds): if 'label_candidates' in observations[i]: # each candidate tuple is a pair of the parsed version and # the original full string cs = list(observations[i]['label_candidates']) parsed.append([self.parse(c) for c in cs]) valid_cands.append((i, v, cs)) if len(parsed) > 0: # TODO: store lengths of cands separately, so don't have zero # padding for varying number of cands per example # found cands, pack them into tensor max_c_len = max(max(len(c) for c in cs) for cs in parsed) max_c_cnt = max(len(cs) for cs in parsed) cands = torch.LongTensor(len(parsed), max_c_cnt, max_c_len).fill_(self.NULL_IDX) for i, cs in enumerate(parsed): for j, c in enumerate(cs): for k, idx in enumerate(c): cands[i][j][k] = idx if self.use_cuda: # copy to gpu self.cands.resize_(cands.size()) self.cands.copy_(cands, async=True) cands = Variable(self.cands) else: cands = Variable(cands) return xs, ys, labels, valid_inds, cands, valid_cands def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, labels, valid_inds, cands, valid_cands = self.batchify( observations) if xs is None: # no valid examples, just return empty responses return batch_reply # produce predictions, train on targets if available predictions, text_cand_inds = self.predict(xs, ys, cands, valid_cands) if self.lm and ys is not None: # also train on lm task: given [START], predict [x y] # (regular task is given [x START] produce [y]) new_obs = [ { 'text': [self.START_IDX], 'labels': [ '{x} {s} {y}'.format( x=self.v2t(obs['text'][1:]), # skip START token s=self.START, y=random.choice(obs.get('labels', ['']))) ] } for obs in observations ] xs, ys, _, _, _, _ = self.batchify(new_obs) _, _ = self.predict(xs, ys, lm=True) predictions = predictions.cpu() for i in range(len(predictions)): # map the predictions back to non-empty examples in the batch # we join with spaces since we produce tokens one at a time curr = batch_reply[valid_inds[i]] output_tokens = [] for c in predictions.data[i]: if c == self.END_IDX or c == self.NULL_IDX: break else: output_tokens.append(c) curr_pred = self.v2t(output_tokens) curr['text'] = curr_pred if labels is not None: y = [] for c in ys.data[i]: if c == self.END_IDX or c == self.NULL_IDX: break else: y.append(c) self.answers[valid_inds[i]] = y else: self.answers[valid_inds[i]] = output_tokens if text_cand_inds is not None: for i in range(len(valid_cands)): order = text_cand_inds[i] _, batch_idx, curr_cands = valid_cands[i] curr = batch_reply[valid_inds[batch_idx]] curr['text_candidates'] = [ curr_cands[idx] for idx in order if idx < len(curr_cands) ] return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'optims'): model = {} model['enc_lt'] = self.enc_lt.state_dict() if self.opt['lookuptable'] not in ['enc_dec', 'all']: # dec_lt is enc_lt raise RuntimeError() # model['dec_lt'] = self.dec_lt.state_dict() if self.opt['decoder'] != 'shared': model['encoder'] = self.encoder.state_dict() model['decoder'] = self.decoder.state_dict() model['h2e'] = self.h2e.state_dict() model['e2o'] = self.e2o.state_dict() model['optims'] = { k: v.state_dict() for k, v in self.optims.items() } model['longest_label'] = self.longest_label model['opt'] = self.opt for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): model[attn_name] = getattr(self, attn_name).state_dict() with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: model = torch.load(read) return model['opt'], model def set_states(self, states): """Set the state dicts of the modules from saved states.""" self.enc_lt.load_state_dict(states['enc_lt']) if self.opt['lookuptable'] not in ['enc_dec', 'all']: # dec_lt is enc_lt raise RuntimeError( 'dec_lt state should not exist--it is same as enc_lt.') if self.opt['decoder'] != 'shared': self.encoder.load_state_dict(states['encoder']) self.decoder.load_state_dict(states['decoder']) self.h2e.load_state_dict(states['h2e']) self.e2o.load_state_dict(states['e2o']) for attn_name in ['attn', 'attn_v', 'attn_combine']: if attn_name in states: getattr(self, attn_name).load_state_dict(states[attn_name]) for k, v in states['optims'].items(): self.optims[k].load_state_dict(v) self.longest_label = states['longest_label']
def main(): # Get command line arguments argparser = ParlaiParser() DictionaryAgent.add_cmdline_args(argparser) ParsedRemoteAgent.add_cmdline_args(argparser) argparser.add_argument('--num-examples', default=1000, type=int) argparser.add_argument('--num-its', default=100, type=int) argparser.add_argument('--dict-max-exs', default=10000, type=int) parlai_home = os.environ['PARLAI_HOME'] if '--remote-cmd' not in sys.argv: if os.system('which luajit') != 0: raise RuntimeError('Could not detect torch luajit installed: ' + 'please install torch from http://torch.ch ' + 'or manually set --remote-cmd for this example.') sys.argv.append('--remote-cmd') sys.argv.append('luajit {}/parlai/agents/'.format(parlai_home) + 'memnn_luatorch_cpu/memnn_zmq_parsed.lua') if '--remote-args' not in sys.argv: sys.argv.append('--remote-args') sys.argv.append('{}/examples/'.format(parlai_home) + 'memnn_luatorch_cpu/params_default.lua') opt = argparser.parse_args() # set up dictionary print('Setting up dictionary.') dictionary = DictionaryAgent(opt) if not opt.get('dict_file'): # build dictionary since we didn't load it ordered_opt = copy.deepcopy(opt) ordered_opt['datatype'] = 'train:ordered' ordered_opt['numthreads'] = 1 world_dict = create_task(ordered_opt, dictionary) print('Dictionary building on training data.') cnt = 0 # pass examples to dictionary for _ in world_dict: cnt += 1 if cnt > opt['dict_max_exs'] and opt['dict_max_exs'] > 0: print('Processed {} exs, moving on.'.format( opt['dict_max_exs'])) # don't wait too long... break world_dict.parley() # we need to save the dictionary to load it in memnn (sort it by freq) dictionary.sort() dictionary.save('/tmp/dict.txt', sort=True) print('Dictionary ready, moving on to training.') opt['datatype'] = 'train' agent = ParsedRemoteAgent(opt, {'dictionary_shared': dictionary.share()}) world_train = create_task(opt, agent) opt['datatype'] = 'valid' world_valid = create_task(opt, agent) start = time.time() with world_train: for _ in range(opt['num_its']): print('[ training ]') for _ in range(opt['num_examples'] * opt.get('numthreads', 1)): world_train.parley() world_train.synchronize() print('[ validating ]') world_valid.reset() for _ in world_valid: # check valid accuracy world_valid.parley() print('[ validation summary. ]') report_valid = world_valid.report() print(report_valid) if report_valid['accuracy'] > 0.95: break # show some example dialogs after training: world_valid = create_task(opt, agent) for _k in range(3): world_valid.parley() print(world_valid.display()) print('finished in {} s'.format(round(time.time() - start, 2)))
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) # all instances needs truncate param self.truncate = opt['truncate'] if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available( ) if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) states = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START = self.dict.start_token self.START_IDX = self.dict[self.START] self.START_TENSOR = torch.LongTensor([self.START_IDX]) # we use END markers to end our output self.END = self.dict.end_token self.END_IDX = self.dict[self.END] self.END_TENSOR = torch.LongTensor([self.END_IDX]) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params in self hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.attention = opt['attention'] self.bidirectional = opt['bidirectional'] self.num_dirs = 2 if self.bidirectional else 1 self.dropout = opt['dropout'] self.lm = opt['language_model'] # set up tensors once self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) self.cand_scores = torch.FloatTensor(1) self.cand_lengths = torch.LongTensor(1) # set up modules self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) # lookup table stores word embeddings self.enc_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if opt['lookuptable'] in ['enc_dec', 'all']: # share this with the encoder self.dec_lt = self.enc_lt else: self.dec_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if not states and opt['embedding_init'] == 'glove': # set up pre-initialized vectors from GloVe try: import torchtext.vocab as vocab except ImportError: raise ImportError('Please install torchtext from' 'github.com/pytorch/text.') Glove = vocab.GloVe(name='840B', dim=300) # do better than uniform random proj = torch.FloatTensor(emb, 300).uniform_( -0.057735, 0.057735) if emb != 300 else None for w in self.dict.freq: if w in Glove.stoi: vec = Glove.vectors[Glove.stoi[w]] if emb != 300: vec = torch.mm(proj, vec.unsqueeze(1)).squeeze() self.enc_lt.weight.data[self.dict[w]] = vec self.dec_lt.weight.data[self.dict[w]] = vec # encoder captures the input text enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']] # decoder produces our output states if opt['decoder'] in ['same', 'shared']: # use same class as encoder self.decoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) else: # use set class dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']] self.decoder = dec_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) if opt['decoder'] == 'shared': # shared weights: use the decoder to encode if self.bidirectional: raise RuntimeError('Cannot share enc/dec and do ' 'bidirectional encoding.') self.encoder = self.decoder else: self.encoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True, bidirectional=self.bidirectional) # linear layers help us produce outputs from final decoder state hszXdirs = hsz * self.num_dirs # hidden to embedding self.h2e = nn.Linear(hsz, emb) # embedding to output. note that this CAN predict NULL self.e2o = nn.Linear(emb, len(self.dict)) if opt['lookuptable'] in ['dec_out', 'all']: # share these weights with the decoder lookup table self.e2o.weight = self.dec_lt.weight if self.attention == 'local': # local attention over fixed set of output states if opt['attention_length'] < 0: raise RuntimeError('Set attention length to > 0.') self.max_length = opt['attention_length'] # combines input and previous hidden output layer self.attn = nn.Linear(hsz + emb, self.max_length) # combines attention weights with encoder outputs self.attn_combine = nn.Linear(hszXdirs + emb, emb) elif self.attention == 'concat': self.attn = nn.Linear(hsz + hszXdirs, hsz) self.attn_v = nn.Linear(hsz, 1) self.attn_combine = nn.Linear(hszXdirs + emb, emb) elif self.attention == 'general': self.attn = nn.Linear(hsz, hszXdirs) self.attn_combine = nn.Linear(hszXdirs + emb, emb) # set up optims for each module lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True self.optims = { 'enc_lt': optim_class(self.enc_lt.parameters(), **kwargs), 'decoder': optim_class(self.decoder.parameters(), **kwargs), 'h2e': optim_class(self.h2e.parameters(), **kwargs), 'e2o': optim_class(self.e2o.parameters(), **kwargs), } if opt['decoder'] != 'shared': self.optims['encoder'] = optim_class(self.encoder.parameters(), **kwargs) if opt['lookuptable'] not in ['enc_dec', 'all']: # only add dec if it's separate from enc self.optims['dec_lt'] = optim_class(self.dec_lt.parameters(), **kwargs) # add attention parameters into optims if available for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): self.optims[attn_name] = optim_class( getattr(self, attn_name).parameters(), **kwargs) if states is not None: # set loaded states if applicable self.set_states(states) if self.use_cuda: self.cuda() self.reset()
def add_cmdline_args(parser): DictionaryAgent.add_cmdline_args(parser) parser.add_argument( '-lp', '--length_penalty', default=0.5, help='length penalty for responses')
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = {'loss': 0.0, 'num_tokens': 0} self.history = {} self.report_freq = opt.get('report_freq', 0.001) states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if opt.get('numthreads') > 1: torch.set_num_threads(1) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.metrics = shared['metrics'] states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file' elif opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] else: init_model = None if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format(init_model)) new_opt, states = self.load(init_model) # override model-specific options with stored ones opt = self.override_opt(new_opt) self.opt = opt if opt['dict_file'] is None: if init_model is not None and os.path.isfile(init_model + '.dict'): # check first to see if a dictionary exists opt['dict_file'] = init_model + '.dict' elif opt.get('model_file'): # otherwise, set default dict-file if it is not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] if not hasattr(self, 'model_class'): # this allows child classes to override this but inherit init self.model_class = Seq2seq self.model = self.model_class( opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=states.get('longest_label', 1)) if opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ModuleNotFoundError as ex: print('Please install torch text with `pip install torchtext`') raise ex if opt['embedding_type'].startswith('glove'): init = 'glove' embs = vocab.GloVe(name='840B', dim=300, cache=os.path.join(opt['parlai_home'], '.vector_cache')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=os.path.join(opt['parlai_home'], '.vector_cache')) else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != 300: rp = torch.Tensor(300, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', -1) self.rank = opt['rank_candidates'] # set up tensors once self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.xs = self.xs.cuda() self.ys = self.ys.cuda() if self.rank: self.cands = self.cands.cuda() self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in ['sgd', 'rmsprop']: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(states['optimizer']) if self.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset()
class Seq2seqAgent(Agent): """Simple agent which uses an LSTM to process incoming text observations.""" @staticmethod def add_cmdline_args(argparser): DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=64, help='size of the hidden layers and embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.5, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') def __init__(self, opt, shared=None): super().__init__(opt, shared) opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available() if opt['cuda']: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if not shared: # don't enter this loop for shared (ie batch) instantiations self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' hsz = opt['hiddensize'] self.EOS = self.dict.eos_token self.observation = {'text': self.EOS, 'episode_done': True} self.EOS_TENSOR = torch.LongTensor(self.dict.parse(self.EOS)) self.hidden_size = hsz self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.use_cuda = opt.get('cuda', False) self.longest_label = 1 self.criterion = nn.NLLLoss() self.lt = nn.Embedding(len(self.dict), hsz, padding_idx=0, scale_grad_by_freq=True) self.encoder = nn.GRU(hsz, hsz, opt['numlayers']) self.decoder = nn.GRU(hsz, hsz, opt['numlayers']) self.d2o = nn.Linear(hsz, len(self.dict)) self.dropout = nn.Dropout(opt['dropout']) self.softmax = nn.LogSoftmax() lr = opt['learningrate'] self.optims = { 'lt': optim.SGD(self.lt.parameters(), lr=lr), 'encoder': optim.SGD(self.encoder.parameters(), lr=lr), 'decoder': optim.SGD(self.decoder.parameters(), lr=lr), 'd2o': optim.SGD(self.d2o.parameters(), lr=lr), } if self.use_cuda: self.cuda() if opt.get('model_file') and os.path.isfile(opt['model_file']): print('Loading existing model parameters from ' + opt['model_file']) self.load(opt['model_file']) self.episode_done = True def parse(self, text): return torch.LongTensor(self.dict.txt2vec(text)) def v2t(self, vec): return self.dict.vec2txt(vec) def cuda(self): self.criterion.cuda() self.lt.cuda() self.encoder.cuda() self.decoder.cuda() self.d2o.cuda() self.dropout.cuda() self.softmax.cuda() def hidden_to_idx(self, hidden, drop=False): if hidden.size(0) > 1: raise RuntimeError('bad dimensions of tensor:', hidden) hidden = hidden.squeeze(0) scores = self.d2o(hidden) if drop: scores = self.dropout(scores) scores = self.softmax(scores) _max_score, idx = scores.max(1) return idx, scores def zero_grad(self): for optimizer in self.optims.values(): optimizer.zero_grad() def update_params(self): for optimizer in self.optims.values(): optimizer.step() def init_zeros(self, bsz=1): t = torch.zeros(self.num_layers, bsz, self.hidden_size) if self.use_cuda: t = t.cuda(async=True) return Variable(t) def init_rand(self, bsz=1): t = torch.FloatTensor(self.num_layers, bsz, self.hidden_size) t.uniform_(0.05) if self.use_cuda: t = t.cuda(async=True) return Variable(t) def observe(self, observation): observation = copy.deepcopy(observation) if not self.episode_done: # if the last example wasn't the end of an episode, then we need to # recall what was said in that example prev_dialogue = self.observation['text'] observation['text'] = prev_dialogue + '\n' + observation['text'] self.observation = observation self.episode_done = observation['episode_done'] return observation def update(self, xs, ys): batchsize = len(xs) # first encode context xes = self.lt(xs).t() h0 = self.init_zeros(batchsize) _output, hn = self.encoder(xes, h0) # start with EOS tensor for all x = self.EOS_TENSOR if self.use_cuda: x = x.cuda(async=True) x = Variable(x) xe = self.lt(x).unsqueeze(1) xes = xe.expand(xe.size(0), batchsize, xe.size(2)) output_lines = [[] for _ in range(batchsize)] self.zero_grad() # update model loss = 0 self.longest_label = max(self.longest_label, ys.size(1)) for i in range(ys.size(1)): output, hn = self.decoder(xes, hn) preds, scores = self.hidden_to_idx(output, drop=True) y = ys.select(1, i) loss += self.criterion(scores, y) # use the true token as the next input xes = self.lt(y).unsqueeze(0) # hn = self.dropout(hn) for j in range(preds.size(0)): token = self.v2t([preds.data[j][0]]) output_lines[j].append(token) loss.backward() self.update_params() if random.random() < 0.1: true = self.v2t(ys.data[0]) #print('loss:', round(loss.data[0], 2), # ' '.join(output_lines[0]), '(true: {})'.format(true)) return output_lines def predict(self, xs): batchsize = len(xs) # first encode context xes = self.lt(xs).t() h0 = self.init_zeros(batchsize) _output, hn = self.encoder(xes, h0) # start with EOS tensor for all x = self.EOS_TENSOR if self.use_cuda: x = x.cuda(async=True) x = Variable(x) xe = self.lt(x).unsqueeze(1) xes = xe.expand(xe.size(0), batchsize, xe.size(2)) done = [False for _ in range(batchsize)] total_done = 0 max_len = 0 output_lines = [[] for _ in range(batchsize)] while(total_done < batchsize) and max_len < self.longest_label: output, hn = self.decoder(xes, hn) preds, scores = self.hidden_to_idx(output, drop=False) xes = self.lt(preds.t()) max_len += 1 for i in range(preds.size(0)): if not done[i]: token = self.v2t(preds.data[i]) if token == self.EOS: done[i] = True total_done += 1 else: output_lines[i].append(token) if random.random() < 0.1: print('prediction:', ' '.join(output_lines[0])) return output_lines def batchify(self, obs): exs = [ex for ex in obs if 'text' in ex] valid_inds = [i for i, ex in enumerate(obs) if 'text' in ex] batchsize = len(exs) parsed = [self.parse(ex['text']) for ex in exs] max_x_len = max([len(x) for x in parsed]) xs = torch.LongTensor(batchsize, max_x_len).fill_(0) for i, x in enumerate(parsed): offset = max_x_len - len(x) for j, idx in enumerate(x): xs[i][j + offset] = idx if self.use_cuda: xs = xs.cuda(async=True) xs = Variable(xs) ys = None if 'labels' in exs[0]: labels = [random.choice(ex['labels']) + ' ' + self.EOS for ex in exs] parsed = [self.parse(y) for y in labels] max_y_len = max(len(y) for y in parsed) ys = torch.LongTensor(batchsize, max_y_len).fill_(0) for i, y in enumerate(parsed): for j, idx in enumerate(y): ys[i][j] = idx if self.use_cuda: ys = ys.cuda(async=True) ys = Variable(ys) return xs, ys, valid_inds def batch_act(self, observations): batchsize = len(observations) batch_reply = [{'id': self.getID()} for _ in range(batchsize)] xs, ys, valid_inds = self.batchify(observations) if len(xs) == 0: return batch_reply # Either train or predict if ys is not None: predictions = self.update(xs, ys) else: predictions = self.predict(xs) for i in range(len(predictions)): batch_reply[valid_inds[i]]['text'] = ' '.join( c for c in predictions[i] if c != self.EOS) return batch_reply def act(self): return self.batch_act([self.observation])[0] def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path: model = {} model['lt'] = self.lt.state_dict() model['encoder'] = self.encoder.state_dict() model['decoder'] = self.decoder.state_dict() model['d2o'] = self.d2o.state_dict() model['longest_label'] = self.longest_label with open(path, 'wb') as write: torch.save(model, write) def load(self, path): with open(path, 'rb') as read: model = torch.load(read) self.lt.load_state_dict(model['lt']) self.encoder.load_state_dict(model['encoder']) self.decoder.load_state_dict(model['decoder']) self.d2o.load_state_dict(model['d2o']) self.longest_label = model['longest_label']
class Seq2seqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. This model supports encoding the input and decoding the output via one of several flavors of RNN. It then uses a linear layer (whose weights can be shared with the embedding layer) to convert RNN output states into output tokens. This model currently uses greedy decoding, selecting the highest probability token at each time step. For more information, see Sequence to Sequence Learning with Neural Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('--init-model', type=str, default=None, help='load dict/features/weights/opts from this file') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=1, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=-1, help='gradient clipping using l2 norm') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', default='none', choices=['none', 'concat', 'general', 'dot', 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' 'For more details see: ' 'https://arxiv.org/abs/1508.04025') agent.add_argument('-attl', '--attention-length', default=48, type=int, help='Length of local attention.') agent.add_argument('--attention-time', default='post', choices=['pre', 'post'], help='Whether to apply attention before or after ' 'decoding.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length. This ' 'reduces the total amount ' 'of padding in the batches.') agent.add_argument('-rnn', '--rnn-class', default='lstm', choices=Seq2seq.RNN_OPTS.keys(), help='Choose between different types of RNNs.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'], help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights. ' 'Note that shared disabled some encoder ' 'options--in particular, bidirectionality.') agent.add_argument('-lt', '--lookuptable', default='unique', choices=['unique', 'enc_dec', 'dec_out', 'all'], help='The encoder, decoder, and output modules can ' 'share weights, or not. ' 'Unique has independent embeddings for each. ' 'Enc_dec shares the embedding for the encoder ' 'and decoder. ' 'Dec_out shares decoder embedding and output ' 'weights. ' 'All shares all three weights.') agent.add_argument('-opt', '--optimizer', default='sgd', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-mom', '--momentum', default=-1, type=float, help='if applicable, momentum value for optimizer. ' 'if > 0, sgd uses nesterov momentum.') agent.add_argument('-emb', '--embedding-type', default='random', choices=['random', 'glove', 'glove-fixed', 'fasttext', 'fasttext-fixed'], help='Choose between different strategies ' 'for word embeddings. Default is random, ' 'but can also preinitialize from Glove or ' 'Fasttext.' 'Preinitialized embeddings can also be fixed ' 'so they are not updated during training.') agent.add_argument('-rf', '--report-freq', type=float, default=0.001, help='Report frequency of prediction during eval.') Seq2seqAgent.dictionary_class().add_cmdline_args(argparser) return agent def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = {'loss': 0.0, 'num_tokens': 0} self.history = {} self.report_freq = opt.get('report_freq', 0.001) states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if opt.get('numthreads') > 1: torch.set_num_threads(1) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.metrics = shared['metrics'] states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file' elif opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] else: init_model = None if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format(init_model)) new_opt, states = self.load(init_model) # override model-specific options with stored ones opt = self.override_opt(new_opt) self.opt = opt if opt['dict_file'] is None: if init_model is not None and os.path.isfile(init_model + '.dict'): # check first to see if a dictionary exists opt['dict_file'] = init_model + '.dict' elif opt.get('model_file'): # otherwise, set default dict-file if it is not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] if not hasattr(self, 'model_class'): # this allows child classes to override this but inherit init self.model_class = Seq2seq self.model = self.model_class( opt, len(self.dict), padding_idx=self.NULL_IDX, start_idx=self.START_IDX, end_idx=self.END_IDX, longest_label=states.get('longest_label', 1)) if opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ModuleNotFoundError as ex: print('Please install torch text with `pip install torchtext`') raise ex if opt['embedding_type'].startswith('glove'): init = 'glove' embs = vocab.GloVe(name='840B', dim=300, cache=os.path.join(opt['parlai_home'], '.vector_cache')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=os.path.join(opt['parlai_home'], '.vector_cache')) else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != 300: rp = torch.Tensor(300, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.lt.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.lt.weight.data[i] = vec print('Seq2seq: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', -1) self.rank = opt['rank_candidates'] # set up tensors once self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.xs = self.xs.cuda() self.ys = self.ys.cuda() if self.rank: self.cands = self.cands.cuda() self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in ['sgd', 'rmsprop']: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['embedding_type'].endswith('fixed'): print('Seq2seq: fixing embedding weights.') self.model.decoder.lt.weight.requires_grad = False self.model.encoder.lt.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(states['optimizer']) if self.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder', 'decoder', 'lookuptable', 'attention', 'attention_length', 'rnn_class'} for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('[ Adding new option: | {k}: {v} | ]'.format(k=k, v=v)) elif self.opt[k] != v: print('[ Overriding option: | {k}: {old} => {v} | ]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v if 'dict_file' in new_opt and not self.opt.get('dict_file'): print('[ No dictionary path detected, trying to load previous ' 'path {} ]'.format(new_opt['dict_file'])) self.opt['dict_file'] = new_opt['dict_file'] return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" if isinstance(vec, Variable): vec = vec.data new_vec = [] for i in vec: if i == self.END_IDX: break elif i != self.START_IDX: new_vec.append(i) return self.dict.vec2txt(new_vec) def zero_grad(self): """Zero out optimizer.""" self.optimizer.zero_grad() def update_params(self): """Do one optimization step.""" if self.clip > 0: torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip) self.optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.history.clear() self.reset_metrics() def reset_metrics(self): self.metrics['loss'] = 0.0 self.metrics['num_tokens'] = 0 def report(self): m = {} if self.metrics['num_tokens'] > 0: m['loss'] = self.metrics['loss'] / self.metrics['num_tokens'] m['ppl'] = math.exp(m['loss']) for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats m[k] = round_sigfigs(v, 4) return m def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['opt'] = self.opt shared['answers'] = self.answers shared['dict'] = self.dict shared['START_IDX'] = self.START_IDX shared['END_IDX'] = self.END_IDX shared['NULL_IDX'] = self.NULL_IDX if self.opt.get('numthreads', 1) > 1: if type(self.metrics) == dict: self.metrics = SharedTable(self.metrics) self.model.share_memory() shared['metrics'] = self.metrics shared['model'] = self.model shared['states'] = { # only need to pass optimizer states 'optimizer': self.optimizer.state_dict(), 'optimizer_type': self.opt['optimizer'], } return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) obs = observation.copy() batch_idx = self.opt.get('batchindex', 0) if not obs.get('preprocessed', False) or 'text2vec' not in obs: obs['text2vec'] = maintain_dialog_history( self.history, obs, reply=self.answers[batch_idx], historyLength=self.truncate, useReplies=self.opt['include_labels'], dict=self.dict, useStartEndIndices=False) else: obs['text2vec'] = deque(obs['text2vec'], maxlen=self.truncate) self.observation = obs self.answers[batch_idx] = None return obs def predict(self, xs, ys=None, cands=None, valid_cands=None, is_training=False): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ text_cand_inds, loss_dict = None, None if is_training: self.model.train() self.zero_grad() out = self.model(xs, ys) predictions, scores = out[0], out[1] loss = self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1)) # save loss to metrics target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0] self.metrics['loss'] += loss.double().data[0] self.metrics['num_tokens'] += target_tokens loss /= target_tokens # average loss per token # loss /= xs.size(0) # average loss per sentence loss.backward() self.update_params() else: self.model.eval() out = self.model(xs, ys=None, cands=cands, valid_cands=valid_cands) predictions, text_cand_inds = out[0], out[2] if ys is not None: # calculate loss on targets out = self.model(xs, ys) scores = out[1] loss = self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1)) target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0] self.metrics['loss'] += loss.double().data[0] self.metrics['num_tokens'] += target_tokens return predictions, text_cand_inds def vectorize(self, observations): """Convert a list of observations into input & target tensors.""" is_training = any(['labels' in obs for obs in observations]) xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text( observations, self.dict, end_idx=self.END_IDX, null_idx=self.NULL_IDX, dq=True, eval_labels=True, truncate=self.truncate) if xs is None: return None, None, None, None, None, None, None xs = torch.LongTensor(xs) if ys is not None: ys = torch.LongTensor(ys) if self.use_cuda: # copy to gpu self.xs.resize_(xs.size()) self.xs.copy_(xs) xs = Variable(self.xs) if ys is not None: self.ys.resize_(ys.size()) self.ys.copy_(ys) ys = Variable(self.ys) else: xs = Variable(xs) if ys is not None: ys = Variable(ys) # set up candidates cands = None valid_cands = None if not is_training and self.rank: # only do ranking when no targets available and ranking flag set parsed_cs = [] valid_cands = [] for i, v in enumerate(valid_inds): if 'label_candidates' in observations[v]: # each candidate tuple is a pair of the parsed version and # the original full string cs = list(observations[v]['label_candidates']) curr_dqs = [deque(maxlen=self.truncate) for _ in cs] for dq, c in zip(curr_dqs, cs): dq.extendleft(reversed(self.parse(c))) parsed_cs.append(curr_dqs) valid_cands.append((i, v, cs)) if len(parsed_cs) > 0: # TODO: store lengths of cands separately, so don't have zero # padding for varying number of cands per example # found cands, pack them into tensor max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs) max_c_cnt = max(len(cs) for cs in parsed_cs) for cs in parsed_cs: for c in cs: c += [self.NULL_IDX] * (max_c_len - len(c)) cs += [[self.NULL_IDX] * max_c_len] * (max_c_cnt - len(cs)) cands = torch.LongTensor(parsed_cs) if self.use_cuda: # copy to gpu self.cands.resize_(cands.size()) self.cands.copy_(cands) cands = Variable(self.cands) else: cands = Variable(cands) return xs, ys, labels, valid_inds, cands, valid_cands, is_training def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, labels, valid_inds, cands, valid_cands, is_training = self.vectorize(observations) if xs is None: # no valid examples, just return empty responses return batch_reply # produce predictions, train on targets if availables predictions, text_cand_inds = self.predict(xs, ys, cands, valid_cands, is_training) if is_training: report_freq = 0 else: report_freq = self.report_freq PaddingUtils.map_predictions( predictions.cpu().data, valid_inds, batch_reply, observations, self.dict, self.END_IDX, report_freq=report_freq, labels=labels, answers=self.answers, ys=ys.data if ys is not None else None) if text_cand_inds is not None: text_cand_inds = text_cand_inds.cpu().data for i in range(len(valid_cands)): order = text_cand_inds[i] _, batch_idx, curr_cands = valid_cands[i] curr = batch_reply[batch_idx] curr['text_candidates'] = [curr_cands[idx] for idx in order if idx < len(curr_cands)] return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['longest_label'] = self.model.longest_label model['optimizer'] = self.optimizer.state_dict() model['optimizer_type'] = self.opt['optimizer'] model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" states = torch.load(path, map_location=lambda cpu, _: cpu) return states['opt'], states def receive_metrics(self, metrics_dict): """Use the metrics to decide when to adjust LR schedule.""" if 'loss' in metrics_dict: self.scheduler.step(metrics_dict['loss'])
class IbmSeq2seqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. For more information, see IBM's repository at https://github.com/IBM/pytorch-seq2seq. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" IbmSeq2seqAgent.dictionary_class().add_cmdline_args(argparser) agent = argparser.add_argument_group('IBM Seq2Seq Arguments') agent.add_argument( '--init-model', type=str, default=None, help='load dict/features/weights/opts from this file') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.005, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.5, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=-1, help='gradient clipping using l2 norm') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', type='bool', default=True, help='Enable/disable attention over encoded state.') agent.add_argument('--maxlength-in', type=int, default=50, help='Maximum input token length.') agent.add_argument('--maxlength-out', type=int, default=50, help='Maximum output token length.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length. This ' 'reduces the total amount ' 'of padding in the batches.') agent.add_argument('-rnn', '--rnncell', default='gru', help='Choose between different types of RNNs.') agent.add_argument('-opt', '--optimizer', default='adam', choices=IbmSeq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init # all instances may need some params self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = {'loss': 0, 'num_tokens': 0} self.history = {} self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file' elif opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] else: init_model = None if init_model is not None: # load model parameters if available print('Loading existing model params from ' + init_model) new_opt, self.states = self.load(init_model) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None: if init_model is not None and os.path.isfile(init_model + '.dict'): # check first to see if a dictionary exists opt['dict_file'] = init_model + '.dict' elif opt.get('model_file'): # otherwise, set default dict-file if it is not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] encoder = EncoderRNN(len(self.dict), opt['maxlength_in'], opt['hiddensize'], dropout_p=opt['dropout'], input_dropout_p=opt['dropout'], n_layers=opt['numlayers'], rnn_cell=opt['rnncell'], bidirectional=opt['bidirectional'], variable_lengths=True) decoder = DecoderRNN( len(self.dict), opt['maxlength_out'], opt['hiddensize'] * 2 if opt['bidirectional'] else opt['hiddensize'], dropout_p=opt['dropout'], input_dropout_p=opt['dropout'], n_layers=opt['numlayers'], rnn_cell=opt['rnncell'], bidirectional=opt['bidirectional'], sos_id=self.START_IDX, eos_id=self.END_IDX, use_attention=opt['attention']) self.model = Seq2seq(encoder, decoder) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt['gradient_clip'] # set up tensors once self.START = torch.LongTensor([self.START_IDX]) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) # set up criteria self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.START = self.START.cuda() self.xs = self.xs.cuda() self.ys = self.ys.cuda() self.criterion.cuda() # set up optimizer lr = opt['learningrate'] optim_class = IbmSeq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if self.states: if self.states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: self.optimizer.load_state_dict(self.states['optimizer']) self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'attention', 'maxlength-in', 'maxlength-out' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" if type(vec) == Variable: vec = vec.data new_vec = [] for i in vec: if i == self.END_IDX: break elif i != self.START_IDX: new_vec.append(i) return self.dict.vec2txt(new_vec) def zero_grad(self): """Zero out optimizer.""" self.optimizer.zero_grad() def update_params(self): """Do one optimization step.""" if self.clip > 0: torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip) self.optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.history.clear() self.reset_metrics() def reset_metrics(self): self.metrics.clear() self.metrics['loss'] = 0 self.metrics['num_tokens'] = 0 def report(self): m = {} if self.metrics['num_tokens'] > 0: m['loss'] = self.metrics['loss'] / self.metrics['num_tokens'] m['ppl'] = math.exp(m['loss']) for k, v in m.items(): # clean up: rounds to sigfigs and converts tensors to floats m[k] = round_sigfigs(v, 4) return m def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['answers'] = self.answers shared['dict'] = self.dict shared['START_IDX'] = self.START_IDX shared['END_IDX'] = self.END_IDX shared['NULL_IDX'] = self.NULL_IDX if self.opt.get('numthreads', 1) > 1: shared['model'] = self.model self.model.share_memory() shared['states'] = self.states return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) obs = observation.copy() batch_idx = self.opt.get('batchindex', 0) if not obs.get('preprocessed', False): obs['text2vec'] = maintain_dialog_history( self.history, obs, reply=self.answers[batch_idx], historyLength=self.truncate, useReplies=self.opt['include_labels'], dict=self.dict, useStartEndIndices=False) else: obs['text2vec'] = deque(obs['text2vec'], maxlen=self.truncate) self.observation = obs self.answers[batch_idx] = None return obs def predict(self, xs, ys=None, is_training=False): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ # import pdb; pdb.set_trace() loss_dict = None, None x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] start = Variable(self.START, requires_grad=False) starts = start.expand(len(xs), 1) if is_training: self.model.train() self.zero_grad() y_in = torch.cat([starts, ys], 1) out, hid, result = self.model(xs, x_lens, y_in, teacher_forcing_ratio=True) scores = torch.cat(out) loss = self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1)) # save loss to metrics target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0] self.metrics['loss'] += loss.double().data[0] self.metrics['num_tokens'] += target_tokens # average loss per token loss /= target_tokens loss.backward() self.update_params() else: self.model.eval() out, hid, result = self.model(xs, x_lens) if ys is not None: # calculate loss on targets y_in = torch.cat([starts, ys], 1) out, hid, result = self.model(xs, x_lens, y_in, teacher_forcing_ratio=False) scores = torch.cat(out) loss = self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1)) target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0] self.metrics['loss'] += loss.double().data[0] self.metrics['num_tokens'] += target_tokens predictions = torch.cat(result['sequence'], 1) return predictions def vectorize(self, observations): """Convert a list of observations into input & target tensors.""" is_training = any(['labels' in obs for obs in observations]) xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text( observations, self.dict, end_idx=None, null_idx=self.NULL_IDX, dq=True, eval_labels=True, truncate=self.truncate) if xs is None: return None, None, None, None, None, None, None xs = torch.LongTensor(xs) ys = torch.LongTensor(ys) if self.use_cuda: # copy to gpu self.xs.resize_(xs.size()) self.xs.copy_(xs) xs = Variable(self.xs) if ys is not None: self.ys.resize_(ys.size()) self.ys.copy_(ys) ys = Variable(self.ys) else: xs = Variable(xs) if ys is not None: ys = Variable(ys) return xs, ys, labels, valid_inds, is_training def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, labels, valid_inds, is_training = self.vectorize(observations) if xs is None: # no valid examples, just return empty responses return batch_reply # produce predictions, train on targets if availables predictions = self.predict(xs, ys, is_training) if is_training: report_freq = 0 else: report_freq = 0.01 PaddingUtils.map_predictions(predictions, valid_inds, batch_reply, observations, self.dict, self.END_IDX, report_freq=report_freq, labels=labels, answers=self.answers, ys=ys.data) return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['optimizer'] = self.optimizer.state_dict() model['optimizer_type'] = self.opt['optimizer'] model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: states = torch.load(read) return states['opt'], states def receive_metrics(self, metrics_dict): """Use the metrics to decide when to adjust LR schedule.""" if 'loss' in metrics_dict: self.scheduler.step(metrics_dict['loss'])
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init self.metrics = { 'loss': 0, 'num_tokens': 0, 'lmloss': 0, 'lm_num_tokens': 0 } self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() self.batchsize = opt.get('batchsize', 1) self.use_person_tokens = opt.get('person_tokens', True) self.sampling_mode = opt.get('sampling_mode', False) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] self.metrics = shared['metrics'] # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) else: # this is not a shared instance of this class, so do full init if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] # for backwards compatibility: will only be called for older models # for which .opt file does not exist if (init_model is not None and not os.path.isfile(init_model + '.opt')): new_opt = self.load_opt(init_model) # load model parameters if available print('[ Setting opt from {} ]'.format(init_model)) # since .opt file does not exist, save one for future use print("Saving opt file at:", init_model + ".opt") with open(init_model + ".opt", 'wb') as handle: pickle.dump(new_opt, handle, protocol=pickle.HIGHEST_PROTOCOL) opt = self.override_opt(new_opt) if ((init_model is not None and os.path.isfile(init_model + '.dict')) or opt['dict_file'] is None): opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'LanguageModel' # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] if self.use_person_tokens: # add person1 and person2 tokens self.dict.add_to_dict(self.dict.tokenize("PERSON1")) self.dict.add_to_dict(self.dict.tokenize("PERSON2")) # set model self.model = RNNModel(opt, len(self.dict)) if init_model is not None: self.load(init_model) if self.use_cuda: self.model.cuda() self.next_observe = [] self.next_batch = [] self.is_training = True if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.25) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.criterion.cuda() # init hidden state self.hidden = self.model.init_hidden(self.batchsize) # init tensor of end tokens self.ends = torch.LongTensor( [self.END_IDX for _ in range(self.batchsize)]) if self.use_cuda: self.ends = self.ends.cuda() # set up model and learning rate scheduler parameters self.lr = opt['learningrate'] self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr) self.best_val_loss = self.states.get('best_val_loss', None) self.lr_factor = opt['lr_factor'] if self.lr_factor < 1.0: self.lr_patience = opt['lr_patience'] self.lr_min = opt['lr_minimum'] self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, factor=self.lr_factor, verbose=True, patience=self.lr_patience, min_lr=self.lr_min) # initial step for scheduler if self.best_val_loss is initialized if self.best_val_loss is not None: self.scheduler.step(self.best_val_loss) else: self.scheduler = None self.reset()
class Seq2seqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. This model supports encoding the input and decoding the output via one of several flavors of RNN. It then uses a linear layer (whose weights can be shared with the embedding layer) to convert RNN output states into output tokens. This model currently uses greedy decoding, selecting the highest probability token at each time step. For more information, see Sequence to Sequence Learning with Neural Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM} @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.005, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-bi', '--bidirectional', type='bool', default=False, help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', default='none', choices=['none', 'concat', 'general', 'dot', 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' 'For more details see: ' 'https://arxiv.org/pdf/1508.04025.pdf') agent.add_argument('-attl', '--attention-length', default=48, type=int, help='Length of local attention.') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type=int, default=-1, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length and to ' 'be similar in length to one another by throwing ' 'away extra tokens. This reduces the total amount ' 'of padding in the batches.') agent.add_argument('-enc', '--encoder', default='gru', choices=Seq2seqAgent.ENC_OPTS.keys(), help='Choose between different encoder modules.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()), help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights. ' 'Note that shared disabled some encoder ' 'options--in particular, bidirectionality.') agent.add_argument('-lt', '--lookuptable', default='all', choices=['unique', 'enc_dec', 'dec_out', 'all'], help='The encoder, decoder, and output modules can ' 'share weights, or not. ' 'Unique has independent embeddings for each. ' 'Enc_dec shares the embedding for the encoder ' 'and decoder. ' 'Dec_out shares decoder embedding and output ' 'weights. ' 'All shares all three weights.') agent.add_argument('-opt', '--optimizer', default='adam', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') agent.add_argument('-emb', '--embedding-type', default='random', choices=['random', 'glove', 'glove-fixed'], help='Choose between different strategies ' 'for word embeddings. Default is random, ' 'but can also preinitialize from Glove.' 'Preinitialized embeddings can also be fixed ' 'so they are not updated during training.') agent.add_argument('-lm', '--language-model', default='none', choices=['none', 'only', 'both'], help='Enabled language modeling training on the ' 'concatenated input and label data.') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) # all instances needs truncate param self.truncate = opt['truncate'] if shared: # set up shared properties self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) states = None if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START = self.dict.start_token self.START_IDX = self.dict[self.START] self.START_TENSOR = torch.LongTensor([self.START_IDX]) # we use END markers to end our output self.END = self.dict.end_token self.END_IDX = self.dict[self.END] self.END_TENSOR = torch.LongTensor([self.END_IDX]) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params in self hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.attention = opt['attention'] self.bidirectional = opt['bidirectional'] self.num_dirs = 2 if self.bidirectional else 1 self.dropout = opt['dropout'] self.lm = opt['language_model'] # set up tensors once self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) if self.rank: self.cands = torch.LongTensor(1, 1, 1) self.cand_scores = torch.FloatTensor(1) self.cand_lengths = torch.LongTensor(1) # set up modules self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) # lookup table stores word embeddings self.enc_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if opt['lookuptable'] in ['enc_dec', 'all']: # share this with the encoder self.dec_lt = self.enc_lt else: self.dec_lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, max_norm=10) if not states and opt['embedding_type'].startswith('glove'): # set up pre-initialized vectors from GloVe try: import torchtext.vocab as vocab except ImportError: raise ImportError('Please install torchtext from' 'github.com/pytorch/text.') Glove = vocab.GloVe(name='840B', dim=300) # do better than uniform random proj = torch.FloatTensor(emb, 300).uniform_(-0.057735, 0.057735) if emb != 300 else None for w in self.dict.freq: if w in Glove.stoi: vec = Glove.vectors[Glove.stoi[w]] if emb != 300: vec = torch.mm(proj, vec.unsqueeze(1)).squeeze() self.enc_lt.weight.data[self.dict[w]] = vec self.dec_lt.weight.data[self.dict[w]] = vec # encoder captures the input text enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']] # decoder produces our output states if opt['decoder'] in ['same', 'shared']: # use same class as encoder self.decoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) else: # use set class dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']] self.decoder = dec_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True) if opt['decoder'] == 'shared': # shared weights: use the decoder to encode if self.bidirectional: raise RuntimeError('Cannot share enc/dec and do ' 'bidirectional encoding.') self.encoder = self.decoder else: self.encoder = enc_class(emb, hsz, opt['numlayers'], dropout=self.dropout, batch_first=True, bidirectional=self.bidirectional) # linear layers help us produce outputs from final decoder state hszXdirs = hsz * self.num_dirs # hidden to embedding self.h2e = nn.Linear(hsz, emb) # embedding to output. note that this CAN predict NULL self.e2o = nn.Linear(emb, len(self.dict)) if opt['lookuptable'] in ['dec_out', 'all']: # share these weights with the decoder lookup table self.e2o.weight = self.dec_lt.weight if self.attention != 'none': # we'll need this for all attention types self.attn_combine = nn.Linear(hszXdirs + emb, emb) if self.attention == 'local': # local attention over fixed set of output states if opt['attention_length'] < 0: raise RuntimeError('Set attention length to > 0.') self.max_length = opt['attention_length'] # combines input and previous hidden output layer self.attn = nn.Linear(hsz + emb, self.max_length) # combines attention weights with encoder outputs elif self.attention == 'concat': self.attn = nn.Linear(hsz + hszXdirs, hsz) self.attn_v = nn.Linear(hsz, 1) elif self.attention == 'general': # equivalent to dot if attn is identity self.attn = nn.Linear(hsz, hszXdirs) # set up optims for each module lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt['optimizer'] == 'sgd': kwargs['momentum'] = 0.95 kwargs['nesterov'] = True self.optims = { 'decoder': optim_class(self.decoder.parameters(), **kwargs), 'h2e': optim_class(self.h2e.parameters(), **kwargs), } if opt['decoder'] != 'shared': # update the encoder as well self.optims['encoder'] = optim_class( self.encoder.parameters(), **kwargs) if not opt['embedding_type'].endswith('-fixed'): # update embeddings during training self.optims['enc_lt'] = optim_class( self.enc_lt.parameters(), **kwargs) self.optims['e2o'] = optim_class( self.e2o.parameters(), **kwargs) if opt['lookuptable'] not in ['enc_dec', 'all']: # only add dec if it's separate from enc self.optims['dec_lt'] = optim_class( self.dec_lt.parameters(), **kwargs) elif opt['lookuptable'] not in ['dec_out', 'all']: # embeddings are fixed, so only update e2o if it's not shared self.optims['e2o'] = optim_class( self.e2o.parameters(), **kwargs) # add attention parameters into optims if available for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): self.optims[attn_name] = optim_class( getattr(self, attn_name).parameters(), **kwargs) if states is not None: # set loaded states if applicable self.set_states(states) if self.use_cuda: self.cuda() self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder', 'decoder', 'lookuptable', 'attention', 'attention_length'} for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" if type(vec) == Variable: vec = vec.data new_vec = [] for i in vec: if i == self.END_IDX: break elif i != self.START_IDX: new_vec.append(i) return self.dict.vec2txt(new_vec) def cuda(self): """Push parameters to the GPU.""" self.START_TENSOR = self.START_TENSOR.cuda(async=True) self.END_TENSOR = self.END_TENSOR.cuda(async=True) self.zeros = self.zeros.cuda(async=True) self.xs = self.xs.cuda(async=True) self.ys = self.ys.cuda(async=True) if self.rank: self.cands = self.cands.cuda(async=True) self.cand_scores = self.cand_scores.cuda(async=True) self.cand_lengths = self.cand_lengths.cuda(async=True) self.criterion.cuda() self.enc_lt.cuda() self.dec_lt.cuda() self.encoder.cuda() self.decoder.cuda() self.h2e.cuda() self.e2o.cuda() if self.attention != 'none': for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): getattr(self, attn_name).cuda() def hidden_to_idx(self, hidden, is_training=False): """Convert hidden state vectors into indices into the dictionary.""" # dropout at each step e = F.dropout(self.h2e(hidden), p=self.dropout, training=is_training) scores = F.dropout(self.e2o(e), p=self.dropout, training=is_training) # skip zero (null_idx) when selecting a score _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2) # add one back to index since we removed first option return idx.add_(1), scores def zero_grad(self): """Zero out optimizers.""" for optimizer in self.optims.values(): optimizer.zero_grad() def update_params(self): """Do one optimization step.""" for optimizer in self.optims.values(): optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['answers'] = self.answers shared['dict'] = self.dict shared['START_IDX'] = self.START_IDX shared['END_IDX'] = self.END_IDX return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) observation = observation.copy() if 'text' in observation: if observation['text'] == '': observation.pop('text') else: # put START and END around text parsed_x = [self.START_IDX] parsed_x.extend(self.parse(observation['text'])) parsed_x.append(self.END_IDX) if self.truncate > 0: parsed_x = parsed_x[-self.truncate:] observation['text'] = parsed_x if not self.episode_done: # remember past dialog prev_dialog = self.observation['text'] # get last y batch_idx = self.opt.get('batchindex', 0) if self.answers[batch_idx] is not None: # use our last answer, which is the label during training lastY = self.answers[batch_idx] prev_dialog.append(self.START_IDX) prev_dialog.extend(lastY) prev_dialog.append(self.END_IDX) self.answers[batch_idx] = None # forget last y prev_dialog.extend(parsed_x) if self.truncate > 0: prev_dialog = prev_dialog[-self.truncate:] observation['text'] = prev_dialog self.observation = observation self.episode_done = observation['episode_done'] return observation def _encode(self, xs, is_training=False): """Call encoder and return output and hidden states.""" self.lastxs = xs batchsize = len(xs) # first encode context xes = F.dropout(self.enc_lt(xs), p=self.dropout, training=is_training) # project from emb_size to hidden_size dimensions x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True) if self.zeros.size(1) != batchsize: self.zeros.resize_(self.num_layers * self.num_dirs, batchsize, self.hidden_size).fill_(0) h0 = Variable(self.zeros, requires_grad=False) if type(self.encoder) == nn.LSTM: encoder_output_packed, hidden = self.encoder(xes_packed, (h0, h0)) # take elementwise max between forward and backward hidden states hidden = (hidden[0].view(-1, self.num_dirs, hidden[0].size(1), hidden[0].size(2)).max(1)[0], hidden[1].view(-1, self.num_dirs, hidden[1].size(1), hidden[1].size(2)).max(1)[0]) if type(self.decoder) != nn.LSTM: hidden = hidden[0] else: encoder_output_packed, hidden = self.encoder(xes_packed, h0) # take elementwise max between forward and backward hidden states hidden = hidden.view(-1, self.num_dirs, hidden.size(1), hidden.size(2)).max(1)[0] if type(self.decoder) == nn.LSTM: hidden = (hidden, h0.narrow(0, 0, 2)) encoder_output, _ = pad_packed_sequence(encoder_output_packed, batch_first=True) encoder_output = encoder_output if self.attention == 'local': # if using local attention, narrow encoder_output to max_length if encoder_output.size(1) > self.max_length: offset = encoder_output.size(1) - self.max_length encoder_output = encoder_output.narrow( 1, offset, self.max_length) return encoder_output, hidden def _apply_attention(self, xes, encoder_output, hidden, attn_mask=None): """Apply attention to encoder hidden layer.""" last_hidden = hidden[-1] # select hidden from last RNN layer if self.attention == 'concat': hidden_expand = last_hidden.unsqueeze(1).expand( last_hidden.size(0), encoder_output.size(1), last_hidden.size(1)) attn_w_premask = self.attn_v(F.tanh(self.attn( torch.cat((encoder_output, hidden_expand), 2)))).squeeze(2) attn_weights = F.softmax(attn_w_premask * attn_mask - (1 - attn_mask) * 1e20) elif self.attention == 'dot': hidden_expand = last_hidden.unsqueeze(1) attn_w_premask = torch.bmm(hidden_expand, encoder_output.transpose(1, 2) ).squeeze(1) attn_weights = F.softmax(attn_w_premask * attn_mask - (1 - attn_mask) * 1e20) elif self.attention == 'general': hidden_expand = last_hidden.unsqueeze(1) attn_w_premask = torch.bmm(self.attn(hidden_expand), encoder_output.transpose(1, 2) ).squeeze(1) attn_weights = F.softmax(attn_w_premask * attn_mask - (1 - attn_mask) * 1e20) elif self.attention == 'local': attn_weights = F.softmax(self.attn( torch.cat((xes.squeeze(1), last_hidden), 1))) if attn_weights.size(1) > encoder_output.size(1): attn_weights = attn_weights.narrow( 1, 0, encoder_output.size(1)) attn_applied = torch.bmm( attn_weights.unsqueeze(1), encoder_output).squeeze(1) output = torch.cat((xes.squeeze(1), attn_applied), 1) output = self.attn_combine(output).unsqueeze(1) output = F.tanh(output) self.attn_weights = attn_weights return output def _decode_and_train(self, batchsize, xes, ys, encoder_output, hidden, attn_mask, lm=False): """Update the model based on the labels.""" self.zero_grad() loss = 0 predictions = [] if self.attention != 'none': # using attention, produce one token at a time for i in range(ys.size(1)): h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden output = self._apply_attention(xes, encoder_output, h_att, attn_mask) output, hidden = self.decoder(output, hidden) preds, scores = self.hidden_to_idx(output, is_training=True) y = ys.select(1, i) loss += self.criterion(scores.squeeze(1), y) # use the true token as the next input instead of predicted xes = self.dec_lt(y).unsqueeze(1) xes = F.dropout(xes, p=self.dropout, training=True) predictions.append(preds) else: # force the entire sequence at once by feeding in START + y[:-2] y_in = ys.narrow(1, 0, ys.size(1) - 1) xes = torch.cat([xes, self.dec_lt(y_in)], 1) output, hidden = self.decoder(xes, hidden) preds, scores = self.hidden_to_idx(output, is_training=True) for i in range(ys.size(1)): # sum loss per-token score = scores.select(1, i) y = ys.select(1, i) loss += self.criterion(score, y) predictions.append(preds) loss.backward() self.update_params() predictions = torch.cat(predictions, 1) # if random.random() < 0.1: # sometimes output a prediction for debugging # print('prediction:', ' '.join(output_lines[0])) # print('label:', self.v2t(ys.data[0])) # print('lm' if lm else ' ', 'loss:', loss.data[0]) return predictions, {('lm' if lm else '') + 'loss': loss.mul_(batchsize).data} def _decode_only(self, batchsize, xes, ys, encoder_output, hidden, attn_mask): """Just produce a prediction without training the model.""" done = [False for _ in range(batchsize)] total_done = 0 max_len = 0 predictions = [] # generate a response from scratch while(total_done < batchsize) and max_len < self.longest_label: # keep producing tokens until we hit END or max length for each # example in the batch if self.attention == 'none': output = xes else: h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden output = self._apply_attention(xes, encoder_output, h_att, attn_mask) output, hidden = self.decoder(output, hidden) preds, _scores = self.hidden_to_idx(output, is_training=False) predictions.append(preds) xes = self.dec_lt(preds) max_len += 1 for b in range(batchsize): if not done[b]: # only add more tokens for examples that aren't done yet if preds.data[b][0] == self.END_IDX: # if we produced END, we're done done[b] = True total_done += 1 predictions = torch.cat(predictions, 1) if random.random() < 0.2: # sometimes output a prediction for debugging print('\nprediction:', self.v2t(predictions.data[0])) return predictions def _score_candidates(self, cands, cand_inds, start, encoder_output, hidden, attn_mask): """Rank candidates by their likelihood according to the decoder.""" if type(self.decoder) == nn.LSTM: hidden, cell = hidden # score each candidate separately # cands are exs_with_cands x cands_per_ex x words_per_cand # cview is total_cands x words_per_cand cview = cands.view(-1, cands.size(2)) c_xes = start.expand(cview.size(0), start.size(0), start.size(1)) if len(cand_inds) != hidden.size(1): # only use hidden state from inputs with associated candidates cand_indices = torch.LongTensor([i for i, _, _ in cand_inds]) if self.use_cuda: cand_indices = cand_indices.cuda() cand_indices = Variable(cand_indices) hidden = hidden.index_select(1, cand_indices) sz = hidden.size() cands_hn = ( hidden.view(sz[0], sz[1], 1, sz[2]) .expand(sz[0], sz[1], cands.size(1), sz[2]) .contiguous() .view(sz[0], -1, sz[2]) ) if type(self.decoder) == nn.LSTM: if len(cand_inds) != cell.size(1): # only use cell state from inputs with associated candidates cell = cell.index_select(1, cand_indices) cands_hn = (cands_hn, cell.view(sz[0], sz[1], 1, sz[2]) .expand(sz[0], sz[1], cands.size(1), sz[2]) .contiguous() .view(sz[0], -1, sz[2])) cand_scores = Variable( self.cand_scores.resize_(cview.size(0)).fill_(0)) cand_lengths = Variable( self.cand_lengths.resize_(cview.size(0)).fill_(0)) if self.attention != 'none': # using attention # select only encoder output matching xs we want if len(cand_inds) != len(encoder_output): indices = torch.LongTensor([i[0] for i in cand_inds]) if self.use_cuda: indices = indices.cuda() indices = Variable(indices) encoder_output = encoder_output.index_select(0, indices) attn_mask = attn_mask.index_select(0, indices) sz = encoder_output.size() cands_encoder_output = ( encoder_output.contiguous() .view(sz[0], 1, sz[1], sz[2]) .expand(sz[0], cands.size(1), sz[1], sz[2]) .contiguous() .view(-1, sz[1], sz[2]) ) msz = attn_mask.size() cands_attn_mask = ( attn_mask.contiguous() .view(msz[0], 1, msz[1]) .expand(msz[0], cands.size(1), msz[1]) .contiguous() .view(-1, msz[1]) ) for i in range(cview.size(1)): # process one token at a time h_att = cands_hn[0] if type(self.decoder) == nn.LSTM else cands_hn output = self._apply_attention(c_xes, cands_encoder_output, h_att, cands_attn_mask) output, cands_hn = self.decoder(output, cands_hn) _preds, scores = self.hidden_to_idx(output, is_training=False) cs = cview.select(1, i) non_nulls = cs.ne(self.NULL_IDX) cand_lengths += non_nulls.long() score_per_cand = torch.gather(scores.squeeze(), 1, cs.unsqueeze(1)) cand_scores += score_per_cand.squeeze() * non_nulls.float() c_xes = self.dec_lt(cs).unsqueeze(1) else: # process entire sequence at once if cview.size(1) > 1: # feed in START + cands[:-2] cands_in = cview.narrow(1, 0, cview.size(1) - 1) c_xes = torch.cat([c_xes, self.dec_lt(cands_in)], 1) output, cands_hn = self.decoder(c_xes, cands_hn) _preds, scores = self.hidden_to_idx(output, is_training=False) for i in range(cview.size(1)): # calculate score at each token cs = cview.select(1, i) non_nulls = cs.ne(self.NULL_IDX) cand_lengths += non_nulls.long() score_per_cand = torch.gather(scores.select(1, i), 1, cs.unsqueeze(1)) cand_scores += score_per_cand.squeeze() * non_nulls.float() # set empty scores to -1, so when divided by 0 they become -inf cand_scores -= cand_lengths.eq(0).float() # average the scores per token cand_scores /= cand_lengths.float() cand_scores = cand_scores.view(cands.size(0), cands.size(1)) srtd_scores, text_cand_inds = cand_scores.sort(1, True) return text_cand_inds def predict(self, xs, ys=None, cands=None, valid_cands=None, lm=False): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available and param is set. """ batchsize = len(xs) text_cand_inds = None is_training = ys is not None self.encoder.train(mode=is_training) self.decoder.train(mode=is_training) encoder_output, hidden = self._encode(xs, is_training) # next we use START as an input to kick off our decoder if not lm: x = Variable(self.START_TENSOR, requires_grad=False) xe = self.dec_lt(x) xe = F.dropout(xe, p=self.dropout, training=is_training) xes = xe.expand(batchsize, 1, xe.size(1)) else: # during language_model mode, just start with zeros xes = Variable( self.zeros[0].narrow(1, 0, self.emb_size).unsqueeze(1), requires_grad=False ) if self.attention == 'none': attn_mask = None else: attn_mask = xs.ne(0).float() loss = None if is_training: predictions, loss = self._decode_and_train(batchsize, xes, ys, encoder_output, hidden, attn_mask, lm=lm) else: if cands is not None: text_cand_inds = self._score_candidates(cands, valid_cands, xe, encoder_output, hidden, attn_mask) predictions = self._decode_only(batchsize, xes, ys, encoder_output, hidden, attn_mask) return predictions, text_cand_inds, loss def batchify(self, observations, lm=False): """Convert a list of observations into input & target tensors.""" def valid(obs): # check if this is an example our model should actually process return 'text' in obs # valid examples and their indices try: valid_inds, exs = zip(*[(i, ex) for i, ex in enumerate(observations) if valid(ex)]) except ValueError: # zero examples to process in this batch, so zip failed to unpack return None, None, None, None, None, None # set up the input tensors batchsize = len(exs) # `x` text is already tokenized and truncated parsed_x = [ex['text'] for ex in exs] x_lens = [len(x) for x in parsed_x] ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k]) exs = [exs[k] for k in ind_sorted] valid_inds = [valid_inds[k] for k in ind_sorted] parsed_x = [parsed_x[k] for k in ind_sorted] if lm: self.xs.resize_(batchsize, 1) self.xs.fill_(self.START_IDX) xs = Variable(self.xs) else: max_x_len = max([len(x) for x in parsed_x]) xs = torch.LongTensor(batchsize, max_x_len).fill_(self.NULL_IDX) # right-padded with zeros for i, x in enumerate(parsed_x): for j, idx in enumerate(x): xs[i][j] = idx if self.use_cuda: # copy to gpu self.xs.resize_(xs.size()) self.xs.copy_(xs, async=True) xs = Variable(self.xs) else: xs = Variable(xs) # set up the target tensors ys = None labels = None if any(['labels' in ex for ex in exs]): # randomly select one of the labels to update on, if multiple # append END to each label labels = [random.choice(ex.get('labels', [''])) for ex in exs] parsed_y = [self.parse(y + ' ' + self.END) for y in labels] if lm: parsed_y = [parsed_x[i] + parsed_y[i] for i in range(batchsize)] max_y_len = max(len(y) for y in parsed_y) if self.truncate > 0 and max_y_len > self.truncate: parsed_y = [y[:self.truncate] for y in parsed_y] max_y_len = self.truncate ys = torch.LongTensor(batchsize, max_y_len).fill_(self.NULL_IDX) for i, y in enumerate(parsed_y): for j, idx in enumerate(y): ys[i][j] = idx if self.use_cuda: # copy to gpu self.ys.resize_(ys.size()) self.ys.copy_(ys, async=True) ys = Variable(self.ys) else: ys = Variable(ys) # set up candidates cands = None valid_cands = None if ys is None and self.rank: # only do ranking when no targets available and ranking flag set parsed_cs = [] valid_cands = [] for i, v in enumerate(valid_inds): if 'label_candidates' in observations[v]: # each candidate tuple is a pair of the parsed version and # the original full string cs = list(observations[v]['label_candidates']) parsed_cs.append([self.parse(c) for c in cs]) valid_cands.append((i, v, cs)) if len(parsed_cs) > 0: # TODO: store lengths of cands separately, so don't have zero # padding for varying number of cands per example # found cands, pack them into tensor max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs) max_c_cnt = max(len(cs) for cs in parsed_cs) cands = torch.LongTensor(len(parsed_cs), max_c_cnt, max_c_len).fill_(self.NULL_IDX) for i, cs in enumerate(parsed_cs): for j, c in enumerate(cs): for k, idx in enumerate(c): cands[i][j][k] = idx if self.use_cuda: # copy to gpu self.cands.resize_(cands.size()) self.cands.copy_(cands, async=True) cands = Variable(self.cands) else: cands = Variable(cands) return xs, ys, labels, valid_inds, cands, valid_cands def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, labels, valid_inds, cands, valid_cands = self.batchify(observations) if ys is not None: # keep track of longest label we've ever seen # we'll never produce longer ones than that during prediction self.longest_label = max(self.longest_label, ys.size(1)) if xs is None: # no valid examples, just return empty responses return batch_reply if self.lm != 'none' and ys is not None: # train on lm task: given [START], predict [x y] # (regular task is given [x START] produce [y]) xs, ys, _, _, _, _ = self.batchify(observations, lm=True) _, _, loss = self.predict(xs, ys, lm=True) if loss is not None: batch_reply[0]['metrics'] = loss if self.lm != 'only' or ys is None: # produce predictions, train on targets if availables predictions, text_cand_inds, loss = self.predict(xs, ys, cands, valid_cands) if loss is not None: if 'metrics' in batch_reply[0]: for k, v in loss: batch_reply[0]['metrics'][k] = v else: batch_reply[0]['metrics'] = loss predictions = predictions.cpu() for i in range(len(predictions)): # map the predictions back to non-empty examples in the batch # we join with spaces since we produce tokens one at a time curr = batch_reply[valid_inds[i]] output_tokens = [] for c in predictions.data[i]: if c == self.END_IDX: break else: output_tokens.append(c) curr_pred = self.v2t(output_tokens) curr['text'] = curr_pred if labels is not None: y = [] for c in ys.data[i]: if c == self.END_IDX: break else: y.append(c) self.answers[valid_inds[i]] = y else: self.answers[valid_inds[i]] = output_tokens if self.NULL_IDX in self.answers[valid_inds[i]]: raise RuntimeError('This shouldnt happen but might.') if text_cand_inds is not None: text_cand_inds = text_cand_inds.cpu().data for i in range(len(valid_cands)): order = text_cand_inds[i] _, batch_idx, curr_cands = valid_cands[i] curr = batch_reply[batch_idx] curr['text_candidates'] = [curr_cands[idx] for idx in order if idx < len(curr_cands)] return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'optims'): model = {} model['enc_lt'] = self.enc_lt.state_dict() if self.opt['lookuptable'] not in ['enc_dec', 'all']: # dec_lt is not shared with enc_lt, so save it model['dec_lt'] = self.dec_lt.state_dict() if self.opt['decoder'] != 'shared': model['encoder'] = self.encoder.state_dict() model['decoder'] = self.decoder.state_dict() model['h2e'] = self.h2e.state_dict() model['e2o'] = self.e2o.state_dict() model['optims'] = {k: v.state_dict() for k, v in self.optims.items()} model['longest_label'] = self.longest_label model['opt'] = self.opt for attn_name in ['attn', 'attn_v', 'attn_combine']: if hasattr(self, attn_name): model[attn_name] = getattr(self, attn_name).state_dict() with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: model = torch.load(read) return model['opt'], model def set_states(self, states): """Set the state dicts of the modules from saved states.""" self.enc_lt.load_state_dict(states['enc_lt']) if self.opt['lookuptable'] not in ['enc_dec', 'all']: # dec_lt is not shared with enc_lt, so load it self.dec_lt.load_state_dict(states['dec_lt']) if self.opt['decoder'] != 'shared': self.encoder.load_state_dict(states['encoder']) self.decoder.load_state_dict(states['decoder']) self.h2e.load_state_dict(states['h2e']) self.e2o.load_state_dict(states['e2o']) for attn_name in ['attn', 'attn_v', 'attn_combine']: if attn_name in states: getattr(self, attn_name).load_state_dict(states[attn_name]) for k, optimizer in self.optims.items(): if k in states['optims']: optimizer.load_state_dict(states['optims'][k]) else: print('WARNING: loaded other optims, but none found for ' + k + '. Using default initialization instead.') self.longest_label = states['longest_label']
def test_basic_parse(self): """Check that the dictionary is correctly adding and parsing short sentence. """ from parlai.core.dict import DictionaryAgent from parlai.core.params import ParlaiParser argparser = ParlaiParser() DictionaryAgent.add_cmdline_args(argparser) opt = argparser.parse_args(print_args=False) dictionary = DictionaryAgent(opt) num_builtin = len(dictionary) dictionary.observe({'text': 'hello world'}) dictionary.act() assert len(dictionary) - num_builtin == 2 vec = dictionary.parse('hello world') assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=list) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1 vec = dictionary.parse('hello world', vec_type=tuple) assert len(vec) == 2 assert vec[0] == num_builtin assert vec[1] == num_builtin + 1
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() self.batchsize = opt.get('batchsize', 1) if shared: # set up shared properties self.dict = shared['dict'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] else: # this is not a shared instance of this class, so do full init if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'LanguageModel' # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] # set model self.model = RNNModel(opt, len(self.dict)) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() self.next_observe = [] self.next_batch = [] self.is_training = True if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.25) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) if self.use_cuda: # push to cuda self.criterion.cuda() # set up criterion for eval: we do not want to average over size self.eval_criterion = nn.CrossEntropyLoss( ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.eval_criterion.cuda() # init hidden state self.hidden = self.model.init_hidden(self.batchsize) # init tensor of end tokens self.ends = torch.LongTensor( [self.END_IDX for _ in range(self.batchsize)]) if self.use_cuda: self.ends = self.ends.cuda() # set up optimizer self.lr = opt['learningrate'] best_val_loss = None self.reset()
class Seq2seqAgent(Agent): """Agent which takes an input sequence and produces an output sequence. For more information, see Sequence to Sequence Learning with Neural Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_. """ OPTIM_OPTS = { 'adadelta': optim.Adadelta, 'adagrad': optim.Adagrad, 'adam': optim.Adam, 'adamax': optim.Adamax, 'asgd': optim.ASGD, 'lbfgs': optim.LBFGS, 'rmsprop': optim.RMSprop, 'rprop': optim.Rprop, 'sgd': optim.SGD, } ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM} @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" DictionaryAgent.add_cmdline_args(argparser) agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-emb', '--embeddingsize', type=int, default=128, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=0.5, help='learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.1, help='dropout rate') agent.add_argument('-att', '--attention', type=int, default=0, help='if greater than 0, use attention of specified' ' length while decoding') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-rc', '--rank-candidates', type='bool', default=False, help='rank candidates if available. this is done by' ' computing the mean score per token for each ' 'candidate and selecting the highest scoring.') agent.add_argument('-tr', '--truncate', type='bool', default=True, help='truncate input & output lengths to speed up ' 'training (may reduce accuracy). This fixes all ' 'input and output to have a maximum length and to ' 'be similar in length to one another by throwing ' 'away extra tokens. This reduces the total amount ' 'of padding in the batches.') agent.add_argument('-enc', '--encoder', default='gru', choices=Seq2seqAgent.ENC_OPTS.keys(), help='Choose between different encoder modules.') agent.add_argument('-dec', '--decoder', default='same', choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()), help='Choose between different decoder modules. ' 'Default "same" uses same class as encoder, ' 'while "shared" also uses the same weights.') agent.add_argument('-opt', '--optimizer', default='sgd', choices=Seq2seqAgent.OPTIM_OPTS.keys(), help='Choose between pytorch optimizers. ' 'Any member of torch.optim is valid and will ' 'be used with default params except learning ' 'rate (as specified by -lr).') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override options with stored ones opt = self.override_opt(new_opt) self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START = self.dict.start_token self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START)) # we use END markers to end our output self.END = self.dict.end_token self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END)) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params directly hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.truncate = opt['truncate'] self.attention = opt['attention'] # set up tensors self.zeros = torch.zeros(self.num_layers, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) self.cands = torch.LongTensor(1, 1, 1) self.cand_scores = torch.FloatTensor(1) self.cand_lengths = torch.LongTensor(1) # set up modules self.criterion = nn.NLLLoss() # lookup table stores word embeddings self.lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, scale_grad_by_freq=True) self.lt2enc = nn.Linear(emb, hsz) self.lt2dec = nn.Linear(emb, hsz) # encoder captures the input text enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']] self.encoder = enc_class(hsz, hsz, opt['numlayers']) # decoder produces our output states if opt['decoder'] == 'shared': self.decoder = self.encoder elif opt['decoder'] == 'same': self.decoder = enc_class(hsz, hsz, opt['numlayers']) else: dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']] self.decoder = dec_class(hsz, hsz, opt['numlayers']) # linear layer helps us produce outputs from final decoder state self.h2o = nn.Linear(hsz, len(self.dict)) # droput on the linear layer helps us generalize self.dropout = nn.Dropout(opt['dropout']) self.use_attention = False # if attention is greater than 0, set up additional members if self.attention > 0: self.use_attention = True self.max_length = self.attention # combines input and previous hidden output layer self.attn = nn.Linear(hsz * 2, self.max_length) # combines attention weights with encoder outputs self.attn_combine = nn.Linear(hsz * 2, hsz) # set up optims for each module lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] self.optims = { 'lt': optim_class(self.lt.parameters(), lr=lr), 'lt2enc': optim_class(self.lt2enc.parameters(), lr=lr), 'lt2dec': optim_class(self.lt2dec.parameters(), lr=lr), 'encoder': optim_class(self.encoder.parameters(), lr=lr), 'decoder': optim_class(self.decoder.parameters(), lr=lr), 'h2o': optim_class(self.h2o.parameters(), lr=lr), } if hasattr(self, 'states'): # set loaded states if applicable self.set_states(self.states) if self.use_cuda: self.cuda() self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder', 'decoder'} for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def v2t(self, vec): """Convert token indices to string of tokens.""" return self.dict.vec2txt(vec) def cuda(self): """Push parameters to the GPU.""" self.START_TENSOR = self.START_TENSOR.cuda(async=True) self.END_TENSOR = self.END_TENSOR.cuda(async=True) self.zeros = self.zeros.cuda(async=True) self.xs = self.xs.cuda(async=True) self.ys = self.ys.cuda(async=True) self.cands = self.cands.cuda(async=True) self.cand_scores = self.cand_scores.cuda(async=True) self.cand_lengths = self.cand_lengths.cuda(async=True) self.criterion.cuda() self.lt.cuda() self.lt2enc.cuda() self.lt2dec.cuda() self.encoder.cuda() self.decoder.cuda() self.h2o.cuda() self.dropout.cuda() if self.use_attention: self.attn.cuda() self.attn_combine.cuda() def hidden_to_idx(self, hidden, dropout=False): """Convert hidden state vectors into indices into the dictionary.""" if hidden.size(0) > 1: raise RuntimeError('bad dimensions of tensor:', hidden) hidden = hidden.squeeze(0) scores = self.h2o(hidden) if dropout: scores = self.dropout(scores) scores = F.log_softmax(scores) _max_score, idx = scores.max(1) return idx, scores def zero_grad(self): """Zero out optimizers.""" for optimizer in self.optims.values(): optimizer.zero_grad() def update_params(self): """Do one optimization step.""" for optimizer in self.optims.values(): optimizer.step() def reset(self): """Reset observation and episode_done.""" self.observation = None self.episode_done = True def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ # shallow copy observation (deep copy can be expensive) observation = observation.copy() if not self.episode_done: # if the last example wasn't the end of an episode, then we need to # recall what was said in that example prev_dialogue = self.observation['text'] observation['text'] = prev_dialogue + '\n' + observation['text'] self.observation = observation self.episode_done = observation['episode_done'] return observation def _encode(self, xs, dropout=False): """Call encoder and return output and hidden states.""" batchsize = len(xs) # first encode context xes = self.lt(xs) if dropout: xes = self.dropout(xes) # project from emb_size to hidden_size dimensions xes = self.lt2enc(xes).transpose(0, 1) if self.zeros.size(1) != batchsize: self.zeros.resize_(self.num_layers, batchsize, self.hidden_size).fill_(0) h0 = Variable(self.zeros) if type(self.encoder) == nn.LSTM: encoder_output, hidden = self.encoder(xes, (h0, h0)) if type(self.decoder) != nn.LSTM: hidden = hidden[0] else: encoder_output, hidden = self.encoder(xes, h0) if type(self.decoder) == nn.LSTM: hidden = (hidden, h0) encoder_output = encoder_output.transpose(0, 1) if self.use_attention: if encoder_output.size(1) > self.max_length: offset = encoder_output.size(1) - self.max_length encoder_output = encoder_output.narrow(1, offset, self.max_length) return encoder_output, hidden def _apply_attention(self, xes, encoder_output, encoder_hidden): """Apply attention to encoder hidden layer.""" attn_weights = F.softmax(self.attn(torch.cat((xes[0], encoder_hidden[-1]), 1))) if attn_weights.size(1) > encoder_output.size(1): attn_weights = attn_weights.narrow(1, 0, encoder_output.size(1) ) attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_output).squeeze(1) output = torch.cat((xes[0], attn_applied), 1) output = self.attn_combine(output).unsqueeze(0) output = F.relu(output) return output def _decode_and_train(self, batchsize, xes, ys, encoder_output, hidden): # update the model based on the labels self.zero_grad() loss = 0 output_lines = [[] for _ in range(batchsize)] # keep track of longest label we've ever seen self.longest_label = max(self.longest_label, ys.size(1)) for i in range(ys.size(1)): output = self._apply_attention(xes, encoder_output, hidden) if self.use_attention else xes output, hidden = self.decoder(output, hidden) preds, scores = self.hidden_to_idx(output, dropout=True) y = ys.select(1, i) loss += self.criterion(scores, y) # use the true token as the next input instead of predicted # this produces a biased prediction but better training xes = self.lt2dec(self.lt(y).unsqueeze(0)) for b in range(batchsize): # convert the output scores to tokens token = self.v2t([preds.data[b]]) output_lines[b].append(token) loss.backward() self.update_params() if random.random() < 0.1: # sometimes output a prediction for debugging print('prediction:', ' '.join(output_lines[0]), '\nlabel:', self.dict.vec2txt(ys.data[0])) return output_lines def _decode_only(self, batchsize, xes, ys, encoder_output, hidden): # just produce a prediction without training the model done = [False for _ in range(batchsize)] total_done = 0 max_len = 0 output_lines = [[] for _ in range(batchsize)] # now, generate a response from scratch while(total_done < batchsize) and max_len < self.longest_label: # keep producing tokens until we hit END or max length for each # example in the batch output = self._apply_attention(xes, encoder_output, hidden) if self.use_attention else xes output, hidden = self.decoder(output, hidden) preds, scores = self.hidden_to_idx(output, dropout=False) xes = self.lt2dec(self.lt(preds.unsqueeze(0))) max_len += 1 for b in range(batchsize): if not done[b]: # only add more tokens for examples that aren't done yet token = self.v2t([preds.data[b]]) if token == self.END: # if we produced END, we're done done[b] = True total_done += 1 else: output_lines[b].append(token) if random.random() < 0.1: # sometimes output a prediction for debugging print('prediction:', ' '.join(output_lines[0])) return output_lines def _score_candidates(self, cands, xe, encoder_output, hidden): # score each candidate separately # cands are exs_with_cands x cands_per_ex x words_per_cand # cview is total_cands x words_per_cand cview = cands.view(-1, cands.size(2)) cands_xes = xe.expand(xe.size(0), cview.size(0), xe.size(2)) sz = hidden.size() cands_hn = ( hidden.view(sz[0], sz[1], 1, sz[2]) .expand(sz[0], sz[1], cands.size(1), sz[2]) .contiguous() .view(sz[0], -1, sz[2]) ) sz = encoder_output.size() cands_encoder_output = ( encoder_output.contiguous() .view(sz[0], 1, sz[1], sz[2]) .expand(sz[0], cands.size(1), sz[1], sz[2]) .contiguous() .view(-1, sz[1], sz[2]) ) cand_scores = Variable( self.cand_scores.resize_(cview.size(0)).fill_(0)) cand_lengths = Variable( self.cand_lengths.resize_(cview.size(0)).fill_(0)) for i in range(cview.size(1)): output = self._apply_attention(cands_xes, cands_encoder_output, cands_hn) \ if self.use_attention else cands_xes output, cands_hn = self.decoder(output, cands_hn) preds, scores = self.hidden_to_idx(output, dropout=False) cs = cview.select(1, i) non_nulls = cs.ne(self.NULL_IDX) cand_lengths += non_nulls.long() score_per_cand = torch.gather(scores, 1, cs.unsqueeze(1)) cand_scores += score_per_cand.squeeze() * non_nulls.float() cands_xes = self.lt2dec(self.lt(cs).unsqueeze(0)) # set empty scores to -1, so when divided by 0 they become -inf cand_scores -= cand_lengths.eq(0).float() # average the scores per token cand_scores /= cand_lengths.float() cand_scores = cand_scores.view(cands.size(0), cands.size(1)) srtd_scores, text_cand_inds = cand_scores.sort(1, True) text_cand_inds = text_cand_inds.data return text_cand_inds def predict(self, xs, ys=None, cands=None): """Produce a prediction from our model. Update the model using the targets if available, otherwise rank candidates as well if they are available. """ batchsize = len(xs) text_cand_inds = None is_training = ys is not None encoder_output, hidden = self._encode(xs, dropout=is_training) # next we use END as an input to kick off our decoder x = Variable(self.START_TENSOR) xe = self.lt2dec(self.lt(x).unsqueeze(1)) xes = xe.expand(xe.size(0), batchsize, xe.size(2)) # list of output tokens for each example in the batch output_lines = None if is_training: output_lines = self._decode_and_train(batchsize, xes, ys, encoder_output, hidden) else: if cands is not None: text_cand_inds = self._score_candidates(cands, xe, encoder_output, hidden) output_lines = self._decode_only(batchsize, xes, ys, encoder_output, hidden) return output_lines, text_cand_inds def batchify(self, observations): """Convert a list of observations into input & target tensors.""" # valid examples exs = [ex for ex in observations if 'text' in ex] # the indices of the valid (non-empty) tensors valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex] # set up the input tensors batchsize = len(exs) # tokenize the text xs = None if batchsize > 0: parsed = [self.parse(ex['text']) for ex in exs] max_x_len = max([len(x) for x in parsed]) if self.truncate: # shrink xs to to limit batch computation min_x_len = min([len(x) for x in parsed]) max_x_len = min(min_x_len + 12, max_x_len, 48) parsed = [x[-max_x_len:] for x in parsed] xs = torch.LongTensor(batchsize, max_x_len).fill_(0) # pack the data to the right side of the tensor for this model for i, x in enumerate(parsed): offset = max_x_len - len(x) for j, idx in enumerate(x): xs[i][j + offset] = idx if self.use_cuda: # copy to gpu self.xs.resize_(xs.size()) self.xs.copy_(xs, async=True) xs = Variable(self.xs) else: xs = Variable(xs) # set up the target tensors ys = None if batchsize > 0 and any(['labels' in ex for ex in exs]): # randomly select one of the labels to update on, if multiple # append END to each label labels = [random.choice(ex.get('labels', [''])) + ' ' + self.END for ex in exs] parsed = [self.parse(y) for y in labels] max_y_len = max(len(y) for y in parsed) if self.truncate: # shrink ys to to limit batch computation min_y_len = min(len(y) for y in parsed) max_y_len = min(min_y_len + 12, max_y_len, 48) parsed = [y[:max_y_len] for y in parsed] ys = torch.LongTensor(batchsize, max_y_len).fill_(0) for i, y in enumerate(parsed): for j, idx in enumerate(y): ys[i][j] = idx if self.use_cuda: # copy to gpu self.ys.resize_(ys.size()) self.ys.copy_(ys, async=True) ys = Variable(self.ys) else: ys = Variable(ys) # set up candidates cands = None valid_cands = None if ys is None and self.rank: # only do ranking when no targets available and ranking flag set parsed = [] valid_cands = [] for i in valid_inds: if 'label_candidates' in observations[i]: # each candidate tuple is a pair of the parsed version and # the original full string cs = list(observations[i]['label_candidates']) parsed.append([self.parse(c) for c in cs]) valid_cands.append((i, cs)) if len(parsed) > 0: # TODO: store lengths of cands separately, so don't have zero # padding for varying number of cands per example # found cands, pack them into tensor max_c_len = max(max(len(c) for c in cs) for cs in parsed) max_c_cnt = max(len(cs) for cs in parsed) cands = torch.LongTensor(len(parsed), max_c_cnt, max_c_len).fill_(0) for i, cs in enumerate(parsed): for j, c in enumerate(cs): for k, idx in enumerate(c): cands[i][j][k] = idx if self.use_cuda: # copy to gpu self.cands.resize_(cands.size()) self.cands.copy_(cands, async=True) cands = Variable(self.cands) else: cands = Variable(cands) return xs, ys, valid_inds, cands, valid_cands def batch_act(self, observations): batchsize = len(observations) # initialize a table of replies with this agent's id batch_reply = [{'id': self.getID()} for _ in range(batchsize)] # convert the observations into batches of inputs and targets # valid_inds tells us the indices of all valid examples # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1] # since the other three elements had no 'text' field xs, ys, valid_inds, cands, valid_cands = self.batchify(observations) if xs is None: # no valid examples, just return the empty responses we set up return batch_reply # produce predictions either way, but use the targets if available predictions, text_cand_inds = self.predict(xs, ys, cands) for i in range(len(predictions)): # map the predictions back to non-empty examples in the batch # we join with spaces since we produce tokens one at a time curr = batch_reply[valid_inds[i]] curr['text'] = ' '.join(c for c in predictions[i] if c != self.END and c != self.dict.null_token) if text_cand_inds is not None: for i in range(len(valid_cands)): order = text_cand_inds[i] batch_idx, curr_cands = valid_cands[i] curr = batch_reply[batch_idx] curr['text_candidates'] = [curr_cands[idx] for idx in order if idx < len(curr_cands)] return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'lt'): model = {} model['lt'] = self.lt.state_dict() model['lt2enc'] = self.lt2enc.state_dict() model['lt2dec'] = self.lt2dec.state_dict() model['encoder'] = self.encoder.state_dict() model['decoder'] = self.decoder.state_dict() model['h2o'] = self.h2o.state_dict() model['optims'] = {k: v.state_dict() for k, v in self.optims.items()} model['longest_label'] = self.longest_label model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: model = torch.load(read) return model['opt'], model def set_states(self, states): """Set the state dicts of the modules from saved states.""" self.lt.load_state_dict(states['lt']) self.lt2enc.load_state_dict(states['lt2enc']) self.lt2dec.load_state_dict(states['lt2dec']) self.encoder.load_state_dict(states['encoder']) self.decoder.load_state_dict(states['decoder']) self.h2o.load_state_dict(states['h2o']) for k, v in states['optims'].items(): self.optims[k].load_state_dict(v) self.longest_label = states['longest_label']
class LanguageModelAgent(Agent): """ Agent which trains an RNN on a language modeling task. It is adapted from the language model featured in Pytorch's examples repo here: <https://github.com/pytorch/examples/tree/master/word_language_model>. """ @staticmethod def dictionary_class(): return DictionaryAgent @staticmethod def add_cmdline_args(argparser): """Add command-line arguments specifically for this agent.""" argparser.set_defaults(batch_sort=False) LanguageModelAgent.dictionary_class().add_cmdline_args(argparser) agent = argparser.add_argument_group('Language Model Arguments') agent.add_argument('-hs', '--hiddensize', type=int, default=200, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=200, help='size of the token embeddings') agent.add_argument('-nl', '--numlayers', type=int, default=2, help='number of hidden layers') agent.add_argument('-lr', '--learningrate', type=float, default=20, help='initial learning rate') agent.add_argument('-dr', '--dropout', type=float, default=0.2, help='dropout rate') agent.add_argument('-clip', '--gradient-clip', type=float, default=0.25, help='gradient clipping') agent.add_argument('--no-cuda', action='store_true', default=False, help='disable GPUs even if available') agent.add_argument( '-rnn', '--rnn-class', default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') agent.add_argument('-sl', '--seq-len', type=int, default=35, help='sequence length') agent.add_argument('-tied', '--emb-tied', action='store_true', help='tie the word embedding and softmax weights') agent.add_argument('-seed', '--random-seed', type=int, default=1111, help='random seed') agent.add_argument('--gpu', type=int, default=-1, help='which GPU device to use') agent.add_argument('-tr', '--truncate-pred', type=int, default=50, help='truncate predictions') agent.add_argument('-rf', '--report-freq', type=float, default=0.1, help='report frequency of prediction during eval') def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init self.states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() self.batchsize = opt.get('batchsize', 1) if shared: # set up shared properties self.dict = shared['dict'] if 'model' in shared: # model is shared during hogwild self.model = shared['model'] self.states = shared['states'] # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] else: # this is not a shared instance of this class, so do full init if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override model-specific options with stored ones opt = self.override_opt(new_opt) if opt['dict_file'] is None and opt.get('model_file'): # set default dict-file if not set opt['dict_file'] = opt['model_file'] + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'LanguageModel' # get NULL token and END token self.NULL_IDX = self.dict[self.dict.null_token] self.END_IDX = self.dict[self.dict.end_token] # set model self.model = RNNModel(opt, len(self.dict)) if self.states: # set loaded states if applicable self.model.load_state_dict(self.states['model']) if self.use_cuda: self.model.cuda() self.next_observe = [] self.next_batch = [] self.is_training = True if hasattr(self, 'model'): # if model was built, do more setup self.clip = opt.get('gradient_clip', 0.25) # set up criteria self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX) if self.use_cuda: # push to cuda self.criterion.cuda() # set up criterion for eval: we do not want to average over size self.eval_criterion = nn.CrossEntropyLoss( ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: # push to cuda self.eval_criterion.cuda() # init hidden state self.hidden = self.model.init_hidden(self.batchsize) # init tensor of end tokens self.ends = torch.LongTensor( [self.END_IDX for _ in range(self.batchsize)]) if self.use_cuda: self.ends = self.ends.cuda() # set up optimizer self.lr = opt['learningrate'] best_val_loss = None self.reset() def override_opt(self, new_opt): """Set overridable opts from loaded opt file. Print out each added key and each overriden key. Only override args specific to the model. """ model_args = { 'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len', 'emb_tied' } for k, v in new_opt.items(): if k not in model_args: # skip non-model args continue if k not in self.opt: print('Adding new option [ {k}: {v} ]'.format(k=k, v=v)) elif self.opt[k] != v: print('Overriding option [ {k}: {old} => {v}]'.format( k=k, old=self.opt[k], v=v)) self.opt[k] = v return self.opt def parse(self, text): """Convert string to token indices.""" return self.dict.txt2vec(text) def zero_grad(self): """Zero out optimizer.""" self.model.zero_grad() def update_params(self): """Do one optimization step.""" torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip) for p in self.model.parameters(): p.data.add_(-self.lr, p.grad.data) def reset(self): """Reset observation and episode_done.""" self.observation = None def share(self): """Share internal states between parent and child instances.""" shared = super().share() shared['dict'] = self.dict shared['NULL_IDX'] = self.NULL_IDX shared['END_IDX'] = self.END_IDX if self.opt.get('numthreads', 1) > 1: shared['model'] = self.model self.model.share_memory() shared['states'] = self.states return shared def observe(self, observation): """Save observation for act. If multiple observations are from the same episode, concatenate them. """ #shallow copy observation (deep copy can be expensive) obs = observation.copy() seq_len = self.opt['seq_len'] is_training = True if 'eval_labels' in obs: is_training = False if is_training: if 'text' in obs: vec = self.parse(obs['text']) vec.append(self.END_IDX) self.next_observe += vec if 'labels' in obs: vec = self.parse(obs['labels'][0]) vec.append(self.END_IDX) self.next_observe += vec if len(self.next_observe) < (seq_len + 1): # not enough to return to make a batch # we handle this case in vectorize # labels indicates that we are training self.observation = {'labels': ''} return self.observation else: vecs_to_return = [] total = len(self.next_observe) // (seq_len + 1) for _ in range(total): observe = self.next_observe[:(seq_len + 1)] self.next_observe = self.next_observe[(seq_len + 1):] vecs_to_return.append(observe) dict_to_return = { 'text': '', 'labels': '', 'text2vec': vecs_to_return } self.observation = dict_to_return return dict_to_return else: self.observation = obs return obs def repackage_hidden(self, h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(self.repackage_hidden(v) for v in h) def get_target_loss(self, data, hidden, targets, y_lens): """Calculates the loss with respect to the targets, token by token, where each output token is conditioned on either the input or the previous target token. """ loss = 0.0 bsz = data.size(0) # feed in inputs without end token output, hidden = self.model(data.transpose(0, 1), hidden) self.hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)), self.hidden) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.eval_criterion(output_flat, targets.select(1, 0).view(-1)).data for i in range(1, targets.size(1)): output, hidden = self.model(targets.select(1, i - 1).view(1, bsz), self.hidden, no_pack=True) self.hidden = self.repackage_hidden(hidden) output_flat = output.view(-1, len(self.dict)) loss += self.eval_criterion(output_flat, targets.select(1, i).view(-1)).data return loss / float(sum(y_lens)) def get_predictions(self, data): """Generates predictions word by word until we either reach the end token or some max length (opt['truncate_pred']). """ token_list = [] bsz = data.size(0) done = [False for _ in range(bsz)] total_done = 0 hidden = self.model.init_hidden(bsz) i = 0 while total_done < bsz and i <= self.opt['truncate_pred']: if i == 0: # feed in input without end tokens output, hidden = self.model(data.transpose(0, 1), hidden) hidden = self.repackage_hidden(hidden) # feed in end tokens output, hidden = self.model( Variable(self.ends[:bsz].view(1, bsz)), hidden) else: output, hidden = self.model(Variable(word_idx.view(1, bsz)), hidden, no_pack=True) hidden = self.repackage_hidden(hidden) word_weights = output.squeeze().data.exp() if bsz > 1: value, word_idx = torch.max(word_weights, 1) else: value, word_idx = torch.max(word_weights, 0) # mark end indices for items in batch for k in range(word_idx.size(0)): if not done[k]: if int(word_idx[k]) == self.END_IDX: done[k] = True total_done += 1 token_list.append(word_idx.view(bsz, 1)) i += 1 return torch.cat(token_list, 1) def predict(self, data, hidden, targets=None, is_training=True, y_lens=None): """Produce a prediction from our model. """ loss_dict = None output = None predictions = None if is_training: self.model.train() self.zero_grad() output, hidden = self.model(data, hidden) loss = self.criterion(output.view(-1, len(self.dict)), targets.view(-1)) loss.backward(retain_graph=True) self.update_params() loss_dict = {'lmloss': loss.data} loss_dict['lmppl'] = math.exp(loss.data) else: self.model.eval() predictions = self.get_predictions(data) loss_dict = {} bsz = data.size(0) if bsz != self.batchsize: self.hidden = self.model.init_hidden(bsz) loss = self.get_target_loss(data, self.hidden, targets, y_lens) loss_dict['loss'] = loss loss_dict['ppl'] = math.exp(loss) return output, hidden, loss_dict, predictions def vectorize(self, observations, seq_len, is_training): """Convert a list of observations into input & target tensors.""" labels = None valid_inds = None y_lens = None if is_training: for obs in observations: if obs: if 'text2vec' in obs: self.next_batch += obs['text2vec'] if len(self.next_batch) <= self.batchsize: return None, None, None, None, None else: data_list = [] targets_list = [] # total is the number of batches total = len(self.next_batch) // self.batchsize for i in range(total): batch = self.next_batch[:self.batchsize] self.next_batch = self.next_batch[self.batchsize:] source = torch.LongTensor(batch).t().contiguous() data = Variable(source[:seq_len]) targets = Variable(source[1:]) if self.use_cuda: data = data.cuda() targets = targets.cuda() data_list.append(data) targets_list.append(targets) else: # here we get valid examples and pad them with zeros xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text( observations, self.dict, self.END_IDX, self.NULL_IDX) if self.use_cuda: xs = Variable(xs).cuda() ys = Variable(ys).cuda() else: xs = Variable(xs) ys = Variable(ys) data_list = [xs] targets_list = [ys] return data_list, targets_list, labels, valid_inds, y_lens def batch_act(self, observations): batch_reply = [{'id': self.getID()} for _ in range(len(observations))] if any(['labels' in obs for obs in observations]): # if we are starting a new training epoch, reinitialize hidden if self.is_training == False: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = True data_list, targets_list, _, _, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) else: # if we just finished training, reinitialize hidden if self.is_training == True: self.hidden = self.model.init_hidden(self.batchsize) self.is_training = False data_list, targets_list, labels, valid_inds, y_lens = self.vectorize( observations, self.opt['seq_len'], self.is_training) if data_list is None: # not enough data to batch act yet, return empty responses return batch_reply batch_reply = [] # during evaluation, len(data_list) is always 1 # during training, len(dat_list) >= 0: vectorize returns a list containing all batches available at the time it is called for i in range(len(data_list)): temp_dicts = [{ 'id': self.getID() } for _ in range(len(observations))] output, hidden, loss_dict, predictions = self.predict( data_list[i], self.hidden, targets_list[i], self.is_training, y_lens) self.hidden = self.repackage_hidden(hidden) if predictions is not None: # map predictions back to the right order PaddingUtils.map_predictions( predictions, valid_inds, temp_dicts, observations, self.dict, self.END_IDX, report_freq=self.opt['report_freq']) if loss_dict is not None: if 'metrics' in temp_dicts[0]: for k, v in loss_dict.items(): temp_dicts[0]['metrics'][k] = v else: temp_dicts[0]['metrics'] = loss_dict batch_reply += temp_dicts return batch_reply def act(self): # call batch_act with this batch of one return self.batch_act([self.observation])[0] def save(self, path=None): """Save model parameters if model_file is set.""" path = self.opt.get('model_file', None) if path is None else path if path and hasattr(self, 'model'): model = {} model['model'] = self.model.state_dict() model['opt'] = self.opt with open(path, 'wb') as write: torch.save(model, write) def shutdown(self): """Save the state of the model when shutdown.""" path = self.opt.get('model_file', None) if path is not None: self.save(path + '.shutdown_state') super().shutdown() def load(self, path): """Return opt and model states.""" with open(path, 'rb') as read: states = torch.load(read) return states['opt'], states
def __init__(self, opt, shared=None): """Set up model if shared params not set, otherwise no work to do.""" super().__init__(opt, shared) if not shared: # this is not a shared instance of this class, so do full # initialization. if shared is set, only set up shared members. # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) if opt.get('model_file') and os.path.isfile(opt['model_file']): # load model parameters if available print('Loading existing model params from ' + opt['model_file']) new_opt, self.states = self.load(opt['model_file']) # override options with stored ones opt = self.override_opt(new_opt) self.dict = DictionaryAgent(opt) self.id = 'Seq2Seq' # we use START markers to start our output self.START = self.dict.start_token self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START)) # we use END markers to end our output self.END = self.dict.end_token self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END)) # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0] # store important params directly hsz = opt['hiddensize'] emb = opt['embeddingsize'] self.hidden_size = hsz self.emb_size = emb self.num_layers = opt['numlayers'] self.learning_rate = opt['learningrate'] self.rank = opt['rank_candidates'] self.longest_label = 1 self.truncate = opt['truncate'] self.attention = opt['attention'] # set up tensors self.zeros = torch.zeros(self.num_layers, 1, hsz) self.xs = torch.LongTensor(1, 1) self.ys = torch.LongTensor(1, 1) self.cands = torch.LongTensor(1, 1, 1) self.cand_scores = torch.FloatTensor(1) self.cand_lengths = torch.LongTensor(1) # set up modules self.criterion = nn.NLLLoss() # lookup table stores word embeddings self.lt = nn.Embedding(len(self.dict), emb, padding_idx=self.NULL_IDX, scale_grad_by_freq=True) self.lt2enc = nn.Linear(emb, hsz) self.lt2dec = nn.Linear(emb, hsz) # encoder captures the input text enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']] self.encoder = enc_class(hsz, hsz, opt['numlayers']) # decoder produces our output states if opt['decoder'] == 'shared': self.decoder = self.encoder elif opt['decoder'] == 'same': self.decoder = enc_class(hsz, hsz, opt['numlayers']) else: dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']] self.decoder = dec_class(hsz, hsz, opt['numlayers']) # linear layer helps us produce outputs from final decoder state self.h2o = nn.Linear(hsz, len(self.dict)) # droput on the linear layer helps us generalize self.dropout = nn.Dropout(opt['dropout']) self.use_attention = False # if attention is greater than 0, set up additional members if self.attention > 0: self.use_attention = True self.max_length = self.attention # combines input and previous hidden output layer self.attn = nn.Linear(hsz * 2, self.max_length) # combines attention weights with encoder outputs self.attn_combine = nn.Linear(hsz * 2, hsz) # set up optims for each module lr = opt['learningrate'] optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']] self.optims = { 'lt': optim_class(self.lt.parameters(), lr=lr), 'lt2enc': optim_class(self.lt2enc.parameters(), lr=lr), 'lt2dec': optim_class(self.lt2dec.parameters(), lr=lr), 'encoder': optim_class(self.encoder.parameters(), lr=lr), 'decoder': optim_class(self.decoder.parameters(), lr=lr), 'h2o': optim_class(self.h2o.parameters(), lr=lr), } if hasattr(self, 'states'): # set loaded states if applicable self.set_states(self.states) if self.use_cuda: self.cuda() self.reset()
class FlickrDataset(Dataset): """A Pytorch Dataset utilizing streaming""" def __init__(self, opt): self.opt = opt self.use_hdf5 = opt.get('use_hdf5', False) self.datatype = self.opt.get('datatype') self.training = self.datatype.startswith('train') self.num_epochs = self.opt.get('num_epochs', 0) self.image_loader = ImageLoader(opt) caption_path, self.image_path = _path(opt) self._setup_data(caption_path, opt.get('unittest', False)) if self.use_hdf5: try: import h5py self.h5py = h5py except ModuleNotFoundError: raise ModuleNotFoundError( 'Need to install h5py - `pip install h5py`') self._setup_image_data() self.dict_agent = DictionaryAgent(opt) def __getitem__(self, index): index %= self.num_episodes() cap = self.caption[index] ep = { 'text': self.dict_agent.txt2vec(QUESTION), 'image': self.get_image(cap['image_id']), 'episode_done': True, } if self.opt.get('extract_image', False): ep['image_id'] = cap['image_id'] return ep ep['labels'] = [self.dict_agent.txt2vec(cc) for cc in cap['captions']] ep['valid'] = True ep['use_hdf5'] = self.use_hdf5 return (index, ep) def __len__(self): num_epochs = self.num_epochs if self.num_epochs > 0 else 100 num_iters = num_epochs if self.training else 1 return int(num_iters * self.num_episodes()) def _load_lens(self): with open(self.length_datafile) as length: lengths = json.load(length) self.num_eps = lengths['num_eps'] self.num_exs = lengths['num_exs'] def _setup_data(self, caption_path, unittest): with open(caption_path) as data_file: self.caption = [] prev_img_id = None for line in data_file: img_id = line.split('#')[0][:-4] caption = line.split('\t')[1] if img_id != prev_img_id: prev_img_id = img_id to_add = {} to_add['image_id'] = int(img_id) to_add['captions'] = [caption] self.caption.append(to_add) else: self.caption[-1]['captions'].append(caption) if unittest: self.caption = self.caption[:10] self.image_paths = set() for cap in self.caption: self.image_paths.add( os.path.join(self.image_path, '%d.jpg' % (cap['image_id']))) def _setup_image_data(self): '''hdf5 image dataset''' extract_feats(self.opt) im = self.opt.get('image_mode') hdf5_path = self.image_path + 'mode_{}_noatt.hdf5'.format(im) hdf5_file = self.h5py.File(hdf5_path, 'r') self.image_dataset = hdf5_file['images'] image_id_to_idx_path = self.image_path + 'mode_{}_id_to_idx.txt'.format( im) with open(image_id_to_idx_path, 'r') as f: self.image_id_to_idx = json.load(f) def get_image(self, image_id): if not self.use_hdf5: im_path = os.path.join(self.image_path, '%d.jpg' % (image_id)) return self.image_loader.load(im_path) else: img_idx = self.image_id_to_idx[str(image_id)] return torch.Tensor(self.image_dataset[img_idx]) def num_episodes(self): return len(self.caption) def num_examples(self): return self.num_episodes() def num_images(self): return self.num_episodes()