Python DictionaryAgent Examples

Programming Language: Python

Namespace/Package Name: parlai.core.dict

Class/Type: DictionaryAgent

Examples at hotexamples.com: 60

Python DictionaryAgent - 60 examples found. These are the top rated real world Python examples of parlai.core.dict.DictionaryAgent extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DictionaryAgent(30)

add_cmdline_args(30)

save(21)

parse(12)

tokenize(8)

act(7)

observe(7)

load(6)

add_to_dict(4)

freqs(4)

gpt2_tokenize(4)

add_additional_special_tokens(3)

split_tokenize(2)

bytelevelbpe_tokenize(2)

default_tok(1)

remove_tail(1)

is_prebuilt(1)

set_tokenization_mode(1)

share(1)

sort(1)

space_tokenize(1)

finalize(1)

keys(1)

Example #1

Show file

File: build_dict.py Project: jojonki/ParlAI

def build_dict(opt):
    if not opt.get('dict_file'):
        print('Tried to build dictionary but `--dict-file` is not set. Set ' +
              'this param so the dictionary can be saved.')
        return
    print('[ setting up dictionary. ]')
    if os.path.isfile(opt['dict_file']):
        # Dictionary already built
        print("[ dictionary already built .]")
        return
    if opt.get('dict_class'):
        # Custom dictionary class
        dictionary = str2class(opt['dict_class'])(opt)
    else:
        # Default dictionary class
        dictionary = DictionaryAgent(opt)
    ordered_opt = copy.deepcopy(opt)
    cnt = 0
    # we use train set to build dictionary
    ordered_opt['datatype'] = 'train:ordered'
    if 'stream' in opt['datatype']:
        ordered_opt['datatype'] += ':stream'
    ordered_opt['numthreads'] = 1
    ordered_opt['batchsize'] = 1
    world_dict = create_task(ordered_opt, dictionary)
    # pass examples to dictionary
    for _ in world_dict:
        cnt += 1
        if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0:
            print('Processed {} exs, moving on.'.format(opt['dict_maxexs']))
            # don't wait too long...
            break
        world_dict.parley()
    print('[ dictionary built. ]')
    dictionary.save(opt['dict_file'], sort=True)

Example #2

Show file

File: memnn.py Project: jojonki/ParlAI

 def add_cmdline_args(argparser):
     DictionaryAgent.add_cmdline_args(argparser)
     arg_group = argparser.add_argument_group('MemNN Arguments')
     arg_group.add_argument('-lr', '--learning-rate', type=float, default=0.01,
         help='learning rate')
     arg_group.add_argument('--embedding-size', type=int, default=128,
         help='size of token embeddings')
     arg_group.add_argument('--hops', type=int, default=3,
         help='number of memory hops')
     arg_group.add_argument('--mem-size', type=int, default=100,
         help='size of memory')
     arg_group.add_argument('--time-features', type='bool', default=True,
         help='use time features for memory embeddings')
     arg_group.add_argument('--position-encoding', type='bool', default=False,
         help='use position encoding instead of bag of words embedding')
     arg_group.add_argument('--output', type=str, default='rank',
         help='type of output (rank|generate)')
     arg_group.add_argument('--rnn-layers', type=int, default=2,
         help='number of hidden layers in RNN decoder for generative output')
     arg_group.add_argument('--dropout', type=float, default=0.1,
         help='dropout probability for RNN decoder training')
     arg_group.add_argument('--optimizer', default='adam',
         help='optimizer type (sgd|adam)')
     arg_group.add_argument('--no-cuda', action='store_true', default=False,
         help='disable GPUs even if available')
     arg_group.add_argument('--gpu', type=int, default=-1,
         help='which GPU device to use')

Example #3

Show file

File: fairseq.py Project: ahiroto/ParlAI

 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent."""
     DictionaryAgent.add_cmdline_args(argparser)
     agent = argparser.add_argument_group('Fairseq Arguments')
     agent.add_argument(
         '-tr', '--truncate',
         type=int, default=-1,
         help='truncate input & output lengths to speed up training (may '
              'reduce accuracy). This fixes all input and output to have a '
              'maximum length. This reduces the total amount of padding in '
              'the batches.')
     agent.add_argument(
         '--max-positions',
         default=1024,
         type=int,
         metavar='N',
         help='max number of tokens in the sequence')
     agent.add_argument(
         '--seed',
         default=1,
         type=int,
         metavar='N',
         help='pseudo random number generator seed')
     options.add_optimization_args(argparser)
     options.add_generation_args(argparser)
     options.add_model_args(argparser)

Example #4

Show file

File: test_dict.py Project: ahiroto/ParlAI

    def test_basic_parse(self):
        """Check that the dictionary is correctly adding and parsing short
        sentence.
        """
        from parlai.core.dict import DictionaryAgent
        from parlai.core.params import ParlaiParser

        argparser = ParlaiParser()
        DictionaryAgent.add_cmdline_args(argparser)
        opt = argparser.parse_args()
        dictionary = DictionaryAgent(opt)
        num_builtin = len(dictionary)

        dictionary.observe({'text': 'hello world'})
        dictionary.act()
        assert len(dictionary) - num_builtin == 2

        vec = dictionary.parse('hello world')
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=list)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=tuple)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

Example #5

Show file

File: coopgame_agent.py Project: ahiroto/ParlAI

 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent."""
     DictionaryAgent.add_cmdline_args(argparser)
     group = argparser.add_argument_group('Cooperative Game Agent Arguments')
     agent.add_argument('--optimizer', default='adam',
                        choices=CooperativeGameAgent.OPTIM_OPTS.keys(),
                        help='Choose between pytorch optimizers. Any member of torch.optim '
                             'is valid and will be used with default params except learning '
                             'rate (as specified by -lr).')
     group.add_argument('--learning-rate', default=1e-2, type=float,
                        help='Initial learning rate')
     group.add_argument('--no-cuda', action='store_true', default=False,
                        help='disable GPUs even if available')
     group.add_argument('--gpuid', type=int, default=-1,
                        help='which GPU device to use (defaults to cpu)')

Example #6

Show file

File: seq2seq.py Project: analyticlaks/ParlAI

 def add_cmdline_args(argparser):
     DictionaryAgent.add_cmdline_args(argparser)
     agent = argparser.add_argument_group('Seq2Seq Arguments')
     agent.add_argument('-hs', '--hiddensize', type=int, default=64,
         help='size of the hidden layers and embeddings')
     agent.add_argument('-nl', '--numlayers', type=int, default=2,
         help='number of hidden layers')
     agent.add_argument('-lr', '--learningrate', type=float, default=0.5,
         help='learning rate')
     agent.add_argument('-dr', '--dropout', type=float, default=0.1,
         help='dropout rate')
     agent.add_argument('--no-cuda', action='store_true', default=False,
         help='disable GPUs even if available')
     agent.add_argument('--gpu', type=int, default=-1,
         help='which GPU device to use')

Example #7

Show file

File: drqa.py Project: ahiroto/ParlAI

 def add_cmdline_args(argparser):
     group = DictionaryAgent.add_cmdline_args(argparser)
     group.add_argument(
         '--pretrained_words', type='bool', default=True,
         help='Use only words found in provided embedding_file'
     )
     group.set_defaults(dict_tokenizer='spacy')

Example #8

Show file

File: seq2seq.py Project: jojonki/ParlAI

 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent."""
     DictionaryAgent.add_cmdline_args(argparser)
     agent = argparser.add_argument_group('Seq2Seq Arguments')
     agent.add_argument('-hs', '--hiddensize', type=int, default=128,
                        help='size of the hidden layers')
     agent.add_argument('-emb', '--embeddingsize', type=int, default=128,
                        help='size of the token embeddings')
     agent.add_argument('-nl', '--numlayers', type=int, default=2,
                        help='number of hidden layers')
     agent.add_argument('-lr', '--learningrate', type=float, default=0.5,
                        help='learning rate')
     agent.add_argument('-dr', '--dropout', type=float, default=0.1,
                        help='dropout rate')
     agent.add_argument('-att', '--attention', type=int, default=0,
                        help='if greater than 0, use attention of specified'
                             ' length while decoding')
     agent.add_argument('--no-cuda', action='store_true', default=False,
                        help='disable GPUs even if available')
     agent.add_argument('--gpu', type=int, default=-1,
                        help='which GPU device to use')
     agent.add_argument('-rc', '--rank-candidates', type='bool',
                        default=False,
                        help='rank candidates if available. this is done by'
                             ' computing the mean score per token for each '
                             'candidate and selecting the highest scoring.')
     agent.add_argument('-tr', '--truncate', type='bool', default=True,
                        help='truncate input & output lengths to speed up '
                        'training (may reduce accuracy). This fixes all '
                        'input and output to have a maximum length and to '
                        'be similar in length to one another by throwing '
                        'away extra tokens. This reduces the total amount '
                        'of padding in the batches.')
     agent.add_argument('-enc', '--encoder', default='gru',
                        choices=Seq2seqAgent.ENC_OPTS.keys(),
                        help='Choose between different encoder modules.')
     agent.add_argument('-dec', '--decoder', default='same',
                        choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()),
                        help='Choose between different decoder modules. '
                             'Default "same" uses same class as encoder, '
                             'while "shared" also uses the same weights.')
     agent.add_argument('-opt', '--optimizer', default='sgd',
                        choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                        help='Choose between pytorch optimizers. '
                             'Any member of torch.optim is valid and will '
                             'be used with default params except learning '
                             'rate (as specified by -lr).')

Example #9

Show file

File: coopgame_agent.py Project: ahiroto/ParlAI

 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent. Default
     values at according to (Kottur et al. 2017)."""
     DictionaryAgent.add_cmdline_args(argparser)
     group = argparser.add_argument_group('Questioner Agent Arguments')
     parser.add_argument('--q-in-vocab', default=13, type=int,
                         help='Input vocabulary for questioner. Usually includes total '
                              'distinct words spoken by answerer, questioner itself, '
                              'and words by which the goal is described.')
     parser.add_argument('--q-embed-size', default=20, type=int,
                         help='Size of word embeddings for questioner')
     parser.add_argument('--q-state-size', default=100, type=int,
                         help='Size of hidden state of questioner')
     parser.add_argument('--q-out-vocab', default=3, type=int,
                         help='Output vocabulary for questioner')
     parser.add_argument('--q-num-pred', default=12, type=int,
                         help='Size of output to be predicted (for goal).')
     super().add_cmdline_args(argparser)

Example #10

Show file

File: memnn.py Project: jojonki/ParlAI

    def __init__(self, opt, shared=None):
        opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available()
        if opt['cuda']:
            print('[ Using CUDA ]')
            torch.cuda.device(opt['gpu'])

        if not shared:
            self.opt = opt
            self.id = 'MemNN'
            self.dict = DictionaryAgent(opt)
            self.answers = [None] * opt['batchsize']

            self.model = MemNN(opt, self.dict)
            self.mem_size = opt['mem_size']
            self.loss_fn = CrossEntropyLoss()

            self.decoder = None
            self.longest_label = 1
            self.END = self.dict.end_token
            self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END))
            self.START = self.dict.start_token
            self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START))
            if opt['output'] == 'generate' or opt['output'] == 'g':
                self.decoder = Decoder(opt['embedding_size'], opt['embedding_size'],
                                        opt['rnn_layers'], opt, self.dict)
            elif opt['output'] != 'rank' and opt['output'] != 'r':
                raise NotImplementedError('Output type not supported.')

            optim_params = [p for p in self.model.parameters() if p.requires_grad]
            lr = opt['learning_rate']
            if opt['optimizer'] == 'sgd':
                self.optimizers = {'memnn': optim.SGD(optim_params, lr=lr)}
                if self.decoder is not None:
                    self.optimizers['decoder'] = optim.SGD(self.decoder.parameters(), lr=lr)
            elif opt['optimizer'] == 'adam':
                self.optimizers = {'memnn': optim.Adam(optim_params, lr=lr)}
                if self.decoder is not None:
                    self.optimizers['decoder'] = optim.Adam(self.decoder.parameters(), lr=lr)
            else:
                raise NotImplementedError('Optimizer not supported.')

            if opt['cuda']:
                self.model.share_memory()
                if self.decoder is not None:
                    self.decoder.cuda()

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                print('Loading existing model parameters from ' + opt['model_file'])
                self.load(opt['model_file'])
        else:
            self.answers = shared['answers']

        self.episode_done = True
        self.last_cands, self.last_cands_list = None, None
        super().__init__(opt, shared)

Example #11

Show file

File: fairseq.py Project: ahiroto/ParlAI

    def __init__(self, opt, shared=None):
        # initialize defaults first
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.
            saved_state = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, saved_state = self.load(opt['model_file'])
                # override options with stored ones
                opt = self._override_opt(new_opt)

            self.args = OptWrapper(opt)
            self.parlai_dict = DictionaryAgent(opt)
            self.fairseq_dict = _make_fairseq_dict(self.parlai_dict)
            self.id = 'Fairseq'
            self.truncate = opt['truncate'] if opt['truncate'] > 0 else None

            self.EOS = self.fairseq_dict[self.fairseq_dict.eos()]
            self.EOS_TENSOR = (torch.LongTensor(1, 1)
                               .fill_(self.fairseq_dict.eos()))
            self.NULL_IDX = self.fairseq_dict.pad()

            encoder = fconv.FConvEncoder(
                self.fairseq_dict,
                embed_dim=self.args.encoder_embed_dim,
                convolutions=eval(self.args.encoder_layers),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            decoder = fconv.FConvDecoder(
                self.fairseq_dict,
                embed_dim=self.args.decoder_embed_dim,
                convolutions=eval(self.args.decoder_layers),
                out_embed_dim=self.args.decoder_out_embed_dim,
                attention=eval(self.args.decoder_attention),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            self.model = fconv.FConvModel(encoder, decoder)

            # from fairseq's build_criterion()
            if self.args.label_smoothing > 0:
                self.criterion = criterions.LabelSmoothedCrossEntropyCriterion(
                    self.args.label_smoothing, self.NULL_IDX)
            else:
                self.criterion = criterions.CrossEntropyCriterion(
                    self.args, self.fairseq_dict)

            self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion)
            if saved_state is not None:
                self.set_states(saved_state)
        self.reset()

Example #12

Show file

File: build_dict.py Project: ahiroto/ParlAI

def build_dict(opt):
    if not opt.get('dict_file'):
        print('Tried to build dictionary but `--dict-file` is not set. Set ' +
              'this param so the dictionary can be saved.')
        return
    print('[ setting up dictionary. ]')
    if os.path.isfile(opt['dict_file']):
        # Dictionary already built
        print("[ dictionary already built .]")
        return
    if opt.get('dict_class'):
        # Custom dictionary class
        dictionary = str2class(opt['dict_class'])(opt)
    else:
        # Default dictionary class
        dictionary = DictionaryAgent(opt)
    ordered_opt = copy.deepcopy(opt)
    cnt = 0
    # we use train set to build dictionary
    ordered_opt['datatype'] = 'train:ordered:stream'
    ordered_opt['numthreads'] = 1
    ordered_opt['batchsize'] = 1
    ordered_opt['image_mode'] = 'none'
    if ordered_opt['task'] == 'pytorch_teacher' and ordered_opt.get('pytorch_preprocess', False):
       pytorch_buildteacher_task = ordered_opt.get('pytorch_buildteacher', '')
       if pytorch_buildteacher_task != '':
        ordered_opt['task'] = pytorch_buildteacher_task
    world_dict = create_task(ordered_opt, dictionary)
    # pass examples to dictionary
    while not world_dict.epoch_done():
        cnt += 1
        if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0:
            print('Processed {} exs, moving on.'.format(opt['dict_maxexs']))
            # don't wait too long...
            break
        world_dict.parley()
    print('[ dictionary built. ]')
    dictionary.save(opt['dict_file'], sort=True)

Example #13

Show file

File: seq2seq.py Project: analyticlaks/ParlAI

    def __init__(self, opt, shared=None):
        super().__init__(opt, shared)
        opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available()
        if opt['cuda']:
            print('[ Using CUDA ]')
            torch.cuda.set_device(opt['gpu'])
        if not shared:
            # don't enter this loop for shared (ie batch) instantiations
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            hsz = opt['hiddensize']
            self.EOS = self.dict.eos_token
            self.observation = {'text': self.EOS, 'episode_done': True}
            self.EOS_TENSOR = torch.LongTensor(self.dict.parse(self.EOS))
            self.hidden_size = hsz
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.use_cuda = opt.get('cuda', False)
            self.longest_label = 1

            self.criterion = nn.NLLLoss()
            self.lt = nn.Embedding(len(self.dict), hsz, padding_idx=0,
                                   scale_grad_by_freq=True)
            self.encoder = nn.GRU(hsz, hsz, opt['numlayers'])
            self.decoder = nn.GRU(hsz, hsz, opt['numlayers'])
            self.d2o = nn.Linear(hsz, len(self.dict))
            self.dropout = nn.Dropout(opt['dropout'])
            self.softmax = nn.LogSoftmax()

            lr = opt['learningrate']
            self.optims = {
                'lt': optim.SGD(self.lt.parameters(), lr=lr),
                'encoder': optim.SGD(self.encoder.parameters(), lr=lr),
                'decoder': optim.SGD(self.decoder.parameters(), lr=lr),
                'd2o': optim.SGD(self.d2o.parameters(), lr=lr),
            }
            if self.use_cuda:
                self.cuda()
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                print('Loading existing model parameters from ' + opt['model_file'])
                self.load(opt['model_file'])

        self.episode_done = True

Example #14

Show file

File: starspace.py Project: ahiroto/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt
        self.reset_metrics()
        self.id = 'Starspace'
        self.NULL_IDX = 0
        self.cands = torch.LongTensor(1, 1, 1)
        self.ys_cache = []
        self.ys_cache_sz = opt['cache_size']
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        self.debugMode = False
        if shared:
            self.threadindex = shared['threadindex']
            print("[ creating Starspace thread " + str(self.threadindex)  + " ]")
            # set up shared properties
            self.dict = shared['dict']
            self.model = shared['model'] #Starspace(opt, len(self.dict))
        else:
            print("[ creating StarspaceAgent ]")
            # this is not a shared instance of this class, so do full init
            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'
            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)

            self.model = Starspace(opt, len(self.dict), self.dict)
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                self.load(opt['model_file'])
            self.model.share_memory()

        # set up modules
        self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'], size_average=False)
        self.reset()
        self.fixedCands = False
        if self.opt.get('fixed-candidates-file'):
            self.fixedCands = load_cands(self.opt.get('fixed-candidates-file'))

Example #15

Show file

File: seq2seq.py Project: ahiroto/ParlAI

class Seq2seqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    This model supports encoding the input and decoding the output via one of
    several flavors of RNN. It then uses a linear layer (whose weights can
    be shared with the embedding layer) to convert RNN output states into
    output tokens. This model currently uses greedy decoding, selecting the
    highest probability token at each time step.

    For more information, see Sequence to Sequence Learning with Neural
    Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        Seq2seqAgent.dictionary_class().add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument('-hs', '--hiddensize', type=int, default=128,
                           help='size of the hidden layers')
        agent.add_argument('-esz', '--embeddingsize', type=int, default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl', '--numlayers', type=int, default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr', '--learningrate', type=float, default=0.005,
                           help='learning rate')
        agent.add_argument('-dr', '--dropout', type=float, default=0.1,
                           help='dropout rate')
        agent.add_argument('-clip', '--gradient-clip', type=float, default=0.2,
                           help='gradient clipping using l2 norm')
        agent.add_argument('-bi', '--bidirectional', type='bool',
                           default=False,
                           help='whether to encode the context with a '
                                'bidirectional rnn')
        agent.add_argument('-att', '--attention', default='none',
                           choices=['none', 'concat', 'general', 'dot', 'local'],
                           help='Choices: none, concat, general, local. '
                                'If set local, also set attention-length. '
                                'For more details see: '
                                'https://arxiv.org/pdf/1508.04025.pdf')
        agent.add_argument('-attl', '--attention-length', default=48, type=int,
                           help='Length of local attention.')
        agent.add_argument('--no-cuda', action='store_true', default=False,
                           help='disable GPUs even if available')
        agent.add_argument('--gpu', type=int, default=-1,
                           help='which GPU device to use')
        agent.add_argument('-rc', '--rank-candidates', type='bool',
                           default=False,
                           help='rank candidates if available. this is done by'
                                ' computing the mean score per token for each '
                                'candidate and selecting the highest scoring.')
        agent.add_argument('-tr', '--truncate', type=int, default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length. This '
                           'reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-rnn', '--rnn-class', default='lstm',
                           choices=Seq2seq.RNN_OPTS.keys(),
                           help='Choose between different types of RNNs.')
        agent.add_argument('-dec', '--decoder', default='same',
                           choices=['same', 'shared'],
                           help='Choose between different decoder modules. '
                                'Default "same" uses same class as encoder, '
                                'while "shared" also uses the same weights. '
                                'Note that shared disabled some encoder '
                                'options--in particular, bidirectionality.')
        agent.add_argument('-lt', '--lookuptable', default='all',
                           choices=['unique', 'enc_dec', 'dec_out', 'all'],
                           help='The encoder, decoder, and output modules can '
                                'share weights, or not. '
                                'Unique has independent embeddings for each. '
                                'Enc_dec shares the embedding for the encoder '
                                'and decoder. '
                                'Dec_out shares decoder embedding and output '
                                'weights. '
                                'All shares all three weights.')
        agent.add_argument('-opt', '--optimizer', default='adam',
                           choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                                'Any member of torch.optim is valid and will '
                                'be used with default params except learning '
                                'rate (as specified by -lr).')
        agent.add_argument('-emb', '--embedding-type', default='random',
                           choices=['random', 'glove', 'glove-fixed',
                                    'fasttext', 'fasttext-fixed'],
                           help='Choose between different strategies '
                                'for word embeddings. Default is random, '
                                'but can also preinitialize from Glove or '
                                'Fasttext.'
                                'Preinitialized embeddings can also be fixed '
                                'so they are not updated during training.')
        agent.add_argument('-hist', '--history-length', default=100000, type=int,
                           help='Number of past tokens to remember. '
                                'Default remembers 100000 tokens.')
        agent.add_argument('-histr', '--history-replies',
                           default='none', type=str,
                           choices=['none', 'model', 'label'],
                           help='Keep replies in the history, or not.')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        self.states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()


        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            self.model = Seq2seq(opt, len(self.dict),
                                 padding_idx=self.NULL_IDX,
                                 start_idx=self.START_IDX,
                                 end_idx=self.END_IDX,
                                 longest_label=self.states.get('longest_label', 1))

            if opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ModuleNotFoundError as ex:
                    print('Please install torch text with `pip install torchtext`')
                    raise ex
                if opt['embedding_type'].startswith('glove'):
                    init = 'glove'
                    embs = vocab.GloVe(name='840B', dim=300)
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en')
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != 300:
                    rp = torch.Tensor(300, opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.2)
            self.rank = opt['rank_candidates']

            # set up tensors once
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)

            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)

            if self.use_cuda:
                # push to cuda
                self.xs = self.xs.cuda(async=True)
                self.ys = self.ys.cuda(async=True)
                if self.rank:
                    self.cands = self.cands.cuda(async=True)
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs)
            if self.states:
                if self.states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(self.states['optimizer'])

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer',
                      'encoder', 'decoder', 'lookuptable', 'attention',
                      'attention_length'}
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                      k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        if type(vec) == Variable:
            vec = vec.data
        new_vec = []
        for i in vec:
            if i == self.END_IDX:
                break
            elif i != self.START_IDX:
                new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def zero_grad(self):
        """Zero out optimizer."""
        self.optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip)
        self.optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['answers'] = self.answers
        shared['dict'] = self.dict
        shared['START_IDX'] = self.START_IDX
        shared['END_IDX'] = self.END_IDX
        shared['NULL_IDX'] = self.NULL_IDX
        if self.opt.get('numthreads', 1) > 1:
            shared['model'] = self.model
            self.model.share_memory()
            shared['states'] = self.states
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        batch_idx = self.opt.get('batchindex', 0)
        if not obs.get('preprocessed', False):
            obs['text2vec'] = maintain_dialog_history(
                self.history, obs,
                reply=self.answers[batch_idx],
                historyLength=self.opt['history_length'],
                useReplies=self.opt['history_replies'],
                dict=self.dict,
                useStartEndIndices=False)
        else:
            obs['text2vec'] = deque(obs['text2vec'], self.opt['history_length'])
        self.observation = obs
        self.answers[batch_idx] = None
        return obs

    def predict(self, xs, ys=None, cands=None, valid_cands=None):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        is_training = ys is not None
        text_cand_inds, loss_dict = None, None
        if is_training:
            self.model.train()
            self.zero_grad()
            loss = 0
            predictions, scores, _ = self.model(xs, ys)
            loss += self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1))
            loss.backward()
            self.update_params()
            loss_dict = {'loss': loss.mul(len(xs)).data[0]}
            loss_dict['ppl'] = (math.e**loss).mul(len(xs)).data[0]
        else:
            self.model.eval()
            predictions, scores, text_cand_inds = self.model(xs, ys, cands,
                                                             valid_cands)

        return predictions, text_cand_inds, loss_dict

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        ys = None
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations, self.dict, self.END_IDX, self.NULL_IDX, dq=True,
            eval_labels=False, truncate=self.truncate)
        if xs is None:
            return None, None, None, None, None, None
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs, async=True)
            xs = Variable(self.xs)
            if ys is not None:
                self.ys.resize_(ys.size())
                self.ys.copy_(ys, async=True)
                ys = Variable(self.ys)
        else:
            xs = Variable(xs)
            if ys is not None:
                ys = Variable(ys)

        # set up candidates
        cands = None
        valid_cands = None
        if ys is None and self.rank:
            # only do ranking when no targets available and ranking flag set
            parsed_cs = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[v]:
                    # each candidate tuple is a pair of the parsed version and
                    # the original full string
                    cs = list(observations[v]['label_candidates'])
                    curr_dqs = [deque(maxlen=self.truncate) for _ in cs]
                    for dq, c in zip(curr_dqs, cs):
                        dq.extendleft(reversed(self.parse(c)))
                    parsed_cs.append(curr_dqs)
                    valid_cands.append((i, v, cs))
            if len(parsed_cs) > 0:
                # TODO: store lengths of cands separately, so don't have zero
                #       padding for varying number of cands per example
                # found cands, pack them into tensor
                max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs)
                max_c_cnt = max(len(cs) for cs in parsed_cs)
                for cs in parsed_cs:
                    for c in cs:
                        c += [self.NULL_IDX] * (max_c_len - len(c))
                    cs += [self.NULL_IDX] * (max_c_cnt - len(cs))
                cands = torch.LongTensor(parsed_cs)
                if self.use_cuda:
                    # copy to gpu
                    self.cands.resize_(cands.size())
                    self.cands.copy_(cands, async=True)
                    cands = Variable(self.cands)
                else:
                    cands = Variable(cands)

        return xs, ys, labels, valid_inds, cands, valid_cands

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, labels, valid_inds, cands, valid_cands = self.vectorize(observations)

        if xs is None:
            # no valid examples, just return empty responses
            return batch_reply

        # produce predictions, train on targets if availables
        predictions, text_cand_inds, loss = self.predict(xs, ys, cands, valid_cands)
        if loss is not None:
            if 'metrics' in batch_reply[0]:
                for k, v in loss.items():
                    batch_reply[0]['metrics'][k] = v
            else:
                batch_reply[0]['metrics'] = loss


        if ys is not None:
            report_freq = 0
        else:
            report_freq = 0.1
        PaddingUtils.map_predictions(
            predictions, valid_inds, batch_reply, observations, self.dict,
            self.END_IDX, report_freq=report_freq, labels=labels,
            answers=self.answers, ys=ys)

        if text_cand_inds is not None:
            text_cand_inds = text_cand_inds.cpu().data
            for i in range(len(valid_cands)):
                order = text_cand_inds[i]
                _, batch_idx, curr_cands = valid_cands[i]
                curr = batch_reply[batch_idx]
                curr['text_candidates'] = [curr_cands[idx] for idx in order
                                           if idx < len(curr_cands)]

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['longest_label'] = self.model.longest_label
            model['optimizer'] = self.optimizer.state_dict()
            model['optimizer_type'] = self.opt['optimizer']
            model['opt'] = self.opt

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            states = torch.load(read)

        return states['opt'], states

Example #16

Show file

 def add_cmdline_args(parser):
     DictionaryAgent.add_cmdline_args(parser)
     parser.add_argument('-lp',
                         '--length_penalty',
                         default=0.5,
                         help='length penalty for responses')

Example #17

Show file

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        self.states = {}
        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        self.batchsize = opt.get('batchsize', 1)
        self.use_person_tokens = opt.get('person_tokens', True)

        if shared:
            # set up shared properties
            self.dict = shared['dict']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

        else:
            # this is not a shared instance of this class, so do full init
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file'
            elif opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']
            else:
                init_model = None

            if init_model is not None:
                # load model parameters if available
                print('Loading existing model params from ' + init_model)
                new_opt, self.states = self.load(init_model)
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None:
                if init_model is not None and os.path.isfile(init_model +
                                                             '.dict'):
                    # check first to see if a dictionary exists
                    opt['dict_file'] = init_model + '.dict'
                elif opt.get('model_file'):
                    # otherwise, set default dict-file if it is not set
                    opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'LanguageModel'

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

            # set model
            self.model = RNNModel(opt, len(self.dict))

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        self.next_observe = []
        self.next_batch = []

        self.is_training = True

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.25)
            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            if self.use_cuda:
                # push to cuda
                self.criterion.cuda()
            # set up criterion for eval: we do not want to average over size
            self.eval_criterion = nn.CrossEntropyLoss(
                ignore_index=self.NULL_IDX, size_average=False)
            if self.use_cuda:
                # push to cuda
                self.eval_criterion.cuda()
            # init hidden state
            self.hidden = self.model.init_hidden(self.batchsize)
            # init tensor of end tokens
            self.ends = torch.LongTensor(
                [self.END_IDX for _ in range(self.batchsize)])
            if self.use_cuda:
                self.ends = self.ends.cuda()
            # set up model and learning rate scheduler parameters
            self.lr = opt['learningrate']
            self.optimizer = torch.optim.SGD(self.model.parameters(),
                                             lr=self.lr)
            self.best_val_loss = self.states.get('best_val_loss', None)
            self.lr_factor = opt['lr_factor']
            if self.lr_factor < 1.0:
                self.lr_patience = opt['lr_patience']
                self.lr_min = opt['lr_minimum']
                self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    self.optimizer,
                    factor=self.lr_factor,
                    verbose=True,
                    patience=self.lr_patience,
                    min_lr=self.lr_min)
                # initial step for scheduler if self.best_val_loss is initialized
                if self.best_val_loss is not None:
                    self.scheduler.step(self.best_val_loss)
            else:
                self.scheduler = None

        self.reset()

Example #18

Show file

File: fairseq.py Project: youlei5898/ParlAI

    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Fairseq Arguments')

        agent.add_argument(
            '--max-positions',
            default=1024,
            type=int,
            metavar='N',
            help='max number of tokens in the sequence')
        agent.add_argument(
            '--seed',
            default=1,
            type=int,
            metavar='N',
            help='pseudo random number generator seed')
        agent.add_argument(
            '--lr',
            '--learning-rate',
            default=0.25,
            type=float,
            metavar='LR',
            help='initial learning rate')
        agent.add_argument(
            '--momentum',
            default=0.99,
            type=float,
            metavar='M',
            help='momentum factor')
        agent.add_argument(
            '--weight-decay',
            '--wd',
            default=0.0,
            type=float,
            metavar='WD',
            help='weight decay')
        agent.add_argument(
            '--force-anneal',
            '--fa',
            default=0,
            type=int,
            metavar='N',
            help='force annealing at specified epoch')
        agent.add_argument(
            '--beam', default=5, type=int, metavar='N', help='beam size')
        agent.add_argument(
            '--no-early-stop',
            action='store_true',
            help=('continue searching even after finalizing k=beam '
                  'hypotheses; this is more correct, but increases '
                  'generation time by 50%%'))
        agent.add_argument(
            '--unnormalized',
            action='store_true',
            help='compare unnormalized hypothesis scores')

        agent.add_argument(
            '--lenpen',
            default=1,
            type=float,
            help=
            'length penalty: <1.0 favors shorter, >1.0 favors longer sentences')

        agent.add_argument(
            '--clip-norm',
            default=25,
            type=float,
            metavar='NORM',
            help='clip threshold of gradients')

        agent.add_argument(
            '--arch',
            '-a',
            default='fconv',
            metavar='ARCH',
            choices=models.arch_model_map.keys(),
            help='model architecture ({})'.format(
                ', '.join(models.arch_model_map.keys())))
        agent.add_argument(
            '--encoder-embed-dim',
            type=int,
            metavar='N',
            help='encoder embedding dimension')
        agent.add_argument(
            '--encoder-layers',
            type=str,
            metavar='EXPR',
            help='encoder layers [(dim, kernel_size), ...]')
        agent.add_argument(
            '--decoder-embed-dim',
            type=int,
            metavar='N',
            help='decoder embedding dimension')
        agent.add_argument(
            '--decoder-layers',
            type=str,
            metavar='EXPR',
            help='decoder layers [(dim, kernel_size), ...]')
        agent.add_argument(
            '--decoder-out-embed-dim',
            type=int,
            metavar='N',
            help='decoder output embedding dimension')
        agent.add_argument(
            '--decoder-attention',
            type=str,
            metavar='EXPR',
            help='decoder attention [True, ...]')

        # These arguments have default values independent of the model:
        agent.add_argument(
            '--dropout',
            default=0.1,
            type=float,
            metavar='D',
            help='dropout probability')
        agent.add_argument(
            '--label-smoothing',
            default=0,
            type=float,
            metavar='D',
            help='epsilon for label smoothing, 0 means no label smoothing')

Example #19

Show file

class ScoringNetAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    For more information, see Sequence to Sequence Learning with Neural
    Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=128,
                           help='size of the hidden layers')
        agent.add_argument('-emb',
                           '--embeddingsize',
                           type=int,
                           default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr',
                           '--learning_rate',
                           type=float,
                           default=0.5,
                           help='learning rate')
        agent.add_argument('-wd',
                           '--weight_decay',
                           type=float,
                           default=0,
                           help='weight decay')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.2,
                           help='dropout rate')
        agent.add_argument('-att',
                           '--attention',
                           default=False,
                           type='bool',
                           help='if True, use attention')
        agent.add_argument(
            '-attType',
            '--attn-type',
            default='general',
            choices=['general', 'concat', 'dot'],
            help='general=bilinear dotproduct, concat=bahdanau\'s implemenation'
        )
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument('--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        agent.add_argument('-rc',
                           '--rank-candidates',
                           type='bool',
                           default=False,
                           help='rank candidates if available. this is done by'
                           ' computing the mean score per token for each '
                           'candidate and selecting the highest scoring.')
        agent.add_argument('-tr',
                           '--truncate',
                           type='bool',
                           default=True,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length and to '
                           'be similar in length to one another by throwing '
                           'away extra tokens. This reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-enc',
                           '--encoder',
                           default='gru',
                           choices=ScoringNetAgent.ENC_OPTS.keys(),
                           help='Choose between different encoder modules.')
        agent.add_argument('-bi',
                           '--bi-encoder',
                           default=True,
                           type='bool',
                           help='Bidirection of encoder')
        agent.add_argument('-dec',
                           '--decoder',
                           default='same',
                           choices=['same', 'shared'] +
                           list(ScoringNetAgent.ENC_OPTS.keys()),
                           help='Choose between different decoder modules. '
                           'Default "same" uses same class as encoder, '
                           'while "shared" also uses the same weights.')
        agent.add_argument('-opt',
                           '--optimizer',
                           default='sgd',
                           choices=ScoringNetAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                           'Any member of torch.optim is valid and will '
                           'be used with default params except learning '
                           'rate (as specified by -lr).')
        agent.add_argument('-gradClip',
                           '--grad-clip',
                           type=float,
                           default=-1,
                           help='gradient clip, default = -1 (no clipping)')
        agent.add_argument(
            '-epi',
            '--episode-concat',
            type='bool',
            default=False,
            help=
            'If multiple observations are from the same episode, concatenate them.'
        )
        agent.add_argument(
            '--beam_size',
            type=int,
            default=0,
            help=
            'Beam size for beam search (only for generation mode) \n For Greedy search set 0'
        )
        agent.add_argument('--max_seq_len',
                           type=int,
                           default=50,
                           help='The maximum sequence length, default = 50')
        agent.add_argument('-ptrmodel',
                           '--ptr_model',
                           default='',
                           help='The pretrained model directory')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available(
            )
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])
            """
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override options with stored ones
                opt = self.override_opt(new_opt)
            """
            if opt.get('ptr_model') and os.path.isfile(opt['ptr_model']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['ptr_model'])
                new_opt, self.states = self.load(
                    opt['ptr_model'])  ## TODO:: load what?
                # override options with stored ones
                #opt = self.override_opt(new_opt)

            self.dict = DictionaryAgent(opt)
            self.id = 'ScoringNet'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START))
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END))
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params directly
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learning_rate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.truncate = opt['truncate']
            self.attention = opt['attention']

            # set up tensors
            if self.opt['bi_encoder']:
                self.zeros = torch.zeros(2 * self.num_layers, 1, hsz)
            else:
                self.zeros = torch.zeros(self.num_layers, 1, hsz)

            self.zeros_dec = torch.zeros(self.num_layers, 1, hsz)

            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            self.neg_ys = torch.LongTensor(1, 1)

            # set up modules
            #self.criterion = nn.NLLLoss(size_average = False, ignore_index = 0)
            self.criterion = nn.BCELoss()

            # lookup table stores word embeddings
            self.lt = nn.Embedding(len(self.dict),
                                   emb,
                                   padding_idx=self.NULL_IDX)
            #scale_grad_by_freq=True)
            # encoder captures the input text
            enc_class = ScoringNetAgent.ENC_OPTS[opt['encoder']]
            self.encoder = enc_class(emb,
                                     hsz,
                                     opt['numlayers'],
                                     bidirectional=opt['bi_encoder'],
                                     dropout=opt['dropout'])
            # decoder produces our output states

            dec_isz = hsz
            if opt['bi_encoder']:
                dec_isz += hsz

            # linear layer helps us produce outputs from final decoder state
            self.h2o = nn.Linear(dec_isz, dec_isz, bias=False)

            # droput on the linear layer helps us generalize
            self.dropout = nn.Dropout(opt['dropout'])

            self.use_attention = False
            self.attn = None
            # if attention is greater than 0, set up additional members
            if self.attention:
                self.use_attention = True
                self.att_type = opt['attn_type']
                input_size = hsz
                if opt['bi_encoder']:
                    input_size += hsz

                if self.att_type == 'concat':
                    self.attn = nn.Linear(input_size + hsz, 1, bias=False)
                elif self.att_type == 'dot':
                    assert not opt['bi_encoder']
                elif self.att_type == 'general':
                    self.attn = nn.Linear(hsz, input_size, bias=False)

            # set up optims for each module
            self.lr = opt['learning_rate']
            self.wd = opt['weight_decay'] is not 0

            optim_class = ScoringNetAgent.OPTIM_OPTS[opt['optimizer']]
            self.optims = {
                'lt':
                optim_class(self.lt.parameters(), lr=self.lr),
                'encoder':
                optim_class(self.encoder.parameters(), lr=self.lr),
                'h2o':
                optim_class(self.h2o.parameters(),
                            lr=self.lr,
                            weight_decay=self.wd),
            }
            if self.attention and self.attn is not None:
                self.optims.update({
                    'attn':
                    optim_class(self.attn.parameters(),
                                lr=self.lr,
                                weight_decay=self.wd)
                })

            if hasattr(self, 'states'):
                # set loaded states if applicable
                if opt.get('ptr_model'):
                    self.init_pretrain(self.states)
                else:
                    self.set_states(self.states)

            if self.use_cuda:
                self.cuda()

            self.loss = 0
            self.ndata = 0
            self.loss_valid = 0
            self.ndata_valid = 0

            if opt['beam_size'] > 0:
                self.beamsize = opt['beam_size']

        self.episode_concat = opt['episode_concat']
        self.training = True
        self.generating = False
        self.local_human = False
        self.max_seq_len = opt['max_seq_len']
        self.reset()

    def set_lrate(self, lr):
        self.lr = lr
        for key in self.optims:
            self.optims[key].param_groups[0]['lr'] = self.lr

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        return self.dict.vec2txt(vec)

    def cuda(self):
        """Push parameters to the GPU."""
        self.START_TENSOR = self.START_TENSOR.cuda(async=True)
        self.END_TENSOR = self.END_TENSOR.cuda(async=True)
        self.zeros = self.zeros.cuda(async=True)
        self.zeros_dec = self.zeros_dec.cuda(async=True)
        self.xs = self.xs.cuda(async=True)
        self.ys = self.ys.cuda(async=True)
        self.neg_ys = self.neg_ys.cuda(async=True)
        self.criterion.cuda()
        self.lt.cuda()
        self.encoder.cuda()
        self.h2o.cuda()
        self.dropout.cuda()
        if self.use_attention:
            self.attn.cuda()

    def hidden_to_idx(self, hidden, dropout=False):
        """Convert hidden state vectors into indices into the dictionary."""
        if hidden.size(0) > 1:
            raise RuntimeError('bad dimensions of tensor:', hidden)
        hidden = hidden.squeeze(0)
        if dropout:
            hidden = self.dropout(hidden)  # dropout over the last hidden
        scores = self.h2o(hidden)
        scores = F.log_softmax(scores)
        _max_score, idx = scores.max(1)
        return idx, scores

    def zero_grad(self):
        """Zero out optimizers."""
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        for optimizer in self.optims.values():
            optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True

    def preprocess(self, reply_text):
        # preprocess for opensub
        reply_text = reply_text.replace('\\n', '\n')  ## TODO: pre-processing
        reply_text = reply_text.replace("'m", " 'm")
        reply_text = reply_text.replace("'ve", " 've")
        reply_text = reply_text.replace("'s", " 's")
        reply_text = reply_text.replace("'t", " 't")
        reply_text = reply_text.replace("'il", " 'il")
        reply_text = reply_text.replace("'d", " 'd")
        reply_text = reply_text.replace("'re", " 're")
        reply_text = reply_text.lower().strip()

        return reply_text

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        if self.local_human:
            observation = {}
            observation['id'] = self.getID()
            reply_text = input("Enter Your Message: ")
            reply_text = self.preprocess(reply_text)
            observation['episode_done'] = True  ### TODO: for history
            observation['text'] = reply_text

            reply_text = input("Enter a lable: ")
            observation['labels'] = self.preprocess(reply_text)

            reply_text = input("Enter a candidate: ")
            observation['cands'] = self.preprocess(reply_text)

        else:
            # shallow copy observation (deep copy can be expensive)
            observation = observation.copy()
            if not self.episode_done and self.episode_concat:
                # if the last example wasn't the end of an episode, then we need to
                # recall what was said in that example
                prev_dialogue = self.observation['text']
                observation['text'] = prev_dialogue + '\n' + observation[
                    'text']  #### TODO!!!! # DATA is concatenated!!

        self.observation = observation
        self.episode_done = observation['episode_done']

        return observation

    def _encode(self, xs, xlen, dropout=False, packed=True):
        """Call encoder and return output and hidden states."""
        batchsize = len(xs)

        # first encode context
        xes = self.lt(xs).transpose(0, 1)
        #if dropout:
        #    xes = self.dropout(xes)

        # initial hidden
        if self.zeros.size(1) != batchsize:
            if self.opt['bi_encoder']:
                self.zeros.resize_(2 * self.num_layers, batchsize,
                                   self.hidden_size).fill_(0)
            else:
                self.zeros.resize_(self.num_layers, batchsize,
                                   self.hidden_size).fill_(0)

        h0 = Variable(self.zeros.fill_(0))

        # forward
        if packed:
            xes = torch.nn.utils.rnn.pack_padded_sequence(xes, xlen)

        if type(self.encoder) == nn.LSTM:
            encoder_output, _ = self.encoder(
                xes, (h0, h0))  ## Note : we can put None instead of (h0, h0)
        else:
            encoder_output, _ = self.encoder(xes, h0)

        if packed:
            encoder_output, _ = torch.nn.utils.rnn.pad_packed_sequence(
                encoder_output)

        encoder_output = encoder_output.transpose(0, 1)  #batch first
        """
        if self.use_attention:
            if encoder_output.size(1) > self.max_length:
                offset = encoder_output.size(1) - self.max_length
                encoder_output = encoder_output.narrow(1, offset, self.max_length)
        """

        return encoder_output

    def _apply_attention(self, word_input, encoder_output, last_hidden, xs):
        """Apply attention to encoder hidden layer."""
        batch_size = encoder_output.size(0)
        enc_length = encoder_output.size(1)
        mask = Variable(xs.data.eq(0).eq(0).float())

        #pdb.set_trace()
        # encoder_output # B x T x 2H
        # last_hidden  B x H

        if self.att_type == 'concat':
            last_hidden = last_hidden.unsqueeze(1).expand(
                batch_size, encoder_output.size(1),
                self.hidden_size)  # B x T x H
            attn_weights = F.tanh(
                self.attn(
                    torch.cat((encoder_output, last_hidden),
                              2).view(batch_size * enc_length,
                                      -1)).view(batch_size, enc_length))
        elif self.att_type == 'dot':
            attn_weights = F.tanh(
                torch.bmm(encoder_output, last_hidden.unsqueeze(2)).squeeze())
        elif self.att_type == 'general':
            attn_weights = F.tanh(
                torch.bmm(encoder_output,
                          self.attn(last_hidden).unsqueeze(2)).squeeze())

        #attn_weights = F.softmax(attn_weights.view(batch_size, enc_length))

        attn_weights = attn_weights.exp().mul(mask)
        denom = attn_weights.sum(1).unsqueeze(1).expand_as(attn_weights)
        attn_weights = attn_weights.div(denom)
        context = torch.bmm(attn_weights.unsqueeze(1),
                            encoder_output).squeeze(1)

        output = torch.cat((word_input, context.unsqueeze(0)), 2)
        return output

    def _get_context(self, batchsize, xlen_t, encoder_output):
        " return initial hidden of decoder and encoder context (last_state)"

        ## The initial of decoder is the hidden (last states) of encoder --> put zero!
        if self.zeros_dec.size(1) != batchsize:
            self.zeros_dec.resize_(self.num_layers, batchsize,
                                   self.hidden_size).fill_(0)
        hidden = Variable(self.zeros_dec.fill_(0))

        last_state = None
        if not self.use_attention:
            last_state = torch.gather(
                encoder_output, 1,
                xlen_t.view(-1, 1, 1).expand(encoder_output.size(0), 1,
                                             encoder_output.size(2)))
            if self.opt['bi_encoder']:
                last_state = torch.cat(
                    (encoder_output[:, 0, self.hidden_size:],
                     last_state[:, 0, :self.hidden_size]), 1)

        return hidden, last_state

    def predict(self,
                xs,
                xlen,
                x_idx,
                ys,
                ylen,
                y_idx,
                nys=None,
                nylen=None,
                ny_idx=None):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available.
        """

        self._training(self.training)
        self.zero_grad()

        batchsize = len(xs)
        #text_cand_inds = None
        #target_exist = ys is not None

        xlen_t = Variable(torch.LongTensor(xlen) - 1)
        ylen_t = Variable(torch.LongTensor(ylen) - 1)
        if self.use_cuda:
            xlen_t = xlen_t.cuda()
            ylen_t = ylen_t.cuda()

        _, x_idx_t = torch.LongTensor(x_idx).sort(0)
        _, y_idx_t = torch.LongTensor(y_idx).sort(0)

        if self.use_cuda:
            x_idx_t = x_idx_t.cuda()
            y_idx_t = y_idx_t.cuda()
        if ny_idx is not None:
            nylen_t = Variable(torch.LongTensor(nylen) - 1)
            _, ny_idx_t = torch.LongTensor(ny_idx).sort(0)

            if self.use_cuda:
                nylen_t = nylen_t.cuda()
                ny_idx_t = ny_idx_t.cuda()

        # Encoding
        _, enc_x = self._get_context(
            batchsize, xlen_t, self._encode(xs, xlen,
                                            dropout=self.training))  # encode x
        _, enc_y = self._get_context(
            batchsize, ylen_t, self._encode(ys, ylen,
                                            dropout=self.training))  # encode x

        # Permute
        enc_x = enc_x[x_idx_t, :]
        enc_y = enc_y[y_idx_t, :]
        target = Variable(torch.Tensor(batchsize).zero_())

        if ny_idx is not None:
            _, enc_ny = self._get_context(
                batchsize, nylen_t,
                self._encode(nys, nylen, dropout=self.training))  # encode x
            enc_ny = enc_ny[ny_idx_t, :]

            # make batch
            enc_x = torch.cat((enc_x, enc_x), 0)
            enc_y = torch.cat((enc_y, enc_ny), 0)
            target = torch.cat((target, target + 1), 0)

        if self.use_cuda:
            target = target.cuda()

        # calcuate the score
        output = F.sigmoid(
            torch.bmm(enc_y.unsqueeze(1),
                      self.h2o(enc_x).unsqueeze(1).transpose(1, 2)))

        # loss
        loss = self.criterion(output.squeeze(), target)

        if self.training:
            self.ndata += batchsize
            self.loss = loss
        else:
            self.ndata_valid += batchsize
            self.loss_valid += loss.data[0] * batchsize

        # list of output tokens for each example in the batch
        if self.training:
            self.loss.backward()
            if self.opt['grad_clip'] > 0:
                torch.nn.utils.clip_grad_norm(self.lt.parameters(),
                                              self.opt['grad_clip'])
                torch.nn.utils.clip_grad_norm(self.h2o.parameters(),
                                              self.opt['grad_clip'])
                torch.nn.utils.clip_grad_norm(self.encoder.parameters(),
                                              self.opt['grad_clip'])
            self.update_params()

            self.display_predict(xs[x_idx_t[0], :],
                                 ys[y_idx_t[0], :],
                                 nys[ny_idx_t[0], :],
                                 target,
                                 output,
                                 batchsize,
                                 freq=0.05)

        return self.loss, output.squeeze()

    def display_predict(self,
                        xs,
                        ys,
                        nys,
                        target,
                        output,
                        batchsize,
                        freq=0.01):
        if random.random() < freq:
            # sometimes output a prediction for debugging
            print(
                '\n    input:',
                self.dict.vec2txt(xs.data.cpu()).replace(
                    self.dict.null_token + ' ', ''), '\n    postive:',
                ' {0:.2e} '.format(output[0].data.cpu()[0, 0]),
                self.dict.vec2txt(ys.data.cpu()).replace(
                    self.dict.null_token + ' ', ''), '\n    negative:',
                ' {0:.2e} '.format(output[batchsize].data.cpu()[0, 0]),
                self.dict.vec2txt(nys.data.cpu()).replace(
                    self.dict.null_token + ' ', ''), '\n')

    def txt2tensor(self, parsed, batchsize):
        max_x_len = max([len(x) for x in parsed])
        if self.truncate:
            # shrink xs to to limit batch computation
            max_x_len = min(max_x_len, self.max_seq_len)
            parsed = [x[-max_x_len:] for x in parsed]

        # sorting for unpack in encoder
        parsed_x = sorted(enumerate(parsed),
                          key=lambda p: len(p[1]),
                          reverse=True)
        x_idx, parsed_x = zip(*parsed_x)
        x_idx = list(x_idx)
        xlen = [len(x) for x in parsed_x]
        xs = torch.LongTensor(batchsize, max_x_len).fill_(0)
        for i, x in enumerate(parsed_x):
            for j, idx in enumerate(x):
                xs[i][j] = idx
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs, async=True)
            xs = Variable(self.xs)
        else:
            xs = Variable(xs)

        return xs, xlen, x_idx

    def batchify(self, observations):
        """Convert a list of observations into input & target tensors."""
        # valid examples
        exs = [ex for ex in observations if 'text' in ex]
        # the indices of the valid (non-empty) tensors
        valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex]

        # set up the input tensors
        batchsize = len(exs)

        # tokenize the text
        xs = None
        xlen = None
        x_idx = None
        if batchsize > 0:
            parsed = [
                self.dict.parse(self.START) + self.parse(ex['text']) +
                self.dict.parse(self.END) for ex in exs
            ]
            xs, xlen, x_idx = self.txt2tensor(parsed, batchsize)

        # set up the target tensors (positive exampels)
        ys = None
        ylen = None
        y_idx = None

        if batchsize > 0 and (any(['labels' in ex for ex in exs])
                              or any(['eval_labels' in ex for ex in exs])):
            # randomly select one of the labels to update on, if multiple
            # append END to each label
            if any(['labels' in ex for ex in exs]):
                labels = [
                    self.START + ' ' + random.choice(ex.get('labels', [''])) +
                    ' ' + self.END for ex in exs
                ]
            else:
                labels = [
                    self.START + ' ' +
                    random.choice(ex.get('eval_labels', [''])) + ' ' + self.END
                    for ex in exs
                ]

            parsed_y = [self.parse(y) for y in labels]
            ys, ylen, y_idx = self.txt2tensor(parsed_y, batchsize)

        # set up candidates (negative samples, randomly select!!)
        neg_ys = None
        neg_ylen = None
        ny_idx = None

        if batchsize > 0:
            cands = None
            for i in range(len(exs)):
                if exs[i].get('label_candidates') is not None:
                    cands = list(exs[i]['label_candidates'])
                    break
            if cands is None:
                if any(['labels' in ex for ex in exs]):
                    cands = [ex['labels'][0] for ex in exs
                             ]  ## TODO: the same index should not be selected
                else:
                    cands = [ex['eval_labels'][0] for ex in exs
                             ]  ## TODO: the same index should not be selected

            # randomly select one of the labels to update on, if multiple
            # append END to each label
            parsed_ny = [
                self.dict.parse(self.START) +
                self.parse(random.choice(cands)) + self.dict.parse(self.END)
                for ex in exs
            ]
            neg_ys, neg_ylen, ny_idx = self.txt2tensor(parsed_ny, batchsize)

        return xs, xlen, x_idx, ys, ylen, y_idx, valid_inds, neg_ys, neg_ylen, ny_idx

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, xlen, x_idx, ys, ylen, y_idx, valid_inds, neg_ys, neg_ylen, ny_idx = self.batchify(
            observations)

        if xs is None:
            # no valid examples, just return the empty responses we set up
            return batch_reply

        ## seperate : test code / train code
        loss = self.predict(xs, xlen, x_idx, ys, ylen, y_idx, neg_ys, neg_ylen,
                            ny_idx)

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def act_scoring_test(self):  ## see ../../bot_code/CC_scoring.py

        x = self.observation['text']
        y = self.observation['labels']
        batchsize = len(x)

        parsed = [
            self.dict.parse(self.START) + self.parse(ex) +
            self.dict.parse(self.END) for ex in x
        ]
        xs, xlen, x_idx = self.txt2tensor(parsed, batchsize)

        labels = [
            self.dict.parse(self.START) + self.parse(ex) +
            self.dict.parse(self.END) for ex in y
        ]
        ys, ylen, y_idx = self.txt2tensor(labels, batchsize)

        loss, output = self.predict(xs, xlen, x_idx, ys, ylen, y_idx)
        return output.data

    def save(self, path=None):
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'lt'):
            model = {}
            model['lt'] = self.lt.state_dict()
            model['encoder'] = self.encoder.state_dict()
            model['h2o'] = self.h2o.state_dict()
            if self.use_attention:
                model['attn'] = self.attn.state_dict()
            model['optims'] = {
                k: v.state_dict()
                for k, v in self.optims.items()
            }
            model['longest_label'] = self.longest_label
            model['opt'] = self.opt

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            model = torch.load(read)

        return model['opt'], model

    def set_states(self, states):
        """Set the state dicts of the modules from saved states."""
        self.lt.load_state_dict(states['lt'])
        self.encoder.load_state_dict(states['encoder'])
        #self.h2o.load_state_dict(states['h2o'])
        if self.use_attention:
            self.attn.load_state_dict(states['attn'])
        for k, v in states['optims'].items():
            self.optims[k].load_state_dict(v)
        self.longest_label = states['longest_label']

    def init_pretrain(self, states):
        """Set the state dicts of the modules from saved states."""
        self.lt.load_state_dict(states['lt'])
        self.encoder.load_state_dict(states['encoder'])
        #self.h2o.load_state_dict(states['h2o'])
        """
        if self.use_attention:
            self.attn.load_state_dict(states['attn'])
        for k, v in states['optims'].items():
            self.optims[k].load_state_dict(v)
        self.longest_label = states['longest_label']
        """

    def report(self):
        m = {}
        if not self.generating:
            if self.training:
                m['loss'] = self.loss.data[0]
                m['ndata'] = self.ndata
            else:
                m['loss'] = self.loss_valid / self.ndata_valid
                m['ndata'] = self.ndata_valid

            m['lr'] = self.lr
            self.print_weight_state()

        return m

    def reset_valid_report(self):
        self.ndata_valid = 0
        self.loss_valid = 0

    def print_weight_state(self):
        self._print_grad_weight(getattr(self, 'lt').weight, 'lookup')
        for module in {'encoder'}:
            layer = getattr(self, module)
            for weights in layer._all_weights:
                for weight_name in weights:
                    self._print_grad_weight(getattr(layer, weight_name),
                                            module + ' ' + weight_name)
        self._print_grad_weight(getattr(self, 'h2o').weight, 'h2o')
        if self.use_attention:
            self._print_grad_weight(getattr(self, 'attn').weight, 'attn')

    def _print_grad_weight(self, weight, module_name):
        if weight.dim() == 2:
            nparam = weight.size(0) * weight.size(1)
            norm_w = weight.norm(2).pow(2)
            norm_dw = weight.grad.norm(2).pow(2)
            print('{:30}'.format(module_name) +
                  ' {:5} x{:5}'.format(weight.size(0), weight.size(1)) +
                  ' : w {0:.2e} | '.format((norm_w / nparam).sqrt().data[0]) +
                  'dw {0:.2e}'.format((norm_dw / nparam).sqrt().data[0]))

    def _training(self, training=True):
        for module in {'encoder', 'lt', 'h2o', 'attn'}:
            layer = getattr(self, module)
            if layer is not None:
                layer.training = training

Example #20

Show file

File: language_model.py Project: weiqiangzheng/ParlAI

class LanguageModelAgent(Agent):
    """ Agent which trains an RNN on a language modeling task.
    It is adapted from the language model featured in Pytorch's examples repo
    here: <https://github.com/pytorch/examples/tree/master/word_language_model>.
    """
    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        argparser.set_defaults(batch_sort=False)
        agent = argparser.add_argument_group('Language Model Arguments')
        agent.add_argument(
            '--init-model',
            type=str,
            default=None,
            help='load dict/features/weights/opts from this file')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=200,
                           help='size of the hidden layers')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=200,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.2,
                           help='dropout rate')
        agent.add_argument('-clip',
                           '--gradient-clip',
                           type=float,
                           default=0.25,
                           help='gradient clipping')
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument(
            '-rnn',
            '--rnn-class',
            default='LSTM',
            help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
        agent.add_argument('-sl',
                           '--seq-len',
                           type=int,
                           default=35,
                           help='sequence length')
        agent.add_argument('-tied',
                           '--emb-tied',
                           action='store_true',
                           help='tie the word embedding and softmax weights')
        agent.add_argument('-seed',
                           '--random-seed',
                           type=int,
                           default=1111,
                           help='random seed')
        agent.add_argument('--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        agent.add_argument('-tr',
                           '--truncate-pred',
                           type=int,
                           default=50,
                           help='truncate predictions')
        agent.add_argument('-rf',
                           '--report-freq',
                           type=float,
                           default=0.1,
                           help='report frequency of prediction during eval')
        agent.add_argument('-pt',
                           '--person-tokens',
                           type='bool',
                           default=True,
                           help='append person1 and person2 tokens to text')
        # learning rate parameters
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=20,
                           help='initial learning rate')
        agent.add_argument(
            '-lrf',
            '--lr-factor',
            type=float,
            default=1.0,
            help='mutliply learning rate by this factor when the \
                           validation loss does not decrease')
        agent.add_argument('-lrp',
                           '--lr-patience',
                           type=int,
                           default=10,
                           help='wait before decreasing learning rate')
        agent.add_argument('-lrm',
                           '--lr-minimum',
                           type=float,
                           default=0.1,
                           help='minimum learning rate')
        agent.add_argument(
            '-sm',
            '--sampling-mode',
            type='bool',
            default=False,
            help='sample when generating tokens instead of taking \
                           the max and do not produce UNK token (when bs=1)')
        LanguageModelAgent.dictionary_class().add_cmdline_args(argparser)
        return agent

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        self.metrics = {
            'loss': 0,
            'num_tokens': 0,
            'lmloss': 0,
            'lm_num_tokens': 0
        }
        self.states = {}
        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        self.batchsize = opt.get('batchsize', 1)
        self.use_person_tokens = opt.get('person_tokens', True)
        self.sampling_mode = opt.get('sampling_mode', False)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']
                self.metrics = shared['metrics']

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

        else:
            # this is not a shared instance of this class, so do full init
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            # for backwards compatibility: will only be called for older models
            # for which .opt file does not exist
            if (init_model is not None
                    and not os.path.isfile(init_model + '.opt')):
                new_opt = self.load_opt(init_model)
                # load model parameters if available
                print('[ Setting opt from {} ]'.format(init_model))
                # since .opt file does not exist, save one for future use
                print("Saving opt file at:", init_model + ".opt")
                with open(init_model + ".opt", 'wb') as handle:
                    pickle.dump(new_opt,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                opt = self.override_opt(new_opt)

            if ((init_model is not None
                 and os.path.isfile(init_model + '.dict'))
                    or opt['dict_file'] is None):
                opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'LanguageModel'

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

            # set model
            self.model = RNNModel(opt, len(self.dict))

            if init_model is not None:
                self.load(init_model)

            if self.use_cuda:
                self.model.cuda()

        self.next_observe = []
        self.next_batch = []

        self.is_training = True

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.25)
            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)
            if self.use_cuda:
                # push to cuda
                self.criterion.cuda()
            # init hidden state
            self.hidden = self.model.init_hidden(self.batchsize)
            # init tensor of end tokens
            self.ends = torch.LongTensor(
                [self.END_IDX for _ in range(self.batchsize)])
            if self.use_cuda:
                self.ends = self.ends.cuda()
            # set up model and learning rate scheduler parameters
            self.lr = opt['learningrate']
            self.optimizer = torch.optim.SGD(self.model.parameters(),
                                             lr=self.lr)
            self.best_val_loss = self.states.get('best_val_loss', None)
            self.lr_factor = opt['lr_factor']
            if self.lr_factor < 1.0:
                self.lr_patience = opt['lr_patience']
                self.lr_min = opt['lr_minimum']
                self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    self.optimizer,
                    factor=self.lr_factor,
                    verbose=True,
                    patience=self.lr_patience,
                    min_lr=self.lr_min)
                # initial step for scheduler if self.best_val_loss is initialized
                if self.best_val_loss is not None:
                    self.scheduler.step(self.best_val_loss)
            else:
                self.scheduler = None

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.
        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len',
            'emb_tied', 'truncate_pred', 'report_freq', 'person_tokens',
            'learningrate'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def zero_grad(self):
        """Zero out optimizer."""
        self.optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
        self.optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.reset_metrics()

    def reset_metrics(self):
        self.metrics.clear()
        self.metrics['loss'] = 0
        self.metrics['lmloss'] = 0
        self.metrics['num_tokens'] = 0
        self.metrics['lm_num_tokens'] = 0

    def report(self):
        m = {}
        if self.metrics['num_tokens'] > 0:
            m['loss'] = self.metrics['loss'] / self.metrics['num_tokens']
            m['ppl'] = math.exp(m['loss'])
        if self.metrics['lm_num_tokens'] > 0:
            m['lmloss'] = self.metrics['lmloss'] / self.metrics['lm_num_tokens']
            m['lmppl'] = math.exp(m['lmloss'])
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            m[k] = round_sigfigs(v, 4)
        return m

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['opt'] = self.opt
        shared['dict'] = self.dict
        shared['NULL_IDX'] = self.NULL_IDX
        shared['END_IDX'] = self.END_IDX
        shared['metrics'] = self.metrics
        shared['model'] = self.model
        self.model.share_memory()
        shared['states'] = {  # only need to pass optimizer states
            'optimizer': self.optimizer.state_dict(),
        }
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        #shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        seq_len = self.opt['seq_len']
        is_training = True
        if 'labels' not in obs:
            is_training = False

        if is_training:
            if 'text' in obs:
                if self.use_person_tokens:
                    obs['text'] = 'PERSON1 ' + obs['text']
                vec = self.parse(obs['text'])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if 'labels' in obs:
                if self.use_person_tokens:
                    labels = [
                        'PERSON2 ' + label for label in obs['labels']
                        if label != ''
                    ]
                    obs['labels'] = tuple(labels)
                vec = self.parse(obs['labels'][0])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if len(self.next_observe) < (seq_len + 1):
                # not enough to return to make a batch
                # we handle this case in vectorize
                # labels indicates that we are training
                self.observation = {'labels': ''}
                return self.observation
            else:
                vecs_to_return = []
                total = len(self.next_observe) // (seq_len + 1)
                for _ in range(total):
                    observe = self.next_observe[:(seq_len + 1)]
                    self.next_observe = self.next_observe[(seq_len + 1):]
                    vecs_to_return.append(observe)
                dict_to_return = {
                    'text': '',
                    'labels': '',
                    'text2vec': vecs_to_return
                }
                self.observation = dict_to_return
                return dict_to_return
        else:
            if 'text' in obs:
                if self.use_person_tokens:
                    obs['text'] = 'PERSON1 ' + obs['text']
            if 'eval_labels' in obs:
                if self.use_person_tokens:
                    eval_labels = [
                        'PERSON2 ' + label for label in obs['eval_labels']
                        if label != ''
                    ]
                    obs['eval_labels'] = tuple(eval_labels)
            self.observation = obs
            return obs

    def repackage_hidden(self, h):
        """Wraps hidden states in new Variables, to detach them from their history."""
        if isinstance(h, Variable):
            return Variable(h.data)
        else:
            return tuple(self.repackage_hidden(v) for v in h)

    def get_target_loss(self, data, hidden, targets):
        """Calculates the loss with respect to the targets, token by token,
           where each output token is conditioned on either the input or the
           previous target token.
        """
        loss = 0.0
        bsz = data.size(0)

        # during interactive mode, when no targets exist, we return 0
        if targets is None:
            return loss

        # feed in inputs without end token
        output, hidden = self.model(data.transpose(0, 1), hidden)
        self.hidden = self.repackage_hidden(hidden)
        # feed in end tokens
        output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)),
                                    self.hidden)
        self.hidden = self.repackage_hidden(hidden)
        output_flat = output.view(-1, len(self.dict))
        loss += self.criterion(output_flat, targets.select(1, 0).view(-1)).data

        for i in range(1, targets.size(1)):
            output, hidden = self.model(targets.select(1, i - 1).view(1, bsz),
                                        self.hidden,
                                        no_pack=True)
            self.hidden = self.repackage_hidden(hidden)
            output_flat = output.view(-1, len(self.dict))
            loss += self.criterion(output_flat,
                                   targets.select(1, i).view(-1)).data

        return loss

    def get_predictions(self, data):
        """Generates predictions word by word until we either reach the end token
           or some max length (opt['truncate_pred']).
        """
        token_list = []
        bsz = data.size(0)
        done = [False for _ in range(bsz)]
        total_done = 0
        hidden = self.model.init_hidden(bsz)

        i = 0
        while total_done < bsz and i <= self.opt['truncate_pred']:
            if i == 0:
                # feed in input without end tokens
                output, hidden = self.model(data.transpose(0, 1), hidden)
                hidden = self.repackage_hidden(hidden)
                # feed in end tokens
                output, hidden = self.model(
                    Variable(self.ends[:bsz].view(1, bsz)), hidden)
            else:
                output, hidden = self.model(Variable(word_idx.view(1, bsz)),
                                            hidden,
                                            no_pack=True)
            hidden = self.repackage_hidden(hidden)
            word_weights = output.squeeze().data.exp()
            if bsz > 1:
                _, word_idx = torch.max(word_weights, 1)
            else:
                if self.sampling_mode:
                    unk_idx = self.dict[self.dict.unk_token]
                    # make word_weights have smaller norm so that calculated
                    # norm does not blow up
                    word_weights = word_weights.div(1e10)
                    # make word_weights have L2 norm 1
                    ww_norm = torch.norm(word_weights, p=2)
                    word_weights = word_weights.div(ww_norm)
                    # square distribution
                    word_weights = torch.mul(word_weights, word_weights)
                    # sample distribution
                    word_idx = torch.multinomial(word_weights, 1)
                    # do not produce UNK token
                    while word_idx == unk_idx:
                        word_idx = torch.multinomial(word_weights, 1)
                else:
                    _, word_idx = torch.max(word_weights, 0)
            # mark end indices for items in batch
            word_idx = word_idx.view(-1)
            for k in range(word_idx.size(0)):
                if not done[k]:
                    if int(word_idx[k]) == self.END_IDX:
                        done[k] = True
                        total_done += 1
            token_list.append(word_idx.view(bsz, 1))
            i += 1

        return torch.cat(token_list, 1)

    def predict(self,
                data,
                hidden,
                targets=None,
                is_training=True,
                y_lens=None):
        """Produce a prediction from our model."""
        output = None
        predictions = None
        if is_training:
            self.model.train()
            self.zero_grad()
            output, hidden = self.model(data, hidden)
            loss = self.criterion(output.view(-1, len(self.dict)),
                                  targets.view(-1))
            # save loss to metrics
            target_tokens = targets.ne(self.NULL_IDX).float().sum().item()
            self.metrics['lmloss'] += loss.double().item()
            self.metrics['lm_num_tokens'] += target_tokens
            # average loss per token
            loss /= target_tokens
            loss.backward(retain_graph=True)
            self.update_params()
        else:
            self.model.eval()
            predictions = self.get_predictions(data)
            bsz = data.size(0)
            if bsz != self.batchsize:
                self.hidden = self.model.init_hidden(bsz)
            if targets is not None:
                loss = self.get_target_loss(data, self.hidden, targets)
                self.metrics['loss'] += loss
                self.metrics['num_tokens'] += sum(y_lens)

        return output, hidden, predictions

    def vectorize(self, observations, seq_len, is_training):
        """Convert a list of observations into input & target tensors."""
        labels = None
        valid_inds = None
        y_lens = None
        if is_training:
            for obs in observations:
                if obs:
                    if 'text2vec' in obs:
                        self.next_batch += obs['text2vec']
            if len(self.next_batch) <= self.batchsize:
                return None, None, None, None, None
            else:
                data_list = []
                targets_list = []
                # total is the number of batches
                total = len(self.next_batch) // self.batchsize
                for i in range(total):
                    batch = self.next_batch[:self.batchsize]
                    self.next_batch = self.next_batch[self.batchsize:]

                    source = torch.LongTensor(batch).t().contiguous()
                    data = Variable(source[:seq_len])
                    targets = Variable(source[1:])

                    if self.use_cuda:
                        data = data.cuda()
                        targets = targets.cuda()

                    data_list.append(data)
                    targets_list.append(targets)
        else:
            # here we get valid examples and pad them with zeros
            xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text(
                observations,
                self.dict,
                end_idx=self.END_IDX,
                null_idx=self.NULL_IDX)

            if self.use_cuda:
                if xs is not None:
                    xs = Variable(torch.LongTensor(xs)).cuda()
                if ys is not None:
                    ys = Variable(torch.LongTensor(ys)).cuda()
            else:
                if xs is not None:
                    xs = Variable(torch.LongTensor(xs))
                if ys is not None:
                    ys = Variable(torch.LongTensor(ys))
            data_list = [xs]
            targets_list = [ys]

        return data_list, targets_list, labels, valid_inds, y_lens

    def batch_act(self, observations):
        batch_reply = [{'id': self.getID()} for _ in range(len(observations))]
        if any(['labels' in obs for obs in observations]):
            # if we are starting a new training epoch, reinitialize hidden
            if self.is_training == False:
                self.hidden = self.model.init_hidden(self.batchsize)
            self.is_training = True
            data_list, targets_list, _, _, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)
        else:
            # if we just finished training, reinitialize hidden
            if self.is_training == True:
                self.hidden = self.model.init_hidden(self.batchsize)
                self.is_training = False
            data_list, targets_list, labels, valid_inds, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)

        if data_list is None:
            # not enough data to batch act yet, return empty responses
            return batch_reply

        batch_reply = []
        # during evaluation, len(data_list) is always 1
        # during training, len(dat_list) >= 0: vectorize returns a list containing all batches available at the time it is called
        for i in range(len(data_list)):
            temp_dicts = [{
                'id': self.getID()
            } for _ in range(len(observations))]
            # ignore case when we do not return any valid indices
            if data_list[i] is not None:
                output, hidden, predictions = self.predict(
                    data_list[i], self.hidden, targets_list[i],
                    self.is_training, y_lens)
                self.hidden = self.repackage_hidden(hidden)

                if predictions is not None:
                    # map predictions back to the right order
                    PaddingUtils.map_predictions(
                        predictions.cpu(),
                        valid_inds,
                        temp_dicts,
                        observations,
                        self.dict,
                        self.END_IDX,
                        report_freq=self.opt['report_freq'])

            batch_reply += temp_dicts

        # for prediction metrics computations, we get rid of PERSON1 and PERSON2 tokens
        if not self.is_training:
            for reply in batch_reply:
                if 'text' in reply:
                    reply['text'] = reply['text'].replace('PERSON1 ', '')
                    reply['text'] = reply['text'].replace('PERSON2 ', '')

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['opt'] = self.opt
            model['best_val_loss'] = self.best_val_loss

            with open(path, 'wb') as write:
                torch.save(model, write)
            # save opt file
            with open(path + ".opt", 'wb') as handle:
                pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def receive_metrics(self, metrics_dict):
        if 'loss' in metrics_dict and self.scheduler is not None:
            self.scheduler.step(metrics_dict['loss'])

    def load_opt(self, path):
        """Return opt, states."""
        states = torch.load(path, map_location=lambda cpu, _: cpu)
        return states['opt']

    def load(self, path):
        """Load model states."""
        if os.path.isfile(path):
            # load model parameters if available
            print('[ Loading existing model params from {} ]'.format(path))
            self.states = torch.load(path, map_location=lambda cpu, _: cpu)
            self.model.load_state_dict(self.states['model'])

Example #21

Show file

File: ir_baseline.py Project: ahiroto/ParlAI

 def __init__(self, opt, shared=None):
     super().__init__(opt)
     self.id = 'IRBaselineAgent'
     self.length_penalty = float(opt['length_penalty'])
     self.dictionary = DictionaryAgent(opt)
     self.opt = opt

Example #22

Show file

class KvmemnnAgent(Agent):
    """
    Simple implementation of the memnn algorithm with 1 hop.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,  # type: ignore
        'adagrad': optim.Adagrad,  # type: ignore
        'adam': optim.Adam,
        'adamax': optim.Adamax,  # type: ignore
        'asgd': optim.ASGD,  # type: ignore
        'lbfgs': optim.LBFGS,  # type: ignore
        'rmsprop': optim.RMSprop,  # type: ignore
        'rprop': optim.Rprop,  # type: ignore
        'sgd': optim.SGD,
    }

    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """
        Add command-line arguments specifically for this agent.
        """
        KvmemnnAgent.dictionary_class().add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Kvmemnn Arguments')
        agent.add_argument('--hops', type=int, default=1, help='num hops')
        agent.add_argument('--lins',
                           type=int,
                           default=0,
                           help='num lins projecting after hops')
        agent.add_argument(
            '-esz',
            '--embeddingsize',
            type=int,
            default=128,
            help='size of the token embeddings',
        )
        agent.add_argument(
            '-enorm',
            '--embeddingnorm',
            type=float,
            default=10,
            help='max norm of word embeddings',
        )
        agent.add_argument(
            '-shareEmb',
            '--share-embeddings',
            type='bool',
            default=True,
            help='whether LHS and RHS share embeddings',
        )
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=0.005,
                           help='learning rate')
        agent.add_argument('-margin',
                           '--margin',
                           type=float,
                           default=0.3,
                           help='margin')
        agent.add_argument('-loss',
                           '--loss',
                           default='cosine',
                           choices={'cosine', 'nll'})
        agent.add_argument(
            '-opt',
            '--optimizer',
            default='sgd',
            choices=KvmemnnAgent.OPTIM_OPTS.keys(),
            help='Choose between pytorch optimizers. '
            'Any member of torch.optim is valid and will '
            'be used with default params except learning '
            'rate (as specified by -lr).',
        )
        agent.add_argument(
            '-tr',
            '--truncate',
            type=int,
            default=-1,
            help='truncate input & output lengths to speed up '
            'training (may reduce accuracy). This fixes all '
            'input and output to have a maximum length.',
        )
        agent.add_argument(
            '-k',
            '--neg-samples',
            type=int,
            default=10,
            help='number k of negative samples per example',
        )
        agent.add_argument('--parrot-neg',
                           type=int,
                           default=0,
                           help='include query as a negative')
        agent.add_argument('--take-next-utt',
                           type='bool',
                           default=False,
                           help='take next utt')
        agent.add_argument(
            '--twohop-range',
            type=int,
            default=100,
            help='2 hop range constraint for num rescored utterances',
        )
        agent.add_argument(
            '--twohop-blend',
            type=float,
            default=0,
            help='2 hop blend in the first hop scores if > 0',
        )
        agent.add_argument(
            '--kvmemnn-debug',
            type='bool',
            default=False,
            help='print debug information',
        )
        agent.add_argument(
            '--tfidf',
            type='bool',
            default=False,
            help='Use frequency based normalization for embeddings.',
        )
        agent.add_argument(
            '-cs',
            '--cache-size',
            type=int,
            default=1000,
            help='size of negative sample cache to draw from',
        )
        agent.add_argument(
            '-hist',
            '--history-length',
            default=100,
            type=int,
            help='Number of past tokens to remember. ',
        )
        agent.add_argument(
            '-histr',
            '--history-replies',
            default='label',
            type=str,
            choices=['none', 'model', 'label'],
            help='Keep replies in the history, or not.',
        )
        agent.add_argument('--interactive-mode',
                           default=False,
                           type='bool',
                           choices=[True, False])
        agent.add_argument(
            '--loadcands',
            type='bool',
            default=True,
            help='Load candidates to rank from .candspair files, or not.',
        )

    def __init__(self, opt, shared=None):
        """
        Set up model if shared params not set, otherwise no work to do.
        """
        super().__init__(opt, shared)
        opt = self.opt
        if opt.get('batchsize', 1) > 1:
            raise RuntimeError('Kvmemnn model does not support batchsize > 1, '
                               'try training with numthreads > 1 instead.')
        self.reset_metrics()
        # all instances needs truncate param
        self.id = 'Kvmemnn'
        self.NULL_IDX = 0
        self.start2 = 99
        # set up tensors once
        self.cands = torch.LongTensor(1, 1, 1)
        self.ys_cache = []
        self.ys_cache_sz = opt['cache_size']
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        if shared:
            torch.set_num_threads(1)
            if 'threadindex' in shared:
                self.threadindex = shared['threadindex']
            else:
                self.threadindex = 1
            # set up shared properties
            self.dict = shared['dict']
            # answers contains a batch_size list of the last answer produced
            self.model = shared['model']  # Kvmemnn(opt, len(self.dict))
            if 'fixedX' in shared:
                self.fixedX = shared['fixedX']
                self.fixedCands = shared['fixedCands']
                self.fixedCands_txt = shared['fixedCands_txt']
                self.fixedCands2 = shared['fixedCands2']
                self.fixedCands_txt2 = shared['fixedCands_txt2']
        else:
            print("[ creating KvmemnnAgent ]")
            # this is not a shared instance of this class, so do full init
            self.threadindex = -1
            torch.set_num_threads(1)

            if (opt['dict_file'] is None and opt.get('model_file')
                ) or os.path.isfile(opt['model_file'] + '.dict'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'
            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            if 'loss' not in opt:
                opt['loss'] = 'cosine'
            self.model = Kvmemnn(opt, len(self.dict), self.dict)
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                self.load(opt['model_file'])
            self.model.share_memory()

            self.fixedCands = False
            self.fixedX = None
            path = opt['model_file'] + '.candspair'
            if os.path.isfile(path) and opt.get('loadcands') is not False:
                print("[loading candidates: " + path + "*]")
                fc = load_cands(path)
                fcs = []
                for c in fc:
                    fcs.append(
                        Variable(torch.LongTensor(self.parse(c)).unsqueeze(0)))
                self.fixedCands = fcs
                self.fixedCands_txt = fc
                fc2 = load_cands(path + "2")
                fcs2 = []
                for c2 in fc2:
                    fcs2.append(
                        Variable(
                            torch.LongTensor(self.parse(c2)).unsqueeze(0)))
                self.fixedCands2 = fcs2
                self.fixedCands_txt2 = fc2
                print("[caching..]")
                xsq = Variable(torch.LongTensor([self.parse('nothing')]))
                xe, ye = self.model(xsq, [], None, self.fixedCands)
                self.fixedX = ye
            print("=init done=")

        if self.opt['loss'] == 'cosine':
            self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'],
                                                          size_average=False)
        elif self.opt['loss'] == 'nll':
            self.criterion = nn.CrossEntropyLoss(ignore_index=-100)
        else:
            raise RuntimeError('unspecified loss')
        # self.criterion = torch.nn.MultiMarginLoss(p=1, margin=0.1)
        self.reset()
        # can be used to look at embeddings:
        # self.dict_neighbors('coffee')
        self.take_next_utt = True
        self.cands_done = []
        if 'interactive_mode' in opt:
            self.interactiveMode = self.opt['interactive_mode']
        else:
            self.interactiveMode = False
        if self.interactiveMode:
            print("[ Interactive mode ]")

    def override_opt(self, new_opt):
        """
        Set overridable opts from loaded opt file.

        Print out each added key and each overriden key. Only override args specific to
        the model.
        """
        model_args = {
            'hiddensize',
            'embeddingsize',
            'numlayers',
            'optimizer',
            'encoder',
            'decoder',
            'lookuptable',
            'attention',
            'attention_length',
            'fixed_candidates_file',
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """
        Convert string to token indices.
        """
        text = text.lower()
        text = text.replace("n't", " not")
        vec = self.dict.txt2vec(text)
        if vec == []:
            vec = [self.dict[self.dict.null_token]]
        return vec

    def t2v(self, text):
        p = self.dict.txt2vec(text)
        return Variable(torch.LongTensor(p).unsqueeze(1))

    def v2t(self, vec):
        """
        Convert token indices to string of tokens.
        """
        if type(vec) == Variable:
            vec = vec.data
        if type(vec) == torch.LongTensor and vec.dim() == 2:
            vec = vec.squeeze(0)
        if type(vec) == torch.Tensor and vec.dim() == 2:
            vec = vec.squeeze(0)
        new_vec = []
        for i in vec:
            new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def zero_grad(self):
        """
        Zero out optimizer.
        """
        self.optimizer.zero_grad()

    def update_params(self):
        """
        Do one optimization step.
        """
        self.optimizer.step()

    def reset(self):
        """
        Reset observation and episode_done.
        """
        self.observation = None
        self.episode_done = True
        self.cands_done = []
        self.history = {}
        # set up optimizer
        lr = self.opt['learningrate']
        optim_class = KvmemnnAgent.OPTIM_OPTS[self.opt['optimizer']]
        kwargs = {'lr': lr}
        self.optimizer = optim_class(self.model.parameters(), **kwargs)

    def share(self):
        """
        Share internal states between parent and child instances.
        """
        shared = super().share()
        shared['dict'] = self.dict
        shared['model'] = self.model
        if self.fixedX is not None:
            shared['fixedX'] = self.fixedX
            shared['fixedCands'] = self.fixedCands
            shared['fixedCands_txt'] = self.fixedCands_txt
            shared['fixedCands2'] = self.fixedCands2
            shared['fixedCands_txt2'] = self.fixedCands_txt2
        return shared

    def observe(self, observation):
        self.episode_done = observation['episode_done']
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        obs['query'], obs['mem'] = maintain_dialog_history(
            self.history,
            obs,
            historyLength=self.opt['history_length'],
            useReplies=self.opt['history_replies'],
            dict=self.dict,
            useStartEndIndices=False,
        )
        self.observation = obs
        return obs

    def report2(self):
        def clip(f):
            return round_sigfigs(f)

        metrics = self.metrics
        if metrics['exs'] == 0:
            report = {'mean_rank': self.opt['neg_samples']}
        else:
            maxn = 0
            for _ in range(100):
                n = self.model.lt.weight[5].norm(2)[0].item()
                if n > maxn:
                    maxn = n

            report = {
                'exs': clip(metrics['total_total']),
                'loss': clip(metrics['loss'] / metrics['exs']),
                'mean_rank': clip(metrics['mean_rank'] / metrics['exs']),
                'mlp_time': clip(metrics['mlp_time'] / metrics['exs']),
                'tot_time': clip(metrics['tot_time'] / metrics['exs']),
                'max_norm': clip(n),
            }
        return report

    def reset_metrics(self, keep_total=False):
        if keep_total:
            self.metrics = {
                'exs': 0,
                'mean_rank': 0,
                'loss': 0,
                'total_total': self.metrics['total_total'],
                'mlp_time': 0,
                'tot_time': 0,
                'max_weight': 0,
                'mean_weight': 0,
            }
        else:
            self.metrics = {
                'total_total': 0,
                'mean_rank': 0,
                'exs': 0,
                'mlp_time': 0,
                'tot_time': 0,
                'loss': 0,
                'max_weight': 0,
                'mean_weight': 0,
            }

    def compute_metrics(self, loss, scores, mlp_time, non_mlp_time):
        metrics = {}
        pos = scores[0]
        cnt = 0
        for i in range(1, len(scores)):
            if scores[i] >= pos:
                cnt += 1
        metrics['mean_rank'] = cnt
        metrics['loss'] = loss
        metrics['tot_time'] = mlp_time + non_mlp_time
        metrics['mlp_time'] = mlp_time
        return metrics

    def same(self, y1, y2):
        """
        Check if two tensors are the same, within small margin of error.
        """
        if len(y1) != len(y2):
            return False
        if abs((y1 - y2).sum().data.sum()) > 0.00001:
            return False
        return True

    def get_negs(self, xs, ys):
        negs = []
        # for neg in self.ys_cache:
        cache_sz = len(self.ys_cache) - 1
        if cache_sz < 1:
            return negs
        k = self.opt['neg_samples']
        for _ in range(1, k * 3):
            index = random.randint(0, cache_sz)
            neg = self.ys_cache[index]
            if not self.same(ys.squeeze(0), neg.squeeze(0)):
                negs.append(neg)
                if len(negs) >= k:
                    break
        if self.opt['parrot_neg'] > 0:
            utt = self.history['last_utterance']
            if len(utt) > 2:
                query = Variable(torch.LongTensor(utt).unsqueeze(0))
                negs.append(query)
        return negs

    def dict_neighbors(self, word, useRHS=False):
        input = self.t2v(word)
        W = self.model.encoder.lt.weight
        q = W[input[0].item()]
        if useRHS:
            W = self.model.encoder2.lt.weight
        score = torch.Tensor(W.size(0))
        for i in range(W.size(0)):
            score[i] = torch.nn.functional.cosine_similarity(q, W[i],
                                                             dim=0)[0].item()
        val, ind = score.sort(descending=True)
        for i in range(20):
            print(
                str(ind[i]) + " [" + str(val[i]) + "]: " +
                self.v2t(torch.Tensor([ind[i]])))

    def predict(self, xs, ys=None, cands=None, cands_txt=None, obs=None):
        """
        Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank candidates as
        well if they are available and param is set.
        """
        self.start = time.time()
        if xs is None:
            return [{}]
        is_training = ys is not None
        if is_training:  #
            negs = self.get_negs(xs, ys)
            if len(negs) > 0:
                self.model.train()
                self.zero_grad()
                if self.opt['loss'] == 'cosine':
                    xe, ye = self.model(xs, obs[0]['mem'], ys, negs)
                    y = Variable(-torch.ones(xe.size(0)))
                    y[0] = 1
                    loss = self.criterion(xe, ye, y)
                else:
                    x = self.model(xs, obs[0]['mem'], ys, negs)
                    y = Variable(torch.LongTensor([0]))
                    loss = self.criterion(x.unsqueeze(0), y)
                loss.backward()
                self.update_params()
                rest = 0
                if self.start2 != 99:
                    rest = self.start - self.start2
                self.start2 = time.time()
                if self.opt['loss'] == 'cosine':
                    pred = nn.CosineSimilarity().forward(xe, ye)
                else:
                    pred = x
                metrics = self.compute_metrics(loss.item(), pred.squeeze(0),
                                               self.start2 - self.start, rest)
                return [{'metrics': metrics}]
        else:
            fixed = False
            if hasattr(self, 'fixedCands') and self.fixedCands:
                self.take_next_utt = True
                self.twohoputt = True
                self.tricks = True
            else:
                self.take_next_utt = False
                self.twohoputt = False
                self.tricks = False
            if cands is None or cands[0] is None or self.take_next_utt:
                # cannot predict without candidates.
                if self.fixedCands or self.take_next_utt:
                    cands_txt2 = [self.fixedCands_txt2]
                    fixed = True
                else:
                    return [{}]
            # test set prediction uses candidates
            self.model.eval()
            if fixed:
                if obs[0]['episode_done']:
                    self.cands_done = []

                if xs is None:
                    xs = Variable(torch.LongTensor([self.parse('nothing')]))
                xs = xs.clone()
                if self.tricks:
                    vv = self.history['last_utterance']
                    if len(vv) == 0:
                        xsq = Variable(
                            torch.LongTensor([self.parse('nothing')]))
                    else:
                        xsq = Variable(torch.LongTensor([vv]))
                else:
                    xsq = xs
                mems = obs[0]['mem']
                if self.tricks:
                    mems = []
                if self.fixedX is None:
                    xe, ye = self.model(xsq, mems, ys, self.fixedCands)
                    self.fixedX = ye
                else:
                    # fixed cand embed vectors are cached, dont't recompute
                    blah = Variable(torch.LongTensor([1]))
                    xe, ye = self.model(xsq, mems, ys, [blah])
                    ye = self.fixedX
                pred = nn.CosineSimilarity().forward(xe, ye)
                origxe = xe
                origpred = pred
                val, ind = pred.sort(descending=True)
                ypred = cands_txt2[0][ind[0].item()]  # reply to match
                if self.opt.get('kvmemnn_debug', False):
                    print("twohop-range:", self.opt.get('twohop_range', 100))
                    for i in range(10):
                        txt1 = self.fixedCands_txt[ind[i].item()]
                        txt2 = cands_txt2[0][ind[i].item()]
                        print(i, txt1, '\n    ', txt2)
                tc = [ypred]
                if self.twohoputt:
                    # now we rerank original cands against this prediction
                    zq = []
                    z = []
                    ztxt = []
                    newwords = {}
                    r = self.opt.get('twohop_range', 100)
                    for i in range(r):
                        c = self.fixedCands2[ind[i].item()]
                        ctxt = self.fixedCands_txt2[ind[i].item()]
                        if i < 10:
                            zq.append(c)
                        z.append(c)
                        ztxt.append(ctxt)
                        for w in c[0]:
                            newwords[w.item()] = True
                    xs2 = torch.cat(zq, 1)

                if (self.interactiveMode
                        and self.twohoputt) or cands[0] is None:
                    # used for nextutt alg in demo mode, get 2nd hop
                    blah = Variable(torch.LongTensor([1]))
                    if self.tricks:
                        xe, ye = self.model(xs2, obs[0]['mem'], ys, z)
                    else:
                        xe, ye = self.model(xs2, obs[0]['mem'], ys, [blah])
                        ye = self.fixedX
                    blend = self.opt.get('twohop_blend', 0)
                    if blend > 0:
                        xe = (1 - blend) * xe + blend * origxe
                    pred = nn.CosineSimilarity().forward(xe, ye)
                    for c in self.cands_done:
                        for i in range(len(ztxt)):
                            if ztxt[i] == c:
                                # interactive heuristic: don't repeat yourself
                                pred[i] = -1000
                    val, ind = pred.sort(descending=True)
                    # predict the highest scoring candidate, and return it.
                    # print("   [query:          " + self.v2t(xsq) + "]")
                    ps = []
                    for c in obs[0]['mem']:
                        ps.append(self.v2t(c))
                    # print("   [persona:        " + '|'.join(ps) + "]")
                    # print("   [1st hop qmatch: " + ypredorig + "]")
                    # print("   [1st hop nextut: " + ypred + "]")
                    if self.tricks:
                        ypred = ztxt[ind[0].item()]  # match
                        self.cands_done.append(ypred)
                    else:
                        ypred = self.fixedCands_txt[ind[0].item()]  # match
                        self.cands_done.append(ind[0].item())
                        # print("   [2nd hop nextut: " + ypred2 + "]")
                    tc = [ypred]
                    self.history['labels'] = [ypred]
                    # print("   [final pred: " + ypred + "]")
                    ret = [{'text': ypred, 'text_candidates': tc}]
                    return ret
                elif self.take_next_utt and not self.interactiveMode:
                    xe, ye = self.model(xs2, obs[0]['mem'], ys, cands[0])
                    pred = nn.CosineSimilarity().forward(xe, ye)
                    xe, ye = self.model(xs, obs[0]['mem'], ys, cands[0])
                    origpred = nn.CosineSimilarity().forward(xe, ye)
                    if 'alpha' not in self.opt:
                        alpha = 0.1
                    else:
                        alpha = self.opt['alpha']
                    pred = alpha * pred + 1 * origpred
                    val, ind = pred.sort(descending=True)
                    # predict the highest scoring candidate, and return it.
                    ypred = cands_txt[0][ind[0].item()]  # match
                    tc = []
                    for i in range(len(ind)):
                        tc.append(cands_txt[0][ind[i].item()])
            else:
                if self.opt['loss'] == 'cosine':
                    xe, ye = self.model(xs, obs[0]['mem'], ys, cands[0])
                    pred = nn.CosineSimilarity().forward(xe, ye)
                else:
                    x = self.model(xs, obs[0]['mem'], ys, cands[0])
                    pred = x  # .squeeze()
                val, ind = pred.sort(descending=True)
                ypred = cands_txt[0][ind[0].item()]  # match
                tc = []
                for i in range(min(100, ind.size(0))):
                    tc.append(cands_txt[0][ind[i].item()])
            ret = [{'text': ypred, 'text_candidates': tc}]
            return ret
        return [{}] * xs.size(0)

    def batchify(self, observations):
        """
        Convert a list of observations into input & target tensors.
        """
        def valid(obs):
            # check if this is an example our model should actually process
            return 'query' in obs and len(obs['query']) > 0

        try:
            # valid examples and their indices
            valid_inds, exs = zip(*[(i, ex)
                                    for i, ex in enumerate(observations)
                                    if valid(ex)])
        except ValueError:
            # zero examples to process in this batch, so zip failed to unpack
            return None, None, None, None

        # `x` text is already tokenized and truncated
        # sort by length so we can use pack_padded
        parsed_x = [ex['query'] for ex in exs]
        x_lens = [len(x) for x in parsed_x]
        ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k])

        exs = [exs[k] for k in ind_sorted]
        valid_inds = [valid_inds[k] for k in ind_sorted]
        parsed_x = [parsed_x[k] for k in ind_sorted]

        labels_avail = any(['labels' in ex for ex in exs])

        max_x_len = max([len(x) for x in parsed_x])
        for x in parsed_x:
            x += [self.NULL_IDX] * (max_x_len - len(x))
        xs = torch.LongTensor(parsed_x)
        xs = Variable(xs)

        # set up the target tensors
        ys = None
        labels = None
        if labels_avail:
            # randomly select one of the labels to update on, if multiple
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            # parse each label and append END
            parsed_y = [deque(maxlen=self.truncate) for _ in labels]
            for dq, y in zip(parsed_y, labels):
                dq.extendleft(reversed(self.parse(y)))
            max_y_len = max(len(y) for y in parsed_y)
            for y in parsed_y:
                y += [self.NULL_IDX] * (max_y_len - len(y))
            if len(parsed_y[0]) == 0:
                return None, None, None, None
            else:
                ys = torch.LongTensor(parsed_y)
                ys = Variable(ys)

        cands = []
        cands_txt = []
        if ys is None:
            # only build candidates in eval mode.
            for o in observations:
                if 'label_candidates' in o and o[
                        'label_candidates'] is not None:
                    cs = []
                    ct = []
                    for c in o['label_candidates']:
                        cs.append(
                            Variable(
                                torch.LongTensor(self.parse(c)).unsqueeze(0)))
                        ct.append(c)
                    cands.append(cs)
                    cands_txt.append(ct)
                else:
                    cands.append(None)
                    cands_txt.append(None)
        return xs, ys, cands, cands_txt

    def add_to_ys_cache(self, ys):
        if ys is None or len(ys) == 0:
            return
        if len(self.ys_cache) < self.ys_cache_sz:
            self.ys_cache.append(copy.deepcopy(ys))
        else:
            ind = random.randint(0, self.ys_cache_sz - 1)
            self.ys_cache[ind] = copy.deepcopy(ys)

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        if batchsize == 0 or 'text' not in observations[0]:
            return [{'text': 'dunno'}]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, cands, cands_txt = self.batchify(observations)
        batch_reply = self.predict(xs, ys, cands, cands_txt, observations)
        self.add_to_ys_cache(ys)
        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def shutdown(self):
        # """Save the state of the model when shutdown."""
        super().shutdown()

    def save(self, path=None):
        """
        Save model parameters if model_file is set.
        """
        path = self.opt.get('model_file', None) if path is None else path
        if path and hasattr(self, 'model'):
            data = {}
            data['model'] = self.model.state_dict()
            data['optimizer'] = self.optimizer.state_dict()
            data['opt'] = self.opt
            with open(path, 'wb') as handle:
                torch.save(data, handle)
            with open(path + ".opt", 'wb') as handle:
                pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def load(self, path):
        """
        Return opt and model states.
        """
        with open(path, 'rb') as read:
            print('Loading existing model params from ' + path)
            data = torch.load(read)
            self.model.load_state_dict(data['model'])
            self.reset()
            self.optimizer.load_state_dict(data['optimizer'])
            self.opt = self.override_opt(data['opt'])

Example #23

Show file

File: build_dict.py Project: VProv/ParlAI

def build_dict(opt, skip_if_built=False):
    if isinstance(opt, ParlaiParser):
        print('[ Deprecated Warning: should be passed opt not Parser ]')
        opt = opt.parse_args()
    if not opt.get('dict_file'):
        print('Tried to build dictionary but `--dict-file` is not set. Set ' +
              'this param so the dictionary can be saved.')
        return

    if skip_if_built and os.path.isfile(opt['dict_file']):
        # Dictionary already built, skip all loading or setup
        print("[ dictionary already built .]")
        return None

    if opt.get('dict_class'):
        # Custom dictionary class
        dictionary = str2class(opt['dict_class'])(opt)
    else:
        # Default dictionary class
        dictionary = DictionaryAgent(opt)

    if os.path.isfile(opt['dict_file']):
        # Dictionary already built, return loaded dictionary agent
        print("[ dictionary already built .]")
        return dictionary

    ordered_opt = copy.deepcopy(opt)
    cnt = 0
    # we use train set to build dictionary

    ordered_opt['numthreads'] = 1
    ordered_opt['batchsize'] = 1
    ordered_opt['image_mode'] = 'none'
    if ordered_opt['task'] == 'pytorch_teacher':
        pytorch_buildteacher_task = ordered_opt.get('pytorch_buildteacher', '')
        if pytorch_buildteacher_task != '':
            ordered_opt['task'] = pytorch_buildteacher_task

    datatypes = ['train:ordered:stream']
    if opt.get('dict_include_valid'):
        datatypes.append('valid:stream')
    if opt.get('dict_include_test'):
        datatypes.append('test:stream')
    cnt = 0
    for dt in datatypes:
        ordered_opt['datatype'] = dt
        world_dict = create_task(ordered_opt, dictionary)
        # pass examples to dictionary
        print('[ running dictionary over data.. ]')
        log_every_n_secs = opt.get('log_every_n_secs', -1)
        if log_every_n_secs <= 0:
            log_every_n_secs = float('inf')
        log_time = TimeLogger()
        while not world_dict.epoch_done():
            cnt += 1
            if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] > 0:
                print('Processed {} exs, moving on.'.format(
                    opt['dict_maxexs']))
                # don't wait too long...
                break
            world_dict.parley()
            if log_time.time() > log_every_n_secs:
                sys.stdout.write('\r')
                text, _log = log_time.log(
                    cnt,
                    max(opt.get('dict_maxexs', 0), world_dict.num_examples()))
                sys.stdout.write(text)
                sys.stdout.flush()

    dictionary.save(opt['dict_file'], sort=True)
    print('[ dictionary built with {} tokens ]'.format(len(dictionary)))
    return dictionary

Example #24

Show file

File: build_dict.py Project: zhongyunuestc/convai

def main():
    # Get command line arguments
    argparser = ParlaiParser()
    DictionaryAgent.add_cmdline_args(argparser)
    opt = argparser.parse_args()
    build_dict(opt)

Example #25

Show file

def build_dict(opt, skip_if_built=False):
    if isinstance(opt, ParlaiParser):
        print('[ Deprecated Warning: should be passed opt not Parser ]')
        opt = opt.parse_args()
    if not opt.get('dict_file'):
        print(
            'Tried to build dictionary but `--dict-file` is not set. Set '
            + 'this param so the dictionary can be saved.'
        )
        return
    if skip_if_built and os.path.isfile(opt['dict_file']):
        # Dictionary already built, skip all loading or setup
        print("[ dictionary already built .]")
        return None

    if is_distributed():
        raise ValueError('Dictionaries should be pre-built before distributed train.')

    if opt.get('dict_class'):
        # Custom dictionary class
        dictionary = str2class(opt['dict_class'])(opt)
    else:
        # Default dictionary class
        dictionary = DictionaryAgent(opt)

    if os.path.isfile(opt['dict_file']):
        # Dictionary already built, return loaded dictionary agent
        print("[ dictionary already built .]")
        return dictionary

    ordered_opt = copy.deepcopy(opt)
    cnt = 0
    # we use train set to build dictionary

    ordered_opt['numthreads'] = 1
    ordered_opt['batchsize'] = 1
    # Set this to none so that image features are not calculated when Teacher is
    # instantiated while building the dict
    ordered_opt['image_mode'] = 'no_image_model'

    datatypes = ['train:ordered:stream']
    if opt.get('dict_include_valid'):
        datatypes.append('valid:stream')
    if opt.get('dict_include_test'):
        datatypes.append('test:stream')
    cnt = 0
    for dt in datatypes:
        ordered_opt['datatype'] = dt
        world_dict = create_task(ordered_opt, dictionary)
        # pass examples to dictionary
        print('[ running dictionary over data.. ]')
        log_time = TimeLogger()
        total = world_dict.num_examples()
        if opt['dict_maxexs'] >= 0:
            total = min(total, opt['dict_maxexs'])

        log_every_n_secs = opt.get('log_every_n_secs', None)
        if log_every_n_secs:
            pbar = tqdm.tqdm(
                total=total, desc='Building dictionary', unit='ex', unit_scale=True
            )
        else:
            pbar = None
        while not world_dict.epoch_done():
            cnt += 1
            if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0:
                print('Processed {} exs, moving on.'.format(opt['dict_maxexs']))
                # don't wait too long...
                break
            world_dict.parley()
            if pbar:
                pbar.update(1)
        if pbar:
            pbar.close()

    dictionary.save(opt['dict_file'], sort=True)
    print(
        '[ dictionary built with {} tokens in {}s ]'.format(
            len(dictionary), round(log_time.total_time(), 2)
        )
    )
    return dictionary

Example #26

Show file

def verify(opt, printargs=None, print_parser=None):
    if opt['datatype'] == 'train':
        print("[ note: changing datatype from train to train:ordered ]")
        opt['datatype'] = 'train:ordered'

    # create repeat label agent and assign it to the specified task
    agent = RepeatLabelAgent(opt)
    world = create_task(opt, agent)

    log_every_n_secs = opt.get('log_every_n_secs', -1)
    if log_every_n_secs <= 0:
        log_every_n_secs = float('inf')
    log_time = TimeLogger()

    dictionary = DictionaryAgent(opt)
    ignore_tokens = opt.get('ignore_tokens').split(',')

    counts = {}
    for t in {'input', 'labels', 'both'}:
        counts['tokens_in_' + t] = 0
        counts['utterances_in_' + t] = 0
        counts['avg_utterance_length_in_' + t] = 0
        counts['unique_tokens_in_' + t] = 0
        counts['unique_utterances_in_' + t] = 0
        # for counting the stats..
        counts['token_dict_' + t] = {}
        counts['utterance_dict_' + t] = {}

    def tokenize(txt):
        return dictionary.tokenize(txt)

    def keep_token(t):
        for s in ignore_tokens:
            if s != '' and s in t:
                return False
        return True

    # Show some example dialogs.
    while not world.epoch_done():
        world.parley()
        act = world.get_acts()[opt.get('agent')]
        for itype in {'input', 'labels'}:
            if itype == 'input':
                if opt.get('new_line_new_utt'):
                    txts = act.get('text').split('\n')
                else:
                    txts = [act.get('text')]
            else:
                txts = act.get('labels', act.get('eval_labels', ['']))

            for txt in txts:
                tokens = tokenize(txt)
                retxt = []
                for t in tokens:
                    if keep_token(t):
                        retxt.append(t)
                counts['tokens_in_' + itype] += len(retxt)
                counts['tokens_in_' + 'both'] += len(retxt)
                counts['utterances_in_' + itype] += 1
                counts['utterances_in_' + 'both'] += 1
                counts['avg_utterance_length_in_' + itype] = (
                    counts['tokens_in_' + itype] / counts['utterances_in_' + itype]
                )
                counts['avg_utterance_length_in_' + 'both'] = (
                    counts['tokens_in_' + 'both'] / counts['utterances_in_' + 'both']
                )
                for t in retxt:
                    if t not in counts['token_dict_' + itype]:
                        counts['unique_tokens_in_' + itype] += 1
                        counts['token_dict_' + itype][t] = True
                    if t not in counts['token_dict_' + 'both']:
                        counts['unique_tokens_in_' + 'both'] += 1
                        counts['token_dict_' + 'both'][t] = True
                retxt = ' '.join(retxt)
                if retxt not in counts['utterance_dict_' + itype]:
                    counts['unique_utterances_in_' + itype] += 1
                    counts['utterance_dict_' + itype][retxt] = True
                if retxt not in counts['utterance_dict_' + 'both']:
                    counts['unique_utterances_in_' + 'both'] += 1
                    counts['utterance_dict_' + 'both'][retxt] = True

        if log_time.time() > log_every_n_secs:
            text, log = report(world, counts, log_time)
            if print_parser:
                print(text)

    try:
        # print dataset size if available
        print(
            '[ loaded {} episodes with a total of {} examples ]'.format(
                world.num_episodes(), world.num_examples()
            )
        )
    except Exception:
        pass
    return report(world, counts, log_time)

Example #27

Show file

def eval_wordstat(opt, print_parser=None):
    """
    Evaluates a model.

    :param opt: tells the evaluation function how to run
    :param print_parser: if provided, prints the options that are set within the
        model after loading the model
    """
    random.seed(42)

    # Create model and assign it to the specified task
    agent = create_agent(opt, requireModelExists=True)
    world = create_task(opt, agent)

    if opt.get('external_dict'):
        print('[ Using external dictionary from: {} ]'.format(
            opt['external_dict']))
        dict_opt = copy.deepcopy(opt)
        dict_opt['dict_file'] = opt['external_dict']
        dictionary = DictionaryAgent(dict_opt)
    else:
        print('[ Using model bundled dictionary ]')
        dictionary = agent.dict

    batch_size = opt['batchsize']

    if print_parser:
        # Show arguments after loading model
        print_parser.opt = agent.opt
        print_parser.print_args()
    log_every_n_secs = opt.get('log_every_n_secs', -1)
    if log_every_n_secs <= 0:
        log_every_n_secs = float('inf')
    log_time = TimeLogger()

    cnt = 0
    max_cnt = opt['num_examples'] if opt['num_examples'] > 0 else float('inf')
    word_statistics = {
        'mean_wlength': [],
        'mean_clength': [],
        'freqs_cnt': Counter(),
        'word_cnt': 0,
        'pred_list': [],
        'pure_pred_list': [],
        'context_list': [],
        'unique_words': set(),
    }
    bins = [int(i) for i in opt['freq_bins'].split(',')]

    def process_prediction(prediction, word_statistics):
        normalized = normalize_answer(prediction)
        word_statistics['pred_list'].append(normalized)
        freqs, _cnt, wlength, clength = get_word_stats(prediction,
                                                       dictionary,
                                                       bins=bins)
        word_statistics['word_cnt'] += _cnt
        word_statistics['mean_wlength'].append(wlength)
        word_statistics['mean_clength'].append(clength)
        word_statistics['freqs_cnt'] += Counter(freqs)
        word_statistics['unique_words'] |= set(normalized.split(" "))
        return word_statistics

    while not world.epoch_done():
        world.parley()
        if batch_size == 1:
            cnt += 1
            prediction = world.acts[-1]['text']
            word_statistics['context_list'].append(world.acts[0]['text'])
            word_statistics['pure_pred_list'].append(prediction)
            word_statistics = process_prediction(prediction, word_statistics)
        else:
            for w in world.worlds:
                try:
                    if 'text' not in w.acts[-1]:
                        continue
                    prediction = w.acts[-1]['text']
                    word_statistics['context_list'].append(w.acts[0]['text'])
                    word_statistics['pure_pred_list'].append(prediction)
                except IndexError:
                    continue
                cnt += 1
                word_statistics = process_prediction(prediction,
                                                     word_statistics)

        if log_time.time() > log_every_n_secs:
            report = world.report()
            text, report = log_time.log(report['exs'],
                                        min(max_cnt, world.num_examples()),
                                        report)
            print(text)
            stat_str = 'total_words: {}, '.format(word_statistics['word_cnt'])
            stat_str += ', '.join([
                '<{}:{} ({:.{prec}f}%)'.format(
                    b,
                    word_statistics['freqs_cnt'].get(b, 0),
                    (word_statistics['freqs_cnt'].get(b, 0) /
                     word_statistics['word_cnt']) * 100,
                    prec=2,
                ) for b in bins
            ])
            print("Word statistics: {}, avg_word_length: {:.{prec}f}, "
                  "avg_char_length: {:.{prec}f}".format(
                      stat_str,
                      numpy.array(word_statistics['mean_wlength']).mean(),
                      numpy.array(word_statistics['mean_clength']).mean(),
                      prec=2,
                  ))
        if cnt >= max_cnt:
            break
    if world.epoch_done():
        print("EPOCH DONE")

    if opt['compute_unique'] is True:
        unique_list = []
        cntr = Counter(word_statistics['pred_list'])
        for k, v in cntr.items():
            if v == 1:
                unique_list.append(k)
        print("Unique responses: {:.{prec}f}%".format(
            len(unique_list) / len(word_statistics['pred_list']) * 100,
            prec=2))
    print("Total unique tokens:", len(word_statistics['unique_words']))

    if opt['dump_predictions_path'] is not None:
        with open(opt['dump_predictions_path'], 'w') as f:
            f.writelines([
                'CONTEXT: {}\nPREDICTION:{}\n\n'.format(c, p) for c, p in zip(
                    word_statistics['context_list'],
                    word_statistics['pure_pred_list'],
                )
            ])
        if opt['compute_unique'] is True:
            with open(opt['dump_predictions_path'] + '_unique', 'w') as f:
                f.writelines(['{}\n'.format(i) for i in unique_list])

    stat_str = 'total_words: {}, '.format(word_statistics['word_cnt'])
    stat_str += ', '.join([
        '<{}:{} ({:.{prec}f}%)'.format(
            b,
            word_statistics['freqs_cnt'].get(b, 0),
            (word_statistics['freqs_cnt'].get(b, 0) /
             word_statistics['word_cnt']) * 100,
            prec=2,
        ) for b in bins
    ])
    print("Word statistics: {}, avg_word_length: {:.{prec}f}, "
          "avg_char_length: {:.{prec}f}".format(
              stat_str,
              numpy.array(word_statistics['mean_wlength']).mean(),
              numpy.array(word_statistics['mean_clength']).mean(),
              prec=2,
          ))

    report = world.report()
    print(report)
    return report

Example #28

Show file

File: starspace.py Project: ahiroto/ParlAI

class StarspaceAgent(Agent):
    """Simple implementation of the starspace algorithm: https://arxiv.org/abs/1709.03856
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        StarspaceAgent.dictionary_class().add_cmdline_args(argparser)
        agent = argparser.add_argument_group('StarSpace Arguments')
        agent.add_argument('-esz', '--embeddingsize', type=int, default=128,
                           help='size of the token embeddings')
        agent.add_argument('-enorm', '--embeddingnorm', type=float, default=10,
                           help='max norm of word embeddings')
        agent.add_argument('-shareEmb', '--share-embeddings', type='bool', default=True,
                           help='whether LHS and RHS share embeddings')
        agent.add_argument('-lr', '--learningrate', type=float, default=0.1,
                           help='learning rate')
        agent.add_argument('-margin', '--margin', type=float, default=0.1,
                           help='margin')
        agent.add_argument('-opt', '--optimizer', default='sgd',
                           choices=StarspaceAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                                'Any member of torch.optim is valid and will '
                                'be used with default params except learning '
                                'rate (as specified by -lr).')
        agent.add_argument('-tr', '--truncate', type=int, default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length.')
        agent.add_argument('-k', '--neg-samples', type=int, default=10,
                           help='number k of negative samples per example')
        agent.add_argument('--parrot-neg', type=int, default=0,
                           help='include query as a negative')
        agent.add_argument('--tfidf', type='bool', default=False,
                           help='Use frequency based normalization for embeddings.')
        agent.add_argument('-cs', '--cache-size', type=int, default=1000,
                           help='size of negative sample cache to draw from')
        agent.add_argument('-hist', '--history-length', default=10000, type=int,
                           help='Number of past tokens to remember. ')
        agent.add_argument('-histr', '--history-replies',
                           default='label', type=str,
                           choices=['none', 'model', 'label'],
                           help='Keep replies in the history, or not.')
        agent.add_argument('-fixedCands', '--fixed-candidates-file',
                           default=None, type=str,
                           help='File of cands to use for prediction')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt
        self.reset_metrics()
        self.id = 'Starspace'
        self.NULL_IDX = 0
        self.cands = torch.LongTensor(1, 1, 1)
        self.ys_cache = []
        self.ys_cache_sz = opt['cache_size']
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        self.debugMode = False
        if shared:
            self.threadindex = shared['threadindex']
            print("[ creating Starspace thread " + str(self.threadindex)  + " ]")
            # set up shared properties
            self.dict = shared['dict']
            self.model = shared['model'] #Starspace(opt, len(self.dict))
        else:
            print("[ creating StarspaceAgent ]")
            # this is not a shared instance of this class, so do full init
            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'
            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)

            self.model = Starspace(opt, len(self.dict), self.dict)
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                self.load(opt['model_file'])
            self.model.share_memory()

        # set up modules
        self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'], size_average=False)
        self.reset()
        self.fixedCands = False
        if self.opt.get('fixed-candidates-file'):
            self.fixedCands = load_cands(self.opt.get('fixed-candidates-file'))

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True
        # set up optimizer
        lr = self.opt['learningrate']
        optim_class = StarspaceAgent.OPTIM_OPTS[self.opt['optimizer']]
        kwargs = {'lr': lr}
        self.optimizer = optim_class(self.model.parameters(), **kwargs)

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['dict'] = self.dict
        shared['model'] = self.model
        return shared

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {'embeddingsize', 'optimizer'}
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                      k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def t2v(self, text):
        p = self.dict.txt2vec(text)
        return Variable(torch.LongTensor(p).unsqueeze(1))

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        if type(vec) == Variable:
            vec = vec.data
        new_vec = []
        for i in vec:
            new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def observe(self, observation):
        self.episode_done = observation['episode_done']
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        obs['text2vec'] = maintain_dialog_history(
            self.history, obs,
            historyLength=self.opt['history_length'],
            useReplies=self.opt['history_replies'],
            dict=self.dict, useStartEndIndices=False)
        self.observation = obs
        return obs

    def same(self, y1, y2):
        if len(y1.squeeze()) != len(y2.squeeze()):
            return False
        if abs((y1.squeeze()-y2.squeeze()).sum().data.sum()) > 0.00001:
            return False
        return True

    def get_negs(self, xs, ys):
        negs = []
        cache_sz = len(self.ys_cache) - 1
        if cache_sz < 1:
            return negs
        k = self.opt['neg_samples']
        for i in range(1, k * 3):
            index =  random.randint(0, cache_sz)
            neg = self.ys_cache[index]
            if not self.same(ys, neg):
                negs.append(neg)
                if len(negs) >= k:
                    break
        if self.opt['parrot_neg'] > 0:
            utt = self.history['last_utterance']
            if len(utt) > 2:
                query = Variable(torch.LongTensor(utt).unsqueeze(0))
                negs.append(query)
        return negs

    def dict_neighbors(self, word, useRHS=False):
        input = self.t2v(word)
        W = self.model.encoder.lt.weight
        q = W[input.data[0][0]]
        if useRHS:
            W = self.model.encoder2.lt.weight
        score = torch.Tensor(W.size(0))
        for i in range(W.size(0)):
            score[i] = torch.nn.functional.cosine_similarity(q, W[i], dim=0).data[0]
        val,ind=score.sort(descending=True)
        for i in range(20):
            print(str(ind[i]) + " [" + str(val[i]) + "]: " + self.v2t(torch.Tensor([ind[i]])))

    def compute_metrics(self, loss, scores):
        metrics = {}
        pos = scores[0]
        cnt = 0
        for i in range(1, len(scores)):
            if scores[i] >= pos:
                cnt += 1
        metrics['mean_rank'] = cnt
        metrics['loss'] = loss
        return metrics

    def predict(self, xs, ys=None, cands=None, cands_txt=None, obs=None):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        is_training = ys is not None
        if is_training: #
            text_cand_inds, loss_dict = None, None
            negs = self.get_negs(xs, ys)
            if is_training and len(negs) > 0:
                self.model.train()
                self.optimizer.zero_grad()
                xe, ye = self.model(xs, ys, negs)
                if self.debugMode:
                    # print example
                    print("inp: " + self.v2t(xs.squeeze()))
                    print("pos: " + self.v2t(ys.squeeze()))
                    for c in negs:
                        print("neg: " + self.v2t(c.squeeze()))
                    print("---")
                y = Variable(-torch.ones(xe.size(0)))
                y[0]= 1
                loss = self.criterion(xe, ye, y)
                loss.backward()
                self.optimizer.step()
                pred = nn.CosineSimilarity().forward(xe,ye)
                metrics = self.compute_metrics(loss.data[0], pred.data.squeeze())
                return [{'metrics':metrics}]
        else:
            if cands is None or cands[0] is None:
                # cannot predict without candidates.
                if self.fixedCands:
                    cands = [self.fixedCands]
                else:
                    return [{}]
            # test set prediction uses candidates
            self.model.eval()
            xe, ye = self.model(xs, ys, cands[0])
            pred = nn.CosineSimilarity().forward(xe,ye)
            # This is somewhat costly which we could avoid if we do not evalute ranking.
            # i.e. by only doing: val,ind = pred.max(0)
            val,ind=pred.sort(descending=True)
            # predict the highest scoring candidate, and return it.
            ypred = cands_txt[0][ind.data[0]]
            tc = []
            for i in range(min(100, ind.size(0))):
                tc.append(cands_txt[0][ind.data[i]])
            ret = [{'text': ypred, 'text_candidates': tc }]
            return ret
        return [{}]

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        def valid(obs):
            # check if this is an example our model should actually process
            return 'text2vec' in obs and len(obs['text2vec']) > 0
        try:
            # valid examples and their indices
            valid_inds, exs = zip(*[(i, ex) for i, ex in
                                    enumerate(observations) if valid(ex)])
        except ValueError:
            # zero examples to process in this batch, so zip failed to unpack
            return None, None, None, None

        # set up the input tensors
        bsz = len(exs)

        # `x` text is already tokenized and truncated
        # sort by length so we can use pack_padded
        parsed_x = [ex['text2vec'] for ex in exs]
        x_lens = [len(x) for x in parsed_x]
        ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k])

        exs = [exs[k] for k in ind_sorted]
        valid_inds = [valid_inds[k] for k in ind_sorted]
        parsed_x = [parsed_x[k] for k in ind_sorted]

        labels_avail = any(['labels' in ex for ex in exs])

        max_x_len = max([len(x) for x in parsed_x])
        for x in parsed_x:
            x += [[self.NULL_IDX]] * (max_x_len - len(x))
        xs = torch.LongTensor(parsed_x)
        xs = Variable(xs)

        # set up the target tensors
        ys = None
        labels = None
        if labels_avail:
            # randomly select one of the labels to update on, if multiple
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            # parse each label and append END
            parsed_y = [deque(maxlen=self.truncate) for _ in labels]
            for dq, y in zip(parsed_y, labels):
                dq.extendleft(reversed(self.parse(y)))
            max_y_len = max(len(y) for y in parsed_y)
            for y in parsed_y:
                y += [self.NULL_IDX] * (max_y_len - len(y))
            ys = torch.LongTensor(parsed_y)
            ys = Variable(ys)

        cands = []
        cands_txt = []
        if ys is None:
            # only build candidates in eval mode.
            for o in observations:
                if 'label_candidates' in o:
                    cs = []
                    ct = []
                    for c in o['label_candidates']:
                        cs.append(Variable(torch.LongTensor(self.parse(c)).unsqueeze(0)))
                        ct.append(c)
                    cands.append(cs)
                    cands_txt.append(ct)
                else:
                    cands.append(None)
                    cands_txt.append(None)
        return xs, ys, cands, cands_txt

    def add_to_ys_cache(self, ys):
        if ys is None or len(ys) == 0:
            return
        if len(self.ys_cache) < self.ys_cache_sz:
            self.ys_cache.append(copy.deepcopy(ys))
        else:
            ind = random.randint(0, self.ys_cache_sz - 1)
            self.ys_cache[ind] = copy.deepcopy(ys)

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]
        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, cands, cands_txt = self.vectorize(observations)
        batch_reply = self.predict(xs, ys, cands, cands_txt, observations)
        self.add_to_ys_cache(ys)
        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def shutdown(self):
        #"""Save the state of the model when shutdown."""
        super().shutdown()

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path
        if path and hasattr(self, 'model'):
            data = {}
            data['model'] = self.model.state_dict()
            data['optimizer'] = self.optimizer.state_dict()
            data['opt'] = self.opt
            with open(path, 'wb') as handle:
                torch.save(data, handle)
            with open(path + ".opt", 'wb') as handle:
                pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            print('Loading existing model params from ' + path)
            data = torch.load(read)
            self.model.load_state_dict(data['model'])
            self.reset()
            self.optimizer.load_state_dict(data['optimizer'])
            self.opt = self.override_opt(data['opt'])

Example #29

Show file

File: ir_baseline.py Project: ahiroto/ParlAI

class IrBaselineAgent(Agent):

    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument(
            '-lp', '--length_penalty', default=0.5,
            help='length penalty for responses')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            rep = self.build_query_representation(obs['text'])
            reply['text_candidates'] = (
                rank_candidates(rep, obs['label_candidates'],
                                self.length_penalty, self.dictionary))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname=None):
        fname = self.opt.get('model_file', None) if fname is None else fname
        if fname:
            self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = [w for w in self.dictionary.tokenize(query.lower())]
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 + math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        norm = len(used)
        rep['norm'] = math.sqrt(len(words))
        return rep

Example #30

Show file

    def __init__(self, opt, shared=None):
        """
        Set up model if shared params not set, otherwise no work to do.
        """
        super().__init__(opt, shared)
        opt = self.opt
        if opt.get('batchsize', 1) > 1:
            raise RuntimeError('Kvmemnn model does not support batchsize > 1, '
                               'try training with numthreads > 1 instead.')
        self.reset_metrics()
        # all instances needs truncate param
        self.id = 'Kvmemnn'
        self.NULL_IDX = 0
        self.start2 = 99
        # set up tensors once
        self.cands = torch.LongTensor(1, 1, 1)
        self.ys_cache = []
        self.ys_cache_sz = opt['cache_size']
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        if shared:
            torch.set_num_threads(1)
            if 'threadindex' in shared:
                self.threadindex = shared['threadindex']
            else:
                self.threadindex = 1
            # set up shared properties
            self.dict = shared['dict']
            # answers contains a batch_size list of the last answer produced
            self.model = shared['model']  # Kvmemnn(opt, len(self.dict))
            if 'fixedX' in shared:
                self.fixedX = shared['fixedX']
                self.fixedCands = shared['fixedCands']
                self.fixedCands_txt = shared['fixedCands_txt']
                self.fixedCands2 = shared['fixedCands2']
                self.fixedCands_txt2 = shared['fixedCands_txt2']
        else:
            print("[ creating KvmemnnAgent ]")
            # this is not a shared instance of this class, so do full init
            self.threadindex = -1
            torch.set_num_threads(1)

            if (opt['dict_file'] is None and opt.get('model_file')
                ) or os.path.isfile(opt['model_file'] + '.dict'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'
            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            if 'loss' not in opt:
                opt['loss'] = 'cosine'
            self.model = Kvmemnn(opt, len(self.dict), self.dict)
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                self.load(opt['model_file'])
            self.model.share_memory()

            self.fixedCands = False
            self.fixedX = None
            path = opt['model_file'] + '.candspair'
            if os.path.isfile(path) and opt.get('loadcands') is not False:
                print("[loading candidates: " + path + "*]")
                fc = load_cands(path)
                fcs = []
                for c in fc:
                    fcs.append(
                        Variable(torch.LongTensor(self.parse(c)).unsqueeze(0)))
                self.fixedCands = fcs
                self.fixedCands_txt = fc
                fc2 = load_cands(path + "2")
                fcs2 = []
                for c2 in fc2:
                    fcs2.append(
                        Variable(
                            torch.LongTensor(self.parse(c2)).unsqueeze(0)))
                self.fixedCands2 = fcs2
                self.fixedCands_txt2 = fc2
                print("[caching..]")
                xsq = Variable(torch.LongTensor([self.parse('nothing')]))
                xe, ye = self.model(xsq, [], None, self.fixedCands)
                self.fixedX = ye
            print("=init done=")

        if self.opt['loss'] == 'cosine':
            self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'],
                                                          size_average=False)
        elif self.opt['loss'] == 'nll':
            self.criterion = nn.CrossEntropyLoss(ignore_index=-100)
        else:
            raise RuntimeError('unspecified loss')
        # self.criterion = torch.nn.MultiMarginLoss(p=1, margin=0.1)
        self.reset()
        # can be used to look at embeddings:
        # self.dict_neighbors('coffee')
        self.take_next_utt = True
        self.cands_done = []
        if 'interactive_mode' in opt:
            self.interactiveMode = self.opt['interactive_mode']
        else:
            self.interactiveMode = False
        if self.interactiveMode:
            print("[ Interactive mode ]")

Example #31

Show file

File: seq2seq.py Project: ahiroto/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        self.states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()


        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            self.model = Seq2seq(opt, len(self.dict),
                                 padding_idx=self.NULL_IDX,
                                 start_idx=self.START_IDX,
                                 end_idx=self.END_IDX,
                                 longest_label=self.states.get('longest_label', 1))

            if opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ModuleNotFoundError as ex:
                    print('Please install torch text with `pip install torchtext`')
                    raise ex
                if opt['embedding_type'].startswith('glove'):
                    init = 'glove'
                    embs = vocab.GloVe(name='840B', dim=300)
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en')
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != 300:
                    rp = torch.Tensor(300, opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.2)
            self.rank = opt['rank_candidates']

            # set up tensors once
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)

            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)

            if self.use_cuda:
                # push to cuda
                self.xs = self.xs.cuda(async=True)
                self.ys = self.ys.cuda(async=True)
                if self.rank:
                    self.cands = self.cands.cuda(async=True)
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs)
            if self.states:
                if self.states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(self.states['optimizer'])

        self.reset()

Example #32

Show file

File: agents.py Project: XinnuoXu/DRank

class DefaultDataset(Dataset):
    """A Pytorch Dataset utilizing streaming."""
    def __init__(self, opt, version='2014'):
        self.opt = opt
        self.use_hdf5 = opt.get('use_hdf5', False)
        self.datatype = self.opt.get('datatype')
        self.training = self.datatype.startswith('train')
        self.num_epochs = self.opt.get('num_epochs', 0)
        self.image_loader = ImageLoader(opt)
        test_info_path, annotation_path, self.image_path = _path(opt, version)
        self._setup_data(test_info_path, annotation_path,
                         opt.get('unittest', False))
        if self.use_hdf5:
            try:
                import h5py
                self.h5py = h5py
            except ImportError:
                raise ImportError('Need to install h5py - `pip install h5py`')
            self._setup_image_data()
        self.dict_agent = DictionaryAgent(opt)

    def __getitem__(self, index):
        index %= self.num_episodes()
        image_id = None
        if not self.datatype.startswith('test'):
            anno = self.annotation['annotations'][index]
            image_id = anno['image_id']
        else:
            image_id = self.test_info['images'][index]['id']
        ep = {
            'text': self.dict_agent.txt2vec(QUESTION),
            'image': self.get_image(image_id),
            'episode_done': True,
        }
        if self.opt.get('extract_image', False):
            ep['image_id'] = image_id
            return ep
        if not self.datatype.startswith('test'):
            anno = self.annotation['annotations'][index]
            ep['labels'] = [anno['caption']]
            ep['valid'] = True
        else:
            ep['valid'] = True
        ep['use_hdf5'] = self.use_hdf5
        return (index, ep)

    def __len__(self):
        num_epochs = self.num_epochs if self.num_epochs > 0 else 100
        num_iters = num_epochs if self.training else 1
        return int(num_iters * self.num_episodes())

    def _load_lens(self):
        with open(self.length_datafile) as length:
            lengths = json.load(length)
            self.num_eps = lengths['num_eps']
            self.num_exs = lengths['num_exs']

    def _setup_data(self, test_info_path, annotation_path, unittest):
        if not self.datatype.startswith('test'):
            with open(annotation_path) as data_file:
                self.annotation = json.load(data_file)
        else:
            with open(test_info_path) as data_file:
                self.test_info = json.load(data_file)

        if unittest:
            if not self.datatype.startswith('test'):
                self.annotation['annotations'] = self.annotation[
                    'annotations'][:10]
            else:
                self.test_info['images'] = self.test_info['images'][:10]

        self.image_paths = set()
        # Depending on whether we are using the train/val/test set, we need to
        # find the image IDs in annotations or test image info
        if not self.datatype.startswith('test'):
            for anno in self.annotation['annotations']:
                self.image_paths.add(self.image_path + '%012d.jpg' %
                                     (anno['image_id']))
        else:
            for info in self.test_info['images']:
                self.image_paths.add(self.image_path + '%012d.jpg' %
                                     (info['id']))

    def _setup_image_data(self):
        '''hdf5 image dataset'''
        extract_feats(self.opt)
        im = self.opt.get('image_mode')
        hdf5_path = os.path.join(self.image_path,
                                 'mode_{}_noatt.hdf5'.format(im))
        hdf5_file = self.h5py.File(hdf5_path, 'r')
        self.image_dataset = hdf5_file['images']

        image_id_to_idx_path = os.path.join(self.image_path,
                                            'mode_{}_id_to_idx.txt'.format(im))
        with open(image_id_to_idx_path, 'r') as f:
            self.image_id_to_idx = json.load(f)

    def get_image(self, image_id):
        if not self.use_hdf5:
            im_path = os.path.join(self.image_path, '%012d.jpg' % (image_id))
            return self.image_loader.load(im_path)
        else:
            img_idx = self.image_id_to_idx[str(image_id)]
            return torch.Tensor(self.image_dataset[img_idx])

    def num_examples(self):
        if not self.datatype.startswith('test'):
            return len(self.annotation['annotations'])
        else:
            return len(self.test_info['images'])

    def num_episodes(self):
        return self.num_examples()

    def num_images(self):
        if not hasattr(self, 'num_imgs'):
            return self.num_examples()
        return self.num_imgs

Example #33

Show file

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available(
            )
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])
            """
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override options with stored ones
                opt = self.override_opt(new_opt)
            """
            if opt.get('ptr_model') and os.path.isfile(opt['ptr_model']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['ptr_model'])
                new_opt, self.states = self.load(
                    opt['ptr_model'])  ## TODO:: load what?
                # override options with stored ones
                #opt = self.override_opt(new_opt)

            self.dict = DictionaryAgent(opt)
            self.id = 'ScoringNet'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START))
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END))
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params directly
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learning_rate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.truncate = opt['truncate']
            self.attention = opt['attention']

            # set up tensors
            if self.opt['bi_encoder']:
                self.zeros = torch.zeros(2 * self.num_layers, 1, hsz)
            else:
                self.zeros = torch.zeros(self.num_layers, 1, hsz)

            self.zeros_dec = torch.zeros(self.num_layers, 1, hsz)

            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            self.neg_ys = torch.LongTensor(1, 1)

            # set up modules
            #self.criterion = nn.NLLLoss(size_average = False, ignore_index = 0)
            self.criterion = nn.BCELoss()

            # lookup table stores word embeddings
            self.lt = nn.Embedding(len(self.dict),
                                   emb,
                                   padding_idx=self.NULL_IDX)
            #scale_grad_by_freq=True)
            # encoder captures the input text
            enc_class = ScoringNetAgent.ENC_OPTS[opt['encoder']]
            self.encoder = enc_class(emb,
                                     hsz,
                                     opt['numlayers'],
                                     bidirectional=opt['bi_encoder'],
                                     dropout=opt['dropout'])
            # decoder produces our output states

            dec_isz = hsz
            if opt['bi_encoder']:
                dec_isz += hsz

            # linear layer helps us produce outputs from final decoder state
            self.h2o = nn.Linear(dec_isz, dec_isz, bias=False)

            # droput on the linear layer helps us generalize
            self.dropout = nn.Dropout(opt['dropout'])

            self.use_attention = False
            self.attn = None
            # if attention is greater than 0, set up additional members
            if self.attention:
                self.use_attention = True
                self.att_type = opt['attn_type']
                input_size = hsz
                if opt['bi_encoder']:
                    input_size += hsz

                if self.att_type == 'concat':
                    self.attn = nn.Linear(input_size + hsz, 1, bias=False)
                elif self.att_type == 'dot':
                    assert not opt['bi_encoder']
                elif self.att_type == 'general':
                    self.attn = nn.Linear(hsz, input_size, bias=False)

            # set up optims for each module
            self.lr = opt['learning_rate']
            self.wd = opt['weight_decay'] is not 0

            optim_class = ScoringNetAgent.OPTIM_OPTS[opt['optimizer']]
            self.optims = {
                'lt':
                optim_class(self.lt.parameters(), lr=self.lr),
                'encoder':
                optim_class(self.encoder.parameters(), lr=self.lr),
                'h2o':
                optim_class(self.h2o.parameters(),
                            lr=self.lr,
                            weight_decay=self.wd),
            }
            if self.attention and self.attn is not None:
                self.optims.update({
                    'attn':
                    optim_class(self.attn.parameters(),
                                lr=self.lr,
                                weight_decay=self.wd)
                })

            if hasattr(self, 'states'):
                # set loaded states if applicable
                if opt.get('ptr_model'):
                    self.init_pretrain(self.states)
                else:
                    self.set_states(self.states)

            if self.use_cuda:
                self.cuda()

            self.loss = 0
            self.ndata = 0
            self.loss_valid = 0
            self.ndata_valid = 0

            if opt['beam_size'] > 0:
                self.beamsize = opt['beam_size']

        self.episode_concat = opt['episode_concat']
        self.training = True
        self.generating = False
        self.local_human = False
        self.max_seq_len = opt['max_seq_len']
        self.reset()

Example #34

Show file

File: seq2seq.py Project: elnaaz/ParlAI

class Seq2seqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    This model supports encoding the input and decoding the output via one of
    several flavors of RNN. It then uses a linear layer (whose weights can
    be shared with the embedding layer) to convert RNN output states into
    output tokens. This model currently uses greedy decoding, selecting the
    highest probability token at each time step.

    For more information, see the following papers:
    - Neural Machine Translation by Jointly Learning to Align and Translate
      `(Bahdanau et al. 2014) <arxiv.org/abs/1409.0473>`_
    - Sequence to Sequence Learning with Neural Networks
      `(Sutskever et al. 2014) <arxiv.org/abs/1409.3215>`_
    - Effective Approaches to Attention-based Neural Machine Translation
      `(Luong et al. 2015) <arxiv.org/abs/1508.04025>`_
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument(
            '--init-model',
            type=str,
            default=None,
            help='load dict/features/weights/opts from this file')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=128,
                           help='size of the hidden layers')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=1,
                           help='learning rate')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.1,
                           help='dropout rate')
        agent.add_argument('-clip',
                           '--gradient-clip',
                           type=float,
                           default=0.1,
                           help='gradient clipping using l2 norm')
        agent.add_argument('-bi',
                           '--bidirectional',
                           type='bool',
                           default=False,
                           help='whether to encode the context with a '
                           'bidirectional rnn')
        agent.add_argument(
            '-att',
            '--attention',
            default='none',
            choices=['none', 'concat', 'general', 'dot', 'local'],
            help='Choices: none, concat, general, local. '
            'If set local, also set attention-length. '
            '(see arxiv.org/abs/1508.04025)')
        agent.add_argument('-attl',
                           '--attention-length',
                           default=48,
                           type=int,
                           help='Length of local attention.')
        agent.add_argument('--attention-time',
                           default='post',
                           choices=['pre', 'post'],
                           help='Whether to apply attention before or after '
                           'decoding.')
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument('-gpu',
                           '--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        # ranking arguments
        agent.add_argument('-rc',
                           '--rank-candidates',
                           type='bool',
                           default=False,
                           help='rank candidates if available. this is done by'
                           ' computing the prob score per token for each '
                           'candidate and selecting the highest scoring.')
        agent.add_argument('-tr',
                           '--truncate',
                           type=int,
                           default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length. This '
                           'reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-rnn',
                           '--rnn-class',
                           default='lstm',
                           choices=Seq2seq.RNN_OPTS.keys(),
                           help='Choose between different types of RNNs.')
        agent.add_argument('-dec',
                           '--decoder',
                           default='same',
                           choices=['same', 'shared'],
                           help='Choose between different decoder modules. '
                           'Default "same" uses same class as encoder, '
                           'while "shared" also uses the same weights. '
                           'Note that shared disabled some encoder '
                           'options--in particular, bidirectionality.')
        agent.add_argument('-lt',
                           '--lookuptable',
                           default='unique',
                           choices=['unique', 'enc_dec', 'dec_out', 'all'],
                           help='The encoder, decoder, and output modules can '
                           'share weights, or not. '
                           'Unique has independent embeddings for each. '
                           'Enc_dec shares the embedding for the encoder '
                           'and decoder. '
                           'Dec_out shares decoder embedding and output '
                           'weights. '
                           'All shares all three weights.')
        agent.add_argument('-opt',
                           '--optimizer',
                           default='sgd',
                           choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                           'Any member of torch.optim is valid and will '
                           'be used with default params except learning '
                           'rate (as specified by -lr).')
        agent.add_argument('-mom',
                           '--momentum',
                           default=-1,
                           type=float,
                           help='if applicable, momentum value for optimizer. '
                           'if > 0, sgd uses nesterov momentum.')
        agent.add_argument('-emb',
                           '--embedding-type',
                           default='random',
                           choices=[
                               'random', 'glove', 'glove-fixed', 'fasttext',
                               'fasttext-fixed', 'glove-twitter'
                           ],
                           help='Choose between different strategies '
                           'for word embeddings. Default is random, '
                           'but can also preinitialize from Glove or '
                           'Fasttext.'
                           'Preinitialized embeddings can also be fixed '
                           'so they are not updated during training.')
        agent.add_argument('-soft',
                           '--numsoftmax',
                           default=1,
                           type=int,
                           help='default 1, if greater then uses mixture of '
                           'softmax (see arxiv.org/abs/1711.03953).')
        agent.add_argument('-rf',
                           '--report-freq',
                           type=float,
                           default=0.001,
                           help='Report frequency of prediction during eval.')
        agent.add_argument(
            '-histr',
            '--history-replies',
            default='label_else_model',
            type=str,
            choices=['none', 'model', 'label', 'label_else_model'],
            help='Keep replies in the history, or not.')
        agent.add_argument('-pt',
                           '--person-tokens',
                           type='bool',
                           default=False,
                           help='use special tokens before each speaker')
        agent.add_argument('--beam-size',
                           type=int,
                           default=1,
                           help='Beam size, if 1 then greedy search')
        agent.add_argument(
            '--beam-log-freq',
            type=float,
            default=0.0,
            help=
            'The portion of beams to dump from minibatch into model_name.beam_dump folder'
        )
        agent.add_argument(
            '--topk',
            type=int,
            default=1,
            help=
            'Top k sampling from renormalized softmax in test/valid time, default 1 means simple greedy max output'
        )
        agent.add_argument(
            '--softmax-layer-bias',
            type='bool',
            default=False,
            help='Put True if you want to include the bias in decoder.e2s layer'
        )
        Seq2seqAgent.dictionary_class().add_cmdline_args(argparser)
        return agent

    def __init__(self, opt, shared=None):
        """Set up model."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {
            'loss': 0.0,
            'num_tokens': 0,
            'correct_tokens': 0,
            'total_skipped_batches': 0
        }
        self.history = {}
        self.report_freq = opt.get('report_freq', 0.001)
        self.use_person_tokens = opt.get('person_tokens', False)
        self.batch_idx = shared and shared.get('batchindex') or 0
        self.rank = opt['rank_candidates']
        self.beam_size = opt.get('beam_size', 1)
        self.topk = opt.get('topk', 1)
        states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        if opt.get('numthreads', 1) > 1:
            torch.set_num_threads(1)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
            self.model = shared['model']
            self.metrics = shared['metrics']
            states = shared.get('states', {})

        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(
                    init_model))
                states = self.load(init_model)

                if os.path.isfile(init_model +
                                  '.dict') or opt['dict_file'] is None:
                    opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            if not hasattr(self, 'model_class'):
                # this allows child classes to override this but inherit init
                self.model_class = Seq2seq
            self.model = self.model_class(opt,
                                          len(self.dict),
                                          padding_idx=self.NULL_IDX,
                                          start_idx=self.START_IDX,
                                          end_idx=self.END_IDX,
                                          longest_label=states.get(
                                              'longest_label', 1))

            if opt.get('dict_tokenizer'
                       ) == 'bpe' and opt['embedding_type'] != 'random':
                print('skipping preinitialization of embeddings for bpe')
            elif not states and opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ImportError as ex:
                    print(
                        'Please install torch text with `pip install torchtext`'
                    )
                    raise ex
                pretrained_dim = 300
                if opt['embedding_type'].startswith('glove'):
                    if 'twitter' in opt['embedding_type']:
                        init = 'glove-twitter'
                        name = 'twitter.27B'
                        pretrained_dim = 200
                    else:
                        init = 'glove'
                        name = '840B'
                    embs = vocab.GloVe(name=name,
                                       dim=pretrained_dim,
                                       cache=modelzoo_path(
                                           self.opt.get('datapath'),
                                           'models:glove_vectors'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                                          cache=modelzoo_path(
                                              self.opt.get('datapath'),
                                              'models:fasttext_vectors'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != pretrained_dim:
                    rp = torch.Tensor(pretrained_dim,
                                      opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

        # set up criteria
        if opt.get('numsoftmax', 1) > 1:
            self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX,
                                        size_average=False)
        else:
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)

        if self.use_cuda:
            self.criterion.cuda()

        if 'train' in opt.get('datatype', ''):
            # we only set up optimizers when training
            # we only set this up for the original instance or hogwild ones
            self.clip = opt.get('gradient_clip', -1)

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt.get('momentum') > 0 and opt['optimizer'] in [
                    'sgd', 'rmsprop'
            ]:
                kwargs['momentum'] = opt['momentum']
                if opt['optimizer'] == 'sgd':
                    kwargs['nesterov'] = True
            if opt['optimizer'] == 'adam':
                # https://openreview.net/forum?id=ryQu7f-RZ
                kwargs['amsgrad'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    try:
                        self.optimizer.load_state_dict(states['optimizer'])
                    except ValueError:
                        print('WARNING: not loading optim state since model '
                              'params changed.')
                    if self.use_cuda:
                        for state in self.optimizer.state.values():
                            for k, v in state.items():
                                if isinstance(v, torch.Tensor):
                                    state[k] = v.cuda()
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder',
            'decoder', 'lookuptable', 'attention', 'attention_length',
            'rnn_class'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('[ Adding new option: | {k}: {v} | ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('[ Overriding option: | {k}: {old} => {v} | ]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        if 'dict_file' in new_opt and not self.opt.get('dict_file'):
            print('[ No dictionary path detected, trying to load previous '
                  'path {} ]'.format(new_opt['dict_file']))
            self.opt['dict_file'] = new_opt['dict_file']
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        new_vec = []
        for i in vec:
            if i == self.END_IDX:
                break
            elif i != self.START_IDX:
                new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def zero_grad(self):
        """Zero out optimizer."""
        self.optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        if self.clip > 0:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
        self.optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.history.clear()
        for i in range(len(self.answers)):
            self.answers[i] = None
        self.reset_metrics()

    def reset_metrics(self):
        """Reset metrics for reporting loss and perplexity."""
        self.metrics['loss'] = 0.0
        self.metrics['num_tokens'] = 0
        self.metrics['correct_tokens'] = 0

    def report(self):
        """Report loss and perplexity from model's perspective.

        Note that this includes predicting __END__ and __UNK__ tokens and may
        differ from a truly independent measurement.
        """
        m = {}
        num_tok = self.metrics['num_tokens']
        if num_tok > 0:
            if self.metrics['correct_tokens'] > 0:
                m['token_acc'] = self.metrics['correct_tokens'] / num_tok
            m['loss'] = self.metrics['loss'] / num_tok
            try:
                m['ppl'] = math.exp(m['loss'])
            except OverflowError:
                m['ppl'] = float('inf')
        if self.metrics['total_skipped_batches'] > 0:
            m['total_skipped_batches'] = self.metrics['total_skipped_batches']
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            m[k] = round_sigfigs(v, 4)
        return m

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['opt'] = self.opt
        shared['answers'] = self.answers
        shared['dict'] = self.dict
        shared['START_IDX'] = self.START_IDX
        shared['END_IDX'] = self.END_IDX
        shared['NULL_IDX'] = self.NULL_IDX
        shared['model'] = self.model
        if self.opt.get('numthreads', 1) > 1:
            # we're doing hogwild so share the model too
            if type(self.metrics) == dict:
                # move metrics and model to shared memory
                self.metrics = SharedTable(self.metrics)
                self.model.share_memory()
            shared['states'] = {  # don't share optimizer states
                'optimizer_type': self.opt['optimizer'],
            }
        shared['metrics'] = self.metrics  # do after numthreads check
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()

        if not obs.get('preprocessed', False) or 'text2vec' not in obs:
            obs['text2vec'] = maintain_dialog_history(
                self.history,
                obs,
                reply=self.answers[self.batch_idx],
                historyLength=self.truncate,
                useReplies=self.opt.get('history_replies'),
                dict=self.dict,
                useStartEndIndices=self.use_person_tokens)
        else:
            obs['text2vec'] = deque(obs['text2vec'], maxlen=self.truncate)
        self.observation = obs
        self.answers[self.batch_idx] = None
        return obs

    def predict(self,
                xs,
                ys=None,
                cands=None,
                valid_cands=None,
                is_training=False):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        predictions, cand_preds = None, None
        if is_training:
            self.model.train()
            self.zero_grad()
            out = None
            try:
                out = self.model(xs,
                                 ys,
                                 rank_during_training=cands is not None)
                # generated response
                _preds, scores, cand_preds = out[0], out[1], out[2]

                score_view = scores.view(-1, scores.size(-1))
                loss = self.criterion(score_view, ys.view(-1))
                # save loss to metrics
                y_ne = ys.ne(self.NULL_IDX)
                target_tokens = y_ne.long().sum().item()
                correct = ((ys == _preds) * y_ne).sum().item()
                self.metrics['correct_tokens'] += correct
                self.metrics['loss'] += loss.item()
                self.metrics['num_tokens'] += target_tokens
                loss /= target_tokens  # average loss per token
                loss.backward()
            except RuntimeError as e:
                # catch out of memory exceptions during fwd/bck (skip batch)
                if 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch. '
                          'if this happens frequently, decrease batchsize or '
                          'truncate the inputs to the model.')
                    self.metrics['total_skipped_batches'] += 1
                    return predictions, cand_preds
                else:
                    raise e
            self.update_params()
        else:
            self.model.eval()
            if valid_cands:
                out = self.model(xs,
                                 ys=None,
                                 cands=cands,
                                 valid_cands=valid_cands,
                                 beam_size=self.beam_size,
                                 topk=self.topk)
            else:
                out = self.model(xs,
                                 ys=None,
                                 cands=cands,
                                 beam_size=self.beam_size,
                                 topk=self.topk)
            predictions, cand_preds = out[0], out[2]

            if ys is not None:
                # calculate loss on targets
                out = self.model(xs, ys)
                scores = out[1]
                score_view = scores.view(-1, scores.size(-1))
                loss = self.criterion(score_view, ys.view(-1))
                # save loss to metrics
                target_tokens = ys.ne(self.NULL_IDX).long().sum().item()
                self.metrics['loss'] += loss.item()
                self.metrics['num_tokens'] += target_tokens

        return predictions, cand_preds

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        is_training = any(['labels' in obs for obs in observations])
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations,
            self.dict,
            end_idx=self.END_IDX,
            null_idx=self.NULL_IDX,
            dq=True,
            eval_labels=True,
            truncate=self.truncate)

        if xs is None:
            return None, None, None, None, None, None, None
        xs = torch.LongTensor(xs)
        if ys is not None:
            ys = torch.LongTensor(ys)
        if self.use_cuda:
            # copy to gpu
            xs = xs.cuda()
            if ys is not None:
                ys = ys.cuda()

        cands = None
        valid_cands = None
        if not is_training and self.rank:
            # set up candidates
            cands = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[v]:
                    curr_lcs = list(observations[v]['label_candidates'])
                    curr_cands = [{'text': c} for c in curr_lcs]
                    cs, _, _, valid_c_inds, *_ = PaddingUtils.pad_text(
                        curr_cands,
                        self.dict,
                        null_idx=self.NULL_IDX,
                        dq=True,
                        truncate=self.truncate)
                    valid_cands.append(
                        (i, v, [curr_lcs[j] for j in valid_c_inds]))
                    cs = torch.LongTensor(cs)
                    if self.use_cuda:
                        cs = cs.cuda()
                    cands.append(cs)

        return xs, ys, labels, valid_inds, cands, valid_cands, is_training

    def init_cuda_buffer(self, batchsize):
        if self.use_cuda and not hasattr(self, 'buffer_initialized'):
            try:
                print('preinitializing pytorch cuda buffer')
                bsz = self.opt.get('batchsize', batchsize)
                maxlen = self.truncate or 180
                dummy = torch.ones(bsz, maxlen).long().cuda()
                sc = self.model(dummy, dummy)[1]
                loss = self.criterion(sc.view(-1, sc.size(-1)), dummy.view(-1))
                loss.backward()
                self.buffer_initialized = True
            except RuntimeError as e:
                if 'out of memory' in str(e):
                    m = (
                        'CUDA OOM: Lower batch size (-bs) from {} or lower max'
                        ' sequence length (-tr) from {}'.format(bsz, maxlen))
                    raise RuntimeError(m)
                else:
                    raise e

    def batch_act(self, observations):
        batchsize = len(observations)
        self.init_cuda_buffer(batchsize)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, labels, valid_inds, cands, valid_cands, is_training = self.vectorize(
            observations)

        if xs is None:
            # no valid examples, just return empty responses
            return batch_reply

        # produce predictions, train on targets if availables
        cand_inds = [i[0]
                     for i in valid_cands] if valid_cands is not None else None
        predictions, cand_preds = self.predict(xs, ys, cands, cand_inds,
                                               is_training)

        if is_training:
            report_freq = 0
        else:
            report_freq = self.report_freq
        if predictions is not None:
            PaddingUtils.map_predictions(
                predictions,
                valid_inds,
                batch_reply,
                observations,
                self.dict,
                self.END_IDX,
                report_freq=report_freq,
                labels=labels,
                answers=self.answers,
                ys=ys.data if ys is not None else None)

        if cand_preds is not None:
            if valid_cands is None:
                valid_cands = [(None, i, labels) for i in valid_inds]
            for i in range(len(valid_cands)):
                order = cand_preds[i]
                _, batch_idx, curr_cands = valid_cands[i]
                curr = batch_reply[batch_idx]
                curr['text_candidates'] = [
                    curr_cands[idx] for idx in order if idx < len(curr_cands)
                ]

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['longest_label'] = self.model.longest_label
            model['optimizer'] = self.optimizer.state_dict()
            model['optimizer_type'] = self.opt['optimizer']

            with open(path, 'wb') as write:
                torch.save(model, write)

            # save opt file
            with open(path + ".opt", 'wb') as handle:
                pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None and hasattr(self, 'optimizer'):
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        states = torch.load(path, map_location=lambda cpu, _: cpu)
        return states

    def receive_metrics(self, metrics_dict):
        """Use the metrics to decide when to adjust LR schedule."""
        if 'loss' in metrics_dict:
            self.scheduler.step(metrics_dict['loss'])

Example #35

Show file

File: starspace.py Project: yyy0506/ParlAI

class StarspaceAgent(Agent):
    """Simple implementation of the starspace algorithm: https://arxiv.org/abs/1709.03856
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        agent = argparser.add_argument_group('StarSpace Arguments')
        agent.add_argument(
            '-emb',
            '--embedding-type',
            default='random',
            choices=[
                'random', 'glove', 'glove-fixed', 'glove-twitter-fixed',
                'fasttext', 'fasttext-fixed', 'fasttext_cc',
                'fasttext_cc-fixed'
            ],
            help='Choose between different strategies for initializing word '
            'embeddings. Default is random, but can also preinitialize '
            'from Glove or Fasttext. Preinitialized embeddings can also '
            'be fixed so they are not updated during training.')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=128,
                           help='size of the token embeddings')
        agent.add_argument('-enorm',
                           '--embeddingnorm',
                           type=float,
                           default=10,
                           help='max norm of word embeddings')
        agent.add_argument('-shareEmb',
                           '--share-embeddings',
                           type='bool',
                           default=True,
                           help='whether LHS and RHS share embeddings')
        agent.add_argument(
            '--lins',
            default=0,
            type=int,
            help='If set to 1, add a linear layer between lhs and rhs.')
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=0.1,
                           help='learning rate')
        agent.add_argument('-margin',
                           '--margin',
                           type=float,
                           default=0.1,
                           help='margin')
        agent.add_argument(
            '--input_dropout',
            type=float,
            default=0,
            help='fraction of input/output features to dropout during training'
        )
        agent.add_argument('-opt',
                           '--optimizer',
                           default='sgd',
                           choices=StarspaceAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                           'Any member of torch.optim is valid and will '
                           'be used with default params except learning '
                           'rate (as specified by -lr).')
        agent.add_argument('-tr',
                           '--truncate',
                           type=int,
                           default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length.')
        agent.add_argument('-k',
                           '--neg-samples',
                           type=int,
                           default=10,
                           help='number k of negative samples per example')
        agent.add_argument('--parrot-neg',
                           type=int,
                           default=0,
                           help='include query as a negative')
        agent.add_argument(
            '--tfidf',
            type='bool',
            default=False,
            help='Use frequency based normalization for embeddings.')
        agent.add_argument('-cs',
                           '--cache-size',
                           type=int,
                           default=1000,
                           help='size of negative sample cache to draw from')
        agent.add_argument('-hist',
                           '--history-length',
                           default=10000,
                           type=int,
                           help='Number of past tokens to remember. ')
        agent.add_argument(
            '-histr',
            '--history-replies',
            default='label_else_model',
            type=str,
            choices=['none', 'model', 'label', 'label_else_model'],
            help='Keep replies in the history, or not.')
        agent.add_argument('-fixedCands',
                           '--fixed-candidates-file',
                           default=None,
                           type=str,
                           help='File of cands to use for prediction')
        StarspaceAgent.dictionary_class().add_cmdline_args(argparser)

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt
        self.reset_metrics()
        self.id = 'Starspace'
        self.NULL_IDX = 0
        self.cands = torch.LongTensor(1, 1, 1)
        self.ys_cache = []
        self.ys_cache_sz = opt['cache_size']
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        self.debugMode = False
        if shared:
            torch.set_num_threads(1)
            # set up shared properties
            self.dict = shared['dict']
            self.model = shared['model']
        else:
            print("[ creating StarspaceAgent ]")
            # this is not a shared instance of this class, so do full init
            if (opt.get('model_file')
                    and (os.path.isfile(opt.get('model_file') + '.dict') or
                         (opt['dict_file'] is None))):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'
            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)

            self.model = Starspace(opt, len(self.dict), self.dict)
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                self.load(opt['model_file'])
            else:
                self._init_embeddings()
            self.model.share_memory()

        # set up modules
        self.criterion = torch.nn.CosineEmbeddingLoss(margin=opt['margin'],
                                                      size_average=False)
        self.reset()
        self.fixedCands = False
        self.fixedX = None
        if self.opt.get('fixed_candidates_file'):
            self.fixedCands_txt = load_cands(
                self.opt.get('fixed_candidates_file'))
            fcs = []
            for c in self.fixedCands_txt:
                fcs.append(torch.LongTensor(self.parse(c)).unsqueeze(0))
            self.fixedCands = fcs
            print("[loaded candidates]")

    def _init_embeddings(self, log=True):
        """Copy embeddings from the pretrained embeddings to the lookuptable.

        :param weight:   weights of lookup table (nn.Embedding/nn.EmbeddingBag)
        :param emb_type: pretrained embedding type
        """
        weight = self.model.lt.weight
        emb_type = self.opt.get('embedding_type', 'random')
        if emb_type == 'random':
            return
        embs, name = TorchAgent._get_embtype(self, emb_type)
        cnt = 0
        for w, i in self.dict.tok2ind.items():
            if w in embs.stoi:
                vec = TorchAgent._project_vec(self, embs.vectors[embs.stoi[w]],
                                              weight.size(1))
                weight.data[i] = vec
                cnt += 1
        if log:
            print('Initialized embeddings for {} tokens ({}%) from {}.'
                  ''.format(cnt, round(cnt * 100 / len(self.dict), 1), name))

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True
        # set up optimizer
        lr = self.opt['learningrate']
        optim_class = StarspaceAgent.OPTIM_OPTS[self.opt['optimizer']]
        kwargs = {'lr': lr}
        self.optimizer = optim_class(self.model.parameters(), **kwargs)

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['dict'] = self.dict
        shared['model'] = self.model
        return shared

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {'embeddingsize', 'optimizer'}
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        vec = self.dict.txt2vec(text)
        if vec == []:
            vec = [self.dict[self.dict.null_token]]
        return vec

    def t2v(self, text):
        p = self.dict.txt2vec(text)
        return torch.LongTensor(p).unsqueeze(1)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        new_vec = []
        for i in vec:
            new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def observe(self, observation):
        self.episode_done = observation['episode_done']
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        obs['text2vec'] = maintain_dialog_history(
            self.history,
            obs,
            historyLength=self.opt['history_length'],
            useReplies=self.opt['history_replies'],
            dict=self.dict,
            useStartEndIndices=False)
        self.observation = obs
        return obs

    def same(self, y1, y2):
        if len(y1.squeeze(0)) != len(y2.squeeze(0)):
            return False
        if abs((y1.squeeze(0) - y2.squeeze(0)).sum().data.sum()) > 0.00001:
            return False
        return True

    def get_negs(self, xs, ys):
        negs = []
        cache_sz = len(self.ys_cache) - 1
        if cache_sz < 1:
            return negs
        k = self.opt['neg_samples']
        for i in range(1, k * 3):
            index = random.randint(0, cache_sz)
            neg = self.ys_cache[index]
            if not self.same(ys, neg):
                negs.append(neg)
                if len(negs) >= k:
                    break
        if self.opt['parrot_neg'] > 0:
            utt = self.history['last_utterance']
            if len(utt) > 2:
                query = torch.LongTensor(utt).unsqueeze(0)
                negs.append(query)
        return negs

    def dict_neighbors(self, word, useRHS=False):
        input = self.t2v(word)
        W = self.model.encoder.lt.weight
        q = W[input.data[0][0]]
        if useRHS:
            W = self.model.encoder2.lt.weight
        score = torch.Tensor(W.size(0))
        for i in range(W.size(0)):
            score[i] = torch.nn.functional.cosine_similarity(q, W[i],
                                                             dim=0).data[0]
        val, ind = score.sort(descending=True)
        for i in range(20):
            print(
                str(ind[i]) + " [" + str(val[i]) + "]: " +
                self.v2t(torch.Tensor([ind[i]])))

    def compute_metrics(self, loss, scores):
        metrics = {}
        pos = scores[0]
        cnt = 0
        for i in range(1, len(scores)):
            if scores[i] >= pos:
                cnt += 1
        metrics['mean_rank'] = cnt
        metrics['loss'] = loss
        return metrics

    def input_dropout(self, xs, ys, negs):
        def dropout(x, rate):
            xd = []
            for i in x[0]:
                if random.uniform(0, 1) > rate:
                    xd.append(i)
            if len(xd) == 0:
                # pick one random thing to put in xd
                xd.append(x[0][random.randint(0, x.size(1) - 1)])
            return torch.LongTensor(xd).unsqueeze(0)

        rate = self.opt.get('input_dropout')
        xs2 = dropout(xs, rate)
        ys2 = dropout(ys, rate)
        negs2 = []
        for n in negs:
            negs2.append(dropout(n, rate))
        return xs2, ys2, negs2

    def predict(self, xs, ys=None, cands=None, cands_txt=None, obs=None):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        is_training = ys is not None
        if is_training:
            negs = self.get_negs(xs, ys)
            if is_training and len(negs) > 0:
                self.model.train()
                self.optimizer.zero_grad()
                if self.opt.get('input_dropout', 0) > 0:
                    xs, ys, negs = self.input_dropout(xs, ys, negs)
                xe, ye = self.model(xs, ys, negs)
                if self.debugMode:
                    # print example
                    print("inp: " + self.v2t(xs.squeeze()))
                    print("pos: " + self.v2t(ys.squeeze()))
                    for c in negs:
                        print("neg: " + self.v2t(c.squeeze()))
                    print("---")
                y = -torch.ones(xe.size(0))
                y[0] = 1
                loss = self.criterion(xe, ye, y)
                loss.backward()
                self.optimizer.step()
                pred = nn.CosineSimilarity().forward(xe, ye)
                metrics = self.compute_metrics(loss.item(),
                                               pred.data.squeeze())
                return [{'metrics': metrics}]
        else:
            self.model.eval()
            if cands is None or cands[0] is None:
                # cannot predict without candidates.
                if self.fixedCands:
                    cands = [self.fixedCands]
                    cands_txt = [self.fixedCands_txt]
                else:
                    return [{'text': 'I dunno.'}]
                # test set prediction uses fixed candidates
                if self.fixedX is None:
                    xe, ye = self.model(xs, ys, self.fixedCands)
                    self.fixedX = ye
                else:
                    # fixed candidate embed vectors are cached, dont't recompute
                    blah = torch.LongTensor([1])
                    xe, ye = self.model(xs, ys, [blah])
                    ye = self.fixedX
            else:
                # test set prediction uses candidates
                xe, ye = self.model(xs, ys, cands[0])
            pred = nn.CosineSimilarity().forward(xe, ye)
            # This is somewhat costly which we could avoid if we do not evalute ranking.
            # i.e. by only doing: val,ind = pred.max(0)
            val, ind = pred.sort(descending=True)
            # predict the highest scoring candidate, and return it.
            ypred = cands_txt[0][ind.data[0]]
            tc = []
            for i in range(min(100, ind.size(0))):
                tc.append(cands_txt[0][ind.data[i]])
            ret = [{'text': ypred, 'text_candidates': tc}]
            return ret
        return [{'id': self.getID()}]

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        def valid(obs):
            # check if this is an example our model should actually process
            return 'text2vec' in obs and len(obs['text2vec']) > 0

        try:
            # valid examples and their indices
            valid_inds, exs = zip(*[(i, ex)
                                    for i, ex in enumerate(observations)
                                    if valid(ex)])
        except ValueError:
            # zero examples to process in this batch, so zip failed to unpack
            return None, None, None, None

        # `x` text is already tokenized and truncated
        # sort by length so we can use pack_padded
        parsed_x = [ex['text2vec'] for ex in exs]
        x_lens = [len(x) for x in parsed_x]
        ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k])

        exs = [exs[k] for k in ind_sorted]
        valid_inds = [valid_inds[k] for k in ind_sorted]
        parsed_x = [parsed_x[k] for k in ind_sorted]

        labels_avail = any(['labels' in ex for ex in exs])

        max_x_len = max([len(x) for x in parsed_x])
        for x in parsed_x:
            x += [self.NULL_IDX] * (max_x_len - len(x))
        xs = torch.LongTensor(parsed_x)

        # set up the target tensors
        ys = None
        labels = None
        if labels_avail:
            # randomly select one of the labels to update on, if multiple
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            # parse each label and append END
            parsed_y = [deque(maxlen=self.truncate) for _ in labels]
            for dq, y in zip(parsed_y, labels):
                dq.extendleft(reversed(self.parse(y)))
            max_y_len = max(len(y) for y in parsed_y)
            for y in parsed_y:
                y += [self.NULL_IDX] * (max_y_len - len(y))
            ys = torch.LongTensor(parsed_y)

        cands = []
        cands_txt = []
        if ys is None:
            # only build candidates in eval mode.
            for o in observations:
                if o.get('label_candidates', False):
                    cs = []
                    ct = []
                    for c in o['label_candidates']:
                        cs.append(torch.LongTensor(self.parse(c)).unsqueeze(0))
                        ct.append(c)
                    cands.append(cs)
                    cands_txt.append(ct)
                else:
                    cands.append(None)
                    cands_txt.append(None)
        return xs, ys, cands, cands_txt

    def add_to_ys_cache(self, ys):
        if ys is None or len(ys) == 0:
            return
        if len(self.ys_cache) < self.ys_cache_sz:
            self.ys_cache.append(copy.deepcopy(ys))
        else:
            ind = random.randint(0, self.ys_cache_sz - 1)
            self.ys_cache[ind] = copy.deepcopy(ys)

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, cands, cands_txt = self.vectorize(observations)
        batch_reply = self.predict(xs, ys, cands, cands_txt, observations)
        while len(batch_reply) < batchsize:
            batch_reply.append({'id': self.getID()})
        self.add_to_ys_cache(ys)
        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def shutdown(self):
        # """Save the state of the model when shutdown."""
        super().shutdown()

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path
        if path and hasattr(self, 'model'):
            data = {}
            data['model'] = self.model.state_dict()
            data['optimizer'] = self.optimizer.state_dict()
            data['opt'] = self.opt
            with open(path, 'wb') as handle:
                torch.save(data, handle)
            with open(path + '.opt', 'w') as handle:
                json.dump(self.opt, handle)

    def load(self, path):
        """Return opt and model states."""
        print('Loading existing model params from ' + path)
        data = torch.load(path, map_location=lambda cpu, _: cpu)
        self.model.load_state_dict(data['model'])
        self.reset()
        self.optimizer.load_state_dict(data['optimizer'])
        self.opt = self.override_opt(data['opt'])

Example #36

Show file

File: seq2seq.py Project: elnaaz/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {
            'loss': 0.0,
            'num_tokens': 0,
            'correct_tokens': 0,
            'total_skipped_batches': 0
        }
        self.history = {}
        self.report_freq = opt.get('report_freq', 0.001)
        self.use_person_tokens = opt.get('person_tokens', False)
        self.batch_idx = shared and shared.get('batchindex') or 0
        self.rank = opt['rank_candidates']
        self.beam_size = opt.get('beam_size', 1)
        self.topk = opt.get('topk', 1)
        states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        if opt.get('numthreads', 1) > 1:
            torch.set_num_threads(1)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
            self.model = shared['model']
            self.metrics = shared['metrics']
            states = shared.get('states', {})

        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(
                    init_model))
                states = self.load(init_model)

                if os.path.isfile(init_model +
                                  '.dict') or opt['dict_file'] is None:
                    opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            if not hasattr(self, 'model_class'):
                # this allows child classes to override this but inherit init
                self.model_class = Seq2seq
            self.model = self.model_class(opt,
                                          len(self.dict),
                                          padding_idx=self.NULL_IDX,
                                          start_idx=self.START_IDX,
                                          end_idx=self.END_IDX,
                                          longest_label=states.get(
                                              'longest_label', 1))

            if opt.get('dict_tokenizer'
                       ) == 'bpe' and opt['embedding_type'] != 'random':
                print('skipping preinitialization of embeddings for bpe')
            elif not states and opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ImportError as ex:
                    print(
                        'Please install torch text with `pip install torchtext`'
                    )
                    raise ex
                pretrained_dim = 300
                if opt['embedding_type'].startswith('glove'):
                    if 'twitter' in opt['embedding_type']:
                        init = 'glove-twitter'
                        name = 'twitter.27B'
                        pretrained_dim = 200
                    else:
                        init = 'glove'
                        name = '840B'
                    embs = vocab.GloVe(name=name,
                                       dim=pretrained_dim,
                                       cache=modelzoo_path(
                                           self.opt.get('datapath'),
                                           'models:glove_vectors'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                                          cache=modelzoo_path(
                                              self.opt.get('datapath'),
                                              'models:fasttext_vectors'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != pretrained_dim:
                    rp = torch.Tensor(pretrained_dim,
                                      opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

        # set up criteria
        if opt.get('numsoftmax', 1) > 1:
            self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX,
                                        size_average=False)
        else:
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)

        if self.use_cuda:
            self.criterion.cuda()

        if 'train' in opt.get('datatype', ''):
            # we only set up optimizers when training
            # we only set this up for the original instance or hogwild ones
            self.clip = opt.get('gradient_clip', -1)

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt.get('momentum') > 0 and opt['optimizer'] in [
                    'sgd', 'rmsprop'
            ]:
                kwargs['momentum'] = opt['momentum']
                if opt['optimizer'] == 'sgd':
                    kwargs['nesterov'] = True
            if opt['optimizer'] == 'adam':
                # https://openreview.net/forum?id=ryQu7f-RZ
                kwargs['amsgrad'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    try:
                        self.optimizer.load_state_dict(states['optimizer'])
                    except ValueError:
                        print('WARNING: not loading optim state since model '
                              'params changed.')
                    if self.use_cuda:
                        for state in self.optimizer.state.values():
                            for k, v in state.items():
                                if isinstance(v, torch.Tensor):
                                    state[k] = v.cuda()
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()

Example #37

Show file

File: fairseq.py Project: ahiroto/ParlAI

class FairseqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    For more information, see Convolutional Sequence to Sequence Learning
     `(Gehring et al. 2017) <https://arxiv.org/abs/1705.03122>`_.
    """

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Fairseq Arguments')
        agent.add_argument(
            '-tr', '--truncate',
            type=int, default=-1,
            help='truncate input & output lengths to speed up training (may '
                 'reduce accuracy). This fixes all input and output to have a '
                 'maximum length. This reduces the total amount of padding in '
                 'the batches.')
        agent.add_argument(
            '--max-positions',
            default=1024,
            type=int,
            metavar='N',
            help='max number of tokens in the sequence')
        agent.add_argument(
            '--seed',
            default=1,
            type=int,
            metavar='N',
            help='pseudo random number generator seed')
        options.add_optimization_args(argparser)
        options.add_generation_args(argparser)
        options.add_model_args(argparser)

    def __init__(self, opt, shared=None):
        # initialize defaults first
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.
            saved_state = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, saved_state = self.load(opt['model_file'])
                # override options with stored ones
                opt = self._override_opt(new_opt)

            self.args = OptWrapper(opt)
            self.parlai_dict = DictionaryAgent(opt)
            self.fairseq_dict = _make_fairseq_dict(self.parlai_dict)
            self.id = 'Fairseq'
            self.truncate = opt['truncate'] if opt['truncate'] > 0 else None

            self.EOS = self.fairseq_dict[self.fairseq_dict.eos()]
            self.EOS_TENSOR = (torch.LongTensor(1, 1)
                               .fill_(self.fairseq_dict.eos()))
            self.NULL_IDX = self.fairseq_dict.pad()

            encoder = fconv.FConvEncoder(
                self.fairseq_dict,
                embed_dim=self.args.encoder_embed_dim,
                convolutions=eval(self.args.encoder_layers),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            decoder = fconv.FConvDecoder(
                self.fairseq_dict,
                embed_dim=self.args.decoder_embed_dim,
                convolutions=eval(self.args.decoder_layers),
                out_embed_dim=self.args.decoder_out_embed_dim,
                attention=eval(self.args.decoder_attention),
                dropout=self.args.dropout,
                max_positions=self.args.max_positions)
            self.model = fconv.FConvModel(encoder, decoder)

            # from fairseq's build_criterion()
            if self.args.label_smoothing > 0:
                self.criterion = criterions.LabelSmoothedCrossEntropyCriterion(
                    self.args.label_smoothing, self.NULL_IDX)
            else:
                self.criterion = criterions.CrossEntropyCriterion(
                    self.args, self.fairseq_dict)

            self.trainer = MultiprocessingTrainer(self.args, self.model, self.criterion)
            if saved_state is not None:
                self.set_states(saved_state)
        self.reset()

    def _override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'arch',
            'encoder-embed-dim',
            'encoder-layers',
            'decoder-embed-dim',
            'decoder-layers',
            'decoder-out-embed-dim',
            'decoder-attention',
        }

        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True

    def observe(self, observation):
        # shallow copy observation (deep copy can be expensive)
        observation = observation.copy()
        if not self.episode_done and not observation.get('preprocessed', False):
            # if the last example wasn't the end of an episode, then we need to
            # recall what was said in that example
            prev_dialogue = self.observation['text']
            observation['text'] = prev_dialogue + '\n' + observation['text']
        self.observation = observation
        self.episode_done = observation['episode_done']
        return observation

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def batch_act(self, observations):
        bsz = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(bsz)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field

        # also, split observations into sub-batches based on number of gpus
        obs_split = np.array_split(observations, self.trainer.num_replicas)
        samples = [self.batchify(obs) for obs in obs_split]
        samples = [s for s in samples if s[0] is not None]
        any_valid = any(len(s[0]) > 0 for s in samples)

        if not any_valid:
            # no valid examples, just return the empty responses we set up
            return batch_reply

        # produce predictions if testing; otherwise, train
        has_targets = any(s[1] is not None for s in samples)
        if not has_targets:
            offset = 0
            for s in samples:
                xs = s[0]
                valid_inds = s[2]

                predictions = self._generate(self.args, xs)
                for i in range(len(predictions)):
                    # map the predictions back to non-empty examples in the batch
                    batch_reply[valid_inds[i] + offset]['text'] = predictions[i]
                    if i == 0:
                        print('prediction:', predictions[i])
                offset += len(valid_inds)
        else:
            loss = self._train(samples)

            batch_reply[0]['metrics'] = {}
            for k, v in loss.items():
                batch_reply[0]['metrics'][k] = v * bsz
                if k == 'loss':
                    try:
                        perplexity = 2 ** v * bsz
                    except OverflowError:
                        perplexity = float('inf')
                    batch_reply[0]['metrics']['perplexity'] = perplexity

        return batch_reply

    def parse(self, string):
        return [self.fairseq_dict.index(word)
                for word in self.parlai_dict.tokenize(string)]

    def batchify(self, observations):
        """Convert a list of observations into input & target tensors."""
        # valid examples
        exs = [ex for ex in observations if 'text' in ex]
        # the indices of the valid (non-empty) tensors
        valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex]

        # set up the input tensors
        batchsize = len(exs)
        if batchsize == 0:
            return None, None, None
        # tokenize the text
        parsed_x = [deque(maxlen=self.truncate) for _ in exs]
        for dq, ex in zip(parsed_x, exs):
            dq += self.parse(ex['text'])
        # parsed = [self.parse(ex['text']) for ex in exs]
        max_x_len = max((len(x) for x in parsed_x))
        for x in parsed_x:
            # left pad with zeros
            x.extendleft([self.fairseq_dict.pad()] * (max_x_len - len(x)))
        xs = torch.LongTensor(parsed_x)

        # set up the target tensors
        ys = None
        if 'labels' in exs[0]:
            # randomly select one of the labels to update on, if multiple
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            parsed_y = [deque(maxlen=self.truncate) for _ in labels]
            for dq, y in zip(parsed_y, labels):
                dq.extendleft(reversed(self.parse(y)))
            for y in parsed_y:
                y.append(self.fairseq_dict.eos())
            # append EOS to each label
            max_y_len = max(len(y) for y in parsed_y)
            for y in parsed_y:
                y += [self.fairseq_dict.pad()] * (max_y_len - len(y))
            ys = torch.LongTensor(parsed_y)
        return xs, ys, valid_inds

    def _positions_for_tokens(self, tokens):
        size = tokens.size()
        not_pad = tokens.ne(self.fairseq_dict.pad()).long()
        new_pos = tokens.new(size).fill_(self.fairseq_dict.pad())
        new_pos += not_pad
        for i in range(1, size[1]):
            new_pos[:, i] += new_pos[:, i-1] - 1
        return new_pos

    def _right_shifted_ys(self, ys):
        result = torch.LongTensor(ys.size())
        result[:, 0] = self.fairseq_dict.index(self.EOS)
        result[:, 1:] = ys[:, :-1]
        return result

    def _generate(self, opt, src_tokens):
        if not hasattr(self, 'translator'):
            self.translator = SequenceGenerator(
                [self.trainer.get_model()],
                beam_size=opt.beam,
                stop_early=(not opt.no_early_stop),
                normalize_scores=(not opt.unnormalized),
                len_penalty=opt.lenpen)
            self.translator.cuda()
        tokens = src_tokens.cuda(async=True)
        translations = self.translator.generate(Variable(tokens))
        results = [t[0] for t in translations]
        output_lines = [[] for _ in range(len(results))]
        for i in range(len(results)):
            output_lines[i] = ' '.join(self.fairseq_dict[idx]
                                       for idx in results[i]['tokens'][:-1])
        return output_lines

    def _train(self, samples):
        """Update the model using the targets."""
        for i, sample in enumerate(samples):
            # add extra info to samples
            sample = {
                'src_tokens': sample[0],
                'input_tokens': self._right_shifted_ys(sample[1]),
                'target': sample[1],
                'id': None
            }
            sample['ntokens'] = sum(len(t) for t in sample['target'])
            sample['src_positions'] = self._positions_for_tokens(
                sample['src_tokens'])
            sample['input_positions'] = self._positions_for_tokens(
                sample['input_tokens'])
            samples[i] = sample
        return self.trainer.train_step(samples)

    def save(self, path=None):
        path = self.opt.get('model_file', None) if path is None else path
        if path and hasattr(self, 'trainer'):
            model = {}
            model['state_dict'] = self.trainer.get_model().state_dict()
            model['opt'] = self.opt
            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            model = torch.load(read)
        return model['opt'], model['state_dict']

    def set_states(self, state_dict):
        """Set the state dict of the model from saved states."""
        self.trainer.get_model().load_state_dict(state_dict)

Example #38

Show file

File: ibm_seq2seq.py Project: yucoian/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {'loss': 0, 'num_tokens': 0}
        self.history = {}
        self.states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()

        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file'
            elif opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']
            else:
                init_model = None

            if init_model is not None:
                # load model parameters if available
                print('Loading existing model params from ' + init_model)
                new_opt, self.states = self.load(init_model)
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None:
                if init_model is not None and os.path.isfile(init_model +
                                                             '.dict'):
                    # check first to see if a dictionary exists
                    opt['dict_file'] = init_model + '.dict'
                elif opt.get('model_file'):
                    # otherwise, set default dict-file if it is not set
                    opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            encoder = EncoderRNN(len(self.dict),
                                 opt['maxlength_in'],
                                 opt['hiddensize'],
                                 dropout_p=opt['dropout'],
                                 input_dropout_p=opt['dropout'],
                                 n_layers=opt['numlayers'],
                                 rnn_cell=opt['rnncell'],
                                 bidirectional=opt['bidirectional'],
                                 variable_lengths=True)
            decoder = DecoderRNN(
                len(self.dict),
                opt['maxlength_out'],
                opt['hiddensize'] *
                2 if opt['bidirectional'] else opt['hiddensize'],
                dropout_p=opt['dropout'],
                input_dropout_p=opt['dropout'],
                n_layers=opt['numlayers'],
                rnn_cell=opt['rnncell'],
                bidirectional=opt['bidirectional'],
                sos_id=self.START_IDX,
                eos_id=self.END_IDX,
                use_attention=opt['attention'])
            self.model = Seq2seq(encoder, decoder)

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt['gradient_clip']

            # set up tensors once
            self.START = torch.LongTensor([self.START_IDX])
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)

            # set up criteria
            self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX,
                                        size_average=False)

            if self.use_cuda:
                # push to cuda
                self.START = self.START.cuda()
                self.xs = self.xs.cuda()
                self.ys = self.ys.cuda()
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = IbmSeq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True

            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if self.states:
                if self.states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(self.states['optimizer'])
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()

Example #39

Show file

 def __init__(self, opt, shared=None):
     super().__init__(opt)
     self.id = 'IRBaselineAgent'
     self.length_penalty = float(opt['length_penalty'])
     self.dictionary = DictionaryAgent(opt)
     self.opt = opt

Example #40

Show file

File: seq2seq.py Project: youlei5898/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)

        # all instances needs truncate param
        self.truncate = opt['truncate']
        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
        else:
            # this is not a shared instance of this class, so do full init

            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            states = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_IDX = self.dict[self.START]
            self.START_TENSOR = torch.LongTensor([self.START_IDX])
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_IDX = self.dict[self.END]
            self.END_TENSOR = torch.LongTensor([self.END_IDX])
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params in self
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.attention = opt['attention']
            self.bidirectional = opt['bidirectional']
            self.num_dirs = 2 if self.bidirectional else 1
            self.dropout = opt['dropout']
            self.lm = opt['language_model']

            # set up tensors once
            self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz)
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)
                self.cand_scores = torch.FloatTensor(1)
                self.cand_lengths = torch.LongTensor(1)

            # set up modules
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            # lookup table stores word embeddings
            self.enc_lt = nn.Embedding(len(self.dict), emb,
                                       padding_idx=self.NULL_IDX,
                                       max_norm=10)

            if opt['lookuptable'] in ['enc_dec', 'all']:
                # share this with the encoder
                self.dec_lt = self.enc_lt
            else:
                self.dec_lt = nn.Embedding(len(self.dict), emb,
                                           padding_idx=self.NULL_IDX,
                                           max_norm=10)

            if not states and opt['embedding_type'].startswith('glove'):
                # set up pre-initialized vectors from GloVe
                try:
                    import torchtext.vocab as vocab
                except ImportError:
                    raise ImportError('Please install torchtext from'
                                      'github.com/pytorch/text.')
                Glove = vocab.GloVe(name='840B', dim=300)
                # do better than uniform random
                proj = torch.FloatTensor(emb, 300).uniform_(-0.057735, 0.057735) if emb != 300 else None
                for w in self.dict.freq:
                    if w in Glove.stoi:
                        vec = Glove.vectors[Glove.stoi[w]]
                        if emb != 300:
                            vec = torch.mm(proj, vec.unsqueeze(1)).squeeze()
                        self.enc_lt.weight.data[self.dict[w]] = vec
                        self.dec_lt.weight.data[self.dict[w]] = vec

            # encoder captures the input text
            enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']]
            # decoder produces our output states
            if opt['decoder'] in ['same', 'shared']:
                # use same class as encoder
                self.decoder = enc_class(emb, hsz, opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            else:
                # use set class
                dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']]
                self.decoder = dec_class(emb, hsz, opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            if opt['decoder'] == 'shared':
                # shared weights: use the decoder to encode
                if self.bidirectional:
                    raise RuntimeError('Cannot share enc/dec and do '
                                       'bidirectional encoding.')
                self.encoder = self.decoder
            else:
                self.encoder = enc_class(emb, hsz, opt['numlayers'],
                                         dropout=self.dropout, batch_first=True,
                                         bidirectional=self.bidirectional)

            # linear layers help us produce outputs from final decoder state
            hszXdirs = hsz * self.num_dirs
            # hidden to embedding
            self.h2e = nn.Linear(hsz, emb)
            # embedding to output. note that this CAN predict NULL
            self.e2o = nn.Linear(emb, len(self.dict))
            if opt['lookuptable'] in ['dec_out', 'all']:
                # share these weights with the decoder lookup table
                self.e2o.weight = self.dec_lt.weight

            if self.attention != 'none':
                # we'll need this for all attention types
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)
            if self.attention == 'local':
                # local attention over fixed set of output states
                if opt['attention_length'] < 0:
                    raise RuntimeError('Set attention length to > 0.')
                self.max_length = opt['attention_length']
                # combines input and previous hidden output layer
                self.attn = nn.Linear(hsz + emb, self.max_length)
                # combines attention weights with encoder outputs
            elif self.attention == 'concat':
                self.attn = nn.Linear(hsz + hszXdirs, hsz)
                self.attn_v = nn.Linear(hsz, 1)
            elif self.attention == 'general':
                # equivalent to dot if attn is identity
                self.attn = nn.Linear(hsz, hszXdirs)

            # set up optims for each module
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True
            self.optims = {
                'decoder': optim_class(self.decoder.parameters(), **kwargs),
                'h2e': optim_class(self.h2e.parameters(), **kwargs),
            }
            if opt['decoder'] != 'shared':
                # update the encoder as well
                self.optims['encoder'] = optim_class(
                    self.encoder.parameters(), **kwargs)
            if not opt['embedding_type'].endswith('-fixed'):
                # update embeddings during training
                self.optims['enc_lt'] = optim_class(
                    self.enc_lt.parameters(), **kwargs)
                self.optims['e2o'] = optim_class(
                    self.e2o.parameters(), **kwargs)
                if opt['lookuptable'] not in ['enc_dec', 'all']:
                    # only add dec if it's separate from enc
                    self.optims['dec_lt'] = optim_class(
                        self.dec_lt.parameters(), **kwargs)
            elif opt['lookuptable'] not in ['dec_out', 'all']:
                # embeddings are fixed, so only update e2o if it's not shared
                self.optims['e2o'] = optim_class(
                    self.e2o.parameters(), **kwargs)

            # add attention parameters into optims if available
            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    self.optims[attn_name] = optim_class(
                        getattr(self, attn_name).parameters(), **kwargs)

            if states is not None:
                # set loaded states if applicable
                self.set_states(states)

            if self.use_cuda:
                self.cuda()

        self.reset()

Example #41

Show file

class IrBaselineAgent(Agent):
    @staticmethod
    def add_cmdline_args(parser):
        DictionaryAgent.add_cmdline_args(parser)
        parser.add_argument('-lp',
                            '--length_penalty',
                            default=0.5,
                            help='length penalty for responses')

    def __init__(self, opt, shared=None):
        super().__init__(opt)
        self.id = 'IRBaselineAgent'
        self.length_penalty = float(opt['length_penalty'])
        self.dictionary = DictionaryAgent(opt)
        self.opt = opt

    def observe(self, obs):
        self.observation = obs
        self.dictionary.observe(obs)
        return obs

    def act(self):
        if self.opt.get('datatype', '').startswith('train'):
            self.dictionary.act()

        obs = self.observation
        reply = {}
        reply['id'] = self.getID()

        # Rank candidates
        if 'label_candidates' in obs and len(obs['label_candidates']) > 0:
            rep = self.build_query_representation(obs['text'])
            reply['text_candidates'] = (rank_candidates(
                rep, obs['label_candidates'], self.length_penalty))
            reply['text'] = reply['text_candidates'][0]
        else:
            reply['text'] = "I don't know."
        return reply

    def save(self, fname):
        self.dictionary.save(fname + '.dict')

    def load(self, fname):
        self.dictionary.load(fname + '.dict')

    def build_query_representation(self, query):
        """ Build representation of query, e.g. words or n-grams """
        rep = {}
        rep['words'] = {}
        words = query.lower().split(' ')
        rw = rep['words']
        used = {}
        for w in words:
            if len(self.dictionary.freqs()) > 0:
                rw[w] = 1.0 / (1.0 +
                               math.log(1.0 + self.dictionary.freqs()[w]))
            else:
                if w not in stopwords:
                    rw[w] = 1
            used[w] = True
        norm = len(used)
        rep['norm'] = math.sqrt(len(words))
        return rep

Example #42

Show file

File: seq2seq.py Project: youlei5898/ParlAI

 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent."""
     DictionaryAgent.add_cmdline_args(argparser)
     agent = argparser.add_argument_group('Seq2Seq Arguments')
     agent.add_argument('-hs', '--hiddensize', type=int, default=128,
                        help='size of the hidden layers')
     agent.add_argument('-esz', '--embeddingsize', type=int, default=128,
                        help='size of the token embeddings')
     agent.add_argument('-nl', '--numlayers', type=int, default=2,
                        help='number of hidden layers')
     agent.add_argument('-lr', '--learningrate', type=float, default=0.005,
                        help='learning rate')
     agent.add_argument('-dr', '--dropout', type=float, default=0.1,
                        help='dropout rate')
     agent.add_argument('-bi', '--bidirectional', type='bool',
                        default=False,
                        help='whether to encode the context with a '
                             'bidirectional rnn')
     agent.add_argument('-att', '--attention', default='none',
                        choices=['none', 'concat', 'general', 'dot', 'local'],
                        help='Choices: none, concat, general, local. '
                             'If set local, also set attention-length. '
                             'For more details see: '
                             'https://arxiv.org/pdf/1508.04025.pdf')
     agent.add_argument('-attl', '--attention-length', default=48, type=int,
                        help='Length of local attention.')
     agent.add_argument('--no-cuda', action='store_true', default=False,
                        help='disable GPUs even if available')
     agent.add_argument('--gpu', type=int, default=-1,
                        help='which GPU device to use')
     agent.add_argument('-rc', '--rank-candidates', type='bool',
                        default=False,
                        help='rank candidates if available. this is done by'
                             ' computing the mean score per token for each '
                             'candidate and selecting the highest scoring.')
     agent.add_argument('-tr', '--truncate', type=int, default=-1,
                        help='truncate input & output lengths to speed up '
                        'training (may reduce accuracy). This fixes all '
                        'input and output to have a maximum length and to '
                        'be similar in length to one another by throwing '
                        'away extra tokens. This reduces the total amount '
                        'of padding in the batches.')
     agent.add_argument('-enc', '--encoder', default='gru',
                        choices=Seq2seqAgent.ENC_OPTS.keys(),
                        help='Choose between different encoder modules.')
     agent.add_argument('-dec', '--decoder', default='same',
                        choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()),
                        help='Choose between different decoder modules. '
                             'Default "same" uses same class as encoder, '
                             'while "shared" also uses the same weights. '
                             'Note that shared disabled some encoder '
                             'options--in particular, bidirectionality.')
     agent.add_argument('-lt', '--lookuptable', default='all',
                        choices=['unique', 'enc_dec', 'dec_out', 'all'],
                        help='The encoder, decoder, and output modules can '
                             'share weights, or not. '
                             'Unique has independent embeddings for each. '
                             'Enc_dec shares the embedding for the encoder '
                             'and decoder. '
                             'Dec_out shares decoder embedding and output '
                             'weights. '
                             'All shares all three weights.')
     agent.add_argument('-opt', '--optimizer', default='adam',
                        choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                        help='Choose between pytorch optimizers. '
                             'Any member of torch.optim is valid and will '
                             'be used with default params except learning '
                             'rate (as specified by -lr).')
     agent.add_argument('-emb', '--embedding-type', default='random',
                        choices=['random', 'glove', 'glove-fixed'],
                        help='Choose between different strategies '
                             'for word embeddings. Default is random, '
                             'but can also preinitialize from Glove.'
                             'Preinitialized embeddings can also be fixed '
                             'so they are not updated during training.')
     agent.add_argument('-lm', '--language-model', default='none',
                        choices=['none', 'only', 'both'],
                        help='Enabled language modeling training on the '
                             'concatenated input and label data.')

Example #43

Show file

 def add_cmdline_args(argparser):
     """Add command-line arguments specifically for this agent."""
     DictionaryAgent.add_cmdline_args(argparser)
     agent = argparser.add_argument_group('Seq2Seq Arguments')
     agent.add_argument('-hs',
                        '--hiddensize',
                        type=int,
                        default=128,
                        help='size of the hidden layers')
     agent.add_argument('-esz',
                        '--embeddingsize',
                        type=int,
                        default=128,
                        help='size of the token embeddings')
     agent.add_argument('-nl',
                        '--numlayers',
                        type=int,
                        default=2,
                        help='number of hidden layers')
     agent.add_argument('-lr',
                        '--learningrate',
                        type=float,
                        default=0.005,
                        help='learning rate')
     agent.add_argument('-dr',
                        '--dropout',
                        type=float,
                        default=0.1,
                        help='dropout rate')
     agent.add_argument('-bi',
                        '--bidirectional',
                        type='bool',
                        default=False,
                        help='whether to encode the context with a '
                        'bidirectional rnn')
     agent.add_argument('-att',
                        '--attention',
                        default='none',
                        choices=['none', 'concat', 'general', 'local'],
                        help='Choices: none, concat, general, local. '
                        'If set local, also set attention-length. '
                        'For more details see: '
                        'https://arxiv.org/pdf/1508.04025.pdf')
     agent.add_argument('-attl',
                        '--attention-length',
                        default=48,
                        type=int,
                        help='Length of local attention.')
     agent.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disable GPUs even if available')
     agent.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='which GPU device to use')
     agent.add_argument('-rc',
                        '--rank-candidates',
                        type='bool',
                        default=False,
                        help='rank candidates if available. this is done by'
                        ' computing the mean score per token for each '
                        'candidate and selecting the highest scoring.')
     agent.add_argument('-tr',
                        '--truncate',
                        type=int,
                        default=-1,
                        help='truncate input & output lengths to speed up '
                        'training (may reduce accuracy). This fixes all '
                        'input and output to have a maximum length and to '
                        'be similar in length to one another by throwing '
                        'away extra tokens. This reduces the total amount '
                        'of padding in the batches.')
     agent.add_argument('-enc',
                        '--encoder',
                        default='gru',
                        choices=Seq2seqAgent.ENC_OPTS.keys(),
                        help='Choose between different encoder modules.')
     agent.add_argument('-dec',
                        '--decoder',
                        default='same',
                        choices=['same', 'shared'] +
                        list(Seq2seqAgent.ENC_OPTS.keys()),
                        help='Choose between different decoder modules. '
                        'Default "same" uses same class as encoder, '
                        'while "shared" also uses the same weights. '
                        'Note that shared disabled some encoder '
                        'options--in particular, bidirectionality.')
     agent.add_argument('-lt',
                        '--lookuptable',
                        default='all',
                        choices=['unique', 'enc_dec', 'dec_out', 'all'],
                        help='The encoder, decoder, and output modules can '
                        'share weights, or not. '
                        'Unique has independent embeddings for each. '
                        'Enc_dec shares the embedding for the encoder '
                        'and decoder. '
                        'Dec_out shares decoder embedding and output '
                        'weights. '
                        'All shares all three weights.')
     agent.add_argument('-opt',
                        '--optimizer',
                        default='adam',
                        choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                        help='Choose between pytorch optimizers. '
                        'Any member of torch.optim is valid and will '
                        'be used with default params except learning '
                        'rate (as specified by -lr).')
     agent.add_argument('-emb',
                        '--embedding-init',
                        default='random',
                        choices=['random', 'glove'],
                        help='Choose between initialization strategies '
                        'for word embeddings. Default is random, '
                        'but can also preinitialize from Glove')
     agent.add_argument('-lm',
                        '--language-model',
                        type='bool',
                        default=False,
                        help='enabled language modeling training on the '
                        'concatenated input and label data')

Example #44

Show file

class Seq2seqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    This model supports encoding the input and decoding the output via one of
    several flavors of RNN. It then uses a linear layer (whose weights can
    be shared with the embedding layer) to convert RNN output states into
    output tokens. This model currently uses greedy decoding, selecting the
    highest probability token at each time step.

    For more information, see Sequence to Sequence Learning with Neural
    Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=128,
                           help='size of the hidden layers')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=0.005,
                           help='learning rate')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.1,
                           help='dropout rate')
        agent.add_argument('-bi',
                           '--bidirectional',
                           type='bool',
                           default=False,
                           help='whether to encode the context with a '
                           'bidirectional rnn')
        agent.add_argument('-att',
                           '--attention',
                           default='none',
                           choices=['none', 'concat', 'general', 'local'],
                           help='Choices: none, concat, general, local. '
                           'If set local, also set attention-length. '
                           'For more details see: '
                           'https://arxiv.org/pdf/1508.04025.pdf')
        agent.add_argument('-attl',
                           '--attention-length',
                           default=48,
                           type=int,
                           help='Length of local attention.')
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument('--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        agent.add_argument('-rc',
                           '--rank-candidates',
                           type='bool',
                           default=False,
                           help='rank candidates if available. this is done by'
                           ' computing the mean score per token for each '
                           'candidate and selecting the highest scoring.')
        agent.add_argument('-tr',
                           '--truncate',
                           type=int,
                           default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length and to '
                           'be similar in length to one another by throwing '
                           'away extra tokens. This reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-enc',
                           '--encoder',
                           default='gru',
                           choices=Seq2seqAgent.ENC_OPTS.keys(),
                           help='Choose between different encoder modules.')
        agent.add_argument('-dec',
                           '--decoder',
                           default='same',
                           choices=['same', 'shared'] +
                           list(Seq2seqAgent.ENC_OPTS.keys()),
                           help='Choose between different decoder modules. '
                           'Default "same" uses same class as encoder, '
                           'while "shared" also uses the same weights. '
                           'Note that shared disabled some encoder '
                           'options--in particular, bidirectionality.')
        agent.add_argument('-lt',
                           '--lookuptable',
                           default='all',
                           choices=['unique', 'enc_dec', 'dec_out', 'all'],
                           help='The encoder, decoder, and output modules can '
                           'share weights, or not. '
                           'Unique has independent embeddings for each. '
                           'Enc_dec shares the embedding for the encoder '
                           'and decoder. '
                           'Dec_out shares decoder embedding and output '
                           'weights. '
                           'All shares all three weights.')
        agent.add_argument('-opt',
                           '--optimizer',
                           default='adam',
                           choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                           'Any member of torch.optim is valid and will '
                           'be used with default params except learning '
                           'rate (as specified by -lr).')
        agent.add_argument('-emb',
                           '--embedding-init',
                           default='random',
                           choices=['random', 'glove'],
                           help='Choose between initialization strategies '
                           'for word embeddings. Default is random, '
                           'but can also preinitialize from Glove')
        agent.add_argument('-lm',
                           '--language-model',
                           type='bool',
                           default=False,
                           help='enabled language modeling training on the '
                           'concatenated input and label data')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)

        # all instances needs truncate param
        self.truncate = opt['truncate']
        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
        else:
            # this is not a shared instance of this class, so do full init

            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available(
            )
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            states = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_IDX = self.dict[self.START]
            self.START_TENSOR = torch.LongTensor([self.START_IDX])
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_IDX = self.dict[self.END]
            self.END_TENSOR = torch.LongTensor([self.END_IDX])
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params in self
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.attention = opt['attention']
            self.bidirectional = opt['bidirectional']
            self.num_dirs = 2 if self.bidirectional else 1
            self.dropout = opt['dropout']
            self.lm = opt['language_model']

            # set up tensors once
            self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz)
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)
                self.cand_scores = torch.FloatTensor(1)
                self.cand_lengths = torch.LongTensor(1)

            # set up modules
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            # lookup table stores word embeddings
            self.enc_lt = nn.Embedding(len(self.dict),
                                       emb,
                                       padding_idx=self.NULL_IDX,
                                       max_norm=10)

            if opt['lookuptable'] in ['enc_dec', 'all']:
                # share this with the encoder
                self.dec_lt = self.enc_lt
            else:
                self.dec_lt = nn.Embedding(len(self.dict),
                                           emb,
                                           padding_idx=self.NULL_IDX,
                                           max_norm=10)

            if not states and opt['embedding_init'] == 'glove':
                # set up pre-initialized vectors from GloVe
                try:
                    import torchtext.vocab as vocab
                except ImportError:
                    raise ImportError('Please install torchtext from'
                                      'github.com/pytorch/text.')
                Glove = vocab.GloVe(name='840B', dim=300)
                # do better than uniform random
                proj = torch.FloatTensor(emb, 300).uniform_(
                    -0.057735, 0.057735) if emb != 300 else None
                for w in self.dict.freq:
                    if w in Glove.stoi:
                        vec = Glove.vectors[Glove.stoi[w]]
                        if emb != 300:
                            vec = torch.mm(proj, vec.unsqueeze(1)).squeeze()
                        self.enc_lt.weight.data[self.dict[w]] = vec
                        self.dec_lt.weight.data[self.dict[w]] = vec

            # encoder captures the input text
            enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']]
            # decoder produces our output states
            if opt['decoder'] in ['same', 'shared']:
                # use same class as encoder
                self.decoder = enc_class(emb,
                                         hsz,
                                         opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            else:
                # use set class
                dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']]
                self.decoder = dec_class(emb,
                                         hsz,
                                         opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            if opt['decoder'] == 'shared':
                # shared weights: use the decoder to encode
                if self.bidirectional:
                    raise RuntimeError('Cannot share enc/dec and do '
                                       'bidirectional encoding.')
                self.encoder = self.decoder
            else:
                self.encoder = enc_class(emb,
                                         hsz,
                                         opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True,
                                         bidirectional=self.bidirectional)

            # linear layers help us produce outputs from final decoder state
            hszXdirs = hsz * self.num_dirs
            # hidden to embedding
            self.h2e = nn.Linear(hsz, emb)
            # embedding to output. note that this CAN predict NULL
            self.e2o = nn.Linear(emb, len(self.dict))
            if opt['lookuptable'] in ['dec_out', 'all']:
                # share these weights with the decoder lookup table
                self.e2o.weight = self.dec_lt.weight

            if self.attention == 'local':
                # local attention over fixed set of output states
                if opt['attention_length'] < 0:
                    raise RuntimeError('Set attention length to > 0.')
                self.max_length = opt['attention_length']
                # combines input and previous hidden output layer
                self.attn = nn.Linear(hsz + emb, self.max_length)
                # combines attention weights with encoder outputs
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)
            elif self.attention == 'concat':
                self.attn = nn.Linear(hsz + hszXdirs, hsz)
                self.attn_v = nn.Linear(hsz, 1)
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)
            elif self.attention == 'general':
                self.attn = nn.Linear(hsz, hszXdirs)
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)

            # set up optims for each module
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True
            self.optims = {
                'enc_lt': optim_class(self.enc_lt.parameters(), **kwargs),
                'decoder': optim_class(self.decoder.parameters(), **kwargs),
                'h2e': optim_class(self.h2e.parameters(), **kwargs),
                'e2o': optim_class(self.e2o.parameters(), **kwargs),
            }
            if opt['decoder'] != 'shared':
                self.optims['encoder'] = optim_class(self.encoder.parameters(),
                                                     **kwargs)
            if opt['lookuptable'] not in ['enc_dec', 'all']:
                # only add dec if it's separate from enc
                self.optims['dec_lt'] = optim_class(self.dec_lt.parameters(),
                                                    **kwargs)

            # add attention parameters into optims if available
            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    self.optims[attn_name] = optim_class(
                        getattr(self, attn_name).parameters(), **kwargs)

            if states is not None:
                # set loaded states if applicable
                self.set_states(states)

            if self.use_cuda:
                self.cuda()

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'optimizer', 'encoder',
            'decoder', 'lookuptable', 'attention', 'attention_length'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        new_vec = []
        for i in vec:
            if i == self.END_IDX:
                break
            elif i not in [self.NULL_IDX, self.START_IDX]:
                new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def cuda(self):
        """Push parameters to the GPU."""
        self.START_TENSOR = self.START_TENSOR.cuda(async=True)
        self.END_TENSOR = self.END_TENSOR.cuda(async=True)
        self.zeros = self.zeros.cuda(async=True)
        self.xs = self.xs.cuda(async=True)
        self.ys = self.ys.cuda(async=True)
        if self.rank:
            self.cands = self.cands.cuda(async=True)
            self.cand_scores = self.cand_scores.cuda(async=True)
            self.cand_lengths = self.cand_lengths.cuda(async=True)
        self.criterion.cuda()
        self.enc_lt.cuda()
        self.dec_lt.cuda()
        self.encoder.cuda()
        self.decoder.cuda()
        self.h2e.cuda()
        self.e2o.cuda()
        if self.attention != 'none':
            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    getattr(self, attn_name).cuda()

    def hidden_to_idx(self, hidden, is_training=False):
        """Convert hidden state vectors into indices into the dictionary."""
        # dropout at each step
        e = F.dropout(self.h2e(hidden), p=self.dropout, training=is_training)
        scores = F.dropout(self.e2o(e), p=self.dropout, training=is_training)
        _max_score, idx = scores.max(2)
        return idx, scores

    def zero_grad(self):
        """Zero out optimizers."""
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        for optimizer in self.optims.values():
            optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['answers'] = self.answers
        shared['dict'] = self.dict
        shared['START_IDX'] = self.START_IDX
        shared['END_IDX'] = self.END_IDX
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        observation = observation.copy()
        if 'text' in observation:
            # put START and END around text
            parsed_x = [self.START_IDX]
            parsed_x.extend(self.parse(observation['text']))
            parsed_x.append(self.END_IDX)
            if self.truncate > 0:
                parsed_x = parsed_x[-self.truncate:]
            observation['text'] = parsed_x
        if not self.episode_done:
            prev_dialog = self.observation['text']
            # get last y
            batch_idx = self.opt.get('batchindex', 0)
            if self.answers[batch_idx] is not None:
                # use our last answer, which is the label during training
                lastY = self.answers[batch_idx]
                prev_dialog.append(self.START_IDX)
                prev_dialog.extend(lastY)
                prev_dialog.append(self.END_IDX)
                self.answers[batch_idx] = None  # forget last y
            prev_dialog.extend(parsed_x)
            if self.truncate > 0:
                prev_dialog = prev_dialog[-self.truncate:]
            observation['text'] = prev_dialog
        self.observation = observation
        self.episode_done = observation['episode_done']

        return observation

    def _encode(self, xs, is_training=False):
        """Call encoder and return output and hidden states."""
        self.lastxs = xs
        batchsize = len(xs)

        # first encode context
        xes = F.dropout(self.enc_lt(xs), p=self.dropout, training=is_training)
        # project from emb_size to hidden_size dimensions
        x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
        xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True)

        if self.zeros.size(1) != batchsize:
            self.zeros.resize_(self.num_layers * self.num_dirs, batchsize,
                               self.hidden_size).fill_(0)

        h0 = Variable(self.zeros, requires_grad=False)
        if type(self.encoder) == nn.LSTM:
            encoder_output_packed, hidden = self.encoder(xes_packed, (h0, h0))
            # take elementwise max between forward and backward hidden states
            hidden = (hidden[0].view(-1, self.num_dirs, hidden[0].size(1),
                                     hidden[0].size(2)).max(1)[0],
                      hidden[1].view(-1, self.num_dirs, hidden[1].size(1),
                                     hidden[1].size(2)).max(1)[0])
            if type(self.decoder) != nn.LSTM:
                hidden = hidden[0]
        else:
            encoder_output_packed, hidden = self.encoder(xes_packed, h0)

            # take elementwise max between forward and backward hidden states
            hidden = hidden.view(-1, self.num_dirs, hidden.size(1),
                                 hidden.size(2)).max(1)[0]
            if type(self.decoder) == nn.LSTM:
                hidden = (hidden, h0.narrow(0, 0, 2))
        encoder_output, _ = pad_packed_sequence(encoder_output_packed,
                                                batch_first=True)
        encoder_output = encoder_output

        if self.attention == 'local':
            # if using local attention, narrow encoder_output to max_length
            if encoder_output.size(1) > self.max_length:
                offset = encoder_output.size(1) - self.max_length
                encoder_output = encoder_output.narrow(1, offset,
                                                       self.max_length)

        return encoder_output, hidden

    def _apply_attention(self, xes, encoder_output, hidden, attn_mask=None):
        """Apply attention to encoder hidden layer."""
        last_hidden = hidden[-1]  # select hidden from last RNN layer
        if self.attention == 'concat':
            hidden_expand = last_hidden.unsqueeze(1).expand(
                last_hidden.size(0), encoder_output.size(1),
                last_hidden.size(1))
            attn_w_premask = self.attn_v(
                F.tanh(self.attn(torch.cat((encoder_output, hidden_expand),
                                           2)))).squeeze(2)
            attn_weights = F.softmax(attn_w_premask * attn_mask -
                                     (1 - attn_mask) * 1e20)

        elif self.attention == 'general':
            hidden_expand = last_hidden.unsqueeze(1)
            attn_w_premask = torch.bmm(self.attn(hidden_expand),
                                       encoder_output.transpose(1,
                                                                2)).squeeze(1)
            attn_weights = F.softmax(attn_w_premask * attn_mask -
                                     (1 - attn_mask) * 1e20)

        elif self.attention == 'local':
            attn_weights = F.softmax(
                self.attn(torch.cat((xes.squeeze(1), last_hidden), 1)))
            if attn_weights.size(1) > encoder_output.size(1):
                attn_weights = attn_weights.narrow(1, 0,
                                                   encoder_output.size(1))

        attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                 encoder_output).squeeze(1)

        output = torch.cat((xes.squeeze(1), attn_applied), 1)
        output = self.attn_combine(output).unsqueeze(1)
        output = F.tanh(output)

        return output

    def _decode_and_train(self,
                          batchsize,
                          xes,
                          ys,
                          encoder_output,
                          hidden,
                          attn_mask,
                          lm=False):
        """Update the model based on the labels."""
        self.zero_grad()
        loss = 0

        predictions = []

        # keep track of longest label we've ever seen
        # we'll never produce longer ones than that during prediction
        if not lm:
            self.longest_label = max(self.longest_label, ys.size(1))
        if self.attention != 'none':
            # using attention, produce one token at a time
            for i in range(ys.size(1)):
                h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden
                output = self._apply_attention(xes, encoder_output, h_att,
                                               attn_mask)
                output, hidden = self.decoder(output, hidden)
                preds, scores = self.hidden_to_idx(output, is_training=True)
                y = ys.select(1, i)
                loss += self.criterion(scores.squeeze(1), y)
                # use the true token as the next input instead of predicted
                xes = self.dec_lt(y).unsqueeze(1)
                xes = F.dropout(xes, p=self.dropout, training=True)
                predictions.append(preds)
        else:
            # force the entire sequence at once by feeding in START + y[:-2]
            y_in = ys.narrow(1, 0, ys.size(1) - 1)
            xes = torch.cat([xes, self.dec_lt(y_in)], 1)

            output, hidden = self.decoder(xes, hidden)
            preds, scores = self.hidden_to_idx(output, is_training=True)
            for i in range(ys.size(1)):
                # sum loss per-token
                score = scores.select(1, i)
                y = ys.select(1, i)
                loss += self.criterion(score, y)
            predictions.append(preds)
        loss.backward()
        self.update_params()

        predictions = torch.cat(predictions, 1)
        if random.random() < 0.1:
            # sometimes output a prediction for debugging
            # print('prediction:', ' '.join(output_lines[0]))
            # print('label:', self.v2t(ys.data[0]))
            print('lm' if lm else '  ', 'loss:', loss.data[0])

        return predictions

    def _decode_only(self, batchsize, xes, ys, encoder_output, hidden,
                     attn_mask):
        """Just produce a prediction without training the model."""
        done = [False for _ in range(batchsize)]
        total_done = 0
        max_len = 0
        predictions = []

        # generate a response from scratch
        while (total_done < batchsize) and max_len < self.longest_label:
            # keep producing tokens until we hit END or max length for each
            # example in the batch
            if self.attention == 'none':
                output = xes
            else:
                h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden
                output = self._apply_attention(xes, encoder_output, h_att,
                                               attn_mask)
            output, hidden = self.decoder(output, hidden)
            preds, _scores = self.hidden_to_idx(output, is_training=False)
            predictions.append(preds)

            xes = self.dec_lt(preds)
            max_len += 1
            for b in range(batchsize):
                if not done[b]:
                    # only add more tokens for examples that aren't done yet
                    if preds.data[b][0] == self.END_IDX:
                        # if we produced END, we're done
                        done[b] = True
                        total_done += 1

        predictions = torch.cat(predictions, 1)
        if random.random() < 0.2:
            # sometimes output a prediction for debugging
            print('\nprediction:', self.v2t(predictions.data[0]))

        return predictions

    def _score_candidates(self, cands, cand_inds, start, encoder_output,
                          hidden, attn_mask):
        """Rank candidates by their likelihood according to the decoder."""
        if type(self.decoder) == nn.LSTM:
            hidden, cell = hidden
        # score each candidate separately
        # cands are exs_with_cands x cands_per_ex x words_per_cand
        # cview is total_cands x words_per_cand
        cview = cands.view(-1, cands.size(2))
        c_xes = start.expand(cview.size(0), start.size(0), start.size(1))

        if len(cand_inds) != hidden.size(1):
            # only use hidden state from inputs with associated candidates
            cand_indices = torch.LongTensor([i for i, _, _ in cand_inds])
            if self.use_cuda:
                cand_indices = cand_indices.cuda()
            cand_indices = Variable(cand_indices)
            hidden = hidden.index_select(1, cand_indices)

        sz = hidden.size()
        cands_hn = (hidden.view(sz[0], sz[1], 1, sz[2]).expand(
            sz[0], sz[1], cands.size(1),
            sz[2]).contiguous().view(sz[0], -1, sz[2]))
        if type(self.decoder) == nn.LSTM:
            if len(cand_inds) != cell.size(1):
                # only use cell state from inputs with associated candidates
                cell = cell.index_select(1, cand_indices)
            cands_hn = (cands_hn, cell.view(sz[0], sz[1], 1, sz[2]).expand(
                sz[0], sz[1], cands.size(1),
                sz[2]).contiguous().view(sz[0], -1, sz[2]))

        cand_scores = Variable(
            self.cand_scores.resize_(cview.size(0)).fill_(0))
        cand_lengths = Variable(
            self.cand_lengths.resize_(cview.size(0)).fill_(0))

        if self.attention != 'none':
            # using attention
            sz = encoder_output.size()
            cands_encoder_output = (encoder_output.contiguous().view(
                sz[0], 1, sz[1],
                sz[2]).expand(sz[0], cands.size(1), sz[1],
                              sz[2]).contiguous().view(-1, sz[1], sz[2]))

            msz = attn_mask.size()
            cands_attn_mask = (attn_mask.contiguous().view(
                msz[0], 1,
                msz[1]).expand(msz[0], cands.size(1),
                               msz[1]).contiguous().view(-1, msz[1]))
            for i in range(cview.size(1)):
                # process one token at a time
                h_att = cands_hn[0] if type(
                    self.decoder) == nn.LSTM else cands_hn
                output = self._apply_attention(c_xes, cands_encoder_output,
                                               h_att, cands_attn_mask)
                output, cands_hn = self.decoder(output, cands_hn)
                _preds, scores = self.hidden_to_idx(output, is_training=False)
                cs = cview.select(1, i)
                non_nulls = cs.ne(self.NULL_IDX)
                cand_lengths += non_nulls.long()
                score_per_cand = torch.gather(scores.select(1, i), 1,
                                              cs.unsqueeze(1))
                cand_scores += score_per_cand.squeeze() * non_nulls.float()
                c_xes = self.dec_lt(cs).unsqueeze(1)
        else:
            # process entire sequence at once
            if cview.size(1) > 1:
                # feed in START + cands[:-2]
                cands_in = cview.narrow(1, 0, cview.size(1) - 1)
                c_xes = torch.cat([c_xes, self.dec_lt(cands_in)], 1)
            output, cands_hn = self.decoder(c_xes, cands_hn)
            _preds, scores = self.hidden_to_idx(output, is_training=False)

            for i in range(cview.size(1)):
                # calculate score at each token
                cs = cview.select(1, i)
                non_nulls = cs.ne(self.NULL_IDX)
                cand_lengths += non_nulls.long()
                score_per_cand = torch.gather(scores.select(1, i), 1,
                                              cs.unsqueeze(1))
                cand_scores += score_per_cand.squeeze() * non_nulls.float()

        # set empty scores to -1, so when divided by 0 they become -inf
        cand_scores -= cand_lengths.eq(0).float()
        # average the scores per token
        cand_scores /= cand_lengths.float()

        cand_scores = cand_scores.view(cands.size(0), cands.size(1))
        srtd_scores, text_cand_inds = cand_scores.sort(1, True)
        text_cand_inds = text_cand_inds.data

        return text_cand_inds

    def predict(self, xs, ys=None, cands=None, valid_cands=None, lm=False):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        batchsize = len(xs)
        text_cand_inds = None
        is_training = ys is not None
        self.encoder.train(mode=is_training)
        self.decoder.train(mode=is_training)
        encoder_output, hidden = self._encode(xs, is_training)

        # next we use START as an input to kick off our decoder
        if not lm:
            x = Variable(self.START_TENSOR, requires_grad=False)
            xe = self.dec_lt(x)
            xe = F.dropout(xe, p=self.dropout, training=is_training)
            xes = xe.expand(batchsize, 1, xe.size(1))
        else:
            # during language_model mode, just start with zeros
            xes = Variable(self.zeros[0].narrow(1, 0,
                                                self.emb_size).unsqueeze(1),
                           requires_grad=False)

        if self.attention == 'none':
            attn_mask = None
        else:
            attn_mask = xs.ne(0).float()

        if is_training:
            predictions = self._decode_and_train(batchsize,
                                                 xes,
                                                 ys,
                                                 encoder_output,
                                                 hidden,
                                                 attn_mask,
                                                 lm=lm)
        else:
            if cands is not None:
                text_cand_inds = self._score_candidates(
                    cands, valid_cands, xe, encoder_output, hidden, attn_mask)

            predictions = self._decode_only(batchsize, xes, ys, encoder_output,
                                            hidden, attn_mask)

        return predictions, text_cand_inds

    def batchify(self, observations):
        """Convert a list of observations into input & target tensors."""
        def valid(obs):
            # check if this is an example our model should actually process
            return 'text' in obs and ('labels' in obs or 'eval_labels' in obs)

        # valid examples and their indices
        valid_inds, exs = zip(*[(i, ex) for i, ex in enumerate(observations)
                                if valid(ex)])

        # set up the input tensors
        batchsize = len(exs)
        if batchsize == 0:
            return None, None, None, None, None, None

        # `x` text is already tokenized and truncated
        parsed = [ex['text'] for ex in exs]
        x_lens = [len(x) for x in parsed]
        ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k])

        exs = [exs[k] for k in ind_sorted]
        valid_inds = [valid_inds[k] for k in ind_sorted]
        parsed = [parsed[k] for k in ind_sorted]

        max_x_len = max([len(x) for x in parsed])
        xs = torch.LongTensor(batchsize, max_x_len).fill_(self.NULL_IDX)
        # right-padded with zeros
        for i, x in enumerate(parsed):
            for j, idx in enumerate(x):
                xs[i][j] = idx
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs, async=True)
            xs = Variable(self.xs)
        else:
            xs = Variable(xs)

        # set up the target tensors
        ys = None
        labels = None
        if any(['labels' in ex for ex in exs]):
            # randomly select one of the labels to update on, if multiple
            # append END to each label
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            parsed = [self.parse(y + ' ' + self.END) for y in labels if y]
            max_y_len = max(len(y) for y in parsed)
            if self.truncate > 0:
                # shrink ys to to limit batch computation
                max_y_len = min(max_y_len, self.truncate)
                parsed = [y[:max_y_len] for y in parsed]
            ys = torch.LongTensor(batchsize, max_y_len).fill_(self.NULL_IDX)
            for i, y in enumerate(parsed):
                for j, idx in enumerate(y):
                    ys[i][j] = idx
            if self.use_cuda:
                # copy to gpu
                self.ys.resize_(ys.size())
                self.ys.copy_(ys, async=True)
                ys = Variable(self.ys)
            else:
                ys = Variable(ys)

        # set up candidates
        cands = None
        valid_cands = None
        if ys is None and self.rank:
            # only do ranking when no targets available and ranking flag set
            parsed = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[i]:
                    # each candidate tuple is a pair of the parsed version and
                    # the original full string
                    cs = list(observations[i]['label_candidates'])
                    parsed.append([self.parse(c) for c in cs])
                    valid_cands.append((i, v, cs))
            if len(parsed) > 0:
                # TODO: store lengths of cands separately, so don't have zero
                #       padding for varying number of cands per example
                # found cands, pack them into tensor
                max_c_len = max(max(len(c) for c in cs) for cs in parsed)
                max_c_cnt = max(len(cs) for cs in parsed)
                cands = torch.LongTensor(len(parsed), max_c_cnt,
                                         max_c_len).fill_(self.NULL_IDX)
                for i, cs in enumerate(parsed):
                    for j, c in enumerate(cs):
                        for k, idx in enumerate(c):
                            cands[i][j][k] = idx
                if self.use_cuda:
                    # copy to gpu
                    self.cands.resize_(cands.size())
                    self.cands.copy_(cands, async=True)
                    cands = Variable(self.cands)
                else:
                    cands = Variable(cands)

        return xs, ys, labels, valid_inds, cands, valid_cands

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, labels, valid_inds, cands, valid_cands = self.batchify(
            observations)

        if xs is None:
            # no valid examples, just return empty responses
            return batch_reply

        # produce predictions, train on targets if available
        predictions, text_cand_inds = self.predict(xs, ys, cands, valid_cands)

        if self.lm and ys is not None:
            # also train on lm task: given [START], predict [x y]
            # (regular task is given [x START] produce [y])
            new_obs = [
                {
                    'text': [self.START_IDX],
                    'labels': [
                        '{x} {s} {y}'.format(
                            x=self.v2t(obs['text'][1:]),  # skip START token
                            s=self.START,
                            y=random.choice(obs.get('labels', [''])))
                    ]
                } for obs in observations
            ]
            xs, ys, _, _, _, _ = self.batchify(new_obs)
            _, _ = self.predict(xs, ys, lm=True)

        predictions = predictions.cpu()
        for i in range(len(predictions)):
            # map the predictions back to non-empty examples in the batch
            # we join with spaces since we produce tokens one at a time
            curr = batch_reply[valid_inds[i]]
            output_tokens = []
            for c in predictions.data[i]:
                if c == self.END_IDX or c == self.NULL_IDX:
                    break
                else:
                    output_tokens.append(c)
            curr_pred = self.v2t(output_tokens)
            curr['text'] = curr_pred
            if labels is not None:
                y = []
                for c in ys.data[i]:
                    if c == self.END_IDX or c == self.NULL_IDX:
                        break
                    else:
                        y.append(c)
                self.answers[valid_inds[i]] = y
            else:
                self.answers[valid_inds[i]] = output_tokens

        if text_cand_inds is not None:
            for i in range(len(valid_cands)):
                order = text_cand_inds[i]
                _, batch_idx, curr_cands = valid_cands[i]
                curr = batch_reply[valid_inds[batch_idx]]
                curr['text_candidates'] = [
                    curr_cands[idx] for idx in order if idx < len(curr_cands)
                ]

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'optims'):
            model = {}
            model['enc_lt'] = self.enc_lt.state_dict()
            if self.opt['lookuptable'] not in ['enc_dec', 'all']:
                # dec_lt is enc_lt
                raise RuntimeError()
                # model['dec_lt'] = self.dec_lt.state_dict()
            if self.opt['decoder'] != 'shared':
                model['encoder'] = self.encoder.state_dict()
            model['decoder'] = self.decoder.state_dict()
            model['h2e'] = self.h2e.state_dict()
            model['e2o'] = self.e2o.state_dict()
            model['optims'] = {
                k: v.state_dict()
                for k, v in self.optims.items()
            }
            model['longest_label'] = self.longest_label
            model['opt'] = self.opt

            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    model[attn_name] = getattr(self, attn_name).state_dict()

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            model = torch.load(read)

        return model['opt'], model

    def set_states(self, states):
        """Set the state dicts of the modules from saved states."""
        self.enc_lt.load_state_dict(states['enc_lt'])
        if self.opt['lookuptable'] not in ['enc_dec', 'all']:
            # dec_lt is enc_lt
            raise RuntimeError(
                'dec_lt state should not exist--it is same as enc_lt.')
        if self.opt['decoder'] != 'shared':
            self.encoder.load_state_dict(states['encoder'])
        self.decoder.load_state_dict(states['decoder'])
        self.h2e.load_state_dict(states['h2e'])
        self.e2o.load_state_dict(states['e2o'])
        for attn_name in ['attn', 'attn_v', 'attn_combine']:
            if attn_name in states:
                getattr(self, attn_name).load_state_dict(states[attn_name])

        for k, v in states['optims'].items():
            self.optims[k].load_state_dict(v)
        self.longest_label = states['longest_label']

Example #45

Show file

File: full_task_train.py Project: analyticlaks/ParlAI

def main():
    # Get command line arguments
    argparser = ParlaiParser()
    DictionaryAgent.add_cmdline_args(argparser)
    ParsedRemoteAgent.add_cmdline_args(argparser)
    argparser.add_argument('--num-examples', default=1000, type=int)
    argparser.add_argument('--num-its', default=100, type=int)
    argparser.add_argument('--dict-max-exs', default=10000, type=int)
    parlai_home = os.environ['PARLAI_HOME']
    if '--remote-cmd' not in sys.argv:
        if os.system('which luajit') != 0:
            raise RuntimeError('Could not detect torch luajit installed: ' +
                               'please install torch from http://torch.ch ' +
                               'or manually set --remote-cmd for this example.')
        sys.argv.append('--remote-cmd')
        sys.argv.append('luajit {}/parlai/agents/'.format(parlai_home) +
                        'memnn_luatorch_cpu/memnn_zmq_parsed.lua')
    if '--remote-args' not in sys.argv:
        sys.argv.append('--remote-args')
        sys.argv.append('{}/examples/'.format(parlai_home) +
                        'memnn_luatorch_cpu/params_default.lua')

    opt = argparser.parse_args()

    # set up dictionary
    print('Setting up dictionary.')
    dictionary = DictionaryAgent(opt)
    if not opt.get('dict_file'):
        # build dictionary since we didn't load it
        ordered_opt = copy.deepcopy(opt)
        ordered_opt['datatype'] = 'train:ordered'
        ordered_opt['numthreads'] = 1
        world_dict = create_task(ordered_opt, dictionary)

        print('Dictionary building on training data.')
        cnt = 0
        # pass examples to dictionary
        for _ in world_dict:
            cnt += 1
            if cnt > opt['dict_max_exs'] and opt['dict_max_exs'] > 0:
                print('Processed {} exs, moving on.'.format(
                      opt['dict_max_exs']))
                # don't wait too long...
                break

            world_dict.parley()

        # we need to save the dictionary to load it in memnn (sort it by freq)
        dictionary.sort()
        dictionary.save('/tmp/dict.txt', sort=True)

    print('Dictionary ready, moving on to training.')

    opt['datatype'] = 'train'
    agent = ParsedRemoteAgent(opt, {'dictionary_shared': dictionary.share()})
    world_train = create_task(opt, agent)
    opt['datatype'] = 'valid'
    world_valid = create_task(opt, agent)

    start = time.time()
    with world_train:
        for _ in range(opt['num_its']):
            print('[ training ]')
            for _ in range(opt['num_examples'] * opt.get('numthreads', 1)):
                world_train.parley()
            world_train.synchronize()

            print('[ validating ]')
            world_valid.reset()
            for _ in world_valid:  # check valid accuracy
                world_valid.parley()

            print('[ validation summary. ]')
            report_valid = world_valid.report()
            print(report_valid)
            if report_valid['accuracy'] > 0.95:
                break

        # show some example dialogs after training:
        world_valid = create_task(opt, agent)
        for _k in range(3):
            world_valid.parley()
            print(world_valid.display())

    print('finished in {} s'.format(round(time.time() - start, 2)))

Example #46

Show file

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)

        # all instances needs truncate param
        self.truncate = opt['truncate']
        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
        else:
            # this is not a shared instance of this class, so do full init

            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available(
            )
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            states = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_IDX = self.dict[self.START]
            self.START_TENSOR = torch.LongTensor([self.START_IDX])
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_IDX = self.dict[self.END]
            self.END_TENSOR = torch.LongTensor([self.END_IDX])
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params in self
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.attention = opt['attention']
            self.bidirectional = opt['bidirectional']
            self.num_dirs = 2 if self.bidirectional else 1
            self.dropout = opt['dropout']
            self.lm = opt['language_model']

            # set up tensors once
            self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz)
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)
                self.cand_scores = torch.FloatTensor(1)
                self.cand_lengths = torch.LongTensor(1)

            # set up modules
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            # lookup table stores word embeddings
            self.enc_lt = nn.Embedding(len(self.dict),
                                       emb,
                                       padding_idx=self.NULL_IDX,
                                       max_norm=10)

            if opt['lookuptable'] in ['enc_dec', 'all']:
                # share this with the encoder
                self.dec_lt = self.enc_lt
            else:
                self.dec_lt = nn.Embedding(len(self.dict),
                                           emb,
                                           padding_idx=self.NULL_IDX,
                                           max_norm=10)

            if not states and opt['embedding_init'] == 'glove':
                # set up pre-initialized vectors from GloVe
                try:
                    import torchtext.vocab as vocab
                except ImportError:
                    raise ImportError('Please install torchtext from'
                                      'github.com/pytorch/text.')
                Glove = vocab.GloVe(name='840B', dim=300)
                # do better than uniform random
                proj = torch.FloatTensor(emb, 300).uniform_(
                    -0.057735, 0.057735) if emb != 300 else None
                for w in self.dict.freq:
                    if w in Glove.stoi:
                        vec = Glove.vectors[Glove.stoi[w]]
                        if emb != 300:
                            vec = torch.mm(proj, vec.unsqueeze(1)).squeeze()
                        self.enc_lt.weight.data[self.dict[w]] = vec
                        self.dec_lt.weight.data[self.dict[w]] = vec

            # encoder captures the input text
            enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']]
            # decoder produces our output states
            if opt['decoder'] in ['same', 'shared']:
                # use same class as encoder
                self.decoder = enc_class(emb,
                                         hsz,
                                         opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            else:
                # use set class
                dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']]
                self.decoder = dec_class(emb,
                                         hsz,
                                         opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            if opt['decoder'] == 'shared':
                # shared weights: use the decoder to encode
                if self.bidirectional:
                    raise RuntimeError('Cannot share enc/dec and do '
                                       'bidirectional encoding.')
                self.encoder = self.decoder
            else:
                self.encoder = enc_class(emb,
                                         hsz,
                                         opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True,
                                         bidirectional=self.bidirectional)

            # linear layers help us produce outputs from final decoder state
            hszXdirs = hsz * self.num_dirs
            # hidden to embedding
            self.h2e = nn.Linear(hsz, emb)
            # embedding to output. note that this CAN predict NULL
            self.e2o = nn.Linear(emb, len(self.dict))
            if opt['lookuptable'] in ['dec_out', 'all']:
                # share these weights with the decoder lookup table
                self.e2o.weight = self.dec_lt.weight

            if self.attention == 'local':
                # local attention over fixed set of output states
                if opt['attention_length'] < 0:
                    raise RuntimeError('Set attention length to > 0.')
                self.max_length = opt['attention_length']
                # combines input and previous hidden output layer
                self.attn = nn.Linear(hsz + emb, self.max_length)
                # combines attention weights with encoder outputs
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)
            elif self.attention == 'concat':
                self.attn = nn.Linear(hsz + hszXdirs, hsz)
                self.attn_v = nn.Linear(hsz, 1)
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)
            elif self.attention == 'general':
                self.attn = nn.Linear(hsz, hszXdirs)
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)

            # set up optims for each module
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True
            self.optims = {
                'enc_lt': optim_class(self.enc_lt.parameters(), **kwargs),
                'decoder': optim_class(self.decoder.parameters(), **kwargs),
                'h2e': optim_class(self.h2e.parameters(), **kwargs),
                'e2o': optim_class(self.e2o.parameters(), **kwargs),
            }
            if opt['decoder'] != 'shared':
                self.optims['encoder'] = optim_class(self.encoder.parameters(),
                                                     **kwargs)
            if opt['lookuptable'] not in ['enc_dec', 'all']:
                # only add dec if it's separate from enc
                self.optims['dec_lt'] = optim_class(self.dec_lt.parameters(),
                                                    **kwargs)

            # add attention parameters into optims if available
            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    self.optims[attn_name] = optim_class(
                        getattr(self, attn_name).parameters(), **kwargs)

            if states is not None:
                # set loaded states if applicable
                self.set_states(states)

            if self.use_cuda:
                self.cuda()

        self.reset()

Example #47

Show file

File: ir_baseline.py Project: ahiroto/ParlAI

 def add_cmdline_args(parser):
     DictionaryAgent.add_cmdline_args(parser)
     parser.add_argument(
         '-lp', '--length_penalty', default=0.5,
         help='length penalty for responses')

Example #48

Show file

File: seq2seq.py Project: zhouchunyi/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {'loss': 0.0, 'num_tokens': 0}
        self.history = {}
        self.report_freq = opt.get('report_freq', 0.001)
        states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        if opt.get('numthreads') > 1:
            torch.set_num_threads(1)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.metrics = shared['metrics']
                states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file'
            elif opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']
            else:
                init_model = None

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(init_model))
                new_opt, states = self.load(init_model)
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)
                self.opt = opt

            if opt['dict_file'] is None:
                if init_model is not None and os.path.isfile(init_model + '.dict'):
                    # check first to see if a dictionary exists
                    opt['dict_file'] = init_model + '.dict'
                elif opt.get('model_file'):
                    # otherwise, set default dict-file if it is not set
                    opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            if not hasattr(self, 'model_class'):
                # this allows child classes to override this but inherit init
                self.model_class = Seq2seq
            self.model = self.model_class(
                opt, len(self.dict), padding_idx=self.NULL_IDX,
                start_idx=self.START_IDX, end_idx=self.END_IDX,
                longest_label=states.get('longest_label', 1))

            if opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ModuleNotFoundError as ex:
                    print('Please install torch text with `pip install torchtext`')
                    raise ex
                if opt['embedding_type'].startswith('glove'):
                    init = 'glove'
                    embs = vocab.GloVe(name='840B', dim=300,
                        cache=os.path.join(opt['parlai_home'], '.vector_cache'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                        cache=os.path.join(opt['parlai_home'], '.vector_cache'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != 300:
                    rp = torch.Tensor(300, opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', -1)
            self.rank = opt['rank_candidates']

            # set up tensors once
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)

            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)

            if self.use_cuda:
                # push to cuda
                self.xs = self.xs.cuda()
                self.ys = self.ys.cuda()
                if self.rank:
                    self.cands = self.cands.cuda()
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt.get('momentum') > 0 and opt['optimizer'] in ['sgd', 'rmsprop']:
                kwargs['momentum'] = opt['momentum']
                if opt['optimizer'] == 'sgd':
                    kwargs['nesterov'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs)
            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(states['optimizer'])
                    if self.use_cuda:
                        for state in self.optimizer.state.values():
                            for k, v in state.items():
                                if isinstance(v, torch.Tensor):
                                    state[k] = v.cuda()
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()

Example #49

Show file

File: seq2seq.py Project: analyticlaks/ParlAI

class Seq2seqAgent(Agent):
    """Simple agent which uses an LSTM to process incoming text observations."""

    @staticmethod
    def add_cmdline_args(argparser):
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument('-hs', '--hiddensize', type=int, default=64,
            help='size of the hidden layers and embeddings')
        agent.add_argument('-nl', '--numlayers', type=int, default=2,
            help='number of hidden layers')
        agent.add_argument('-lr', '--learningrate', type=float, default=0.5,
            help='learning rate')
        agent.add_argument('-dr', '--dropout', type=float, default=0.1,
            help='dropout rate')
        agent.add_argument('--no-cuda', action='store_true', default=False,
            help='disable GPUs even if available')
        agent.add_argument('--gpu', type=int, default=-1,
            help='which GPU device to use')

    def __init__(self, opt, shared=None):
        super().__init__(opt, shared)
        opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available()
        if opt['cuda']:
            print('[ Using CUDA ]')
            torch.cuda.set_device(opt['gpu'])
        if not shared:
            # don't enter this loop for shared (ie batch) instantiations
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            hsz = opt['hiddensize']
            self.EOS = self.dict.eos_token
            self.observation = {'text': self.EOS, 'episode_done': True}
            self.EOS_TENSOR = torch.LongTensor(self.dict.parse(self.EOS))
            self.hidden_size = hsz
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.use_cuda = opt.get('cuda', False)
            self.longest_label = 1

            self.criterion = nn.NLLLoss()
            self.lt = nn.Embedding(len(self.dict), hsz, padding_idx=0,
                                   scale_grad_by_freq=True)
            self.encoder = nn.GRU(hsz, hsz, opt['numlayers'])
            self.decoder = nn.GRU(hsz, hsz, opt['numlayers'])
            self.d2o = nn.Linear(hsz, len(self.dict))
            self.dropout = nn.Dropout(opt['dropout'])
            self.softmax = nn.LogSoftmax()

            lr = opt['learningrate']
            self.optims = {
                'lt': optim.SGD(self.lt.parameters(), lr=lr),
                'encoder': optim.SGD(self.encoder.parameters(), lr=lr),
                'decoder': optim.SGD(self.decoder.parameters(), lr=lr),
                'd2o': optim.SGD(self.d2o.parameters(), lr=lr),
            }
            if self.use_cuda:
                self.cuda()
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                print('Loading existing model parameters from ' + opt['model_file'])
                self.load(opt['model_file'])

        self.episode_done = True

    def parse(self, text):
        return torch.LongTensor(self.dict.txt2vec(text))

    def v2t(self, vec):
        return self.dict.vec2txt(vec)

    def cuda(self):
        self.criterion.cuda()
        self.lt.cuda()
        self.encoder.cuda()
        self.decoder.cuda()
        self.d2o.cuda()
        self.dropout.cuda()
        self.softmax.cuda()

    def hidden_to_idx(self, hidden, drop=False):
        if hidden.size(0) > 1:
            raise RuntimeError('bad dimensions of tensor:', hidden)
        hidden = hidden.squeeze(0)
        scores = self.d2o(hidden)
        if drop:
            scores = self.dropout(scores)
        scores = self.softmax(scores)
        _max_score, idx = scores.max(1)
        return idx, scores

    def zero_grad(self):
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        for optimizer in self.optims.values():
            optimizer.step()

    def init_zeros(self, bsz=1):
        t = torch.zeros(self.num_layers, bsz, self.hidden_size)
        if self.use_cuda:
            t = t.cuda(async=True)
        return Variable(t)

    def init_rand(self, bsz=1):
        t = torch.FloatTensor(self.num_layers, bsz, self.hidden_size)
        t.uniform_(0.05)
        if self.use_cuda:
            t = t.cuda(async=True)
        return Variable(t)

    def observe(self, observation):
        observation = copy.deepcopy(observation)
        if not self.episode_done:
            # if the last example wasn't the end of an episode, then we need to
            # recall what was said in that example
            prev_dialogue = self.observation['text']
            observation['text'] = prev_dialogue + '\n' + observation['text']
        self.observation = observation
        self.episode_done = observation['episode_done']
        return observation

    def update(self, xs, ys):
        batchsize = len(xs)

        # first encode context
        xes = self.lt(xs).t()
        h0 = self.init_zeros(batchsize)
        _output, hn = self.encoder(xes, h0)

        # start with EOS tensor for all
        x = self.EOS_TENSOR
        if self.use_cuda:
            x = x.cuda(async=True)
        x = Variable(x)
        xe = self.lt(x).unsqueeze(1)
        xes = xe.expand(xe.size(0), batchsize, xe.size(2))

        output_lines = [[] for _ in range(batchsize)]

        self.zero_grad()
        # update model
        loss = 0
        self.longest_label = max(self.longest_label, ys.size(1))
        for i in range(ys.size(1)):
            output, hn = self.decoder(xes, hn)
            preds, scores = self.hidden_to_idx(output, drop=True)
            y = ys.select(1, i)
            loss += self.criterion(scores, y)
            # use the true token as the next input
            xes = self.lt(y).unsqueeze(0)
            # hn = self.dropout(hn)
            for j in range(preds.size(0)):
                token = self.v2t([preds.data[j][0]])
                output_lines[j].append(token)

        loss.backward()
        self.update_params()

        if random.random() < 0.1:
            true = self.v2t(ys.data[0])
            #print('loss:', round(loss.data[0], 2),
            #      ' '.join(output_lines[0]), '(true: {})'.format(true))
        return output_lines

    def predict(self, xs):
        batchsize = len(xs)

        # first encode context
        xes = self.lt(xs).t()
        h0 = self.init_zeros(batchsize)
        _output, hn = self.encoder(xes, h0)

        # start with EOS tensor for all
        x = self.EOS_TENSOR
        if self.use_cuda:
            x = x.cuda(async=True)
        x = Variable(x)
        xe = self.lt(x).unsqueeze(1)
        xes = xe.expand(xe.size(0), batchsize, xe.size(2))

        done = [False for _ in range(batchsize)]
        total_done = 0
        max_len = 0
        output_lines = [[] for _ in range(batchsize)]

        while(total_done < batchsize) and max_len < self.longest_label:
            output, hn = self.decoder(xes, hn)
            preds, scores = self.hidden_to_idx(output, drop=False)
            xes = self.lt(preds.t())
            max_len += 1
            for i in range(preds.size(0)):
                if not done[i]:
                    token = self.v2t(preds.data[i])
                    if token == self.EOS:
                        done[i] = True
                        total_done += 1
                    else:
                        output_lines[i].append(token)
        if random.random() < 0.1:
            print('prediction:', ' '.join(output_lines[0]))
        return output_lines

    def batchify(self, obs):
        exs = [ex for ex in obs if 'text' in ex]
        valid_inds = [i for i, ex in enumerate(obs) if 'text' in ex]

        batchsize = len(exs)
        parsed = [self.parse(ex['text']) for ex in exs]
        max_x_len = max([len(x) for x in parsed])
        xs = torch.LongTensor(batchsize, max_x_len).fill_(0)
        for i, x in enumerate(parsed):
            offset = max_x_len - len(x)
            for j, idx in enumerate(x):
                xs[i][j + offset] = idx
        if self.use_cuda:
            xs = xs.cuda(async=True)
        xs = Variable(xs)

        ys = None
        if 'labels' in exs[0]:
            labels = [random.choice(ex['labels']) + ' ' + self.EOS for ex in exs]
            parsed = [self.parse(y) for y in labels]
            max_y_len = max(len(y) for y in parsed)
            ys = torch.LongTensor(batchsize, max_y_len).fill_(0)
            for i, y in enumerate(parsed):
                for j, idx in enumerate(y):
                    ys[i][j] = idx
            if self.use_cuda:
                ys = ys.cuda(async=True)
            ys = Variable(ys)
        return xs, ys, valid_inds

    def batch_act(self, observations):
        batchsize = len(observations)
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        xs, ys, valid_inds = self.batchify(observations)

        if len(xs) == 0:
            return batch_reply

        # Either train or predict
        if ys is not None:
            predictions = self.update(xs, ys)
        else:
            predictions = self.predict(xs)

        for i in range(len(predictions)):
            batch_reply[valid_inds[i]]['text'] = ' '.join(
                c for c in predictions[i] if c != self.EOS)

        return batch_reply

    def act(self):
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        path = self.opt.get('model_file', None) if path is None else path

        if path:
            model = {}
            model['lt'] = self.lt.state_dict()
            model['encoder'] = self.encoder.state_dict()
            model['decoder'] = self.decoder.state_dict()
            model['d2o'] = self.d2o.state_dict()
            model['longest_label'] = self.longest_label

            with open(path, 'wb') as write:
                torch.save(model, write)

    def load(self, path):
        with open(path, 'rb') as read:
            model = torch.load(read)

        self.lt.load_state_dict(model['lt'])
        self.encoder.load_state_dict(model['encoder'])
        self.decoder.load_state_dict(model['decoder'])
        self.d2o.load_state_dict(model['d2o'])
        self.longest_label = model['longest_label']

Example #50

Show file

File: seq2seq.py Project: zhouchunyi/ParlAI

class Seq2seqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    This model supports encoding the input and decoding the output via one of
    several flavors of RNN. It then uses a linear layer (whose weights can
    be shared with the embedding layer) to convert RNN output states into
    output tokens. This model currently uses greedy decoding, selecting the
    highest probability token at each time step.

    For more information, see Sequence to Sequence Learning with Neural
    Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument('--init-model', type=str, default=None,
                           help='load dict/features/weights/opts from this file')
        agent.add_argument('-hs', '--hiddensize', type=int, default=128,
                           help='size of the hidden layers')
        agent.add_argument('-esz', '--embeddingsize', type=int, default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl', '--numlayers', type=int, default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr', '--learningrate', type=float, default=1,
                           help='learning rate')
        agent.add_argument('-dr', '--dropout', type=float, default=0.1,
                           help='dropout rate')
        agent.add_argument('-clip', '--gradient-clip', type=float, default=-1,
                           help='gradient clipping using l2 norm')
        agent.add_argument('-bi', '--bidirectional', type='bool',
                           default=False,
                           help='whether to encode the context with a '
                                'bidirectional rnn')
        agent.add_argument('-att', '--attention', default='none',
                           choices=['none', 'concat', 'general', 'dot', 'local'],
                           help='Choices: none, concat, general, local. '
                                'If set local, also set attention-length. '
                                'For more details see: '
                                'https://arxiv.org/abs/1508.04025')
        agent.add_argument('-attl', '--attention-length', default=48, type=int,
                           help='Length of local attention.')
        agent.add_argument('--attention-time', default='post',
                           choices=['pre', 'post'],
                           help='Whether to apply attention before or after '
                                'decoding.')
        agent.add_argument('--no-cuda', action='store_true', default=False,
                           help='disable GPUs even if available')
        agent.add_argument('--gpu', type=int, default=-1,
                           help='which GPU device to use')
        agent.add_argument('-rc', '--rank-candidates', type='bool',
                           default=False,
                           help='rank candidates if available. this is done by'
                                ' computing the mean score per token for each '
                                'candidate and selecting the highest scoring.')
        agent.add_argument('-tr', '--truncate', type=int, default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length. This '
                           'reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-rnn', '--rnn-class', default='lstm',
                           choices=Seq2seq.RNN_OPTS.keys(),
                           help='Choose between different types of RNNs.')
        agent.add_argument('-dec', '--decoder', default='same',
                           choices=['same', 'shared'],
                           help='Choose between different decoder modules. '
                                'Default "same" uses same class as encoder, '
                                'while "shared" also uses the same weights. '
                                'Note that shared disabled some encoder '
                                'options--in particular, bidirectionality.')
        agent.add_argument('-lt', '--lookuptable', default='unique',
                           choices=['unique', 'enc_dec', 'dec_out', 'all'],
                           help='The encoder, decoder, and output modules can '
                                'share weights, or not. '
                                'Unique has independent embeddings for each. '
                                'Enc_dec shares the embedding for the encoder '
                                'and decoder. '
                                'Dec_out shares decoder embedding and output '
                                'weights. '
                                'All shares all three weights.')
        agent.add_argument('-opt', '--optimizer', default='sgd',
                           choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                                'Any member of torch.optim is valid and will '
                                'be used with default params except learning '
                                'rate (as specified by -lr).')
        agent.add_argument('-mom', '--momentum', default=-1, type=float,
                           help='if applicable, momentum value for optimizer. '
                                'if > 0, sgd uses nesterov momentum.')
        agent.add_argument('-emb', '--embedding-type', default='random',
                           choices=['random', 'glove', 'glove-fixed',
                                    'fasttext', 'fasttext-fixed'],
                           help='Choose between different strategies '
                                'for word embeddings. Default is random, '
                                'but can also preinitialize from Glove or '
                                'Fasttext.'
                                'Preinitialized embeddings can also be fixed '
                                'so they are not updated during training.')
        agent.add_argument('-rf', '--report-freq', type=float, default=0.001,
                   help='Report frequency of prediction during eval.')
        Seq2seqAgent.dictionary_class().add_cmdline_args(argparser)
        return agent

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {'loss': 0.0, 'num_tokens': 0}
        self.history = {}
        self.report_freq = opt.get('report_freq', 0.001)
        states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        if opt.get('numthreads') > 1:
            torch.set_num_threads(1)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.metrics = shared['metrics']
                states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file'
            elif opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']
            else:
                init_model = None

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(init_model))
                new_opt, states = self.load(init_model)
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)
                self.opt = opt

            if opt['dict_file'] is None:
                if init_model is not None and os.path.isfile(init_model + '.dict'):
                    # check first to see if a dictionary exists
                    opt['dict_file'] = init_model + '.dict'
                elif opt.get('model_file'):
                    # otherwise, set default dict-file if it is not set
                    opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            if not hasattr(self, 'model_class'):
                # this allows child classes to override this but inherit init
                self.model_class = Seq2seq
            self.model = self.model_class(
                opt, len(self.dict), padding_idx=self.NULL_IDX,
                start_idx=self.START_IDX, end_idx=self.END_IDX,
                longest_label=states.get('longest_label', 1))

            if opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ModuleNotFoundError as ex:
                    print('Please install torch text with `pip install torchtext`')
                    raise ex
                if opt['embedding_type'].startswith('glove'):
                    init = 'glove'
                    embs = vocab.GloVe(name='840B', dim=300,
                        cache=os.path.join(opt['parlai_home'], '.vector_cache'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                        cache=os.path.join(opt['parlai_home'], '.vector_cache'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != 300:
                    rp = torch.Tensor(300, opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.lt.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.lt.weight.data[i] = vec
                print('Seq2seq: initialized embeddings for {} tokens from {}.'
                      ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', -1)
            self.rank = opt['rank_candidates']

            # set up tensors once
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)

            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)

            if self.use_cuda:
                # push to cuda
                self.xs = self.xs.cuda()
                self.ys = self.ys.cuda()
                if self.rank:
                    self.cands = self.cands.cuda()
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt.get('momentum') > 0 and opt['optimizer'] in ['sgd', 'rmsprop']:
                kwargs['momentum'] = opt['momentum']
                if opt['optimizer'] == 'sgd':
                    kwargs['nesterov'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Seq2seq: fixing embedding weights.')
                self.model.decoder.lt.weight.requires_grad = False
                self.model.encoder.lt.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    self.model.decoder.e2s.weight.requires_grad = False
            self.optimizer = optim_class([p for p in self.model.parameters() if p.requires_grad], **kwargs)
            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(states['optimizer'])
                    if self.use_cuda:
                        for state in self.optimizer.state.values():
                            for k, v in state.items():
                                if isinstance(v, torch.Tensor):
                                    state[k] = v.cuda()
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer',
                      'encoder', 'decoder', 'lookuptable', 'attention',
                      'attention_length', 'rnn_class'}
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('[ Adding new option: | {k}: {v} | ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('[ Overriding option: | {k}: {old} => {v} | ]'.format(
                      k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        if 'dict_file' in new_opt and not self.opt.get('dict_file'):
            print('[ No dictionary path detected, trying to load previous '
                  'path {} ]'.format(new_opt['dict_file']))
            self.opt['dict_file'] = new_opt['dict_file']
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        if isinstance(vec, Variable):
            vec = vec.data
        new_vec = []
        for i in vec:
            if i == self.END_IDX:
                break
            elif i != self.START_IDX:
                new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def zero_grad(self):
        """Zero out optimizer."""
        self.optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        if self.clip > 0:
            torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip)
        self.optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.history.clear()
        self.reset_metrics()

    def reset_metrics(self):
        self.metrics['loss'] = 0.0
        self.metrics['num_tokens'] = 0

    def report(self):
        m = {}
        if self.metrics['num_tokens'] > 0:
            m['loss'] = self.metrics['loss'] / self.metrics['num_tokens']
            m['ppl'] = math.exp(m['loss'])
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            m[k] = round_sigfigs(v, 4)
        return m

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['opt'] = self.opt
        shared['answers'] = self.answers
        shared['dict'] = self.dict
        shared['START_IDX'] = self.START_IDX
        shared['END_IDX'] = self.END_IDX
        shared['NULL_IDX'] = self.NULL_IDX
        if self.opt.get('numthreads', 1) > 1:
            if type(self.metrics) == dict:
                self.metrics = SharedTable(self.metrics)
                self.model.share_memory()
            shared['metrics'] = self.metrics
            shared['model'] = self.model
            shared['states'] = { # only need to pass optimizer states
                'optimizer': self.optimizer.state_dict(),
                'optimizer_type': self.opt['optimizer'],
            }
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        batch_idx = self.opt.get('batchindex', 0)
        if not obs.get('preprocessed', False) or 'text2vec' not in obs:
            obs['text2vec'] = maintain_dialog_history(
                self.history, obs,
                reply=self.answers[batch_idx],
                historyLength=self.truncate,
                useReplies=self.opt['include_labels'],
                dict=self.dict,
                useStartEndIndices=False)
        else:
            obs['text2vec'] = deque(obs['text2vec'], maxlen=self.truncate)
        self.observation = obs
        self.answers[batch_idx] = None
        return obs

    def predict(self, xs, ys=None, cands=None, valid_cands=None, is_training=False):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        text_cand_inds, loss_dict = None, None
        if is_training:
            self.model.train()
            self.zero_grad()
            out = self.model(xs, ys)
            predictions, scores = out[0], out[1]
            loss = self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1))
            # save loss to metrics
            target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0]
            self.metrics['loss'] += loss.double().data[0]
            self.metrics['num_tokens'] += target_tokens
            loss /= target_tokens  # average loss per token
            # loss /= xs.size(0)  # average loss per sentence
            loss.backward()
            self.update_params()
        else:
            self.model.eval()
            out = self.model(xs, ys=None, cands=cands, valid_cands=valid_cands)
            predictions, text_cand_inds = out[0], out[2]

            if ys is not None:
                # calculate loss on targets
                out = self.model(xs, ys)
                scores = out[1]
                loss = self.criterion(scores.view(-1, scores.size(-1)), ys.view(-1))
                target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0]
                self.metrics['loss'] += loss.double().data[0]
                self.metrics['num_tokens'] += target_tokens

        return predictions, text_cand_inds

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        is_training = any(['labels' in obs for obs in observations])
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations, self.dict, end_idx=self.END_IDX, null_idx=self.NULL_IDX,
            dq=True, eval_labels=True, truncate=self.truncate)
        if xs is None:
            return None, None, None, None, None, None, None
        xs = torch.LongTensor(xs)
        if ys is not None:
            ys = torch.LongTensor(ys)
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs)
            xs = Variable(self.xs)
            if ys is not None:
                self.ys.resize_(ys.size())
                self.ys.copy_(ys)
                ys = Variable(self.ys)
        else:
            xs = Variable(xs)
            if ys is not None:
                ys = Variable(ys)

        # set up candidates
        cands = None
        valid_cands = None
        if not is_training and self.rank:
            # only do ranking when no targets available and ranking flag set
            parsed_cs = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[v]:
                    # each candidate tuple is a pair of the parsed version and
                    # the original full string
                    cs = list(observations[v]['label_candidates'])
                    curr_dqs = [deque(maxlen=self.truncate) for _ in cs]
                    for dq, c in zip(curr_dqs, cs):
                        dq.extendleft(reversed(self.parse(c)))
                    parsed_cs.append(curr_dqs)
                    valid_cands.append((i, v, cs))
            if len(parsed_cs) > 0:
                # TODO: store lengths of cands separately, so don't have zero
                #       padding for varying number of cands per example
                # found cands, pack them into tensor
                max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs)
                max_c_cnt = max(len(cs) for cs in parsed_cs)
                for cs in parsed_cs:
                    for c in cs:
                        c += [self.NULL_IDX] * (max_c_len - len(c))
                    cs += [[self.NULL_IDX] * max_c_len] * (max_c_cnt - len(cs))
                cands = torch.LongTensor(parsed_cs)
                if self.use_cuda:
                    # copy to gpu
                    self.cands.resize_(cands.size())
                    self.cands.copy_(cands)
                    cands = Variable(self.cands)
                else:
                    cands = Variable(cands)

        return xs, ys, labels, valid_inds, cands, valid_cands, is_training

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, labels, valid_inds, cands, valid_cands, is_training = self.vectorize(observations)

        if xs is None:
            # no valid examples, just return empty responses
            return batch_reply

        # produce predictions, train on targets if availables
        predictions, text_cand_inds = self.predict(xs, ys, cands, valid_cands, is_training)

        if is_training:
            report_freq = 0
        else:
            report_freq = self.report_freq
        PaddingUtils.map_predictions(
            predictions.cpu().data, valid_inds, batch_reply, observations,
            self.dict, self.END_IDX, report_freq=report_freq, labels=labels,
            answers=self.answers, ys=ys.data if ys is not None else None)

        if text_cand_inds is not None:
            text_cand_inds = text_cand_inds.cpu().data
            for i in range(len(valid_cands)):
                order = text_cand_inds[i]
                _, batch_idx, curr_cands = valid_cands[i]
                curr = batch_reply[batch_idx]
                curr['text_candidates'] = [curr_cands[idx] for idx in order
                                           if idx < len(curr_cands)]

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['longest_label'] = self.model.longest_label
            model['optimizer'] = self.optimizer.state_dict()
            model['optimizer_type'] = self.opt['optimizer']
            model['opt'] = self.opt

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        states = torch.load(path, map_location=lambda cpu, _: cpu)
        return states['opt'], states

    def receive_metrics(self, metrics_dict):
        """Use the metrics to decide when to adjust LR schedule."""
        if 'loss' in metrics_dict:
            self.scheduler.step(metrics_dict['loss'])

Example #51

Show file

File: ibm_seq2seq.py Project: yucoian/ParlAI

class IbmSeq2seqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    For more information, see IBM's repository at
    https://github.com/IBM/pytorch-seq2seq.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        IbmSeq2seqAgent.dictionary_class().add_cmdline_args(argparser)
        agent = argparser.add_argument_group('IBM Seq2Seq Arguments')
        agent.add_argument(
            '--init-model',
            type=str,
            default=None,
            help='load dict/features/weights/opts from this file')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=128,
                           help='size of the hidden layers')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=0.005,
                           help='learning rate')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.5,
                           help='dropout rate')
        agent.add_argument('-clip',
                           '--gradient-clip',
                           type=float,
                           default=-1,
                           help='gradient clipping using l2 norm')
        agent.add_argument('-bi',
                           '--bidirectional',
                           type='bool',
                           default=False,
                           help='whether to encode the context with a '
                           'bidirectional rnn')
        agent.add_argument('-att',
                           '--attention',
                           type='bool',
                           default=True,
                           help='Enable/disable attention over encoded state.')
        agent.add_argument('--maxlength-in',
                           type=int,
                           default=50,
                           help='Maximum input token length.')
        agent.add_argument('--maxlength-out',
                           type=int,
                           default=50,
                           help='Maximum output token length.')
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument('--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        agent.add_argument('-tr',
                           '--truncate',
                           type=int,
                           default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length. This '
                           'reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-rnn',
                           '--rnncell',
                           default='gru',
                           help='Choose between different types of RNNs.')
        agent.add_argument('-opt',
                           '--optimizer',
                           default='adam',
                           choices=IbmSeq2seqAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                           'Any member of torch.optim is valid and will '
                           'be used with default params except learning '
                           'rate (as specified by -lr).')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init

        # all instances may need some params
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {'loss': 0, 'num_tokens': 0}
        self.history = {}
        self.states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()

        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']
        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file'
            elif opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']
            else:
                init_model = None

            if init_model is not None:
                # load model parameters if available
                print('Loading existing model params from ' + init_model)
                new_opt, self.states = self.load(init_model)
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None:
                if init_model is not None and os.path.isfile(init_model +
                                                             '.dict'):
                    # check first to see if a dictionary exists
                    opt['dict_file'] = init_model + '.dict'
                elif opt.get('model_file'):
                    # otherwise, set default dict-file if it is not set
                    opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            encoder = EncoderRNN(len(self.dict),
                                 opt['maxlength_in'],
                                 opt['hiddensize'],
                                 dropout_p=opt['dropout'],
                                 input_dropout_p=opt['dropout'],
                                 n_layers=opt['numlayers'],
                                 rnn_cell=opt['rnncell'],
                                 bidirectional=opt['bidirectional'],
                                 variable_lengths=True)
            decoder = DecoderRNN(
                len(self.dict),
                opt['maxlength_out'],
                opt['hiddensize'] *
                2 if opt['bidirectional'] else opt['hiddensize'],
                dropout_p=opt['dropout'],
                input_dropout_p=opt['dropout'],
                n_layers=opt['numlayers'],
                rnn_cell=opt['rnncell'],
                bidirectional=opt['bidirectional'],
                sos_id=self.START_IDX,
                eos_id=self.END_IDX,
                use_attention=opt['attention'])
            self.model = Seq2seq(encoder, decoder)

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt['gradient_clip']

            # set up tensors once
            self.START = torch.LongTensor([self.START_IDX])
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)

            # set up criteria
            self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX,
                                        size_average=False)

            if self.use_cuda:
                # push to cuda
                self.START = self.START.cuda()
                self.xs = self.xs.cuda()
                self.ys = self.ys.cuda()
                self.criterion.cuda()

            # set up optimizer
            lr = opt['learningrate']
            optim_class = IbmSeq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True

            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if self.states:
                if self.states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    self.optimizer.load_state_dict(self.states['optimizer'])
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'optimizer',
            'attention', 'maxlength-in', 'maxlength-out'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        if type(vec) == Variable:
            vec = vec.data
        new_vec = []
        for i in vec:
            if i == self.END_IDX:
                break
            elif i != self.START_IDX:
                new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def zero_grad(self):
        """Zero out optimizer."""
        self.optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        if self.clip > 0:
            torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip)
        self.optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.history.clear()
        self.reset_metrics()

    def reset_metrics(self):
        self.metrics.clear()
        self.metrics['loss'] = 0
        self.metrics['num_tokens'] = 0

    def report(self):
        m = {}
        if self.metrics['num_tokens'] > 0:
            m['loss'] = self.metrics['loss'] / self.metrics['num_tokens']
            m['ppl'] = math.exp(m['loss'])
        for k, v in m.items():
            # clean up: rounds to sigfigs and converts tensors to floats
            m[k] = round_sigfigs(v, 4)
        return m

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['answers'] = self.answers
        shared['dict'] = self.dict
        shared['START_IDX'] = self.START_IDX
        shared['END_IDX'] = self.END_IDX
        shared['NULL_IDX'] = self.NULL_IDX
        if self.opt.get('numthreads', 1) > 1:
            shared['model'] = self.model
            self.model.share_memory()
            shared['states'] = self.states
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        batch_idx = self.opt.get('batchindex', 0)
        if not obs.get('preprocessed', False):
            obs['text2vec'] = maintain_dialog_history(
                self.history,
                obs,
                reply=self.answers[batch_idx],
                historyLength=self.truncate,
                useReplies=self.opt['include_labels'],
                dict=self.dict,
                useStartEndIndices=False)
        else:
            obs['text2vec'] = deque(obs['text2vec'], maxlen=self.truncate)
        self.observation = obs
        self.answers[batch_idx] = None
        return obs

    def predict(self, xs, ys=None, is_training=False):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        # import pdb; pdb.set_trace()
        loss_dict = None, None

        x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
        start = Variable(self.START, requires_grad=False)
        starts = start.expand(len(xs), 1)

        if is_training:
            self.model.train()
            self.zero_grad()
            y_in = torch.cat([starts, ys], 1)
            out, hid, result = self.model(xs,
                                          x_lens,
                                          y_in,
                                          teacher_forcing_ratio=True)
            scores = torch.cat(out)
            loss = self.criterion(scores.view(-1, scores.size(-1)),
                                  ys.view(-1))
            # save loss to metrics
            target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0]
            self.metrics['loss'] += loss.double().data[0]
            self.metrics['num_tokens'] += target_tokens
            # average loss per token
            loss /= target_tokens
            loss.backward()
            self.update_params()
        else:
            self.model.eval()
            out, hid, result = self.model(xs, x_lens)

            if ys is not None:
                # calculate loss on targets
                y_in = torch.cat([starts, ys], 1)
                out, hid, result = self.model(xs,
                                              x_lens,
                                              y_in,
                                              teacher_forcing_ratio=False)
                scores = torch.cat(out)
                loss = self.criterion(scores.view(-1, scores.size(-1)),
                                      ys.view(-1))
                target_tokens = ys.ne(self.NULL_IDX).long().sum().data[0]
                self.metrics['loss'] += loss.double().data[0]
                self.metrics['num_tokens'] += target_tokens

        predictions = torch.cat(result['sequence'], 1)
        return predictions

    def vectorize(self, observations):
        """Convert a list of observations into input & target tensors."""
        is_training = any(['labels' in obs for obs in observations])
        xs, ys, labels, valid_inds, _, _ = PaddingUtils.pad_text(
            observations,
            self.dict,
            end_idx=None,
            null_idx=self.NULL_IDX,
            dq=True,
            eval_labels=True,
            truncate=self.truncate)
        if xs is None:
            return None, None, None, None, None, None, None
        xs = torch.LongTensor(xs)
        ys = torch.LongTensor(ys)
        if self.use_cuda:
            # copy to gpu
            self.xs.resize_(xs.size())
            self.xs.copy_(xs)
            xs = Variable(self.xs)
            if ys is not None:
                self.ys.resize_(ys.size())
                self.ys.copy_(ys)
                ys = Variable(self.ys)
        else:
            xs = Variable(xs)
            if ys is not None:
                ys = Variable(ys)

        return xs, ys, labels, valid_inds, is_training

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, labels, valid_inds, is_training = self.vectorize(observations)

        if xs is None:
            # no valid examples, just return empty responses
            return batch_reply

        # produce predictions, train on targets if availables
        predictions = self.predict(xs, ys, is_training)

        if is_training:
            report_freq = 0
        else:
            report_freq = 0.01
        PaddingUtils.map_predictions(predictions,
                                     valid_inds,
                                     batch_reply,
                                     observations,
                                     self.dict,
                                     self.END_IDX,
                                     report_freq=report_freq,
                                     labels=labels,
                                     answers=self.answers,
                                     ys=ys.data)

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['optimizer'] = self.optimizer.state_dict()
            model['optimizer_type'] = self.opt['optimizer']
            model['opt'] = self.opt

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            states = torch.load(read)

        return states['opt'], states

    def receive_metrics(self, metrics_dict):
        """Use the metrics to decide when to adjust LR schedule."""
        if 'loss' in metrics_dict:
            self.scheduler.step(metrics_dict['loss'])

Example #52

Show file

File: language_model.py Project: weiqiangzheng/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        self.metrics = {
            'loss': 0,
            'num_tokens': 0,
            'lmloss': 0,
            'lm_num_tokens': 0
        }
        self.states = {}
        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        self.batchsize = opt.get('batchsize', 1)
        self.use_person_tokens = opt.get('person_tokens', True)
        self.sampling_mode = opt.get('sampling_mode', False)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']
                self.metrics = shared['metrics']

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

        else:
            # this is not a shared instance of this class, so do full init
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            # for backwards compatibility: will only be called for older models
            # for which .opt file does not exist
            if (init_model is not None
                    and not os.path.isfile(init_model + '.opt')):
                new_opt = self.load_opt(init_model)
                # load model parameters if available
                print('[ Setting opt from {} ]'.format(init_model))
                # since .opt file does not exist, save one for future use
                print("Saving opt file at:", init_model + ".opt")
                with open(init_model + ".opt", 'wb') as handle:
                    pickle.dump(new_opt,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                opt = self.override_opt(new_opt)

            if ((init_model is not None
                 and os.path.isfile(init_model + '.dict'))
                    or opt['dict_file'] is None):
                opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'LanguageModel'

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            if self.use_person_tokens:
                # add person1 and person2 tokens
                self.dict.add_to_dict(self.dict.tokenize("PERSON1"))
                self.dict.add_to_dict(self.dict.tokenize("PERSON2"))

            # set model
            self.model = RNNModel(opt, len(self.dict))

            if init_model is not None:
                self.load(init_model)

            if self.use_cuda:
                self.model.cuda()

        self.next_observe = []
        self.next_batch = []

        self.is_training = True

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.25)
            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)
            if self.use_cuda:
                # push to cuda
                self.criterion.cuda()
            # init hidden state
            self.hidden = self.model.init_hidden(self.batchsize)
            # init tensor of end tokens
            self.ends = torch.LongTensor(
                [self.END_IDX for _ in range(self.batchsize)])
            if self.use_cuda:
                self.ends = self.ends.cuda()
            # set up model and learning rate scheduler parameters
            self.lr = opt['learningrate']
            self.optimizer = torch.optim.SGD(self.model.parameters(),
                                             lr=self.lr)
            self.best_val_loss = self.states.get('best_val_loss', None)
            self.lr_factor = opt['lr_factor']
            if self.lr_factor < 1.0:
                self.lr_patience = opt['lr_patience']
                self.lr_min = opt['lr_minimum']
                self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    self.optimizer,
                    factor=self.lr_factor,
                    verbose=True,
                    patience=self.lr_patience,
                    min_lr=self.lr_min)
                # initial step for scheduler if self.best_val_loss is initialized
                if self.best_val_loss is not None:
                    self.scheduler.step(self.best_val_loss)
            else:
                self.scheduler = None

        self.reset()

Example #53

Show file

File: seq2seq.py Project: youlei5898/ParlAI

class Seq2seqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    This model supports encoding the input and decoding the output via one of
    several flavors of RNN. It then uses a linear layer (whose weights can
    be shared with the embedding layer) to convert RNN output states into
    output tokens. This model currently uses greedy decoding, selecting the
    highest probability token at each time step.

    For more information, see Sequence to Sequence Learning with Neural
    Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument('-hs', '--hiddensize', type=int, default=128,
                           help='size of the hidden layers')
        agent.add_argument('-esz', '--embeddingsize', type=int, default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl', '--numlayers', type=int, default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr', '--learningrate', type=float, default=0.005,
                           help='learning rate')
        agent.add_argument('-dr', '--dropout', type=float, default=0.1,
                           help='dropout rate')
        agent.add_argument('-bi', '--bidirectional', type='bool',
                           default=False,
                           help='whether to encode the context with a '
                                'bidirectional rnn')
        agent.add_argument('-att', '--attention', default='none',
                           choices=['none', 'concat', 'general', 'dot', 'local'],
                           help='Choices: none, concat, general, local. '
                                'If set local, also set attention-length. '
                                'For more details see: '
                                'https://arxiv.org/pdf/1508.04025.pdf')
        agent.add_argument('-attl', '--attention-length', default=48, type=int,
                           help='Length of local attention.')
        agent.add_argument('--no-cuda', action='store_true', default=False,
                           help='disable GPUs even if available')
        agent.add_argument('--gpu', type=int, default=-1,
                           help='which GPU device to use')
        agent.add_argument('-rc', '--rank-candidates', type='bool',
                           default=False,
                           help='rank candidates if available. this is done by'
                                ' computing the mean score per token for each '
                                'candidate and selecting the highest scoring.')
        agent.add_argument('-tr', '--truncate', type=int, default=-1,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length and to '
                           'be similar in length to one another by throwing '
                           'away extra tokens. This reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-enc', '--encoder', default='gru',
                           choices=Seq2seqAgent.ENC_OPTS.keys(),
                           help='Choose between different encoder modules.')
        agent.add_argument('-dec', '--decoder', default='same',
                           choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()),
                           help='Choose between different decoder modules. '
                                'Default "same" uses same class as encoder, '
                                'while "shared" also uses the same weights. '
                                'Note that shared disabled some encoder '
                                'options--in particular, bidirectionality.')
        agent.add_argument('-lt', '--lookuptable', default='all',
                           choices=['unique', 'enc_dec', 'dec_out', 'all'],
                           help='The encoder, decoder, and output modules can '
                                'share weights, or not. '
                                'Unique has independent embeddings for each. '
                                'Enc_dec shares the embedding for the encoder '
                                'and decoder. '
                                'Dec_out shares decoder embedding and output '
                                'weights. '
                                'All shares all three weights.')
        agent.add_argument('-opt', '--optimizer', default='adam',
                           choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                                'Any member of torch.optim is valid and will '
                                'be used with default params except learning '
                                'rate (as specified by -lr).')
        agent.add_argument('-emb', '--embedding-type', default='random',
                           choices=['random', 'glove', 'glove-fixed'],
                           help='Choose between different strategies '
                                'for word embeddings. Default is random, '
                                'but can also preinitialize from Glove.'
                                'Preinitialized embeddings can also be fixed '
                                'so they are not updated during training.')
        agent.add_argument('-lm', '--language-model', default='none',
                           choices=['none', 'only', 'both'],
                           help='Enabled language modeling training on the '
                                'concatenated input and label data.')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)

        # all instances needs truncate param
        self.truncate = opt['truncate']
        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
        else:
            # this is not a shared instance of this class, so do full init

            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            states = None
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_IDX = self.dict[self.START]
            self.START_TENSOR = torch.LongTensor([self.START_IDX])
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_IDX = self.dict[self.END]
            self.END_TENSOR = torch.LongTensor([self.END_IDX])
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params in self
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.attention = opt['attention']
            self.bidirectional = opt['bidirectional']
            self.num_dirs = 2 if self.bidirectional else 1
            self.dropout = opt['dropout']
            self.lm = opt['language_model']

            # set up tensors once
            self.zeros = torch.zeros(self.num_layers * self.num_dirs, 1, hsz)
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            if self.rank:
                self.cands = torch.LongTensor(1, 1, 1)
                self.cand_scores = torch.FloatTensor(1)
                self.cand_lengths = torch.LongTensor(1)

            # set up modules
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            # lookup table stores word embeddings
            self.enc_lt = nn.Embedding(len(self.dict), emb,
                                       padding_idx=self.NULL_IDX,
                                       max_norm=10)

            if opt['lookuptable'] in ['enc_dec', 'all']:
                # share this with the encoder
                self.dec_lt = self.enc_lt
            else:
                self.dec_lt = nn.Embedding(len(self.dict), emb,
                                           padding_idx=self.NULL_IDX,
                                           max_norm=10)

            if not states and opt['embedding_type'].startswith('glove'):
                # set up pre-initialized vectors from GloVe
                try:
                    import torchtext.vocab as vocab
                except ImportError:
                    raise ImportError('Please install torchtext from'
                                      'github.com/pytorch/text.')
                Glove = vocab.GloVe(name='840B', dim=300)
                # do better than uniform random
                proj = torch.FloatTensor(emb, 300).uniform_(-0.057735, 0.057735) if emb != 300 else None
                for w in self.dict.freq:
                    if w in Glove.stoi:
                        vec = Glove.vectors[Glove.stoi[w]]
                        if emb != 300:
                            vec = torch.mm(proj, vec.unsqueeze(1)).squeeze()
                        self.enc_lt.weight.data[self.dict[w]] = vec
                        self.dec_lt.weight.data[self.dict[w]] = vec

            # encoder captures the input text
            enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']]
            # decoder produces our output states
            if opt['decoder'] in ['same', 'shared']:
                # use same class as encoder
                self.decoder = enc_class(emb, hsz, opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            else:
                # use set class
                dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']]
                self.decoder = dec_class(emb, hsz, opt['numlayers'],
                                         dropout=self.dropout,
                                         batch_first=True)
            if opt['decoder'] == 'shared':
                # shared weights: use the decoder to encode
                if self.bidirectional:
                    raise RuntimeError('Cannot share enc/dec and do '
                                       'bidirectional encoding.')
                self.encoder = self.decoder
            else:
                self.encoder = enc_class(emb, hsz, opt['numlayers'],
                                         dropout=self.dropout, batch_first=True,
                                         bidirectional=self.bidirectional)

            # linear layers help us produce outputs from final decoder state
            hszXdirs = hsz * self.num_dirs
            # hidden to embedding
            self.h2e = nn.Linear(hsz, emb)
            # embedding to output. note that this CAN predict NULL
            self.e2o = nn.Linear(emb, len(self.dict))
            if opt['lookuptable'] in ['dec_out', 'all']:
                # share these weights with the decoder lookup table
                self.e2o.weight = self.dec_lt.weight

            if self.attention != 'none':
                # we'll need this for all attention types
                self.attn_combine = nn.Linear(hszXdirs + emb, emb)
            if self.attention == 'local':
                # local attention over fixed set of output states
                if opt['attention_length'] < 0:
                    raise RuntimeError('Set attention length to > 0.')
                self.max_length = opt['attention_length']
                # combines input and previous hidden output layer
                self.attn = nn.Linear(hsz + emb, self.max_length)
                # combines attention weights with encoder outputs
            elif self.attention == 'concat':
                self.attn = nn.Linear(hsz + hszXdirs, hsz)
                self.attn_v = nn.Linear(hsz, 1)
            elif self.attention == 'general':
                # equivalent to dot if attn is identity
                self.attn = nn.Linear(hsz, hszXdirs)

            # set up optims for each module
            lr = opt['learningrate']
            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt['optimizer'] == 'sgd':
                kwargs['momentum'] = 0.95
                kwargs['nesterov'] = True
            self.optims = {
                'decoder': optim_class(self.decoder.parameters(), **kwargs),
                'h2e': optim_class(self.h2e.parameters(), **kwargs),
            }
            if opt['decoder'] != 'shared':
                # update the encoder as well
                self.optims['encoder'] = optim_class(
                    self.encoder.parameters(), **kwargs)
            if not opt['embedding_type'].endswith('-fixed'):
                # update embeddings during training
                self.optims['enc_lt'] = optim_class(
                    self.enc_lt.parameters(), **kwargs)
                self.optims['e2o'] = optim_class(
                    self.e2o.parameters(), **kwargs)
                if opt['lookuptable'] not in ['enc_dec', 'all']:
                    # only add dec if it's separate from enc
                    self.optims['dec_lt'] = optim_class(
                        self.dec_lt.parameters(), **kwargs)
            elif opt['lookuptable'] not in ['dec_out', 'all']:
                # embeddings are fixed, so only update e2o if it's not shared
                self.optims['e2o'] = optim_class(
                    self.e2o.parameters(), **kwargs)

            # add attention parameters into optims if available
            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    self.optims[attn_name] = optim_class(
                        getattr(self, attn_name).parameters(), **kwargs)

            if states is not None:
                # set loaded states if applicable
                self.set_states(states)

            if self.use_cuda:
                self.cuda()

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer',
                      'encoder', 'decoder', 'lookuptable', 'attention',
                      'attention_length'}
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                      k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        if type(vec) == Variable:
            vec = vec.data
        new_vec = []
        for i in vec:
            if i == self.END_IDX:
                break
            elif i != self.START_IDX:
                new_vec.append(i)
        return self.dict.vec2txt(new_vec)

    def cuda(self):
        """Push parameters to the GPU."""
        self.START_TENSOR = self.START_TENSOR.cuda(async=True)
        self.END_TENSOR = self.END_TENSOR.cuda(async=True)
        self.zeros = self.zeros.cuda(async=True)
        self.xs = self.xs.cuda(async=True)
        self.ys = self.ys.cuda(async=True)
        if self.rank:
            self.cands = self.cands.cuda(async=True)
            self.cand_scores = self.cand_scores.cuda(async=True)
            self.cand_lengths = self.cand_lengths.cuda(async=True)
        self.criterion.cuda()
        self.enc_lt.cuda()
        self.dec_lt.cuda()
        self.encoder.cuda()
        self.decoder.cuda()
        self.h2e.cuda()
        self.e2o.cuda()
        if self.attention != 'none':
            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    getattr(self, attn_name).cuda()

    def hidden_to_idx(self, hidden, is_training=False):
        """Convert hidden state vectors into indices into the dictionary."""
        # dropout at each step
        e = F.dropout(self.h2e(hidden), p=self.dropout, training=is_training)
        scores = F.dropout(self.e2o(e), p=self.dropout, training=is_training)
        # skip zero (null_idx) when selecting a score
        _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2)
        # add one back to index since we removed first option
        return idx.add_(1), scores

    def zero_grad(self):
        """Zero out optimizers."""
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        for optimizer in self.optims.values():
            optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['answers'] = self.answers
        shared['dict'] = self.dict
        shared['START_IDX'] = self.START_IDX
        shared['END_IDX'] = self.END_IDX
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        observation = observation.copy()
        if 'text' in observation:
            if observation['text'] == '':
                observation.pop('text')
            else:
                # put START and END around text
                parsed_x = [self.START_IDX]
                parsed_x.extend(self.parse(observation['text']))
                parsed_x.append(self.END_IDX)
                if self.truncate > 0:
                    parsed_x = parsed_x[-self.truncate:]
                observation['text'] = parsed_x

                if not self.episode_done:
                    # remember past dialog
                    prev_dialog = self.observation['text']
                    # get last y
                    batch_idx = self.opt.get('batchindex', 0)
                    if self.answers[batch_idx] is not None:
                        # use our last answer, which is the label during training
                        lastY = self.answers[batch_idx]
                        prev_dialog.append(self.START_IDX)
                        prev_dialog.extend(lastY)
                        prev_dialog.append(self.END_IDX)
                        self.answers[batch_idx] = None  # forget last y
                    prev_dialog.extend(parsed_x)
                    if self.truncate > 0:
                        prev_dialog = prev_dialog[-self.truncate:]
                    observation['text'] = prev_dialog
                self.observation = observation
                self.episode_done = observation['episode_done']

        return observation

    def _encode(self, xs, is_training=False):
        """Call encoder and return output and hidden states."""
        self.lastxs = xs
        batchsize = len(xs)

        # first encode context
        xes = F.dropout(self.enc_lt(xs), p=self.dropout, training=is_training)
        # project from emb_size to hidden_size dimensions
        x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
        xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True)

        if self.zeros.size(1) != batchsize:
            self.zeros.resize_(self.num_layers * self.num_dirs,
                               batchsize, self.hidden_size).fill_(0)

        h0 = Variable(self.zeros, requires_grad=False)
        if type(self.encoder) == nn.LSTM:
            encoder_output_packed, hidden = self.encoder(xes_packed, (h0, h0))
            # take elementwise max between forward and backward hidden states
            hidden = (hidden[0].view(-1, self.num_dirs, hidden[0].size(1), hidden[0].size(2)).max(1)[0],
                      hidden[1].view(-1, self.num_dirs, hidden[1].size(1), hidden[1].size(2)).max(1)[0])
            if type(self.decoder) != nn.LSTM:
                hidden = hidden[0]
        else:
            encoder_output_packed, hidden = self.encoder(xes_packed, h0)

            # take elementwise max between forward and backward hidden states
            hidden = hidden.view(-1, self.num_dirs, hidden.size(1), hidden.size(2)).max(1)[0]
            if type(self.decoder) == nn.LSTM:
                hidden = (hidden, h0.narrow(0, 0, 2))
        encoder_output, _ = pad_packed_sequence(encoder_output_packed,
                                                batch_first=True)
        encoder_output = encoder_output

        if self.attention == 'local':
            # if using local attention, narrow encoder_output to max_length
            if encoder_output.size(1) > self.max_length:
                offset = encoder_output.size(1) - self.max_length
                encoder_output = encoder_output.narrow(
                    1, offset, self.max_length)

        return encoder_output, hidden

    def _apply_attention(self, xes, encoder_output, hidden, attn_mask=None):
        """Apply attention to encoder hidden layer."""
        last_hidden = hidden[-1]  # select hidden from last RNN layer

        if self.attention == 'concat':
            hidden_expand = last_hidden.unsqueeze(1).expand(
                last_hidden.size(0), encoder_output.size(1), last_hidden.size(1))
            attn_w_premask = self.attn_v(F.tanh(self.attn(
                torch.cat((encoder_output, hidden_expand), 2)))).squeeze(2)
            attn_weights = F.softmax(attn_w_premask * attn_mask -
                                     (1 - attn_mask) * 1e20)
        elif self.attention == 'dot':
            hidden_expand = last_hidden.unsqueeze(1)
            attn_w_premask = torch.bmm(hidden_expand,
                                       encoder_output.transpose(1, 2)
                                       ).squeeze(1)
            attn_weights = F.softmax(attn_w_premask * attn_mask -
                                     (1 - attn_mask) * 1e20)
        elif self.attention == 'general':
            hidden_expand = last_hidden.unsqueeze(1)
            attn_w_premask = torch.bmm(self.attn(hidden_expand),
                                       encoder_output.transpose(1, 2)
                                       ).squeeze(1)
            attn_weights = F.softmax(attn_w_premask * attn_mask - (1 - attn_mask) * 1e20)

        elif self.attention == 'local':
            attn_weights = F.softmax(self.attn(
                torch.cat((xes.squeeze(1), last_hidden), 1)))
            if attn_weights.size(1) > encoder_output.size(1):
                attn_weights = attn_weights.narrow(
                    1, 0, encoder_output.size(1))

        attn_applied = torch.bmm(
            attn_weights.unsqueeze(1), encoder_output).squeeze(1)

        output = torch.cat((xes.squeeze(1), attn_applied), 1)
        output = self.attn_combine(output).unsqueeze(1)
        output = F.tanh(output)

        self.attn_weights = attn_weights

        return output

    def _decode_and_train(self, batchsize, xes, ys, encoder_output, hidden, attn_mask, lm=False):
        """Update the model based on the labels."""
        self.zero_grad()
        loss = 0

        predictions = []

        if self.attention != 'none':
            # using attention, produce one token at a time
            for i in range(ys.size(1)):
                h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden
                output = self._apply_attention(xes, encoder_output, h_att, attn_mask)
                output, hidden = self.decoder(output, hidden)
                preds, scores = self.hidden_to_idx(output, is_training=True)
                y = ys.select(1, i)
                loss += self.criterion(scores.squeeze(1), y)
                # use the true token as the next input instead of predicted
                xes = self.dec_lt(y).unsqueeze(1)
                xes = F.dropout(xes, p=self.dropout, training=True)
                predictions.append(preds)
        else:
            # force the entire sequence at once by feeding in START + y[:-2]
            y_in = ys.narrow(1, 0, ys.size(1) - 1)
            xes = torch.cat([xes, self.dec_lt(y_in)], 1)

            output, hidden = self.decoder(xes, hidden)
            preds, scores = self.hidden_to_idx(output, is_training=True)
            for i in range(ys.size(1)):
                # sum loss per-token
                score = scores.select(1, i)
                y = ys.select(1, i)
                loss += self.criterion(score, y)
            predictions.append(preds)
        loss.backward()
        self.update_params()

        predictions = torch.cat(predictions, 1)

        # if random.random() < 0.1:
            # sometimes output a prediction for debugging
            # print('prediction:', ' '.join(output_lines[0]))
            # print('label:', self.v2t(ys.data[0]))
            # print('lm' if lm else '  ', 'loss:', loss.data[0])

        return predictions, {('lm' if lm else '') + 'loss': loss.mul_(batchsize).data}

    def _decode_only(self, batchsize, xes, ys, encoder_output, hidden, attn_mask):
        """Just produce a prediction without training the model."""
        done = [False for _ in range(batchsize)]
        total_done = 0
        max_len = 0
        predictions = []

        # generate a response from scratch
        while(total_done < batchsize) and max_len < self.longest_label:
            # keep producing tokens until we hit END or max length for each
            # example in the batch
            if self.attention == 'none':
                output = xes
            else:
                h_att = hidden[0] if type(self.decoder) == nn.LSTM else hidden
                output = self._apply_attention(xes, encoder_output, h_att, attn_mask)
            output, hidden = self.decoder(output, hidden)
            preds, _scores = self.hidden_to_idx(output, is_training=False)
            predictions.append(preds)

            xes = self.dec_lt(preds)
            max_len += 1
            for b in range(batchsize):
                if not done[b]:
                    # only add more tokens for examples that aren't done yet
                    if preds.data[b][0] == self.END_IDX:
                        # if we produced END, we're done
                        done[b] = True
                        total_done += 1

        predictions = torch.cat(predictions, 1)
        if random.random() < 0.2:
            # sometimes output a prediction for debugging
            print('\nprediction:', self.v2t(predictions.data[0]))

        return predictions

    def _score_candidates(self, cands, cand_inds, start, encoder_output, hidden, attn_mask):
        """Rank candidates by their likelihood according to the decoder."""
        if type(self.decoder) == nn.LSTM:
            hidden, cell = hidden
        # score each candidate separately
        # cands are exs_with_cands x cands_per_ex x words_per_cand
        # cview is total_cands x words_per_cand
        cview = cands.view(-1, cands.size(2))
        c_xes = start.expand(cview.size(0), start.size(0), start.size(1))

        if len(cand_inds) != hidden.size(1):
            # only use hidden state from inputs with associated candidates
            cand_indices = torch.LongTensor([i for i, _, _ in cand_inds])
            if self.use_cuda:
                cand_indices = cand_indices.cuda()
            cand_indices = Variable(cand_indices)
            hidden = hidden.index_select(1, cand_indices)

        sz = hidden.size()
        cands_hn = (
            hidden.view(sz[0], sz[1], 1, sz[2])
            .expand(sz[0], sz[1], cands.size(1), sz[2])
            .contiguous()
            .view(sz[0], -1, sz[2])
        )
        if type(self.decoder) == nn.LSTM:
            if len(cand_inds) != cell.size(1):
                # only use cell state from inputs with associated candidates
                cell = cell.index_select(1, cand_indices)
            cands_hn = (cands_hn, cell.view(sz[0], sz[1], 1, sz[2])
                                      .expand(sz[0], sz[1], cands.size(1), sz[2])
                                      .contiguous()
                                      .view(sz[0], -1, sz[2]))

        cand_scores = Variable(
            self.cand_scores.resize_(cview.size(0)).fill_(0))
        cand_lengths = Variable(
            self.cand_lengths.resize_(cview.size(0)).fill_(0))

        if self.attention != 'none':
            # using attention
            # select only encoder output matching xs we want
            if len(cand_inds) != len(encoder_output):
                indices = torch.LongTensor([i[0] for i in cand_inds])
                if self.use_cuda:
                    indices = indices.cuda()
                indices = Variable(indices)
                encoder_output = encoder_output.index_select(0, indices)
                attn_mask = attn_mask.index_select(0, indices)

            sz = encoder_output.size()
            cands_encoder_output = (
                encoder_output.contiguous()
                .view(sz[0], 1, sz[1], sz[2])
                .expand(sz[0], cands.size(1), sz[1], sz[2])
                .contiguous()
                .view(-1, sz[1], sz[2])
            )

            msz = attn_mask.size()
            cands_attn_mask = (
                attn_mask.contiguous()
                .view(msz[0], 1, msz[1])
                .expand(msz[0], cands.size(1), msz[1])
                .contiguous()
                .view(-1, msz[1])
            )
            for i in range(cview.size(1)):
                # process one token at a time
                h_att = cands_hn[0] if type(self.decoder) == nn.LSTM else cands_hn
                output = self._apply_attention(c_xes, cands_encoder_output, h_att, cands_attn_mask)
                output, cands_hn = self.decoder(output, cands_hn)
                _preds, scores = self.hidden_to_idx(output, is_training=False)
                cs = cview.select(1, i)
                non_nulls = cs.ne(self.NULL_IDX)
                cand_lengths += non_nulls.long()
                score_per_cand = torch.gather(scores.squeeze(), 1, cs.unsqueeze(1))
                cand_scores += score_per_cand.squeeze() * non_nulls.float()
                c_xes = self.dec_lt(cs).unsqueeze(1)
        else:
            # process entire sequence at once
            if cview.size(1) > 1:
                # feed in START + cands[:-2]
                cands_in = cview.narrow(1, 0, cview.size(1) - 1)
                c_xes = torch.cat([c_xes, self.dec_lt(cands_in)], 1)
            output, cands_hn = self.decoder(c_xes, cands_hn)
            _preds, scores = self.hidden_to_idx(output, is_training=False)

            for i in range(cview.size(1)):
                # calculate score at each token
                cs = cview.select(1, i)
                non_nulls = cs.ne(self.NULL_IDX)
                cand_lengths += non_nulls.long()
                score_per_cand = torch.gather(scores.select(1, i), 1, cs.unsqueeze(1))
                cand_scores += score_per_cand.squeeze() * non_nulls.float()

        # set empty scores to -1, so when divided by 0 they become -inf
        cand_scores -= cand_lengths.eq(0).float()
        # average the scores per token
        cand_scores /= cand_lengths.float()

        cand_scores = cand_scores.view(cands.size(0), cands.size(1))
        srtd_scores, text_cand_inds = cand_scores.sort(1, True)

        return text_cand_inds

    def predict(self, xs, ys=None, cands=None, valid_cands=None, lm=False):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available and param is set.
        """
        batchsize = len(xs)
        text_cand_inds = None
        is_training = ys is not None
        self.encoder.train(mode=is_training)
        self.decoder.train(mode=is_training)
        encoder_output, hidden = self._encode(xs, is_training)

        # next we use START as an input to kick off our decoder
        if not lm:
            x = Variable(self.START_TENSOR, requires_grad=False)
            xe = self.dec_lt(x)
            xe = F.dropout(xe, p=self.dropout, training=is_training)
            xes = xe.expand(batchsize, 1, xe.size(1))
        else:
            # during language_model mode, just start with zeros
            xes = Variable(
                self.zeros[0].narrow(1, 0, self.emb_size).unsqueeze(1),
                requires_grad=False
            )

        if self.attention == 'none':
            attn_mask = None
        else:
            attn_mask = xs.ne(0).float()

        loss = None
        if is_training:
            predictions, loss = self._decode_and_train(batchsize, xes, ys,
                                                       encoder_output, hidden,
                                                       attn_mask, lm=lm)
        else:
            if cands is not None:
                text_cand_inds = self._score_candidates(cands, valid_cands, xe,
                                                        encoder_output, hidden,
                                                        attn_mask)

            predictions = self._decode_only(batchsize, xes, ys,
                                               encoder_output, hidden,
                                               attn_mask)

        return predictions, text_cand_inds, loss

    def batchify(self, observations, lm=False):
        """Convert a list of observations into input & target tensors."""
        def valid(obs):
            # check if this is an example our model should actually process
            return 'text' in obs
        # valid examples and their indices
        try:
            valid_inds, exs = zip(*[(i, ex) for i, ex in
                                    enumerate(observations) if valid(ex)])
        except ValueError:
            # zero examples to process in this batch, so zip failed to unpack
            return None, None, None, None, None, None

        # set up the input tensors
        batchsize = len(exs)

        # `x` text is already tokenized and truncated
        parsed_x = [ex['text'] for ex in exs]
        x_lens = [len(x) for x in parsed_x]
        ind_sorted = sorted(range(len(x_lens)), key=lambda k: -x_lens[k])

        exs = [exs[k] for k in ind_sorted]
        valid_inds = [valid_inds[k] for k in ind_sorted]
        parsed_x = [parsed_x[k] for k in ind_sorted]

        if lm:
            self.xs.resize_(batchsize, 1)
            self.xs.fill_(self.START_IDX)
            xs = Variable(self.xs)
        else:
            max_x_len = max([len(x) for x in parsed_x])
            xs = torch.LongTensor(batchsize, max_x_len).fill_(self.NULL_IDX)
            # right-padded with zeros
            for i, x in enumerate(parsed_x):
                for j, idx in enumerate(x):
                    xs[i][j] = idx
            if self.use_cuda:
                # copy to gpu
                self.xs.resize_(xs.size())
                self.xs.copy_(xs, async=True)
                xs = Variable(self.xs)
            else:
                xs = Variable(xs)

        # set up the target tensors
        ys = None
        labels = None
        if any(['labels' in ex for ex in exs]):
            # randomly select one of the labels to update on, if multiple
            # append END to each label
            labels = [random.choice(ex.get('labels', [''])) for ex in exs]
            parsed_y = [self.parse(y + ' ' + self.END) for y in labels]
            if lm:
                parsed_y = [parsed_x[i] + parsed_y[i] for i in range(batchsize)]

            max_y_len = max(len(y) for y in parsed_y)
            if self.truncate > 0 and max_y_len > self.truncate:
                parsed_y = [y[:self.truncate] for y in parsed_y]
                max_y_len = self.truncate
            ys = torch.LongTensor(batchsize, max_y_len).fill_(self.NULL_IDX)
            for i, y in enumerate(parsed_y):
                for j, idx in enumerate(y):
                    ys[i][j] = idx
            if self.use_cuda:
                # copy to gpu
                self.ys.resize_(ys.size())
                self.ys.copy_(ys, async=True)
                ys = Variable(self.ys)
            else:
                ys = Variable(ys)

        # set up candidates
        cands = None
        valid_cands = None
        if ys is None and self.rank:
            # only do ranking when no targets available and ranking flag set
            parsed_cs = []
            valid_cands = []
            for i, v in enumerate(valid_inds):
                if 'label_candidates' in observations[v]:
                    # each candidate tuple is a pair of the parsed version and
                    # the original full string
                    cs = list(observations[v]['label_candidates'])
                    parsed_cs.append([self.parse(c) for c in cs])
                    valid_cands.append((i, v, cs))
            if len(parsed_cs) > 0:
                # TODO: store lengths of cands separately, so don't have zero
                #       padding for varying number of cands per example
                # found cands, pack them into tensor
                max_c_len = max(max(len(c) for c in cs) for cs in parsed_cs)
                max_c_cnt = max(len(cs) for cs in parsed_cs)
                cands = torch.LongTensor(len(parsed_cs), max_c_cnt, max_c_len).fill_(self.NULL_IDX)
                for i, cs in enumerate(parsed_cs):
                    for j, c in enumerate(cs):
                        for k, idx in enumerate(c):
                            cands[i][j][k] = idx
                if self.use_cuda:
                    # copy to gpu
                    self.cands.resize_(cands.size())
                    self.cands.copy_(cands, async=True)
                    cands = Variable(self.cands)
                else:
                    cands = Variable(cands)

        return xs, ys, labels, valid_inds, cands, valid_cands

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, labels, valid_inds, cands, valid_cands = self.batchify(observations)

        if ys is not None:
            # keep track of longest label we've ever seen
            # we'll never produce longer ones than that during prediction
            self.longest_label = max(self.longest_label, ys.size(1))

        if xs is None:
            # no valid examples, just return empty responses
            return batch_reply

        if self.lm != 'none' and ys is not None:
            # train on lm task: given [START], predict [x y]
            # (regular task is given [x START] produce [y])
            xs, ys, _, _, _, _ = self.batchify(observations, lm=True)
            _, _, loss = self.predict(xs, ys, lm=True)
            if loss is not None:
                batch_reply[0]['metrics'] = loss

        if self.lm != 'only' or ys is None:
            # produce predictions, train on targets if availables
            predictions, text_cand_inds, loss = self.predict(xs, ys, cands, valid_cands)
            if loss is not None:
                if 'metrics' in batch_reply[0]:
                    for k, v in loss:
                        batch_reply[0]['metrics'][k] = v
                else:
                    batch_reply[0]['metrics'] = loss

            predictions = predictions.cpu()
            for i in range(len(predictions)):
                # map the predictions back to non-empty examples in the batch
                # we join with spaces since we produce tokens one at a time
                curr = batch_reply[valid_inds[i]]
                output_tokens = []
                for c in predictions.data[i]:
                    if c == self.END_IDX:
                        break
                    else:
                        output_tokens.append(c)
                curr_pred = self.v2t(output_tokens)
                curr['text'] = curr_pred
                if labels is not None:
                    y = []
                    for c in ys.data[i]:
                        if c == self.END_IDX:
                            break
                        else:
                            y.append(c)
                    self.answers[valid_inds[i]] = y
                else:
                    self.answers[valid_inds[i]] = output_tokens
                if self.NULL_IDX in self.answers[valid_inds[i]]:
                    raise RuntimeError('This shouldnt happen but might.')

            if text_cand_inds is not None:
                text_cand_inds = text_cand_inds.cpu().data
                for i in range(len(valid_cands)):
                    order = text_cand_inds[i]
                    _, batch_idx, curr_cands = valid_cands[i]
                    curr = batch_reply[batch_idx]
                    curr['text_candidates'] = [curr_cands[idx] for idx in order
                                               if idx < len(curr_cands)]

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'optims'):
            model = {}
            model['enc_lt'] = self.enc_lt.state_dict()
            if self.opt['lookuptable'] not in ['enc_dec', 'all']:
                # dec_lt is not shared with enc_lt, so save it
                model['dec_lt'] = self.dec_lt.state_dict()
            if self.opt['decoder'] != 'shared':
                model['encoder'] = self.encoder.state_dict()
            model['decoder'] = self.decoder.state_dict()
            model['h2e'] = self.h2e.state_dict()
            model['e2o'] = self.e2o.state_dict()
            model['optims'] = {k: v.state_dict()
                               for k, v in self.optims.items()}
            model['longest_label'] = self.longest_label
            model['opt'] = self.opt

            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    model[attn_name] = getattr(self, attn_name).state_dict()

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            model = torch.load(read)

        return model['opt'], model

    def set_states(self, states):
        """Set the state dicts of the modules from saved states."""
        self.enc_lt.load_state_dict(states['enc_lt'])
        if self.opt['lookuptable'] not in ['enc_dec', 'all']:
            # dec_lt is not shared with enc_lt, so load it
            self.dec_lt.load_state_dict(states['dec_lt'])
        if self.opt['decoder'] != 'shared':
            self.encoder.load_state_dict(states['encoder'])
        self.decoder.load_state_dict(states['decoder'])
        self.h2e.load_state_dict(states['h2e'])
        self.e2o.load_state_dict(states['e2o'])
        for attn_name in ['attn', 'attn_v', 'attn_combine']:
            if attn_name in states:
                getattr(self, attn_name).load_state_dict(states[attn_name])
        for k, optimizer in self.optims.items():
            if k in states['optims']:
                optimizer.load_state_dict(states['optims'][k])
            else:
                print('WARNING: loaded other optims, but none found for ' + k +
                      '. Using default initialization instead.')
        self.longest_label = states['longest_label']

Example #54

Show file

File: test_dict.py Project: yonghangzhou/KBRD

    def test_basic_parse(self):
        """Check that the dictionary is correctly adding and parsing short
        sentence.
        """
        from parlai.core.dict import DictionaryAgent
        from parlai.core.params import ParlaiParser

        argparser = ParlaiParser()
        DictionaryAgent.add_cmdline_args(argparser)
        opt = argparser.parse_args(print_args=False)
        dictionary = DictionaryAgent(opt)
        num_builtin = len(dictionary)

        dictionary.observe({'text': 'hello world'})
        dictionary.act()
        assert len(dictionary) - num_builtin == 2

        vec = dictionary.parse('hello world')
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=list)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

        vec = dictionary.parse('hello world', vec_type=tuple)
        assert len(vec) == 2
        assert vec[0] == num_builtin
        assert vec[1] == num_builtin + 1

Example #55

Show file

File: build_dict.py Project: youlei5898/ParlAI

def main():
    # Get command line arguments
    argparser = ParlaiParser()
    DictionaryAgent.add_cmdline_args(argparser)
    opt = argparser.parse_args()
    build_dict(opt)

Example #56

Show file

File: language_model.py Project: tony-blake/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        self.states = {}
        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        self.batchsize = opt.get('batchsize', 1)

        if shared:
            # set up shared properties
            self.dict = shared['dict']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

        else:
            # this is not a shared instance of this class, so do full init
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'LanguageModel'

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            # set model
            self.model = RNNModel(opt, len(self.dict))

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        self.next_observe = []
        self.next_batch = []

        self.is_training = True

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.25)
            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            if self.use_cuda:
                # push to cuda
                self.criterion.cuda()
            # set up criterion for eval: we do not want to average over size
            self.eval_criterion = nn.CrossEntropyLoss(
                ignore_index=self.NULL_IDX, size_average=False)
            if self.use_cuda:
                # push to cuda
                self.eval_criterion.cuda()
            # init hidden state
            self.hidden = self.model.init_hidden(self.batchsize)
            # init tensor of end tokens
            self.ends = torch.LongTensor(
                [self.END_IDX for _ in range(self.batchsize)])
            if self.use_cuda:
                self.ends = self.ends.cuda()
            # set up optimizer
            self.lr = opt['learningrate']
            best_val_loss = None

        self.reset()

Example #57

Show file

File: seq2seq.py Project: jojonki/ParlAI

class Seq2seqAgent(Agent):
    """Agent which takes an input sequence and produces an output sequence.

    For more information, see Sequence to Sequence Learning with Neural
    Networks `(Sutskever et al. 2014) <https://arxiv.org/abs/1409.3215>`_.
    """

    OPTIM_OPTS = {
        'adadelta': optim.Adadelta,
        'adagrad': optim.Adagrad,
        'adam': optim.Adam,
        'adamax': optim.Adamax,
        'asgd': optim.ASGD,
        'lbfgs': optim.LBFGS,
        'rmsprop': optim.RMSprop,
        'rprop': optim.Rprop,
        'sgd': optim.SGD,
    }

    ENC_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        DictionaryAgent.add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Seq2Seq Arguments')
        agent.add_argument('-hs', '--hiddensize', type=int, default=128,
                           help='size of the hidden layers')
        agent.add_argument('-emb', '--embeddingsize', type=int, default=128,
                           help='size of the token embeddings')
        agent.add_argument('-nl', '--numlayers', type=int, default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr', '--learningrate', type=float, default=0.5,
                           help='learning rate')
        agent.add_argument('-dr', '--dropout', type=float, default=0.1,
                           help='dropout rate')
        agent.add_argument('-att', '--attention', type=int, default=0,
                           help='if greater than 0, use attention of specified'
                                ' length while decoding')
        agent.add_argument('--no-cuda', action='store_true', default=False,
                           help='disable GPUs even if available')
        agent.add_argument('--gpu', type=int, default=-1,
                           help='which GPU device to use')
        agent.add_argument('-rc', '--rank-candidates', type='bool',
                           default=False,
                           help='rank candidates if available. this is done by'
                                ' computing the mean score per token for each '
                                'candidate and selecting the highest scoring.')
        agent.add_argument('-tr', '--truncate', type='bool', default=True,
                           help='truncate input & output lengths to speed up '
                           'training (may reduce accuracy). This fixes all '
                           'input and output to have a maximum length and to '
                           'be similar in length to one another by throwing '
                           'away extra tokens. This reduces the total amount '
                           'of padding in the batches.')
        agent.add_argument('-enc', '--encoder', default='gru',
                           choices=Seq2seqAgent.ENC_OPTS.keys(),
                           help='Choose between different encoder modules.')
        agent.add_argument('-dec', '--decoder', default='same',
                           choices=['same', 'shared'] + list(Seq2seqAgent.ENC_OPTS.keys()),
                           help='Choose between different decoder modules. '
                                'Default "same" uses same class as encoder, '
                                'while "shared" also uses the same weights.')
        agent.add_argument('-opt', '--optimizer', default='sgd',
                           choices=Seq2seqAgent.OPTIM_OPTS.keys(),
                           help='Choose between pytorch optimizers. '
                                'Any member of torch.optim is valid and will '
                                'be used with default params except learning '
                                'rate (as specified by -lr).')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override options with stored ones
                opt = self.override_opt(new_opt)

            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START))
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END))
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params directly
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.truncate = opt['truncate']
            self.attention = opt['attention']

            # set up tensors
            self.zeros = torch.zeros(self.num_layers, 1, hsz)
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            self.cands = torch.LongTensor(1, 1, 1)
            self.cand_scores = torch.FloatTensor(1)
            self.cand_lengths = torch.LongTensor(1)

            # set up modules
            self.criterion = nn.NLLLoss()
            # lookup table stores word embeddings
            self.lt = nn.Embedding(len(self.dict), emb,
                                   padding_idx=self.NULL_IDX,
                                   scale_grad_by_freq=True)
            self.lt2enc = nn.Linear(emb, hsz)
            self.lt2dec = nn.Linear(emb, hsz)
            # encoder captures the input text
            enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']]
            self.encoder = enc_class(hsz, hsz, opt['numlayers'])
            # decoder produces our output states
            if opt['decoder'] == 'shared':
                self.decoder = self.encoder
            elif opt['decoder'] == 'same':
                self.decoder = enc_class(hsz, hsz, opt['numlayers'])
            else:
                dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']]
                self.decoder = dec_class(hsz, hsz, opt['numlayers'])
            # linear layer helps us produce outputs from final decoder state
            self.h2o = nn.Linear(hsz, len(self.dict))
            # droput on the linear layer helps us generalize
            self.dropout = nn.Dropout(opt['dropout'])

            self.use_attention = False
            # if attention is greater than 0, set up additional members
            if self.attention > 0:
                self.use_attention = True
                self.max_length = self.attention
                # combines input and previous hidden output layer
                self.attn = nn.Linear(hsz * 2, self.max_length)
                # combines attention weights with encoder outputs
                self.attn_combine = nn.Linear(hsz * 2, hsz)

            # set up optims for each module
            lr = opt['learningrate']

            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            self.optims = {
                'lt': optim_class(self.lt.parameters(), lr=lr),
                'lt2enc': optim_class(self.lt2enc.parameters(), lr=lr),
                'lt2dec': optim_class(self.lt2dec.parameters(), lr=lr),
                'encoder': optim_class(self.encoder.parameters(), lr=lr),
                'decoder': optim_class(self.decoder.parameters(), lr=lr),
                'h2o': optim_class(self.h2o.parameters(), lr=lr),
            }

            if hasattr(self, 'states'):
                # set loaded states if applicable
                self.set_states(self.states)

            if self.use_cuda:
                self.cuda()

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {'hiddensize', 'embeddingsize', 'numlayers', 'optimizer',
                      'encoder', 'decoder'}
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                      k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def v2t(self, vec):
        """Convert token indices to string of tokens."""
        return self.dict.vec2txt(vec)

    def cuda(self):
        """Push parameters to the GPU."""
        self.START_TENSOR = self.START_TENSOR.cuda(async=True)
        self.END_TENSOR = self.END_TENSOR.cuda(async=True)
        self.zeros = self.zeros.cuda(async=True)
        self.xs = self.xs.cuda(async=True)
        self.ys = self.ys.cuda(async=True)
        self.cands = self.cands.cuda(async=True)
        self.cand_scores = self.cand_scores.cuda(async=True)
        self.cand_lengths = self.cand_lengths.cuda(async=True)
        self.criterion.cuda()
        self.lt.cuda()
        self.lt2enc.cuda()
        self.lt2dec.cuda()
        self.encoder.cuda()
        self.decoder.cuda()
        self.h2o.cuda()
        self.dropout.cuda()
        if self.use_attention:
            self.attn.cuda()
            self.attn_combine.cuda()

    def hidden_to_idx(self, hidden, dropout=False):
        """Convert hidden state vectors into indices into the dictionary."""
        if hidden.size(0) > 1:
            raise RuntimeError('bad dimensions of tensor:', hidden)
        hidden = hidden.squeeze(0)
        scores = self.h2o(hidden)
        if dropout:
            scores = self.dropout(scores)
        scores = F.log_softmax(scores)
        _max_score, idx = scores.max(1)
        return idx, scores

    def zero_grad(self):
        """Zero out optimizers."""
        for optimizer in self.optims.values():
            optimizer.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        for optimizer in self.optims.values():
            optimizer.step()

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None
        self.episode_done = True

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        # shallow copy observation (deep copy can be expensive)
        observation = observation.copy()
        if not self.episode_done:
            # if the last example wasn't the end of an episode, then we need to
            # recall what was said in that example
            prev_dialogue = self.observation['text']
            observation['text'] = prev_dialogue + '\n' + observation['text']
        self.observation = observation
        self.episode_done = observation['episode_done']
        return observation

    def _encode(self, xs, dropout=False):
        """Call encoder and return output and hidden states."""
        batchsize = len(xs)

        # first encode context
        xes = self.lt(xs)
        if dropout:
            xes = self.dropout(xes)
        # project from emb_size to hidden_size dimensions
        xes = self.lt2enc(xes).transpose(0, 1)

        if self.zeros.size(1) != batchsize:
            self.zeros.resize_(self.num_layers, batchsize, self.hidden_size).fill_(0)
        h0 = Variable(self.zeros)
        if type(self.encoder) == nn.LSTM:
            encoder_output, hidden = self.encoder(xes, (h0, h0))
            if type(self.decoder) != nn.LSTM:
                hidden = hidden[0]
        else:
            encoder_output, hidden = self.encoder(xes, h0)
            if type(self.decoder) == nn.LSTM:
                hidden = (hidden, h0)
        encoder_output = encoder_output.transpose(0, 1)

        if self.use_attention:
            if encoder_output.size(1) > self.max_length:
                offset = encoder_output.size(1) - self.max_length
                encoder_output = encoder_output.narrow(1, offset, self.max_length)

        return encoder_output, hidden


    def _apply_attention(self, xes, encoder_output, encoder_hidden):
        """Apply attention to encoder hidden layer."""
        attn_weights = F.softmax(self.attn(torch.cat((xes[0], encoder_hidden[-1]), 1)))

        if attn_weights.size(1) > encoder_output.size(1):
            attn_weights = attn_weights.narrow(1, 0, encoder_output.size(1) )

        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_output).squeeze(1)

        output = torch.cat((xes[0], attn_applied), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)

        return output


    def _decode_and_train(self, batchsize, xes, ys, encoder_output, hidden):
        # update the model based on the labels
        self.zero_grad()
        loss = 0

        output_lines = [[] for _ in range(batchsize)]

        # keep track of longest label we've ever seen
        self.longest_label = max(self.longest_label, ys.size(1))
        for i in range(ys.size(1)):
            output = self._apply_attention(xes, encoder_output, hidden) if self.use_attention else xes

            output, hidden = self.decoder(output, hidden)
            preds, scores = self.hidden_to_idx(output, dropout=True)
            y = ys.select(1, i)
            loss += self.criterion(scores, y)
            # use the true token as the next input instead of predicted
            # this produces a biased prediction but better training
            xes = self.lt2dec(self.lt(y).unsqueeze(0))
            for b in range(batchsize):
                # convert the output scores to tokens
                token = self.v2t([preds.data[b]])
                output_lines[b].append(token)

        loss.backward()
        self.update_params()

        if random.random() < 0.1:
            # sometimes output a prediction for debugging
            print('prediction:', ' '.join(output_lines[0]),
                  '\nlabel:', self.dict.vec2txt(ys.data[0]))

        return output_lines

    def _decode_only(self, batchsize, xes, ys, encoder_output, hidden):
        # just produce a prediction without training the model
        done = [False for _ in range(batchsize)]
        total_done = 0
        max_len = 0

        output_lines = [[] for _ in range(batchsize)]

        # now, generate a response from scratch
        while(total_done < batchsize) and max_len < self.longest_label:
            # keep producing tokens until we hit END or max length for each
            # example in the batch
            output = self._apply_attention(xes, encoder_output, hidden) if self.use_attention else xes

            output, hidden = self.decoder(output, hidden)
            preds, scores = self.hidden_to_idx(output, dropout=False)

            xes = self.lt2dec(self.lt(preds.unsqueeze(0)))
            max_len += 1
            for b in range(batchsize):
                if not done[b]:
                    # only add more tokens for examples that aren't done yet
                    token = self.v2t([preds.data[b]])
                    if token == self.END:
                        # if we produced END, we're done
                        done[b] = True
                        total_done += 1
                    else:
                        output_lines[b].append(token)

        if random.random() < 0.1:
            # sometimes output a prediction for debugging
            print('prediction:', ' '.join(output_lines[0]))

        return output_lines

    def _score_candidates(self, cands, xe, encoder_output, hidden):
        # score each candidate separately

        # cands are exs_with_cands x cands_per_ex x words_per_cand
        # cview is total_cands x words_per_cand
        cview = cands.view(-1, cands.size(2))
        cands_xes = xe.expand(xe.size(0), cview.size(0), xe.size(2))
        sz = hidden.size()
        cands_hn = (
            hidden.view(sz[0], sz[1], 1, sz[2])
            .expand(sz[0], sz[1], cands.size(1), sz[2])
            .contiguous()
            .view(sz[0], -1, sz[2])
        )

        sz = encoder_output.size()
        cands_encoder_output = (
            encoder_output.contiguous()
            .view(sz[0], 1, sz[1], sz[2])
            .expand(sz[0], cands.size(1), sz[1], sz[2])
            .contiguous()
            .view(-1, sz[1], sz[2])
        )

        cand_scores = Variable(
                    self.cand_scores.resize_(cview.size(0)).fill_(0))
        cand_lengths = Variable(
                    self.cand_lengths.resize_(cview.size(0)).fill_(0))

        for i in range(cview.size(1)):
            output = self._apply_attention(cands_xes, cands_encoder_output, cands_hn) \
                    if self.use_attention else cands_xes

            output, cands_hn = self.decoder(output, cands_hn)
            preds, scores = self.hidden_to_idx(output, dropout=False)
            cs = cview.select(1, i)
            non_nulls = cs.ne(self.NULL_IDX)
            cand_lengths += non_nulls.long()
            score_per_cand = torch.gather(scores, 1, cs.unsqueeze(1))
            cand_scores += score_per_cand.squeeze() * non_nulls.float()
            cands_xes = self.lt2dec(self.lt(cs).unsqueeze(0))

        # set empty scores to -1, so when divided by 0 they become -inf
        cand_scores -= cand_lengths.eq(0).float()
        # average the scores per token
        cand_scores /= cand_lengths.float()

        cand_scores = cand_scores.view(cands.size(0), cands.size(1))
        srtd_scores, text_cand_inds = cand_scores.sort(1, True)
        text_cand_inds = text_cand_inds.data

        return text_cand_inds

    def predict(self, xs, ys=None, cands=None):
        """Produce a prediction from our model.

        Update the model using the targets if available, otherwise rank
        candidates as well if they are available.
        """
        batchsize = len(xs)
        text_cand_inds = None
        is_training = ys is not None
        encoder_output, hidden = self._encode(xs, dropout=is_training)

        # next we use END as an input to kick off our decoder
        x = Variable(self.START_TENSOR)
        xe = self.lt2dec(self.lt(x).unsqueeze(1))
        xes = xe.expand(xe.size(0), batchsize, xe.size(2))

        # list of output tokens for each example in the batch
        output_lines = None

        if is_training:
            output_lines = self._decode_and_train(batchsize, xes, ys,
                                                  encoder_output, hidden)

        else:
            if cands is not None:
                text_cand_inds = self._score_candidates(cands, xe,
                                                        encoder_output, hidden)

            output_lines = self._decode_only(batchsize, xes, ys,
                                             encoder_output, hidden)

        return output_lines, text_cand_inds

    def batchify(self, observations):
        """Convert a list of observations into input & target tensors."""
        # valid examples
        exs = [ex for ex in observations if 'text' in ex]
        # the indices of the valid (non-empty) tensors
        valid_inds = [i for i, ex in enumerate(observations) if 'text' in ex]

        # set up the input tensors
        batchsize = len(exs)
        # tokenize the text
        xs = None
        if batchsize > 0:
            parsed = [self.parse(ex['text']) for ex in exs]
            max_x_len = max([len(x) for x in parsed])
            if self.truncate:
                # shrink xs to to limit batch computation
                min_x_len = min([len(x) for x in parsed])
                max_x_len = min(min_x_len + 12, max_x_len, 48)
                parsed = [x[-max_x_len:] for x in parsed]
            xs = torch.LongTensor(batchsize, max_x_len).fill_(0)
            # pack the data to the right side of the tensor for this model
            for i, x in enumerate(parsed):
                offset = max_x_len - len(x)
                for j, idx in enumerate(x):
                    xs[i][j + offset] = idx
            if self.use_cuda:
                # copy to gpu
                self.xs.resize_(xs.size())
                self.xs.copy_(xs, async=True)
                xs = Variable(self.xs)
            else:
                xs = Variable(xs)

        # set up the target tensors
        ys = None
        if batchsize > 0 and any(['labels' in ex for ex in exs]):
            # randomly select one of the labels to update on, if multiple
            # append END to each label
            labels = [random.choice(ex.get('labels', [''])) + ' ' + self.END for ex in exs]
            parsed = [self.parse(y) for y in labels]
            max_y_len = max(len(y) for y in parsed)
            if self.truncate:
                # shrink ys to to limit batch computation
                min_y_len = min(len(y) for y in parsed)
                max_y_len = min(min_y_len + 12, max_y_len, 48)
                parsed = [y[:max_y_len] for y in parsed]
            ys = torch.LongTensor(batchsize, max_y_len).fill_(0)
            for i, y in enumerate(parsed):
                for j, idx in enumerate(y):
                    ys[i][j] = idx
            if self.use_cuda:
                # copy to gpu
                self.ys.resize_(ys.size())
                self.ys.copy_(ys, async=True)
                ys = Variable(self.ys)
            else:
                ys = Variable(ys)

        # set up candidates
        cands = None
        valid_cands = None
        if ys is None and self.rank:
            # only do ranking when no targets available and ranking flag set
            parsed = []
            valid_cands = []
            for i in valid_inds:
                if 'label_candidates' in observations[i]:
                    # each candidate tuple is a pair of the parsed version and
                    # the original full string
                    cs = list(observations[i]['label_candidates'])
                    parsed.append([self.parse(c) for c in cs])
                    valid_cands.append((i, cs))
            if len(parsed) > 0:
                # TODO: store lengths of cands separately, so don't have zero
                # padding for varying number of cands per example
                # found cands, pack them into tensor
                max_c_len = max(max(len(c) for c in cs) for cs in parsed)
                max_c_cnt = max(len(cs) for cs in parsed)
                cands = torch.LongTensor(len(parsed), max_c_cnt, max_c_len).fill_(0)
                for i, cs in enumerate(parsed):
                    for j, c in enumerate(cs):
                        for k, idx in enumerate(c):
                            cands[i][j][k] = idx
                if self.use_cuda:
                    # copy to gpu
                    self.cands.resize_(cands.size())
                    self.cands.copy_(cands, async=True)
                    cands = Variable(self.cands)
                else:
                    cands = Variable(cands)

        return xs, ys, valid_inds, cands, valid_cands

    def batch_act(self, observations):
        batchsize = len(observations)
        # initialize a table of replies with this agent's id
        batch_reply = [{'id': self.getID()} for _ in range(batchsize)]

        # convert the observations into batches of inputs and targets
        # valid_inds tells us the indices of all valid examples
        # e.g. for input [{}, {'text': 'hello'}, {}, {}], valid_inds is [1]
        # since the other three elements had no 'text' field
        xs, ys, valid_inds, cands, valid_cands = self.batchify(observations)

        if xs is None:
            # no valid examples, just return the empty responses we set up
            return batch_reply

        # produce predictions either way, but use the targets if available

        predictions, text_cand_inds = self.predict(xs, ys, cands)

        for i in range(len(predictions)):
            # map the predictions back to non-empty examples in the batch
            # we join with spaces since we produce tokens one at a time
            curr = batch_reply[valid_inds[i]]
            curr['text'] = ' '.join(c for c in predictions[i] if c != self.END
                                    and c != self.dict.null_token)

        if text_cand_inds is not None:
            for i in range(len(valid_cands)):
                order = text_cand_inds[i]
                batch_idx, curr_cands = valid_cands[i]
                curr = batch_reply[batch_idx]
                curr['text_candidates'] = [curr_cands[idx] for idx in order
                                           if idx < len(curr_cands)]

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'lt'):
            model = {}
            model['lt'] = self.lt.state_dict()
            model['lt2enc'] = self.lt2enc.state_dict()
            model['lt2dec'] = self.lt2dec.state_dict()
            model['encoder'] = self.encoder.state_dict()
            model['decoder'] = self.decoder.state_dict()
            model['h2o'] = self.h2o.state_dict()
            model['optims'] = {k: v.state_dict()
                               for k, v in self.optims.items()}
            model['longest_label'] = self.longest_label
            model['opt'] = self.opt

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            model = torch.load(read)

        return model['opt'], model

    def set_states(self, states):
        """Set the state dicts of the modules from saved states."""
        self.lt.load_state_dict(states['lt'])
        self.lt2enc.load_state_dict(states['lt2enc'])
        self.lt2dec.load_state_dict(states['lt2dec'])
        self.encoder.load_state_dict(states['encoder'])
        self.decoder.load_state_dict(states['decoder'])
        self.h2o.load_state_dict(states['h2o'])
        for k, v in states['optims'].items():
            self.optims[k].load_state_dict(v)
        self.longest_label = states['longest_label']

Example #58

Show file

File: language_model.py Project: tony-blake/ParlAI

class LanguageModelAgent(Agent):
    """ Agent which trains an RNN on a language modeling task.

    It is adapted from the language model featured in Pytorch's examples repo
    here: <https://github.com/pytorch/examples/tree/master/word_language_model>.
    """
    @staticmethod
    def dictionary_class():
        return DictionaryAgent

    @staticmethod
    def add_cmdline_args(argparser):
        """Add command-line arguments specifically for this agent."""
        argparser.set_defaults(batch_sort=False)
        LanguageModelAgent.dictionary_class().add_cmdline_args(argparser)
        agent = argparser.add_argument_group('Language Model Arguments')
        agent.add_argument('-hs',
                           '--hiddensize',
                           type=int,
                           default=200,
                           help='size of the hidden layers')
        agent.add_argument('-esz',
                           '--embeddingsize',
                           type=int,
                           default=200,
                           help='size of the token embeddings')
        agent.add_argument('-nl',
                           '--numlayers',
                           type=int,
                           default=2,
                           help='number of hidden layers')
        agent.add_argument('-lr',
                           '--learningrate',
                           type=float,
                           default=20,
                           help='initial learning rate')
        agent.add_argument('-dr',
                           '--dropout',
                           type=float,
                           default=0.2,
                           help='dropout rate')
        agent.add_argument('-clip',
                           '--gradient-clip',
                           type=float,
                           default=0.25,
                           help='gradient clipping')
        agent.add_argument('--no-cuda',
                           action='store_true',
                           default=False,
                           help='disable GPUs even if available')
        agent.add_argument(
            '-rnn',
            '--rnn-class',
            default='LSTM',
            help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
        agent.add_argument('-sl',
                           '--seq-len',
                           type=int,
                           default=35,
                           help='sequence length')
        agent.add_argument('-tied',
                           '--emb-tied',
                           action='store_true',
                           help='tie the word embedding and softmax weights')
        agent.add_argument('-seed',
                           '--random-seed',
                           type=int,
                           default=1111,
                           help='random seed')
        agent.add_argument('--gpu',
                           type=int,
                           default=-1,
                           help='which GPU device to use')
        agent.add_argument('-tr',
                           '--truncate-pred',
                           type=int,
                           default=50,
                           help='truncate predictions')
        agent.add_argument('-rf',
                           '--report-freq',
                           type=float,
                           default=0.1,
                           help='report frequency of prediction during eval')

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        self.states = {}
        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        self.batchsize = opt.get('batchsize', 1)

        if shared:
            # set up shared properties
            self.dict = shared['dict']

            if 'model' in shared:
                # model is shared during hogwild
                self.model = shared['model']
                self.states = shared['states']

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

        else:
            # this is not a shared instance of this class, so do full init
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' +
                      opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override model-specific options with stored ones
                opt = self.override_opt(new_opt)

            if opt['dict_file'] is None and opt.get('model_file'):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'LanguageModel'

            # get NULL token and END token
            self.NULL_IDX = self.dict[self.dict.null_token]
            self.END_IDX = self.dict[self.dict.end_token]

            # set model
            self.model = RNNModel(opt, len(self.dict))

            if self.states:
                # set loaded states if applicable
                self.model.load_state_dict(self.states['model'])

            if self.use_cuda:
                self.model.cuda()

        self.next_observe = []
        self.next_batch = []

        self.is_training = True

        if hasattr(self, 'model'):
            # if model was built, do more setup
            self.clip = opt.get('gradient_clip', 0.25)
            # set up criteria
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX)
            if self.use_cuda:
                # push to cuda
                self.criterion.cuda()
            # set up criterion for eval: we do not want to average over size
            self.eval_criterion = nn.CrossEntropyLoss(
                ignore_index=self.NULL_IDX, size_average=False)
            if self.use_cuda:
                # push to cuda
                self.eval_criterion.cuda()
            # init hidden state
            self.hidden = self.model.init_hidden(self.batchsize)
            # init tensor of end tokens
            self.ends = torch.LongTensor(
                [self.END_IDX for _ in range(self.batchsize)])
            if self.use_cuda:
                self.ends = self.ends.cuda()
            # set up optimizer
            self.lr = opt['learningrate']
            best_val_loss = None

        self.reset()

    def override_opt(self, new_opt):
        """Set overridable opts from loaded opt file.

        Print out each added key and each overriden key.
        Only override args specific to the model.
        """
        model_args = {
            'hiddensize', 'embeddingsize', 'numlayers', 'dropout', 'seq_len',
            'emb_tied'
        }
        for k, v in new_opt.items():
            if k not in model_args:
                # skip non-model args
                continue
            if k not in self.opt:
                print('Adding new option [ {k}: {v} ]'.format(k=k, v=v))
            elif self.opt[k] != v:
                print('Overriding option [ {k}: {old} => {v}]'.format(
                    k=k, old=self.opt[k], v=v))
            self.opt[k] = v
        return self.opt

    def parse(self, text):
        """Convert string to token indices."""
        return self.dict.txt2vec(text)

    def zero_grad(self):
        """Zero out optimizer."""
        self.model.zero_grad()

    def update_params(self):
        """Do one optimization step."""
        torch.nn.utils.clip_grad_norm(self.model.parameters(), self.clip)
        for p in self.model.parameters():
            p.data.add_(-self.lr, p.grad.data)

    def reset(self):
        """Reset observation and episode_done."""
        self.observation = None

    def share(self):
        """Share internal states between parent and child instances."""
        shared = super().share()
        shared['dict'] = self.dict
        shared['NULL_IDX'] = self.NULL_IDX
        shared['END_IDX'] = self.END_IDX
        if self.opt.get('numthreads', 1) > 1:
            shared['model'] = self.model
            self.model.share_memory()
            shared['states'] = self.states
        return shared

    def observe(self, observation):
        """Save observation for act.
        If multiple observations are from the same episode, concatenate them.
        """
        #shallow copy observation (deep copy can be expensive)
        obs = observation.copy()
        seq_len = self.opt['seq_len']
        is_training = True
        if 'eval_labels' in obs:
            is_training = False

        if is_training:
            if 'text' in obs:
                vec = self.parse(obs['text'])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if 'labels' in obs:
                vec = self.parse(obs['labels'][0])
                vec.append(self.END_IDX)
                self.next_observe += vec
            if len(self.next_observe) < (seq_len + 1):
                # not enough to return to make a batch
                # we handle this case in vectorize
                # labels indicates that we are training
                self.observation = {'labels': ''}
                return self.observation
            else:
                vecs_to_return = []
                total = len(self.next_observe) // (seq_len + 1)
                for _ in range(total):
                    observe = self.next_observe[:(seq_len + 1)]
                    self.next_observe = self.next_observe[(seq_len + 1):]
                    vecs_to_return.append(observe)
                dict_to_return = {
                    'text': '',
                    'labels': '',
                    'text2vec': vecs_to_return
                }
                self.observation = dict_to_return
                return dict_to_return
        else:
            self.observation = obs
            return obs

    def repackage_hidden(self, h):
        """Wraps hidden states in new Variables, to detach them from their history."""
        if type(h) == Variable:
            return Variable(h.data)
        else:
            return tuple(self.repackage_hidden(v) for v in h)

    def get_target_loss(self, data, hidden, targets, y_lens):
        """Calculates the loss with respect to the targets, token by token,
           where each output token is conditioned on either the input or the
           previous target token.
        """
        loss = 0.0
        bsz = data.size(0)

        # feed in inputs without end token
        output, hidden = self.model(data.transpose(0, 1), hidden)
        self.hidden = self.repackage_hidden(hidden)
        # feed in end tokens
        output, hidden = self.model(Variable(self.ends[:bsz].view(1, bsz)),
                                    self.hidden)
        self.hidden = self.repackage_hidden(hidden)
        output_flat = output.view(-1, len(self.dict))
        loss += self.eval_criterion(output_flat,
                                    targets.select(1, 0).view(-1)).data

        for i in range(1, targets.size(1)):
            output, hidden = self.model(targets.select(1, i - 1).view(1, bsz),
                                        self.hidden,
                                        no_pack=True)
            self.hidden = self.repackage_hidden(hidden)
            output_flat = output.view(-1, len(self.dict))
            loss += self.eval_criterion(output_flat,
                                        targets.select(1, i).view(-1)).data

        return loss / float(sum(y_lens))

    def get_predictions(self, data):
        """Generates predictions word by word until we either reach the end token
           or some max length (opt['truncate_pred']).
        """
        token_list = []
        bsz = data.size(0)
        done = [False for _ in range(bsz)]
        total_done = 0
        hidden = self.model.init_hidden(bsz)

        i = 0
        while total_done < bsz and i <= self.opt['truncate_pred']:
            if i == 0:
                # feed in input without end tokens
                output, hidden = self.model(data.transpose(0, 1), hidden)
                hidden = self.repackage_hidden(hidden)
                # feed in end tokens
                output, hidden = self.model(
                    Variable(self.ends[:bsz].view(1, bsz)), hidden)
            else:
                output, hidden = self.model(Variable(word_idx.view(1, bsz)),
                                            hidden,
                                            no_pack=True)
            hidden = self.repackage_hidden(hidden)
            word_weights = output.squeeze().data.exp()
            if bsz > 1:
                value, word_idx = torch.max(word_weights, 1)
            else:
                value, word_idx = torch.max(word_weights, 0)
            # mark end indices for items in batch
            for k in range(word_idx.size(0)):
                if not done[k]:
                    if int(word_idx[k]) == self.END_IDX:
                        done[k] = True
                        total_done += 1
            token_list.append(word_idx.view(bsz, 1))
            i += 1

        return torch.cat(token_list, 1)

    def predict(self,
                data,
                hidden,
                targets=None,
                is_training=True,
                y_lens=None):
        """Produce a prediction from our model.
        """
        loss_dict = None
        output = None
        predictions = None
        if is_training:
            self.model.train()
            self.zero_grad()
            output, hidden = self.model(data, hidden)
            loss = self.criterion(output.view(-1, len(self.dict)),
                                  targets.view(-1))
            loss.backward(retain_graph=True)
            self.update_params()
            loss_dict = {'lmloss': loss.data}
            loss_dict['lmppl'] = math.exp(loss.data)
        else:
            self.model.eval()
            predictions = self.get_predictions(data)
            loss_dict = {}
            bsz = data.size(0)
            if bsz != self.batchsize:
                self.hidden = self.model.init_hidden(bsz)
            loss = self.get_target_loss(data, self.hidden, targets, y_lens)
            loss_dict['loss'] = loss
            loss_dict['ppl'] = math.exp(loss)

        return output, hidden, loss_dict, predictions

    def vectorize(self, observations, seq_len, is_training):
        """Convert a list of observations into input & target tensors."""
        labels = None
        valid_inds = None
        y_lens = None
        if is_training:
            for obs in observations:
                if obs:
                    if 'text2vec' in obs:
                        self.next_batch += obs['text2vec']
            if len(self.next_batch) <= self.batchsize:
                return None, None, None, None, None
            else:
                data_list = []
                targets_list = []
                # total is the number of batches
                total = len(self.next_batch) // self.batchsize
                for i in range(total):
                    batch = self.next_batch[:self.batchsize]
                    self.next_batch = self.next_batch[self.batchsize:]

                    source = torch.LongTensor(batch).t().contiguous()
                    data = Variable(source[:seq_len])
                    targets = Variable(source[1:])

                    if self.use_cuda:
                        data = data.cuda()
                        targets = targets.cuda()

                    data_list.append(data)
                    targets_list.append(targets)
        else:
            # here we get valid examples and pad them with zeros
            xs, ys, labels, valid_inds, _, y_lens = PaddingUtils.pad_text(
                observations, self.dict, self.END_IDX, self.NULL_IDX)
            if self.use_cuda:
                xs = Variable(xs).cuda()
                ys = Variable(ys).cuda()
            else:
                xs = Variable(xs)
                ys = Variable(ys)
            data_list = [xs]
            targets_list = [ys]

        return data_list, targets_list, labels, valid_inds, y_lens

    def batch_act(self, observations):
        batch_reply = [{'id': self.getID()} for _ in range(len(observations))]
        if any(['labels' in obs for obs in observations]):
            # if we are starting a new training epoch, reinitialize hidden
            if self.is_training == False:
                self.hidden = self.model.init_hidden(self.batchsize)
            self.is_training = True
            data_list, targets_list, _, _, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)
        else:
            # if we just finished training, reinitialize hidden
            if self.is_training == True:
                self.hidden = self.model.init_hidden(self.batchsize)
                self.is_training = False
            data_list, targets_list, labels, valid_inds, y_lens = self.vectorize(
                observations, self.opt['seq_len'], self.is_training)

        if data_list is None:
            # not enough data to batch act yet, return empty responses
            return batch_reply

        batch_reply = []
        # during evaluation, len(data_list) is always 1
        # during training, len(dat_list) >= 0: vectorize returns a list containing all batches available at the time it is called
        for i in range(len(data_list)):
            temp_dicts = [{
                'id': self.getID()
            } for _ in range(len(observations))]
            output, hidden, loss_dict, predictions = self.predict(
                data_list[i], self.hidden, targets_list[i], self.is_training,
                y_lens)
            self.hidden = self.repackage_hidden(hidden)

            if predictions is not None:
                # map predictions back to the right order
                PaddingUtils.map_predictions(
                    predictions,
                    valid_inds,
                    temp_dicts,
                    observations,
                    self.dict,
                    self.END_IDX,
                    report_freq=self.opt['report_freq'])

            if loss_dict is not None:
                if 'metrics' in temp_dicts[0]:
                    for k, v in loss_dict.items():
                        temp_dicts[0]['metrics'][k] = v
                else:
                    temp_dicts[0]['metrics'] = loss_dict

            batch_reply += temp_dicts

        return batch_reply

    def act(self):
        # call batch_act with this batch of one
        return self.batch_act([self.observation])[0]

    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'model'):
            model = {}
            model['model'] = self.model.state_dict()
            model['opt'] = self.opt

            with open(path, 'wb') as write:
                torch.save(model, write)

    def shutdown(self):
        """Save the state of the model when shutdown."""
        path = self.opt.get('model_file', None)
        if path is not None:
            self.save(path + '.shutdown_state')
        super().shutdown()

    def load(self, path):
        """Return opt and model states."""
        with open(path, 'rb') as read:
            states = torch.load(read)

        return states['opt'], states

Example #59

Show file

File: seq2seq.py Project: jojonki/ParlAI

    def __init__(self, opt, shared=None):
        """Set up model if shared params not set, otherwise no work to do."""
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full
            # initialization. if shared is set, only set up shared members.

            # check for cuda
            self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                # load model parameters if available
                print('Loading existing model params from ' + opt['model_file'])
                new_opt, self.states = self.load(opt['model_file'])
                # override options with stored ones
                opt = self.override_opt(new_opt)

            self.dict = DictionaryAgent(opt)
            self.id = 'Seq2Seq'
            # we use START markers to start our output
            self.START = self.dict.start_token
            self.START_TENSOR = torch.LongTensor(self.dict.parse(self.START))
            # we use END markers to end our output
            self.END = self.dict.end_token
            self.END_TENSOR = torch.LongTensor(self.dict.parse(self.END))
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict.txt2vec(self.dict.null_token)[0]

            # store important params directly
            hsz = opt['hiddensize']
            emb = opt['embeddingsize']
            self.hidden_size = hsz
            self.emb_size = emb
            self.num_layers = opt['numlayers']
            self.learning_rate = opt['learningrate']
            self.rank = opt['rank_candidates']
            self.longest_label = 1
            self.truncate = opt['truncate']
            self.attention = opt['attention']

            # set up tensors
            self.zeros = torch.zeros(self.num_layers, 1, hsz)
            self.xs = torch.LongTensor(1, 1)
            self.ys = torch.LongTensor(1, 1)
            self.cands = torch.LongTensor(1, 1, 1)
            self.cand_scores = torch.FloatTensor(1)
            self.cand_lengths = torch.LongTensor(1)

            # set up modules
            self.criterion = nn.NLLLoss()
            # lookup table stores word embeddings
            self.lt = nn.Embedding(len(self.dict), emb,
                                   padding_idx=self.NULL_IDX,
                                   scale_grad_by_freq=True)
            self.lt2enc = nn.Linear(emb, hsz)
            self.lt2dec = nn.Linear(emb, hsz)
            # encoder captures the input text
            enc_class = Seq2seqAgent.ENC_OPTS[opt['encoder']]
            self.encoder = enc_class(hsz, hsz, opt['numlayers'])
            # decoder produces our output states
            if opt['decoder'] == 'shared':
                self.decoder = self.encoder
            elif opt['decoder'] == 'same':
                self.decoder = enc_class(hsz, hsz, opt['numlayers'])
            else:
                dec_class = Seq2seqAgent.ENC_OPTS[opt['decoder']]
                self.decoder = dec_class(hsz, hsz, opt['numlayers'])
            # linear layer helps us produce outputs from final decoder state
            self.h2o = nn.Linear(hsz, len(self.dict))
            # droput on the linear layer helps us generalize
            self.dropout = nn.Dropout(opt['dropout'])

            self.use_attention = False
            # if attention is greater than 0, set up additional members
            if self.attention > 0:
                self.use_attention = True
                self.max_length = self.attention
                # combines input and previous hidden output layer
                self.attn = nn.Linear(hsz * 2, self.max_length)
                # combines attention weights with encoder outputs
                self.attn_combine = nn.Linear(hsz * 2, hsz)

            # set up optims for each module
            lr = opt['learningrate']

            optim_class = Seq2seqAgent.OPTIM_OPTS[opt['optimizer']]
            self.optims = {
                'lt': optim_class(self.lt.parameters(), lr=lr),
                'lt2enc': optim_class(self.lt2enc.parameters(), lr=lr),
                'lt2dec': optim_class(self.lt2dec.parameters(), lr=lr),
                'encoder': optim_class(self.encoder.parameters(), lr=lr),
                'decoder': optim_class(self.decoder.parameters(), lr=lr),
                'h2o': optim_class(self.h2o.parameters(), lr=lr),
            }

            if hasattr(self, 'states'):
                # set loaded states if applicable
                self.set_states(self.states)

            if self.use_cuda:
                self.cuda()

        self.reset()

Example #60

Show file

class FlickrDataset(Dataset):
    """A Pytorch Dataset utilizing streaming"""
    def __init__(self, opt):
        self.opt = opt
        self.use_hdf5 = opt.get('use_hdf5', False)
        self.datatype = self.opt.get('datatype')
        self.training = self.datatype.startswith('train')
        self.num_epochs = self.opt.get('num_epochs', 0)
        self.image_loader = ImageLoader(opt)
        caption_path, self.image_path = _path(opt)
        self._setup_data(caption_path, opt.get('unittest', False))
        if self.use_hdf5:
            try:
                import h5py
                self.h5py = h5py
            except ModuleNotFoundError:
                raise ModuleNotFoundError(
                    'Need to install h5py - `pip install h5py`')
            self._setup_image_data()
        self.dict_agent = DictionaryAgent(opt)

    def __getitem__(self, index):
        index %= self.num_episodes()
        cap = self.caption[index]
        ep = {
            'text': self.dict_agent.txt2vec(QUESTION),
            'image': self.get_image(cap['image_id']),
            'episode_done': True,
        }
        if self.opt.get('extract_image', False):
            ep['image_id'] = cap['image_id']
            return ep

        ep['labels'] = [self.dict_agent.txt2vec(cc) for cc in cap['captions']]
        ep['valid'] = True
        ep['use_hdf5'] = self.use_hdf5
        return (index, ep)

    def __len__(self):
        num_epochs = self.num_epochs if self.num_epochs > 0 else 100
        num_iters = num_epochs if self.training else 1
        return int(num_iters * self.num_episodes())

    def _load_lens(self):
        with open(self.length_datafile) as length:
            lengths = json.load(length)
            self.num_eps = lengths['num_eps']
            self.num_exs = lengths['num_exs']

    def _setup_data(self, caption_path, unittest):
        with open(caption_path) as data_file:
            self.caption = []
            prev_img_id = None
            for line in data_file:
                img_id = line.split('#')[0][:-4]
                caption = line.split('\t')[1]
                if img_id != prev_img_id:
                    prev_img_id = img_id
                    to_add = {}
                    to_add['image_id'] = int(img_id)
                    to_add['captions'] = [caption]
                    self.caption.append(to_add)
                else:
                    self.caption[-1]['captions'].append(caption)
        if unittest:
            self.caption = self.caption[:10]
        self.image_paths = set()
        for cap in self.caption:
            self.image_paths.add(
                os.path.join(self.image_path, '%d.jpg' % (cap['image_id'])))

    def _setup_image_data(self):
        '''hdf5 image dataset'''
        extract_feats(self.opt)
        im = self.opt.get('image_mode')
        hdf5_path = self.image_path + 'mode_{}_noatt.hdf5'.format(im)
        hdf5_file = self.h5py.File(hdf5_path, 'r')
        self.image_dataset = hdf5_file['images']

        image_id_to_idx_path = self.image_path + 'mode_{}_id_to_idx.txt'.format(
            im)
        with open(image_id_to_idx_path, 'r') as f:
            self.image_id_to_idx = json.load(f)

    def get_image(self, image_id):
        if not self.use_hdf5:
            im_path = os.path.join(self.image_path, '%d.jpg' % (image_id))
            return self.image_loader.load(im_path)
        else:
            img_idx = self.image_id_to_idx[str(image_id)]
            return torch.Tensor(self.image_dataset[img_idx])

    def num_episodes(self):
        return len(self.caption)

    def num_examples(self):
        return self.num_episodes()

    def num_images(self):
        return self.num_episodes()