Exemple #1
0
def create_agent(opt: Opt, requireModelExists=False):
    """
    Create an agent from the options ``model``, ``model_params`` and ``model_file``.

    The input is either of the form
    ``parlai.agents.ir_baseline.agents:IrBaselineAgent`` (i.e. the path
    followed by the class name) or else just ``ir_baseline`` which
    assumes the path above, and a class name suffixed with 'Agent'.

    If ``model-file`` is available in the options this function can also
    attempt to load the model from that location instead. This avoids having to
    specify all the other options necessary to set up the model including its
    name as they are all loaded from the options file if it exists (the file
    opt['model_file'] + '.opt' must exist and contain a pickled or json dict
    containing the model's options).
    """
    if opt.get('datapath', None) is None:
        add_datapath_and_model_args(opt)

    if opt.get('model_file'):
        opt['model_file'] = modelzoo_path(opt.get('datapath'), opt['model_file'])
        if requireModelExists and not PathManager.exists(opt['model_file']):
            raise RuntimeError(
                'WARNING: Model file does not exist, check to make '
                'sure it is correct: {}'.format(opt['model_file'])
            )
        # Attempt to load the model from the model file first (this way we do
        # not even have to specify the model name as a parameter)
        model = create_agent_from_opt_file(opt)
        if model is not None:
            return model
        else:
            logging.info(f"No model with opt yet at: {opt['model_file']}(.opt)")

    if opt.get('model'):
        model_class = load_agent_module(opt['model'])
        # if we want to load weights from --init-model, compare opts with
        # loaded ones
        compare_init_model_opts(opt, opt)
        model = model_class(opt)
        if requireModelExists and hasattr(model, 'load') and not opt.get('model_file'):
            # double check that we didn't forget to set model_file on loadable model
            logging.warn('model_file unset but model has a `load` function.')
        return model
    else:
        raise RuntimeError('Need to set `model` argument to use create_agent.')
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Index words in embedding file
        if (self.opt['pretrained_words'] and self.opt.get('embedding_file')
                and not self.opt.get('trained', False)):
            print('[ Indexing words with embeddings... ]')
            self.embedding_words = set()
            self.opt['embedding_file'] = modelzoo_path(
                self.opt.get('datapath'), self.opt['embedding_file'])
            with open(self.opt['embedding_file']) as f:
                for line in f:
                    w = normalize_text(line.rstrip().split(' ')[0])
                    self.embedding_words.add(w)
            print('[ Num words in set = %d ]' % len(self.embedding_words))
        else:
            self.embedding_words = None
Exemple #3
0
 def __init__(self, opt: Opt):
     self.opt = opt
     self.agents = []
     self.agent_dict = None
     self.generations = []
     self.input_type = 'Search'
     self.knowledge_access_method = KnowledgeAccessMethod(
         opt['knowledge_access_method'])
     model_file = modelzoo_path(opt['datapath'],
                                opt['query_generator_model_file'])
     if (self.knowledge_access_method is KnowledgeAccessMethod.SEARCH_ONLY
             and 'blenderbot2/query_generator/model' in model_file):
         raise ValueError(
             'You cannot use the blenderbot2 query generator with search_only. Please '
             'consider setting --query-generator-model-file zoo:sea/bart_sq_gen/model '
             'instead.')
     if model_file and os.path.exists(model_file):
         logging.info(f'Building Query Generator from file: {model_file}')
         logging.disable()
         overrides: Dict[str, Any] = {'skip_generation': False}
         overrides['inference'] = opt['query_generator_inference']
         overrides['beam_size'] = opt.get('query_generator_beam_size', 3)
         overrides['beam_min_length'] = opt.get(
             'query_generator_beam_min_length', 2)
         overrides['model_parallel'] = opt['model_parallel']
         overrides['no_cuda'] = opt['no_cuda']
         if self.opt['query_generator_truncate'] > 0:
             overrides['text_truncate'] = self.opt[
                 'query_generator_truncate']
             overrides['truncate'] = self.opt['query_generator_truncate']
         base_agent = create_agent_from_model_file(model_file,
                                                   opt_overrides=overrides)
         assert isinstance(base_agent, TorchAgent)
         self.agents = [base_agent]
         bsz = max(
             opt.get('batchsize') or 1,
             opt.get('eval_batchsize') or 1)
         rag_turn_n_turns = opt.get('rag_turn_n_turns', 1)
         if bsz > 1 or rag_turn_n_turns > 1:
             self.agents += [
                 create_agent_from_shared(self.agents[0].share())
                 for _ in range((bsz * rag_turn_n_turns) - 1)
             ]
         self.agent_dict = self.agents[0].build_dictionary()
         logging.enable()
    def __init__(self, opt, shared=None):
        # initialize fields
        self.opt = copy.deepcopy(opt)
        self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq)
        self.null_token = '__PAD__'
        self.end_token = '__SOC__'
        self.unk_token = '__UNK__'
        self.start_token = '__SOC__'
        self.tokenizer = opt.get('dict_tokenizer', 'whitespace')
        self.lower = opt.get('dict_lower', DictionaryAgent.default_lower)
        self.maxtokens = opt.get('dict_maxtokens',
                                 DictionaryAgent.default_maxtokens)
        self.textfields = opt.get(
            'dict_textfields', DictionaryAgent.default_textfields).split(",")

        if shared:
            self.freq = shared.get('freq', {})
            self.tok2ind = shared.get('tok2ind', {})
            self.ind2tok = shared.get('ind2tok', {})
        else:
            self.freq = defaultdict(int)
            self.tok2ind = {}
            self.ind2tok = {}

            if opt.get('dict_file') and os.path.isfile(opt['dict_file']):
                # load pre-existing dictionary
                self.load(opt['dict_file'])
            elif opt.get('dict_initpath'):
                # load seed dictionary
                opt['dict_initpath'] = modelzoo_path(opt.get('datapath'),
                                                     opt['dict_initpath'])
                self.load(opt['dict_initpath'])

            self.add_token(self.null_token)
            self.add_token(self.start_token)
            self.add_token(self.end_token)
            self.add_token(self.unk_token)

        if not shared:
            if opt.get('dict_file'):
                self.save_path = opt['dict_file']

        # cache unk token for later
        self._unk_token_idx = self.tok2ind.get(self.unk_token)
Exemple #5
0
def get_model_name(opt):
    model = opt.get('model', None)
    if model is None:
        # try to get model name from model opt file
        model_file = opt.get('model_file', None)
        if model_file is not None:
            model_file = modelzoo_path(opt.get('datapath'), model_file)
            optfile = model_file + '.opt'
            if os.path.isfile(optfile):
                try:
                    # try json first
                    with open(optfile, 'r', encoding='utf-8') as handle:
                        new_opt = json.load(handle)
                        model = new_opt.get('model', None)
                except UnicodeDecodeError:
                    # oops it's pickled
                    with open(optfile, 'rb') as handle:
                        new_opt = pickle.load(handle)
                        model = new_opt.get('model', None)
    return model
Exemple #6
0
    def set_options(self, name, device):
        option = {
            "n_image_tokens": 1,
            "n_image_channels": 1,
            "image_fusion_type": "late",
        }

        add_datapath_and_model_args(option)
        datapath = option.get("datapath")
        option['model_file'] = modelzoo_path(datapath, name)
        option["override"] = {
            "no_cuda": False if "cuda" in device else True,
        }

        if "cuda:" in device:
            option["override"]["gpu"] = int(device.split(":")[1])
        elif "cuda" in device:
            option["override"]["gpu"] = 0

        return option
Exemple #7
0
    def load_passages(self) -> List[Tuple[str, str, str]]:
        """
        Load passages from tsv file.

        Limit passages returned according to shard number.

        :return passages:
            return a list of (doc_id, doc_text, doc_title) tuples
        """
        logging.info(f"Loading {self.opt['passages_file']}")
        rows = load_passages_list(
            modelzoo_path(self.opt['datapath'],
                          self.opt['passages_file'])  # type: ignore
        )
        shard_id, num_shards = self.opt['shard_id'], self.opt['num_shards']
        shard_size = int(len(rows) / num_shards)
        start_idx = shard_id * shard_size
        end_idx = start_idx + shard_size
        logging.info(f'Shard {shard_id} of {num_shards} encoding psg index '
                     f'{start_idx} to {end_idx}, out of {len(rows)}')
        return rows[start_idx:end_idx]
Exemple #8
0
 def __init__(self, opt: Opt):
     self.opt = opt
     self.agents = []
     self.agent_dict = None
     self.generations = []
     self.input_type = 'Memory'
     self.delimiter = opt.get('memory_decoder_delimiter', '\n')
     self.one_line_memories = opt.get('memory_decoder_one_line_memories',
                                      False)
     model_file = modelzoo_path(opt['datapath'],
                                opt['memory_decoder_model_file'])
     if model_file and os.path.exists(model_file):
         logging.info(f'Building Memory Decoder from file: {model_file}')
         logging.disable()
         overrides = {
             'skip_generation': False,
             'inference': 'beam',
             'beam_size': opt.get('memory_decoder_beam_size', 3),
             'beam_min_length': opt.get('memory_decoder_beam_min_length',
                                        10),
             'beam_block_ngram': 3,
             'no_cuda': opt.get('no_cuda', False),
         }
         if self.opt.get('memory_decoder_truncate', -1) > 0:
             overrides['text_truncate'] = self.opt[
                 'memory_decoder_truncate']
             overrides['truncate'] = self.opt['memory_decoder_truncate']
         base_agent = create_agent_from_model_file(model_file,
                                                   opt_overrides=overrides)
         assert isinstance(base_agent, TorchAgent)
         self.agents = [base_agent]
         assert isinstance(self.agents[0], TorchAgent)
         copies = max(100,
                      (opt['batchsize'] * opt.get('rag_turn_n_turns', 1)))
         self.agents += [
             create_agent_from_shared(self.agents[0].share())
             for _ in range(copies)
         ]
         self.agent_dict = self.agents[0].build_dictionary()
         logging.enable()
Exemple #9
0
    def test_set_model_file_without_dict_file(self):
        """Check that moving a model without moving the dictfile raises an error."""
        # Download model, move to a new location
        datapath = ParlaiParser().parse_args(print_args=False)['datapath']
        try:
            # remove unittest models if there before
            shutil.rmtree(os.path.join(datapath, 'models/unittest'))
        except FileNotFoundError:
            pass
        testing_utils.download_unittest_models()

        zoo_path = 'zoo:unittest/seq2seq/model'
        model_path = modelzoo_path(datapath, zoo_path)
        os.remove(model_path + '.dict')
        # Test that eval model fails
        with self.assertRaises(RuntimeError):
            testing_utils.eval_model(dict(task='babi:task1k:1', model_file=model_path))
        try:
            # remove unittest models if there after
            shutil.rmtree(os.path.join(datapath, 'models/unittest'))
        except FileNotFoundError:
            pass
Exemple #10
0
    def _get_subagent_opt(
        self,
        filename: str,
        specific_override_args: Dict[str, Any],
        general_override_args: Dict[str, Any],
    ) -> Opt:
        """
        Given an agent opt, construct the new opt for the agent.

        :param filename:
            opt path
        :param specific_override_args:
            args for the specific agent
        :param general_override_args:
            args specified for all agents
        """
        if not filename.endswith('.opt'):
            filename += '.opt'
        opt = Opt.load(modelzoo_path(self.opt['datapath'], filename))
        opt['override'] = {}
        blocklist_general = ['model', 'model_file', 'init_model']
        general_override_args['skip_generation'] = False

        # Remove the prefix for the model for the specific override args.
        specific_override_args = {
            '_'.join(k.split('_')[1:]): v for k, v in specific_override_args.items()
        }

        override_args = {**general_override_args, **specific_override_args}

        for k, v in override_args.items():
            if k not in blocklist_general and k in opt:
                logging.warning(f'Overriding {k} to {v} (old val: {opt[k]})')
                opt['override'][k] = v
            elif k in specific_override_args:
                logging.warning(f'Key {k} not originally in opt, setting to {v}')
                opt['override'][k] = v

        return opt
Exemple #11
0
def load_embeddings(opt, word_dict):
    """Initialize embeddings from file of pretrained vectors."""
    embeddings = torch.Tensor(len(word_dict), opt['embedding_dim'])
    embeddings.normal_(0, 1)
    opt['embedding_file'] = modelzoo_path(opt.get('datapath'),
                                          opt['embedding_file'])
    # Fill in embeddings
    if not opt.get('embedding_file'):
        raise RuntimeError('Tried to load embeddings with no embedding file.')
    with open(opt['embedding_file']) as f:
        for line in f:
            parsed = line.rstrip().split(' ')
            if len(parsed) > 2:
                assert (len(parsed) == opt['embedding_dim'] + 1)
                w = normalize_text(parsed[0])
                if w in word_dict:
                    vec = torch.Tensor([float(i) for i in parsed[1:]])
                    embeddings[word_dict[w]].copy_(vec)

    # Zero NULL token
    embeddings[word_dict['__NULL__']].fill_(0)

    return embeddings
Exemple #12
0
    def set_options(self, name, path, class_name, device):
        option = {
            "n_image_tokens": 1,
            "n_image_channels": 1,
            "image_fusion_type": "late",
            "image_features_dim": 2048,
            "image_encoder_num_layers": 1,
        }
        add_datapath_and_model_args(option)
        datapath = option.get('datapath')
        option['model_file'] = modelzoo_path(datapath, name)
        option["override"] = {
            "no_cuda": False if "cuda" in device else True,
        }

        if "cuda:" in device:
            option["override"]["gpu"] = int(device.split(":")[1])
        elif "cuda" in device:
            option["override"]["gpu"] = 0

        my_module = importlib.import_module(path)
        model_class = getattr(my_module, class_name)
        return option, model_class
Exemple #13
0
def set_defaults(opt):
    init_model = None
    # check first for 'init_model' for loading model from file
    if opt.get('init_model') and os.path.isfile(opt['init_model']):
        init_model = opt['init_model']
    # next check for 'model_file', this would override init_model
    if opt.get('model_file') and os.path.isfile(opt['model_file']):
        init_model = opt['model_file']

    if init_model is None:
        # Embeddings options
        opt['embedding_file'] = modelzoo_path(
            opt.get('datapath'), opt['embedding_file']
        )
        if opt.get('embedding_file'):
            if not os.path.isfile(opt['embedding_file']):
                raise IOError('No such file: %s' % opt['embedding_file'])
            with open(opt['embedding_file']) as f:
                dim = len(f.readline().strip().split(' ')) - 1
                if dim == 1:
                    # first line was a dud
                    dim = len(f.readline().strip().split(' ')) - 1
            opt['embedding_dim'] = dim
        elif not opt.get('embedding_dim'):
            raise RuntimeError(
                ('Either embedding_file or embedding_dim ' 'needs to be specified.')
            )

        # Make sure tune_partial and fix_embeddings are consistent
        if opt['tune_partial'] > 0 and opt['fix_embeddings']:
            print('Setting fix_embeddings to False as tune_partial > 0.')
            opt['fix_embeddings'] = False

        # Make sure fix_embeddings and embedding_file are consistent
        if opt['fix_embeddings'] and not opt.get('embedding_file'):
            print('Setting fix_embeddings to False as embeddings are random.')
            opt['fix_embeddings'] = False
Exemple #14
0
    def test_set_model_file_without_dict_file(self):
        """
        Check that moving a model without moving the dictfile raises an error.
        """
        # Download model, move to a new location
        with testing_utils.tempdir() as datapath:
            try:
                # remove unittest models if there before
                shutil.rmtree(os.path.join(datapath, 'models/unittest'))
            except FileNotFoundError:
                pass

            zoo_path = 'zoo:unittest/seq2seq/model'
            model_path = modelzoo_path(datapath, zoo_path)
            PathManager.rm(model_path + '.dict')
            # Test that eval model fails
            with self.assertRaises(RuntimeError):
                testing_utils.eval_model(
                    dict(task='babi:task1k:1', model_file=model_path))
            try:
                # remove unittest models if there after
                shutil.rmtree(os.path.join(datapath, 'models/unittest'))
            except FileNotFoundError:
                pass
Exemple #15
0
def set_defaults(opt):
    # Embeddings options
    opt['embedding_file'] = modelzoo_path(opt.get('datapath'),
                                          opt['embedding_file'])
    if opt.get('embedding_file'):
        if not os.path.isfile(opt['embedding_file']):
            raise IOError('No such file: %s' % opt['embedding_file'])
        with open(opt['embedding_file']) as f:
            dim = len(f.readline().strip().split(' ')) - 1
        opt['embedding_dim'] = dim
    elif not opt.get('embedding_dim'):
        raise RuntimeError(('Either embedding_file or embedding_dim '
                            'needs to be specified.'))

    # Make sure tune_partial and fix_embeddings are consistent
    if opt['tune_partial'] > 0 and opt['fix_embeddings']:
        print('Setting fix_embeddings to False as tune_partial > 0.')
        opt['fix_embeddings'] = False

    # Make sure fix_embeddings and embedding_file are consistent
    if opt['fix_embeddings']:
        if not opt.get('embedding_file') and not opt.get('init_model'):
            print('Setting fix_embeddings to False as embeddings are random.')
            opt['fix_embeddings'] = False
Exemple #16
0
    def __init__(self, opt, shared=None):
        """Set up model."""
        super().__init__(opt, shared)
        opt = self.opt  # there is a deepcopy in the init
        #self.opt = opt
        # all instances may need some params
        opt['label_smoothing'] = False
        opt['src_tgt_weight_share'] = False
        opt['tgt_prj_weight_share'] = False
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.metrics = {
            'loss': 0.0,
            'num_tokens': 0,
            'correct_tokens': 0,
            'total_skipped_batches': 0
        }
        self.history = {}
        self.report_freq = opt.get('report_freq', 0.001)
        self.use_person_tokens = opt.get('person_tokens', False)
        self.batch_idx = shared and shared.get('batchindex') or 0
        self.rank = opt['rank_candidates']
        self.beam_size = opt.get('beam_size', 1)
        self.topk = opt.get('topk', 1)
        states = {}

        # check for cuda
        self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available()
        if opt.get('numthreads', 1) > 1:
            torch.set_num_threads(1)

        if shared:
            # set up shared properties
            self.opt = shared['opt']
            opt = self.opt
            self.dict = shared['dict']
            self.START_IDX = shared['START_IDX']
            self.END_IDX = shared['END_IDX']
            self.NULL_IDX = shared['NULL_IDX']
            # answers contains a batch_size list of the last answer produced
            self.answers = shared['answers']
            self.model = shared['model']
            self.metrics = shared['metrics']
            states = shared.get('states', {})

        else:
            # this is not a shared instance of this class, so do full init
            # answers contains a batch_size list of the last answer produced
            self.answers = [None] * opt['batchsize']

            if self.use_cuda:
                print('[ Using CUDA ]')
                torch.cuda.set_device(opt['gpu'])

            init_model = None
            # check first for 'init_model' for loading model from file
            if opt.get('init_model') and os.path.isfile(opt['init_model']):
                init_model = opt['init_model']
            # next check for 'model_file', this would override init_model
            if opt.get('model_file') and os.path.isfile(opt['model_file']):
                init_model = opt['model_file']

            if init_model is not None:
                # load model parameters if available
                print('[ Loading existing model params from {} ]'.format(
                    init_model))
                states = self.load(init_model)

                if os.path.isfile(init_model +
                                  '.dict') or opt['dict_file'] is None:
                    opt['dict_file'] = init_model + '.dict'

            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)
            self.id = 'Transformer'
            # we use START markers to start our output
            self.START_IDX = self.dict[self.dict.start_token]
            # we use END markers to end our output
            self.END_IDX = self.dict[self.dict.end_token]
            # get index of null token from dictionary (probably 0)
            self.NULL_IDX = self.dict[self.dict.null_token]

            if not hasattr(self, 'model_class'):
                # this allows child classes to override this but inherit init
                self.model_class = Transformer
            # self.model = self.model_class(
            #     opt, len(self.dict), padding_idx=self.NULL_IDX,
            #     start_idx=self.START_IDX, end_idx=self.END_IDX,
            #     longest_label=states.get('longest_label', 1))
            self.model = self.model_class(len(self.dict), opt)

            if opt.get('dict_tokenizer'
                       ) == 'bpe' and opt['embedding_type'] != 'random':
                print('skipping preinitialization of embeddings for bpe')
            elif not states and opt['embedding_type'] != 'random':
                # set up preinitialized embeddings
                try:
                    import torchtext.vocab as vocab
                except ImportError as ex:
                    print(
                        'Please install torch text with `pip install torchtext`'
                    )
                    raise ex
                pretrained_dim = 300
                if opt['embedding_type'].startswith('glove'):
                    if 'twitter' in opt['embedding_type']:
                        init = 'glove-twitter'
                        name = 'twitter.27B'
                        pretrained_dim = 200
                    else:
                        init = 'glove'
                        name = '840B'
                    embs = vocab.GloVe(name=name,
                                       dim=pretrained_dim,
                                       cache=modelzoo_path(
                                           self.opt.get('datapath'),
                                           'models:glove_vectors'))
                elif opt['embedding_type'].startswith('fasttext'):
                    init = 'fasttext'
                    embs = vocab.FastText(language='en',
                                          cache=modelzoo_path(
                                              self.opt.get('datapath'),
                                              'models:fasttext_vectors'))
                else:
                    raise RuntimeError('embedding type not implemented')

                if opt['embeddingsize'] != pretrained_dim:
                    rp = torch.Tensor(pretrained_dim,
                                      opt['embeddingsize']).normal_()
                    t = lambda x: torch.mm(x.unsqueeze(0), rp)
                else:
                    t = lambda x: x
                cnt = 0
                for w, i in self.dict.tok2ind.items():
                    if w in embs.stoi:
                        vec = t(embs.vectors[embs.stoi[w]])
                        self.model.decoder.tgt_word_emb.weight.data[i] = vec
                        cnt += 1
                        if opt['lookuptable'] in ['unique', 'dec_out']:
                            # also set encoder lt, since it's not shared
                            self.model.encoder.src_word_emb.weight.data[
                                i] = vec
                print(
                    'Transformer: initialized embeddings for {} tokens from {}.'
                    ''.format(cnt, init))

            if states:
                # set loaded states if applicable
                self.model.load_state_dict(states['model'])

            if self.use_cuda:
                self.model.cuda()

        # set up criteria
        if opt.get('numsoftmax', 1) > 1:
            self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX,
                                        size_average=False)
        else:
            self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX,
                                                 size_average=False)

        if self.use_cuda:
            self.criterion.cuda()

        if 'train' in opt.get('datatype', ''):
            # we only set up optimizers when training
            # we only set this up for the original instance or hogwild ones
            self.clip = opt.get('gradient_clip', -1)

            # set up optimizer
            lr = opt['learningrate']
            optim_class = TransformerAgent.OPTIM_OPTS[opt['optimizer']]
            kwargs = {'lr': lr}
            if opt.get('momentum') > 0 and opt['optimizer'] in [
                    'sgd', 'rmsprop'
            ]:
                kwargs['momentum'] = opt['momentum']
                if opt['optimizer'] == 'sgd':
                    kwargs['nesterov'] = True
            if opt['optimizer'] == 'adam':
                # https://openreview.net/forum?id=ryQu7f-RZ
                kwargs['amsgrad'] = True

            if opt['embedding_type'].endswith('fixed'):
                print('Transformer: fixing embedding weights.')
                self.model.decoder.tgt_word_emb.weight.requires_grad = False
                self.model.encoder.src_word_emb.weight.requires_grad = False
                if opt['lookuptable'] in ['dec_out', 'all']:
                    # self.model.decoder.e2s.weight.requires_grad = False
                    self.model.tgt_word_prj.weight.requires_grad = False
            self.optimizer = optim_class(
                [p for p in self.model.parameters() if p.requires_grad],
                **kwargs)
            if states.get('optimizer'):
                if states['optimizer_type'] != opt['optimizer']:
                    print('WARNING: not loading optim state since optim class '
                          'changed.')
                else:
                    try:
                        self.optimizer.load_state_dict(states['optimizer'])
                    except ValueError:
                        print('WARNING: not loading optim state since model '
                              'params changed.')
                    if self.use_cuda:
                        for state in self.optimizer.state.values():
                            for k, v in state.items():
                                if isinstance(v, torch.Tensor):
                                    state[k] = v.cuda()
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer, 'min', factor=0.5, patience=3, verbose=True)

        self.reset()
Exemple #17
0
def download(datapath):
    return vocab.Vectors(
        name='wiki.en.vec',
        url=URL,
        cache=modelzoo_path(datapath, 'models:fasttext_vectors'),
    )
Exemple #18
0
    def parse_args(self, args=None, namespace=None, print_args=True):
        """Parses the provided arguments and returns a dictionary of the
        ``args``. We specifically remove items with ``None`` as values in order
        to support the style ``opt.get(key, default)``, which would otherwise
        return ``None``.
        """
        self.add_extra_args(args)
        self.args = super().parse_args(args=args)
        self.opt = vars(self.args)

        # custom post-parsing
        self.opt['parlai_home'] = self.parlai_home
        if 'batchsize' in self.opt and self.opt['batchsize'] <= 1:
            # hide batch options
            self.opt.pop('batch_sort', None)
            self.opt.pop('context_length', None)

        # set environment variables
        if self.opt.get('download_path'):
            os.environ['PARLAI_DOWNPATH'] = self.opt['download_path']
        if self.opt.get('datapath'):
            os.environ['PARLAI_DATAPATH'] = self.opt['datapath']

        # set all arguments specified in commandline as overridable
        option_strings_dict = {}
        store_true = []
        store_false = []
        for group in self._action_groups:
            for a in group._group_actions:
                if hasattr(a, 'option_strings'):
                    for option in a.option_strings:
                        option_strings_dict[option] = a.dest
                        if '_StoreTrueAction' in str(type(a)):
                            store_true.append(option)
                        elif '_StoreFalseAction' in str(type(a)):
                            store_false.append(option)

        for i in range(len(self.cli_args)):
            if self.cli_args[i] in option_strings_dict:
                if self.cli_args[i] in store_true:
                    self.overridable[option_strings_dict[self.cli_args[i]]] = \
                        True
                elif self.cli_args[i] in store_false:
                    self.overridable[option_strings_dict[self.cli_args[i]]] = \
                        False
                elif i < len(
                        self.cli_args) - 1 and self.cli_args[i + 1][:1] != '-':
                    key = option_strings_dict[self.cli_args[i]]
                    self.overridable[key] = self.opt[key]
        self.opt['override'] = self.overridable

        # map filenames that start with 'models:' to point to the model zoo dir
        if self.opt.get('model_file') is not None:
            self.opt['model_file'] = modelzoo_path(self.opt.get('datapath'),
                                                   self.opt['model_file'])
        if self.opt['override'].get('model_file') is not None:
            # also check override
            self.opt['override']['model_file'] = modelzoo_path(
                self.opt.get('datapath'), self.opt['override']['model_file'])
        if self.opt.get('dict_file') is not None:
            self.opt['dict_file'] = modelzoo_path(self.opt.get('datapath'),
                                                  self.opt['dict_file'])
        if self.opt['override'].get('dict_file') is not None:
            # also check override
            self.opt['override']['dict_file'] = modelzoo_path(
                self.opt.get('datapath'), self.opt['override']['dict_file'])

        # add start time of an experiment
        self.opt['starttime'] = datetime.datetime.today().strftime(
            '%b%d_%H-%M')

        if print_args:
            self.print_args()

        return self.opt
Exemple #19
0
    def __init__(self, opt, shared=None):
        """Initialize DictionaryAgent."""
        self.opt = copy.deepcopy(opt)
        self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq)
        self.null_token = opt.get('dict_nulltoken',
                                  DictionaryAgent.default_null)
        self.end_token = opt.get('dict_endtoken', DictionaryAgent.default_end)
        self.unk_token = opt.get('dict_unktoken', DictionaryAgent.default_unk)
        self.start_token = opt.get('dict_starttoken',
                                   DictionaryAgent.default_start)
        self.max_ngram_size = opt.get('dict_max_ngram_size',
                                      DictionaryAgent.default_maxngram)
        self.tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok)
        self.lower = opt.get('dict_lower', DictionaryAgent.default_lower)
        self.maxtokens = opt.get('dict_maxtokens',
                                 DictionaryAgent.default_maxtokens)
        self.textfields = opt.get(
            'dict_textfields', DictionaryAgent.default_textfields).split(",")

        try:
            self.tokenizer_fun = getattr(self, self.tokenizer + '_tokenize')
        except AttributeError:
            raise AttributeError('tokenizer type {} not yet supported'.format(
                self.tokenizer))

        if shared:
            self.freq = shared.get('freq', {})
            self.tok2ind = shared.get('tok2ind', {})
            self.ind2tok = shared.get('ind2tok', {})
        else:
            self.freq = defaultdict(int)
            self.tok2ind = {}
            self.ind2tok = {}

            if self.null_token:
                self.add_token(self.null_token)

            if self.start_token:
                # set special start of sentence word token
                self.add_token(self.start_token)

            if self.end_token:
                # set special end of sentence word token
                self.add_token(self.end_token)

            if self.unk_token:
                # set special unknown word token
                self.add_token(self.unk_token)

            loaded = False
            # If data built via pytorch data teacher, we need to load prebuilt dict
            if opt.get('pytorch_teacher_task'):
                from parlai.scripts.build_pytorch_data import get_pyt_dict_file

                opt['dict_file'] = get_pyt_dict_file(opt)
            if opt.get('dict_file'):
                opt['dict_file'] = modelzoo_path(opt.get('datapath'),
                                                 opt['dict_file'])
                if os.path.isfile(opt['dict_file']):
                    # load pre-existing dictionary
                    self.load(opt['dict_file'])
                    loaded = True

            if not loaded and opt.get('dict_initpath'):
                # load seed dictionary
                opt['dict_initpath'] = modelzoo_path(opt.get('datapath'),
                                                     opt['dict_initpath'])
                # don't check isfile first, should fail if file not found
                self.load(opt['dict_initpath'])

        # initialize tokenizers
        if self.tokenizer == 'nltk':
            try:
                import nltk
            except ImportError:
                raise ImportError('Please install nltk (pip install nltk)')
            # nltk-specific setup
            st_path = 'tokenizers/punkt/{0}.pickle'.format(
                opt['dict_language'])
            try:
                self.sent_tok = nltk.data.load(st_path)
            except LookupError:
                nltk.download('punkt')
                self.sent_tok = nltk.data.load(st_path)
            self.word_tok = nltk.tokenize.treebank.TreebankWordTokenizer()
        elif self.tokenizer == 'spacy':
            try:
                import spacy
            except ImportError:
                raise ImportError('Please install spacy and spacy "en" model: '
                                  '`pip install -U spacy && '
                                  'python -m spacy download en` '
                                  'or find alternative installation options '
                                  'at spacy.io')
            self.NLP = spacy.load('en')
        elif self.tokenizer == 'bpe':
            if not opt.get('dict_file'):
                raise RuntimeError('--dict-file is mandatory.')
            self.bpehelper = _BPEHelper(opt.get('dict_file') + '.codecs')
        elif self.tokenizer == 'gpt2':
            if self.lower:
                raise ValueError(
                    'Only use --dict-lower false with --dict-tokenizer gpt2')
            if self.maxtokens > 0 or self.minfreq > 0:
                raise ValueError(
                    'You should not filter vocabulary with using --dict-tokenizer gpt2'
                    ' (no --dict-minfreq or --dict-maxtokens).')

            self.gpt2_bpe = Gpt2BpeHelper(opt)
            for each_token in self.gpt2_bpe.list_tokens():
                self.add_token(each_token)
                self.freq[each_token] = 1
        if not shared:
            if self.null_token:
                # fix count for null token to one billion and three
                self.freq[self.null_token] = 1000000003

            if self.start_token:
                # fix count for start of sentence token to one billion and two
                self.freq[self.start_token] = 1000000002

            if self.end_token:
                # fix count for end of sentence token to one billion and one
                self.freq[self.end_token] = 1000000001

            if self.unk_token:
                # fix count for unknown token to one billion
                self.freq[self.unk_token] = 1000000000

            if opt.get('dict_file'):
                self.save_path = opt['dict_file']
Exemple #20
0
    def _process_args_to_opts(self, args_that_override: Optional[List[str]] = None):
        self.opt = Opt(vars(self.args))
        extra_ag = []

        if '_subparser' in self.opt:
            # if using the super command, we need to be aware of the subcommand's
            # arguments when identifying things manually set by the user
            self.overridable.update(self.opt['_subparser'].overridable)
            extra_ag = self.opt.pop('_subparser')._action_groups

        # custom post-parsing
        self.opt['parlai_home'] = self.parlai_home
        self.opt = self._infer_datapath(self.opt)

        # set all arguments specified in command line as overridable
        option_strings_dict = {}
        store_true = []
        store_false = []
        for group in self._action_groups + extra_ag:
            for a in group._group_actions:
                if hasattr(a, 'option_strings'):
                    for option in a.option_strings:
                        option_strings_dict[option] = a.dest
                        if isinstance(a, argparse._StoreTrueAction):
                            store_true.append(option)
                        elif isinstance(a, argparse._StoreFalseAction):
                            store_false.append(option)

        if args_that_override is None:
            args_that_override = _sys.argv[1:]

        args_that_override = fix_underscores(args_that_override)

        for i in range(len(args_that_override)):
            if args_that_override[i] in option_strings_dict:
                if args_that_override[i] in store_true:
                    self.overridable[option_strings_dict[args_that_override[i]]] = True
                elif args_that_override[i] in store_false:
                    self.overridable[option_strings_dict[args_that_override[i]]] = False
                elif (
                    i < len(args_that_override) - 1
                    and args_that_override[i + 1] not in option_strings_dict
                ):
                    key = option_strings_dict[args_that_override[i]]
                    self.overridable[key] = self.opt[key]
        self.opt['override'] = self.overridable

        # load opts if a file is provided.
        if self.opt.get('init_opt', None) is not None:
            self._load_opts(self.opt)

        # map filenames that start with 'zoo:' to point to the model zoo dir
        options_to_change = {'model_file', 'dict_file', 'bpe_vocab', 'bpe_merge'}
        for each_key in options_to_change:
            if self.opt.get(each_key) is not None:
                self.opt[each_key] = modelzoo_path(
                    self.opt.get('datapath'), self.opt[each_key]
                )
            if self.opt['override'].get(each_key) is not None:
                # also check override
                self.opt['override'][each_key] = modelzoo_path(
                    self.opt.get('datapath'), self.opt['override'][each_key]
                )

        # add start time of an experiment
        self.opt['starttime'] = datetime.datetime.today().strftime('%b%d_%H-%M')
Exemple #21
0
    def build_regret_model(self) -> RagModel:
        """
        Build and return regret RagModel.
        """
        model_file = modelzoo_path(self.opt['datapath'],
                                   self.opt['regret_model_file'])
        if model_file:
            assert os.path.exists(
                model_file
            ), f'specify correct path for --regret-model-file (currently {model_file})'
            regret_opt = Opt.load(f'{model_file}.opt')
            regret_opt['n_docs'] = self.opt[
                'n_docs']  # Urgent that this is the same
            # add keys that were not in this model when originally trained
            regret_opt.update(
                {k: v
                 for k, v in self.opt.items() if k not in regret_opt})
            retriever_shared = None
            if all([
                    regret_opt[k] == self.opt[k] for k in [
                        'rag_retriever_type',
                        'path_to_index',
                        'path_to_dpr_passages',
                    ]
            ]):
                logging.warning(
                    'Sharing retrievers between model and regret model!')
                retriever_shared = self.model.retriever.share()
            elif self.opt['regret_override_index']:
                # Sharing Index Path & Passages only; not the full retriever
                logging.warning('Overriding initial ReGReT model index')
                regret_opt['path_to_index'] = self.opt['path_to_index']
                regret_opt['path_to_dpr_passages'] = self.opt[
                    'path_to_dpr_passages']

            if self.opt['regret_dict_file']:
                regret_opt['dict_file'] = self.opt['regret_dict_file']

            regret_dict = self.dictionary_class()(regret_opt)
            model = RagModel(regret_opt,
                             regret_dict,
                             retriever_shared=retriever_shared)
            with PathManager.open(model_file, 'rb') as f:
                states = torch.load(
                    f,
                    map_location=lambda cpu, _: cpu,
                    pickle_module=parlai.utils.pickle,
                )
            assert 'model' in states
            model.load_state_dict(states['model'])
            if self.model_parallel:
                ph = PipelineHelper()
                ph.check_compatibility(self.opt)
                model = ph.make_parallel(model)
            elif self.use_cuda:
                model.cuda()
            if self.fp16:
                model = model.half()

            sync_parameters(model)
            train_params = trainable_parameters(model)
            total_params = total_parameters(model)
            logging.info(
                f"Total regret parameters: {total_params:,d} ({train_params:,d} trainable)"
            )
        else:
            model = self.model

        return model
Exemple #22
0
def _fairseq_opt_wrapper(opt, skip_pretrained_embedding_loading=False):
    """
    Marshalls from a dict to a argparse.Namespace object for API compatibility.

    Also does some necessary post-processing needed for fairseq. Optionally can
    override pretrained embedding options, which is useful if we're just loading
    a model from a checkpoint.

    :param opt: dict. ParlAI options passed around from everywhere.
    :param skip_pretrained_embedding_loading: bool. Don't preload word embeddings.
    :return: an argparse.Namespace object for use in fairseq-py.
    """
    args = argparse.Namespace()

    # first set args according to ParlAI options
    for key in opt:
        if opt[key] is not None:
            setattr(args, key, opt[key])

    # at this point the user *must* have specified an arch
    if not hasattr(args, "arch"):
        raise ValueError("--arch/-a must be specified")
    # fill in default options from the model
    models.ARCH_CONFIG_REGISTRY[args.arch](args)

    # post processing of args. See
    # https://github.com/pytorch/fairseq/blob/v0.5.0/fairseq/options.py#L95
    if hasattr(args, "lr"):
        args.lr = options.eval_str_list(args.lr, type=float)
    if hasattr(args, "update_freq"):
        args.update_freq = options.eval_str_list(args.update_freq, int)
    if hasattr(args, "max_sentences_valid"):
        args.max_sentences_valid = args.max_sentences
    if getattr(args, "truncate") == -1:
        # some torch agents use positional embeddings, which must have a max length
        setattr(args, "truncate", 1024)
    if not hasattr(args, "max_source_positions"):
        # fairseq uses a different name for this CLI parameter
        # Sometimes it's set in model defaults, but not for all models
        setattr(args, "max_source_positions", getattr(args, "truncate"))
        # if we don't have source lengths, we don't have target lengths
        setattr(args, "max_target_positions", getattr(args, "truncate"))

    # handle modelzoo if possible
    for k in ("encoder_embed_path", "decoder_embed_path"):
        if getattr(args, k, None) is None:
            # not an argument for this model, pretrained embeddings don't matter
            continue
        elif skip_pretrained_embedding_loading:
            # if we want to skip pretrained, then hide the option from fairseq
            setattr(args, k, None)
        else:
            # otherwise we may need to modelzoo adjust the path for fairseq
            setattr(args, k,
                    modelzoo_path(opt.get("datapath"), getattr(args, k)))

    # Here we hardcode a few options that we currently do not support
    # turn off distributed training
    args.distributed_world_size = 1
    args.distributed_rank = 0

    return args, vars(args)
Exemple #23
0
    def test_load_dpr(self):
        opt = ParlaiParser(True, True).parse_args([])
        # First, we'll load up a DPR model from the zoo dpr file.
        default_query_encoder = DprQueryEncoder(opt,
                                                dpr_model='bert',
                                                pretrained_path=DPR_ZOO_MODEL)
        rag_sequence_query_encoder = DprQueryEncoder(
            opt,
            dpr_model='bert_from_parlai_rag',
            pretrained_path=RAG_SEQUENCE_ZOO_MODEL,
        )
        assert not torch.allclose(
            default_query_encoder.embeddings.weight.float().cpu(),
            rag_sequence_query_encoder.embeddings.weight.float().cpu(),
        )
        # 1. Create a zoo RAG Agent, which involves a trained DPR model
        rag = create_agent(
            Opt({
                'model_file':
                modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL),
                'override': {
                    'retriever_debug_index': 'compressed',
                    'fp16': False
                },
            }))
        # The default rag token model should have different query encoders
        # from both the RAG_SEQUENCE_ZOO_MODEL, and the default DPR_ZOO_MODEL
        assert not torch.allclose(
            rag_sequence_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )
        assert not torch.allclose(
            default_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )

        # 2. create a RAG Agent with the rag_sequence_zoo_model DPR model
        rag = create_agent(
            Opt({
                'model_file':
                modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL),
                'override': {
                    'retriever_debug_index':
                    'compressed',
                    'dpr_model_file':
                    modelzoo_path(opt['datapath'], RAG_SEQUENCE_ZOO_MODEL),
                    'query_model':
                    'bert_from_parlai_rag',
                    'fp16':
                    False,
                },
            }))
        # If we override the DPR Model file, we should now have the same
        # weights as the query encoder from above.
        assert torch.allclose(
            rag_sequence_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )

        # 3. Create a RAG Agent with the default DPR zoo model
        rag = create_agent(
            Opt({
                'model_file':
                modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL),
                'override': {
                    'retriever_debug_index': 'compressed',
                    'dpr_model_file': modelzoo_path(opt['datapath'],
                                                    DPR_ZOO_MODEL),
                    'fp16': False,
                },
            }))

        # This model was trained with the DPR_ZOO_MODEL, and yet now should have the same weights
        # as we explicitly specified it.
        assert torch.allclose(
            default_query_encoder.embeddings.weight.float().cpu(),
            rag.model.retriever.query_encoder.embeddings.weight.float().cpu(),
        )
Exemple #24
0
    def _process_args_to_opts(self,
                              args_that_override: Optional[List[str]] = None):
        self.opt = Opt(vars(self.args))

        # custom post-parsing
        self.opt['parlai_home'] = self.parlai_home
        self.opt = self._infer_datapath(self.opt)

        # set all arguments specified in command line as overridable
        option_strings_dict = {}
        store_true = []
        store_false = []
        for group in self._action_groups:
            for a in group._group_actions:
                if hasattr(a, 'option_strings'):
                    for option in a.option_strings:
                        option_strings_dict[option] = a.dest
                        if '_StoreTrueAction' in str(type(a)):
                            store_true.append(option)
                        elif '_StoreFalseAction' in str(type(a)):
                            store_false.append(option)

        if args_that_override is None:
            args_that_override = _sys.argv[1:]

        for i in range(len(args_that_override)):
            if args_that_override[i] in option_strings_dict:
                if args_that_override[i] in store_true:
                    self.overridable[option_strings_dict[
                        args_that_override[i]]] = True
                elif args_that_override[i] in store_false:
                    self.overridable[option_strings_dict[
                        args_that_override[i]]] = False
                elif (i < len(args_that_override) - 1
                      and args_that_override[i + 1][:1] != '-'):
                    key = option_strings_dict[args_that_override[i]]
                    self.overridable[key] = self.opt[key]
        self.opt['override'] = self.overridable

        # load opts if a file is provided.
        if self.opt.get('init_opt', None) is not None:
            self._load_opts(self.opt)

        # map filenames that start with 'zoo:' to point to the model zoo dir
        if self.opt.get('model_file') is not None:
            self.opt['model_file'] = modelzoo_path(self.opt.get('datapath'),
                                                   self.opt['model_file'])
        if self.opt['override'].get('model_file') is not None:
            # also check override
            self.opt['override']['model_file'] = modelzoo_path(
                self.opt.get('datapath'), self.opt['override']['model_file'])
        if self.opt.get('dict_file') is not None:
            self.opt['dict_file'] = modelzoo_path(self.opt.get('datapath'),
                                                  self.opt['dict_file'])
        if self.opt['override'].get('dict_file') is not None:
            # also check override
            self.opt['override']['dict_file'] = modelzoo_path(
                self.opt.get('datapath'), self.opt['override']['dict_file'])

        # add start time of an experiment
        self.opt['starttime'] = datetime.datetime.today().strftime(
            '%b%d_%H-%M')
Exemple #25
0
def learn_arora(opt):
    """
    Go through ConvAI2 data and collect word counts, thus compute the unigram
    probability distribution. Use those probs to compute weighted sentence embeddings
    for all utterances, thus compute first principal component.

    Save all info to arora.pkl file.
    """
    arora_file = os.path.join(opt['datapath'], 'controllable_dialogue',
                              'arora.pkl')

    opt['task'] = 'fromfile:parlaiformat'
    opt['log_every_n_secs'] = 2

    print('Getting word counts from ConvAI2 train set...')
    opt['datatype'] = 'train:ordered'
    opt['fromfile_datapath'] = os.path.join(opt['datapath'],
                                            'controllable_dialogue',
                                            'ConvAI2_parlaiformat',
                                            'train.txt')
    # Do include inputs because ConvAI2 train set reverses every convo:
    word_counter_train, total_count_train, all_utts_train = get_word_counts(
        opt, count_inputs=False)

    print('Getting word counts from ConvAI2 val set...')
    opt['datatype'] = 'valid'
    opt['fromfile_datapath'] = os.path.join(opt['datapath'],
                                            'controllable_dialogue',
                                            'ConvAI2_parlaiformat',
                                            'valid.txt')
    # Don't include inputs because ConvAI2 val set doesn't reverses convos:
    word_counter_valid, total_count_valid, all_utts_valid = get_word_counts(
        opt, count_inputs=True)

    # Merge word counts
    word_counter = word_counter_train
    for word, count in word_counter_valid.items():
        word_counter[word] += count
    total_count = total_count_train + total_count_valid

    # Merge all_utts
    all_utts = all_utts_train + all_utts_valid

    # Compute unigram prob for every word
    print("Computing unigram probs for all words...")
    word2prob = {w: c / total_count for w, c in word_counter.items()}

    # Settings for sentence embedder
    arora_a = 0.0001
    glove_name = '840B'
    glove_dim = 300
    glove_cache = modelzoo_path(opt['datapath'], 'models:glove_vectors')

    # Embed every sentence, without removing first singular value
    print('Embedding all sentences...')
    sent_embedder = SentenceEmbedder(
        word2prob,
        arora_a,
        glove_name,
        glove_dim,
        first_sv=None,
        glove_cache=glove_cache,
    )
    utt_embs = []
    log_timer = TimeLogger()
    for n, utt in enumerate(all_utts):
        utt_emb = sent_embedder.embed_sent(utt.split(), rem_first_sv=False)
        utt_embs.append(utt_emb)
        if log_timer.time() > opt['log_every_n_secs']:
            text, _log = log_timer.log(n, len(all_utts))
            print(text)

    # Use SVD to calculate singular vector
    # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html
    print('Calculating SVD...')
    utt_embs = np.stack(utt_embs, axis=0)  # shape (num_embs, glove_dim)
    U, s, V = np.linalg.svd(utt_embs, full_matrices=False)
    first_sv = V[0, :]  # first row of V. shape (glove_dim)

    # Remove singular vector from all embs to get complete Arora-style sent embs
    print('Removing singular vec from all sentence embeddings...')
    utt_embs_adj = [
        remove_first_sv(torch.Tensor(emb), torch.Tensor(first_sv)).numpy()
        for emb in utt_embs
    ]  # list of np arrays shape (glove_dim)

    # Make dict mapping ConvAI2 dataset utterances to Arora sent emb
    # We save this to file for convenience (e.g. if you want to inspect)
    utt2emb = {utt: emb for (utt, emb) in zip(all_utts, utt_embs_adj)}

    # Save unigram distribution, first singular value, hyperparameter value for a,
    # info about GloVe vectors used, and full dict of utt->emb to file
    print("Saving Arora embedding info to %s..." % arora_file)
    with open(arora_file, "wb") as f:
        pickle.dump(
            {
                'word2prob':
                word2prob,  # dict: string to float between 0 and 1
                'first_sv': first_sv,  # np array shape (glove_dim)
                'arora_a': arora_a,  # float, 0.0001
                'glove_name': glove_name,  # string, '840B'
                'glove_dim': glove_dim,  # int, 300
                'utt2emb':
                utt2emb,  # dict: string to np array shape (glove_dim)
            },
            f,
        )
Exemple #26
0
    def test_chat_world(self):
        """
        Test functionality of the chat world.
        """

        with testing_utils.tempdir() as tmpdir:
            save_folder = tmpdir

            # Params
            model_name = 'blender_90M'
            zoo_model_file = 'zoo:blender/blender_90M/model'
            model = 'TransformerGenerator'
            num_turn_pairs = 6
            config_folder = os.path.join(
                os.path.dirname(os.path.realpath(run.__file__)), 'config'
            )
            datapath = os.path.join(tmpdir, 'data')

            # Download zoo model file
            model_file = modelzoo_path(datapath, zoo_model_file)

            # Define opt
            base_model_folder = os.path.dirname(os.path.dirname(model_file))
            # Get the folder that encloses the innermost model folder
            with open(os.path.join(config_folder, 'left_pane_text.html')) as f:
                left_pane_text = f.readlines()
            with open(os.path.join(config_folder, 'annotations_config.json')) as f:
                annotations_config = json.load(f)
            opt = Opt(
                {
                    'annotations_config': annotations_config,
                    'annotations_intro': ANNOTATIONS_INTRO_TEXT,
                    'base_model_folder': base_model_folder,
                    'check_acceptability': False,
                    'conversation_start_mode': 'hi',
                    'final_rating_question': 'Please rate your partner on a scale of 1-5.',
                    'include_persona': False,
                    'is_sandbox': True,
                    'left_pane_text': left_pane_text,
                    'save_folder': save_folder,
                    'task': 'turn_annotations',
                    'task_model_parallel': False,
                }
            )

            # Construct desired dialog
            human_agent_id = "Person1"
            bot_utterances = [
                "Hello, how are you today? I just got back from a long day at work, so I'm nervous.",
                "I just don't know what to do. I've never been so nervous in my life.",
                "Yes, I'll probably go to the movies. What about you? What do you like to do?",
                "That's great! What kind of restaurant do you usually go to? I love italian food.",
                "I love thai as well. What's your favorite kind of thai food? I like thai food the best.",
                'Oh, I love peanuts! I love all kinds of peanuts. Do you eat a lot of peanuts?',
                "I eat peanuts a lot, but only a few times a week. It's good for you.",
            ]
            human_utterances = [
                "What are you nervous about?",
                "Do you have any plans for the weekend?",
                "Yeah that sounds great! I like to bike and try new restaurants.",
                "Oh, Italian food is great. I also love Thai and Indian.",
                "Hmmm - anything with peanuts? Or I like when they have spicy licorice-like herbs.",
                "Ha, a decent amount, probably. What about you?",
            ]
            bucket_assignments = [
                {
                    'bucket_0': False,
                    'bucket_1': False,
                    'bucket_2': True,
                    'bucket_3': False,
                    'bucket_4': True,
                }
            ] * (num_turn_pairs + 1)
            # Arbitrary choose buckets. The +1 is for the final model response at the
            # end
            final_rating = 3
            fake_first_human_turn = {
                "left_pane_text": left_pane_text,
                "episode_done": False,
                "id": "Person1",
                "text": "Hi!",
                "fake_start": True,
                "agent_idx": 0,
                "config": {
                    "min_num_turns": num_turn_pairs,
                    "annotations_config": annotations_config,
                },
            }
            final_bot_turn = {
                "agent_idx": 1,
                "text": bot_utterances[num_turn_pairs],
                "id": model,
                "problem_data": {
                    "turn_idx": num_turn_pairs * 2 + 1,
                    **bucket_assignments[num_turn_pairs],
                    "final_rating": str(final_rating),
                },
            }
            dialog = [fake_first_human_turn]
            for turn_pair_idx in range(num_turn_pairs):
                bot_turn = {
                    "agent_idx": 1,
                    "text": bot_utterances[turn_pair_idx],
                    "id": model,
                    "problem_data": {
                        "turn_idx": turn_pair_idx * 2 + 1,
                        **bucket_assignments[turn_pair_idx],
                    },
                }
                human_turn = {
                    "agent_idx": 0,
                    "text": human_utterances[turn_pair_idx],
                    "id": human_agent_id,
                }
                dialog += [bot_turn, human_turn]
            dialog += [final_bot_turn]

            # Construct desired output
            desired_results = {
                "personas": None,
                "context_dataset": None,
                "person1_seed_utterance": None,
                "person2_seed_utterance": None,
                "additional_context": None,
                "dialog": dialog,
                "workers": [HUMAN_LIKE_AGENT_WORKER_ID, model_name],
                "bad_workers": [],
                "acceptability_violations": [None],
                "hit_ids": [HUMAN_LIKE_AGENT_HIT_ID, "none"],
                "assignment_ids": [HUMAN_LIKE_AGENT_ASSIGNMENT_ID, "none"],
                "task_description": {
                    "annotations_config": annotations_config,
                    "model_nickname": model_name,
                    "model_file": model_file,
                },
            }

            # Set up semaphore
            max_concurrent_responses = 1
            semaphore = threading.Semaphore(max_concurrent_responses)

            # Set up human agent
            human_worker = HumanLikeChatAgent(
                agent_id=human_agent_id,
                human_utterances=human_utterances,
                bucket_assignments=bucket_assignments,
                final_rating=final_rating,
            )

            # Set up bot agent
            shared_bot_agents = TurkLikeAgent.get_bot_agents(
                opt=opt, active_models=[model_name]
            )

            # Get a bot and add it to the "human" worker
            print(f'Choosing the "{model_name}" model for the bot.')
            agent = create_agent_from_shared(shared_bot_agents[model_name])
            bot_worker = TurkLikeAgent(
                opt,
                model_name=model_name,
                model_agent=agent,
                num_turns=num_turn_pairs,
                semaphore=semaphore,
            )
            workers_including_bot = [human_worker, bot_worker]

            # Define world
            conv_idx = 0
            world = TurnAnnotationsChatWorld(
                opt=opt,
                agents=workers_including_bot,
                num_turns=num_turn_pairs,
                max_resp_time=180,
                tag='conversation t_{}'.format(conv_idx),
                context_info=None,
            )

            # Run conversation
            while not world.episode_done():
                print('About to parley')
                world.parley()

            # Check the output data
            model_nickname, worker_is_unacceptable, convo_finished = world.save_data()
            self.assertEqual(model_nickname, model_name)
            self.assertFalse(worker_is_unacceptable)
            self.assertTrue(convo_finished)

            # Check the final results file saved by the world
            results_path = list(glob.glob(os.path.join(tmpdir, '*_*_sandbox.json')))[0]
            with open(results_path) as f:
                actual_results = json.load(f)
            for k, v in desired_results.items():
                if k == 'task_description':
                    for k2, v2 in desired_results[k].items():
                        self.assertEqual(actual_results[k].get(k2), v2)
                else:
                    self.assertEqual(actual_results.get(k), v)
Exemple #27
0
    def __init__(self, opt: Opt, shared=None):
        """
        Initialize DictionaryAgent.
        """
        self.opt = copy.deepcopy(opt)
        self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq)
        self.null_token = opt.get('dict_nulltoken',
                                  DictionaryAgent.default_null)
        self.end_token = opt.get('dict_endtoken', DictionaryAgent.default_end)
        self.unk_token = opt.get('dict_unktoken', DictionaryAgent.default_unk)
        self.start_token = opt.get('dict_starttoken',
                                   DictionaryAgent.default_start)
        self.max_ngram_size = opt.get('dict_max_ngram_size',
                                      DictionaryAgent.default_maxngram)
        self.tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok)
        self.lower = opt.get('dict_lower', DictionaryAgent.default_lower)
        self.maxtokens = opt.get('dict_maxtokens',
                                 DictionaryAgent.default_maxtokens)
        self.textfields = opt.get(
            'dict_textfields', DictionaryAgent.default_textfields).split(",")

        # used to signal whether we should use training time tricks, like bpe droput
        self._tokenization_mode = TokenizationMode.TEST_TIME_LABEL

        try:
            self.tokenizer_fun = getattr(self, self.tokenizer + '_tokenize')
        except AttributeError:
            raise AttributeError('tokenizer type {} not yet supported'.format(
                self.tokenizer))

        if shared:
            self.freq = shared.get('freq', {})
            self.tok2ind = shared.get('tok2ind', {})
            self.ind2tok = shared.get('ind2tok', {})
        else:
            self.additional_special_tokens: List[str] = []
            self.freq = defaultdict(int)
            self.tok2ind = {}
            self.ind2tok = {}

            if self.null_token:
                self.add_token(self.null_token)

            if self.start_token:
                # set special start of sentence word token
                self.add_token(self.start_token)

            if self.end_token:
                # set special end of sentence word token
                self.add_token(self.end_token)

            if self.unk_token:
                # set special unknown word token
                self.add_token(self.unk_token)

            loaded = False
            # If data built via pytorch data teacher, we need to load prebuilt dict
            if opt.get('dict_file'):
                opt['dict_file'] = modelzoo_path(opt.get('datapath'),
                                                 opt['dict_file'])
                if PathManager.exists(opt['dict_file']):
                    # load pre-existing dictionary
                    self.load(opt['dict_file'])
                    loaded = True

            if not loaded and opt.get('dict_initpath'):
                # load seed dictionary
                opt['dict_initpath'] = modelzoo_path(opt.get('datapath'),
                                                     opt['dict_initpath'])
                # don't check isfile first, should fail if file not found
                self.load(opt['dict_initpath'])
            opt['dict_loaded'] = loaded

        # cache unk token for later
        self._unk_token_idx = self.tok2ind.get(self.unk_token)

        # initialize tokenizers
        if self.tokenizer == 'nltk':
            try:
                import nltk
            except ImportError:
                raise ImportError('Please install nltk (pip install nltk)')
            # nltk-specific setup
            st_path = 'tokenizers/punkt/{0}.pickle'.format(
                opt['dict_language'])
            try:
                self.sent_tok = nltk.data.load(st_path)
            except LookupError:
                nltk.download('punkt')
                self.sent_tok = nltk.data.load(st_path)
            self.word_tok = nltk.tokenize.treebank.TreebankWordTokenizer()
        elif self.tokenizer in [
                'bpe', 'gpt2', 'bytelevelbpe', 'slow_bytelevel_bpe'
        ]:
            self.bpe = bpe_factory(opt, shared)
            self.bpe.sync_with_dict(self)

        if not shared:
            if self.null_token:
                # fix count for null token to one billion and three
                self.freq[self.null_token] = 1000000003

            if self.start_token:
                # fix count for start of sentence token to one billion and two
                self.freq[self.start_token] = 1000000002

            if self.end_token:
                # fix count for end of sentence token to one billion and one
                self.freq[self.end_token] = 1000000001

            if self.unk_token:
                # fix count for unknown token to one billion
                self.freq[self.unk_token] = 1000000000

            if opt.get('dict_file'):
                self.save_path = opt['dict_file']
Exemple #28
0
def compare_init_model_opts(opt, curr_opt):
    """Print loud warning when `init_model` opts differ from previous configuration."""
    if opt.get('init_model') is None:
        return
    opt['init_model'] = modelzoo_path(opt['datapath'], opt['init_model'])
    optfile = opt['init_model'] + '.opt'
    if not os.path.isfile(optfile):
        return
    init_model_opt = load_opt_file(optfile)

    extra_opts = {}
    different_opts = {}
    exempt_opts = [
        'model_file',
        'dict_file',
        'override',
        'starttime',
        'init_model',
        'batchindex',
    ]

    # search through init model opts
    for k, v in init_model_opt.items():
        if (
            k not in exempt_opts
            and k in init_model_opt
            and init_model_opt[k] != curr_opt.get(k)
        ):
            if isinstance(v, list):
                if init_model_opt[k] != list(curr_opt[k]):
                    different_opts[k] = ','.join([str(x) for x in v])
            else:
                different_opts[k] = v

    # search through opts to load
    for k, v in curr_opt.items():
        if k not in exempt_opts and k not in init_model_opt:
            if isinstance(v, list):
                extra_opts[k] = ','.join([str(x) for x in v])
            else:
                extra_opts[k] = v

    # print warnings
    extra_strs = ['{}: {}'.format(k, v) for k, v in extra_opts.items()]
    if extra_strs:
        print('\n' + '*' * 75)
        print(
            '[ WARNING ] : your model is being loaded with opts that do not '
            'exist in the model you are initializing the weights with: '
            '{}'.format(','.join(extra_strs))
        )

    different_strs = [
        '--{} {}'.format(k, v).replace('_', '-') for k, v in different_opts.items()
    ]
    if different_strs:
        print('\n' + '*' * 75)
        print(
            '[ WARNING ] : your model is being loaded with opts that differ '
            'from the model you are initializing the weights with. Add the '
            'following args to your run command to change this: \n'
            '\n{}'.format(' '.join(different_strs))
        )
        print('*' * 75)
Exemple #29
0
    def __init__(self, opt, shared=None):
        # initialize fields
        self.opt = copy.deepcopy(opt)
        self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq)
        self.null_token = opt.get('dict_nulltoken',
                                  DictionaryAgent.default_null)
        self.end_token = opt.get('dict_endtoken', DictionaryAgent.default_end)
        self.unk_token = opt.get('dict_unktoken', DictionaryAgent.default_unk)
        self.start_token = opt.get('dict_starttoken',
                                   DictionaryAgent.default_start)
        self.max_ngram_size = opt.get('dict_max_ngram_size',
                                      DictionaryAgent.default_maxngram)
        self.tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok)
        self.lower = opt.get('dict_lower', DictionaryAgent.default_lower)
        self.maxtokens = opt.get('dict_maxtokens',
                                 DictionaryAgent.default_maxtokens)
        self.textfields = opt.get(
            'dict_textfields', DictionaryAgent.default_textfields).split(",")

        try:
            self.tokenizer_fun = getattr(self, self.tokenizer + '_tokenize')
        except AttributeError:
            raise AttributeError('tokenizer type {} not yet supported'.format(
                self.tokenizer))

        if shared:
            self.freq = shared.get('freq', {})
            self.tok2ind = shared.get('tok2ind', {})
            self.ind2tok = shared.get('ind2tok', {})
        else:
            self.freq = defaultdict(int)
            self.tok2ind = {}
            self.ind2tok = {}

            if self.null_token:
                self.add_token(self.null_token)

            if self.start_token:
                # set special start of sentence word token
                self.add_token(self.start_token)

            if self.end_token:
                # set special end of sentence word token
                self.add_token(self.end_token)

            if self.unk_token:
                # set special unknown word token
                self.add_token(self.unk_token)

            if opt.get('dict_file') and os.path.isfile(opt['dict_file']):
                # load pre-existing dictionary
                self.load(opt['dict_file'])
            elif opt.get('dict_initpath'):
                # load seed dictionary
                opt['dict_initpath'] = modelzoo_path(opt.get('datapath'),
                                                     opt['dict_initpath'])
                self.load(opt['dict_initpath'])

        # initialize tokenizers
        if self.tokenizer == 'nltk':
            try:
                import nltk
            except ImportError:
                raise ImportError('Please install nltk (pip install nltk)')
            # nltk-specific setup
            st_path = 'tokenizers/punkt/{0}.pickle'.format(
                opt['dict_language'])
            try:
                self.sent_tok = nltk.data.load(st_path)
            except LookupError:
                nltk.download('punkt')
                self.sent_tok = nltk.data.load(st_path)
            self.word_tok = nltk.tokenize.treebank.TreebankWordTokenizer()
        elif self.tokenizer == 'spacy':
            try:
                import spacy
            except ImportError:
                raise ImportError('Please install spacy and spacy "en" model: '
                                  '`pip install -U spacy && '
                                  'python -m spacy download en` '
                                  'or find alternative installation options '
                                  'at spacy.io')
            self.NLP = spacy.load('en')
        elif self.tokenizer == 'bpe':
            if not opt.get('dict_file'):
                raise RuntimeError('--dict-file is mandatory.')
            self.bpehelper = _BPEHelper(opt.get('dict_file') + '.codecs')

        if not shared:
            if self.null_token:
                # fix count for null token to one billion and three
                self.freq[self.null_token] = 1000000003

            if self.start_token:
                # fix count for start of sentence token to one billion and two
                self.freq[self.start_token] = 1000000002

            if self.end_token:
                # fix count for end of sentence token to one billion and one
                self.freq[self.end_token] = 1000000001

            if self.unk_token:
                # fix count for unknown token to one billion
                self.freq[self.unk_token] = 1000000000

            if opt.get('dict_file'):
                self.save_path = opt['dict_file']
Exemple #30
0
    def parse_args(self, args=None, namespace=None, print_args=True):
        """
        Parses the provided arguments and returns a dictionary of the ``args``.

        We specifically remove items with ``None`` as values in order
        to support the style ``opt.get(key, default)``, which would otherwise
        return ``None``.
        """
        self.add_extra_args(args)
        self.args = super().parse_args(args=args)
        self.opt = vars(self.args)

        # custom post-parsing
        self.opt['parlai_home'] = self.parlai_home

        # set environment variables
        # Priority for setting the datapath (same applies for download_path):
        # --datapath -> os.environ['PARLAI_DATAPATH'] -> <self.parlai_home>/data
        if self.opt.get('download_path'):
            os.environ['PARLAI_DOWNPATH'] = self.opt['download_path']
        elif os.environ.get('PARLAI_DOWNPATH') is None:
            os.environ['PARLAI_DOWNPATH'] = os.path.join(self.parlai_home, 'downloads')
        if self.opt.get('datapath'):
            os.environ['PARLAI_DATAPATH'] = self.opt['datapath']
        elif os.environ.get('PARLAI_DATAPATH') is None:
            os.environ['PARLAI_DATAPATH'] = os.path.join(self.parlai_home, 'data')

        self.opt['download_path'] = os.environ['PARLAI_DOWNPATH']
        self.opt['datapath'] = os.environ['PARLAI_DATAPATH']

        # set all arguments specified in commandline as overridable
        option_strings_dict = {}
        store_true = []
        store_false = []
        for group in self._action_groups:
            for a in group._group_actions:
                if hasattr(a, 'option_strings'):
                    for option in a.option_strings:
                        option_strings_dict[option] = a.dest
                        if '_StoreTrueAction' in str(type(a)):
                            store_true.append(option)
                        elif '_StoreFalseAction' in str(type(a)):
                            store_false.append(option)

        for i in range(len(self.cli_args)):
            if self.cli_args[i] in option_strings_dict:
                if self.cli_args[i] in store_true:
                    self.overridable[option_strings_dict[self.cli_args[i]]] = \
                        True
                elif self.cli_args[i] in store_false:
                    self.overridable[option_strings_dict[self.cli_args[i]]] = \
                        False
                elif i < len(self.cli_args) - 1 and self.cli_args[i + 1][:1] != '-':
                    key = option_strings_dict[self.cli_args[i]]
                    self.overridable[key] = self.opt[key]
        self.opt['override'] = self.overridable

        # map filenames that start with 'models:' to point to the model zoo dir
        if self.opt.get('model_file') is not None:
            self.opt['model_file'] = modelzoo_path(self.opt.get('datapath'),
                                                   self.opt['model_file'])
        if self.opt['override'].get('model_file') is not None:
            # also check override
            self.opt['override']['model_file'] = modelzoo_path(
                self.opt.get('datapath'), self.opt['override']['model_file'])
        if self.opt.get('dict_file') is not None:
            self.opt['dict_file'] = modelzoo_path(self.opt.get('datapath'),
                                                  self.opt['dict_file'])
        if self.opt['override'].get('dict_file') is not None:
            # also check override
            self.opt['override']['dict_file'] = modelzoo_path(
                self.opt.get('datapath'), self.opt['override']['dict_file'])

        # add start time of an experiment
        self.opt['starttime'] = datetime.datetime.today().strftime('%b%d_%H-%M')

        if print_args:
            self.print_args()
            print("\n".join([
                "",
                "*" * 80,
                "Thank you for using ParlAI! We are conducting a user survey.",
                "Please consider filling it out at https://forms.gle/uEFbYGP7w6hiuGQT9",
                "*" * 80,
                ""
            ]))

        return self.opt