def create_agent(opt: Opt, requireModelExists=False): """ Create an agent from the options ``model``, ``model_params`` and ``model_file``. The input is either of the form ``parlai.agents.ir_baseline.agents:IrBaselineAgent`` (i.e. the path followed by the class name) or else just ``ir_baseline`` which assumes the path above, and a class name suffixed with 'Agent'. If ``model-file`` is available in the options this function can also attempt to load the model from that location instead. This avoids having to specify all the other options necessary to set up the model including its name as they are all loaded from the options file if it exists (the file opt['model_file'] + '.opt' must exist and contain a pickled or json dict containing the model's options). """ if opt.get('datapath', None) is None: add_datapath_and_model_args(opt) if opt.get('model_file'): opt['model_file'] = modelzoo_path(opt.get('datapath'), opt['model_file']) if requireModelExists and not PathManager.exists(opt['model_file']): raise RuntimeError( 'WARNING: Model file does not exist, check to make ' 'sure it is correct: {}'.format(opt['model_file']) ) # Attempt to load the model from the model file first (this way we do # not even have to specify the model name as a parameter) model = create_agent_from_opt_file(opt) if model is not None: return model else: logging.info(f"No model with opt yet at: {opt['model_file']}(.opt)") if opt.get('model'): model_class = load_agent_module(opt['model']) # if we want to load weights from --init-model, compare opts with # loaded ones compare_init_model_opts(opt, opt) model = model_class(opt) if requireModelExists and hasattr(model, 'load') and not opt.get('model_file'): # double check that we didn't forget to set model_file on loadable model logging.warn('model_file unset but model has a `load` function.') return model else: raise RuntimeError('Need to set `model` argument to use create_agent.')
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Index words in embedding file if (self.opt['pretrained_words'] and self.opt.get('embedding_file') and not self.opt.get('trained', False)): print('[ Indexing words with embeddings... ]') self.embedding_words = set() self.opt['embedding_file'] = modelzoo_path( self.opt.get('datapath'), self.opt['embedding_file']) with open(self.opt['embedding_file']) as f: for line in f: w = normalize_text(line.rstrip().split(' ')[0]) self.embedding_words.add(w) print('[ Num words in set = %d ]' % len(self.embedding_words)) else: self.embedding_words = None
def __init__(self, opt: Opt): self.opt = opt self.agents = [] self.agent_dict = None self.generations = [] self.input_type = 'Search' self.knowledge_access_method = KnowledgeAccessMethod( opt['knowledge_access_method']) model_file = modelzoo_path(opt['datapath'], opt['query_generator_model_file']) if (self.knowledge_access_method is KnowledgeAccessMethod.SEARCH_ONLY and 'blenderbot2/query_generator/model' in model_file): raise ValueError( 'You cannot use the blenderbot2 query generator with search_only. Please ' 'consider setting --query-generator-model-file zoo:sea/bart_sq_gen/model ' 'instead.') if model_file and os.path.exists(model_file): logging.info(f'Building Query Generator from file: {model_file}') logging.disable() overrides: Dict[str, Any] = {'skip_generation': False} overrides['inference'] = opt['query_generator_inference'] overrides['beam_size'] = opt.get('query_generator_beam_size', 3) overrides['beam_min_length'] = opt.get( 'query_generator_beam_min_length', 2) overrides['model_parallel'] = opt['model_parallel'] overrides['no_cuda'] = opt['no_cuda'] if self.opt['query_generator_truncate'] > 0: overrides['text_truncate'] = self.opt[ 'query_generator_truncate'] overrides['truncate'] = self.opt['query_generator_truncate'] base_agent = create_agent_from_model_file(model_file, opt_overrides=overrides) assert isinstance(base_agent, TorchAgent) self.agents = [base_agent] bsz = max( opt.get('batchsize') or 1, opt.get('eval_batchsize') or 1) rag_turn_n_turns = opt.get('rag_turn_n_turns', 1) if bsz > 1 or rag_turn_n_turns > 1: self.agents += [ create_agent_from_shared(self.agents[0].share()) for _ in range((bsz * rag_turn_n_turns) - 1) ] self.agent_dict = self.agents[0].build_dictionary() logging.enable()
def __init__(self, opt, shared=None): # initialize fields self.opt = copy.deepcopy(opt) self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq) self.null_token = '__PAD__' self.end_token = '__SOC__' self.unk_token = '__UNK__' self.start_token = '__SOC__' self.tokenizer = opt.get('dict_tokenizer', 'whitespace') self.lower = opt.get('dict_lower', DictionaryAgent.default_lower) self.maxtokens = opt.get('dict_maxtokens', DictionaryAgent.default_maxtokens) self.textfields = opt.get( 'dict_textfields', DictionaryAgent.default_textfields).split(",") if shared: self.freq = shared.get('freq', {}) self.tok2ind = shared.get('tok2ind', {}) self.ind2tok = shared.get('ind2tok', {}) else: self.freq = defaultdict(int) self.tok2ind = {} self.ind2tok = {} if opt.get('dict_file') and os.path.isfile(opt['dict_file']): # load pre-existing dictionary self.load(opt['dict_file']) elif opt.get('dict_initpath'): # load seed dictionary opt['dict_initpath'] = modelzoo_path(opt.get('datapath'), opt['dict_initpath']) self.load(opt['dict_initpath']) self.add_token(self.null_token) self.add_token(self.start_token) self.add_token(self.end_token) self.add_token(self.unk_token) if not shared: if opt.get('dict_file'): self.save_path = opt['dict_file'] # cache unk token for later self._unk_token_idx = self.tok2ind.get(self.unk_token)
def get_model_name(opt): model = opt.get('model', None) if model is None: # try to get model name from model opt file model_file = opt.get('model_file', None) if model_file is not None: model_file = modelzoo_path(opt.get('datapath'), model_file) optfile = model_file + '.opt' if os.path.isfile(optfile): try: # try json first with open(optfile, 'r', encoding='utf-8') as handle: new_opt = json.load(handle) model = new_opt.get('model', None) except UnicodeDecodeError: # oops it's pickled with open(optfile, 'rb') as handle: new_opt = pickle.load(handle) model = new_opt.get('model', None) return model
def set_options(self, name, device): option = { "n_image_tokens": 1, "n_image_channels": 1, "image_fusion_type": "late", } add_datapath_and_model_args(option) datapath = option.get("datapath") option['model_file'] = modelzoo_path(datapath, name) option["override"] = { "no_cuda": False if "cuda" in device else True, } if "cuda:" in device: option["override"]["gpu"] = int(device.split(":")[1]) elif "cuda" in device: option["override"]["gpu"] = 0 return option
def load_passages(self) -> List[Tuple[str, str, str]]: """ Load passages from tsv file. Limit passages returned according to shard number. :return passages: return a list of (doc_id, doc_text, doc_title) tuples """ logging.info(f"Loading {self.opt['passages_file']}") rows = load_passages_list( modelzoo_path(self.opt['datapath'], self.opt['passages_file']) # type: ignore ) shard_id, num_shards = self.opt['shard_id'], self.opt['num_shards'] shard_size = int(len(rows) / num_shards) start_idx = shard_id * shard_size end_idx = start_idx + shard_size logging.info(f'Shard {shard_id} of {num_shards} encoding psg index ' f'{start_idx} to {end_idx}, out of {len(rows)}') return rows[start_idx:end_idx]
def __init__(self, opt: Opt): self.opt = opt self.agents = [] self.agent_dict = None self.generations = [] self.input_type = 'Memory' self.delimiter = opt.get('memory_decoder_delimiter', '\n') self.one_line_memories = opt.get('memory_decoder_one_line_memories', False) model_file = modelzoo_path(opt['datapath'], opt['memory_decoder_model_file']) if model_file and os.path.exists(model_file): logging.info(f'Building Memory Decoder from file: {model_file}') logging.disable() overrides = { 'skip_generation': False, 'inference': 'beam', 'beam_size': opt.get('memory_decoder_beam_size', 3), 'beam_min_length': opt.get('memory_decoder_beam_min_length', 10), 'beam_block_ngram': 3, 'no_cuda': opt.get('no_cuda', False), } if self.opt.get('memory_decoder_truncate', -1) > 0: overrides['text_truncate'] = self.opt[ 'memory_decoder_truncate'] overrides['truncate'] = self.opt['memory_decoder_truncate'] base_agent = create_agent_from_model_file(model_file, opt_overrides=overrides) assert isinstance(base_agent, TorchAgent) self.agents = [base_agent] assert isinstance(self.agents[0], TorchAgent) copies = max(100, (opt['batchsize'] * opt.get('rag_turn_n_turns', 1))) self.agents += [ create_agent_from_shared(self.agents[0].share()) for _ in range(copies) ] self.agent_dict = self.agents[0].build_dictionary() logging.enable()
def test_set_model_file_without_dict_file(self): """Check that moving a model without moving the dictfile raises an error.""" # Download model, move to a new location datapath = ParlaiParser().parse_args(print_args=False)['datapath'] try: # remove unittest models if there before shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass testing_utils.download_unittest_models() zoo_path = 'zoo:unittest/seq2seq/model' model_path = modelzoo_path(datapath, zoo_path) os.remove(model_path + '.dict') # Test that eval model fails with self.assertRaises(RuntimeError): testing_utils.eval_model(dict(task='babi:task1k:1', model_file=model_path)) try: # remove unittest models if there after shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass
def _get_subagent_opt( self, filename: str, specific_override_args: Dict[str, Any], general_override_args: Dict[str, Any], ) -> Opt: """ Given an agent opt, construct the new opt for the agent. :param filename: opt path :param specific_override_args: args for the specific agent :param general_override_args: args specified for all agents """ if not filename.endswith('.opt'): filename += '.opt' opt = Opt.load(modelzoo_path(self.opt['datapath'], filename)) opt['override'] = {} blocklist_general = ['model', 'model_file', 'init_model'] general_override_args['skip_generation'] = False # Remove the prefix for the model for the specific override args. specific_override_args = { '_'.join(k.split('_')[1:]): v for k, v in specific_override_args.items() } override_args = {**general_override_args, **specific_override_args} for k, v in override_args.items(): if k not in blocklist_general and k in opt: logging.warning(f'Overriding {k} to {v} (old val: {opt[k]})') opt['override'][k] = v elif k in specific_override_args: logging.warning(f'Key {k} not originally in opt, setting to {v}') opt['override'][k] = v return opt
def load_embeddings(opt, word_dict): """Initialize embeddings from file of pretrained vectors.""" embeddings = torch.Tensor(len(word_dict), opt['embedding_dim']) embeddings.normal_(0, 1) opt['embedding_file'] = modelzoo_path(opt.get('datapath'), opt['embedding_file']) # Fill in embeddings if not opt.get('embedding_file'): raise RuntimeError('Tried to load embeddings with no embedding file.') with open(opt['embedding_file']) as f: for line in f: parsed = line.rstrip().split(' ') if len(parsed) > 2: assert (len(parsed) == opt['embedding_dim'] + 1) w = normalize_text(parsed[0]) if w in word_dict: vec = torch.Tensor([float(i) for i in parsed[1:]]) embeddings[word_dict[w]].copy_(vec) # Zero NULL token embeddings[word_dict['__NULL__']].fill_(0) return embeddings
def set_options(self, name, path, class_name, device): option = { "n_image_tokens": 1, "n_image_channels": 1, "image_fusion_type": "late", "image_features_dim": 2048, "image_encoder_num_layers": 1, } add_datapath_and_model_args(option) datapath = option.get('datapath') option['model_file'] = modelzoo_path(datapath, name) option["override"] = { "no_cuda": False if "cuda" in device else True, } if "cuda:" in device: option["override"]["gpu"] = int(device.split(":")[1]) elif "cuda" in device: option["override"]["gpu"] = 0 my_module = importlib.import_module(path) model_class = getattr(my_module, class_name) return option, model_class
def set_defaults(opt): init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] if init_model is None: # Embeddings options opt['embedding_file'] = modelzoo_path( opt.get('datapath'), opt['embedding_file'] ) if opt.get('embedding_file'): if not os.path.isfile(opt['embedding_file']): raise IOError('No such file: %s' % opt['embedding_file']) with open(opt['embedding_file']) as f: dim = len(f.readline().strip().split(' ')) - 1 if dim == 1: # first line was a dud dim = len(f.readline().strip().split(' ')) - 1 opt['embedding_dim'] = dim elif not opt.get('embedding_dim'): raise RuntimeError( ('Either embedding_file or embedding_dim ' 'needs to be specified.') ) # Make sure tune_partial and fix_embeddings are consistent if opt['tune_partial'] > 0 and opt['fix_embeddings']: print('Setting fix_embeddings to False as tune_partial > 0.') opt['fix_embeddings'] = False # Make sure fix_embeddings and embedding_file are consistent if opt['fix_embeddings'] and not opt.get('embedding_file'): print('Setting fix_embeddings to False as embeddings are random.') opt['fix_embeddings'] = False
def test_set_model_file_without_dict_file(self): """ Check that moving a model without moving the dictfile raises an error. """ # Download model, move to a new location with testing_utils.tempdir() as datapath: try: # remove unittest models if there before shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass zoo_path = 'zoo:unittest/seq2seq/model' model_path = modelzoo_path(datapath, zoo_path) PathManager.rm(model_path + '.dict') # Test that eval model fails with self.assertRaises(RuntimeError): testing_utils.eval_model( dict(task='babi:task1k:1', model_file=model_path)) try: # remove unittest models if there after shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass
def set_defaults(opt): # Embeddings options opt['embedding_file'] = modelzoo_path(opt.get('datapath'), opt['embedding_file']) if opt.get('embedding_file'): if not os.path.isfile(opt['embedding_file']): raise IOError('No such file: %s' % opt['embedding_file']) with open(opt['embedding_file']) as f: dim = len(f.readline().strip().split(' ')) - 1 opt['embedding_dim'] = dim elif not opt.get('embedding_dim'): raise RuntimeError(('Either embedding_file or embedding_dim ' 'needs to be specified.')) # Make sure tune_partial and fix_embeddings are consistent if opt['tune_partial'] > 0 and opt['fix_embeddings']: print('Setting fix_embeddings to False as tune_partial > 0.') opt['fix_embeddings'] = False # Make sure fix_embeddings and embedding_file are consistent if opt['fix_embeddings']: if not opt.get('embedding_file') and not opt.get('init_model'): print('Setting fix_embeddings to False as embeddings are random.') opt['fix_embeddings'] = False
def __init__(self, opt, shared=None): """Set up model.""" super().__init__(opt, shared) opt = self.opt # there is a deepcopy in the init #self.opt = opt # all instances may need some params opt['label_smoothing'] = False opt['src_tgt_weight_share'] = False opt['tgt_prj_weight_share'] = False self.truncate = opt['truncate'] if opt['truncate'] > 0 else None self.metrics = { 'loss': 0.0, 'num_tokens': 0, 'correct_tokens': 0, 'total_skipped_batches': 0 } self.history = {} self.report_freq = opt.get('report_freq', 0.001) self.use_person_tokens = opt.get('person_tokens', False) self.batch_idx = shared and shared.get('batchindex') or 0 self.rank = opt['rank_candidates'] self.beam_size = opt.get('beam_size', 1) self.topk = opt.get('topk', 1) states = {} # check for cuda self.use_cuda = not opt.get('no_cuda') and torch.cuda.is_available() if opt.get('numthreads', 1) > 1: torch.set_num_threads(1) if shared: # set up shared properties self.opt = shared['opt'] opt = self.opt self.dict = shared['dict'] self.START_IDX = shared['START_IDX'] self.END_IDX = shared['END_IDX'] self.NULL_IDX = shared['NULL_IDX'] # answers contains a batch_size list of the last answer produced self.answers = shared['answers'] self.model = shared['model'] self.metrics = shared['metrics'] states = shared.get('states', {}) else: # this is not a shared instance of this class, so do full init # answers contains a batch_size list of the last answer produced self.answers = [None] * opt['batchsize'] if self.use_cuda: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) init_model = None # check first for 'init_model' for loading model from file if opt.get('init_model') and os.path.isfile(opt['init_model']): init_model = opt['init_model'] # next check for 'model_file', this would override init_model if opt.get('model_file') and os.path.isfile(opt['model_file']): init_model = opt['model_file'] if init_model is not None: # load model parameters if available print('[ Loading existing model params from {} ]'.format( init_model)) states = self.load(init_model) if os.path.isfile(init_model + '.dict') or opt['dict_file'] is None: opt['dict_file'] = init_model + '.dict' # load dictionary and basic tokens & vectors self.dict = DictionaryAgent(opt) self.id = 'Transformer' # we use START markers to start our output self.START_IDX = self.dict[self.dict.start_token] # we use END markers to end our output self.END_IDX = self.dict[self.dict.end_token] # get index of null token from dictionary (probably 0) self.NULL_IDX = self.dict[self.dict.null_token] if not hasattr(self, 'model_class'): # this allows child classes to override this but inherit init self.model_class = Transformer # self.model = self.model_class( # opt, len(self.dict), padding_idx=self.NULL_IDX, # start_idx=self.START_IDX, end_idx=self.END_IDX, # longest_label=states.get('longest_label', 1)) self.model = self.model_class(len(self.dict), opt) if opt.get('dict_tokenizer' ) == 'bpe' and opt['embedding_type'] != 'random': print('skipping preinitialization of embeddings for bpe') elif not states and opt['embedding_type'] != 'random': # set up preinitialized embeddings try: import torchtext.vocab as vocab except ImportError as ex: print( 'Please install torch text with `pip install torchtext`' ) raise ex pretrained_dim = 300 if opt['embedding_type'].startswith('glove'): if 'twitter' in opt['embedding_type']: init = 'glove-twitter' name = 'twitter.27B' pretrained_dim = 200 else: init = 'glove' name = '840B' embs = vocab.GloVe(name=name, dim=pretrained_dim, cache=modelzoo_path( self.opt.get('datapath'), 'models:glove_vectors')) elif opt['embedding_type'].startswith('fasttext'): init = 'fasttext' embs = vocab.FastText(language='en', cache=modelzoo_path( self.opt.get('datapath'), 'models:fasttext_vectors')) else: raise RuntimeError('embedding type not implemented') if opt['embeddingsize'] != pretrained_dim: rp = torch.Tensor(pretrained_dim, opt['embeddingsize']).normal_() t = lambda x: torch.mm(x.unsqueeze(0), rp) else: t = lambda x: x cnt = 0 for w, i in self.dict.tok2ind.items(): if w in embs.stoi: vec = t(embs.vectors[embs.stoi[w]]) self.model.decoder.tgt_word_emb.weight.data[i] = vec cnt += 1 if opt['lookuptable'] in ['unique', 'dec_out']: # also set encoder lt, since it's not shared self.model.encoder.src_word_emb.weight.data[ i] = vec print( 'Transformer: initialized embeddings for {} tokens from {}.' ''.format(cnt, init)) if states: # set loaded states if applicable self.model.load_state_dict(states['model']) if self.use_cuda: self.model.cuda() # set up criteria if opt.get('numsoftmax', 1) > 1: self.criterion = nn.NLLLoss(ignore_index=self.NULL_IDX, size_average=False) else: self.criterion = nn.CrossEntropyLoss(ignore_index=self.NULL_IDX, size_average=False) if self.use_cuda: self.criterion.cuda() if 'train' in opt.get('datatype', ''): # we only set up optimizers when training # we only set this up for the original instance or hogwild ones self.clip = opt.get('gradient_clip', -1) # set up optimizer lr = opt['learningrate'] optim_class = TransformerAgent.OPTIM_OPTS[opt['optimizer']] kwargs = {'lr': lr} if opt.get('momentum') > 0 and opt['optimizer'] in [ 'sgd', 'rmsprop' ]: kwargs['momentum'] = opt['momentum'] if opt['optimizer'] == 'sgd': kwargs['nesterov'] = True if opt['optimizer'] == 'adam': # https://openreview.net/forum?id=ryQu7f-RZ kwargs['amsgrad'] = True if opt['embedding_type'].endswith('fixed'): print('Transformer: fixing embedding weights.') self.model.decoder.tgt_word_emb.weight.requires_grad = False self.model.encoder.src_word_emb.weight.requires_grad = False if opt['lookuptable'] in ['dec_out', 'all']: # self.model.decoder.e2s.weight.requires_grad = False self.model.tgt_word_prj.weight.requires_grad = False self.optimizer = optim_class( [p for p in self.model.parameters() if p.requires_grad], **kwargs) if states.get('optimizer'): if states['optimizer_type'] != opt['optimizer']: print('WARNING: not loading optim state since optim class ' 'changed.') else: try: self.optimizer.load_state_dict(states['optimizer']) except ValueError: print('WARNING: not loading optim state since model ' 'params changed.') if self.use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', factor=0.5, patience=3, verbose=True) self.reset()
def download(datapath): return vocab.Vectors( name='wiki.en.vec', url=URL, cache=modelzoo_path(datapath, 'models:fasttext_vectors'), )
def parse_args(self, args=None, namespace=None, print_args=True): """Parses the provided arguments and returns a dictionary of the ``args``. We specifically remove items with ``None`` as values in order to support the style ``opt.get(key, default)``, which would otherwise return ``None``. """ self.add_extra_args(args) self.args = super().parse_args(args=args) self.opt = vars(self.args) # custom post-parsing self.opt['parlai_home'] = self.parlai_home if 'batchsize' in self.opt and self.opt['batchsize'] <= 1: # hide batch options self.opt.pop('batch_sort', None) self.opt.pop('context_length', None) # set environment variables if self.opt.get('download_path'): os.environ['PARLAI_DOWNPATH'] = self.opt['download_path'] if self.opt.get('datapath'): os.environ['PARLAI_DATAPATH'] = self.opt['datapath'] # set all arguments specified in commandline as overridable option_strings_dict = {} store_true = [] store_false = [] for group in self._action_groups: for a in group._group_actions: if hasattr(a, 'option_strings'): for option in a.option_strings: option_strings_dict[option] = a.dest if '_StoreTrueAction' in str(type(a)): store_true.append(option) elif '_StoreFalseAction' in str(type(a)): store_false.append(option) for i in range(len(self.cli_args)): if self.cli_args[i] in option_strings_dict: if self.cli_args[i] in store_true: self.overridable[option_strings_dict[self.cli_args[i]]] = \ True elif self.cli_args[i] in store_false: self.overridable[option_strings_dict[self.cli_args[i]]] = \ False elif i < len( self.cli_args) - 1 and self.cli_args[i + 1][:1] != '-': key = option_strings_dict[self.cli_args[i]] self.overridable[key] = self.opt[key] self.opt['override'] = self.overridable # map filenames that start with 'models:' to point to the model zoo dir if self.opt.get('model_file') is not None: self.opt['model_file'] = modelzoo_path(self.opt.get('datapath'), self.opt['model_file']) if self.opt['override'].get('model_file') is not None: # also check override self.opt['override']['model_file'] = modelzoo_path( self.opt.get('datapath'), self.opt['override']['model_file']) if self.opt.get('dict_file') is not None: self.opt['dict_file'] = modelzoo_path(self.opt.get('datapath'), self.opt['dict_file']) if self.opt['override'].get('dict_file') is not None: # also check override self.opt['override']['dict_file'] = modelzoo_path( self.opt.get('datapath'), self.opt['override']['dict_file']) # add start time of an experiment self.opt['starttime'] = datetime.datetime.today().strftime( '%b%d_%H-%M') if print_args: self.print_args() return self.opt
def __init__(self, opt, shared=None): """Initialize DictionaryAgent.""" self.opt = copy.deepcopy(opt) self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq) self.null_token = opt.get('dict_nulltoken', DictionaryAgent.default_null) self.end_token = opt.get('dict_endtoken', DictionaryAgent.default_end) self.unk_token = opt.get('dict_unktoken', DictionaryAgent.default_unk) self.start_token = opt.get('dict_starttoken', DictionaryAgent.default_start) self.max_ngram_size = opt.get('dict_max_ngram_size', DictionaryAgent.default_maxngram) self.tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok) self.lower = opt.get('dict_lower', DictionaryAgent.default_lower) self.maxtokens = opt.get('dict_maxtokens', DictionaryAgent.default_maxtokens) self.textfields = opt.get( 'dict_textfields', DictionaryAgent.default_textfields).split(",") try: self.tokenizer_fun = getattr(self, self.tokenizer + '_tokenize') except AttributeError: raise AttributeError('tokenizer type {} not yet supported'.format( self.tokenizer)) if shared: self.freq = shared.get('freq', {}) self.tok2ind = shared.get('tok2ind', {}) self.ind2tok = shared.get('ind2tok', {}) else: self.freq = defaultdict(int) self.tok2ind = {} self.ind2tok = {} if self.null_token: self.add_token(self.null_token) if self.start_token: # set special start of sentence word token self.add_token(self.start_token) if self.end_token: # set special end of sentence word token self.add_token(self.end_token) if self.unk_token: # set special unknown word token self.add_token(self.unk_token) loaded = False # If data built via pytorch data teacher, we need to load prebuilt dict if opt.get('pytorch_teacher_task'): from parlai.scripts.build_pytorch_data import get_pyt_dict_file opt['dict_file'] = get_pyt_dict_file(opt) if opt.get('dict_file'): opt['dict_file'] = modelzoo_path(opt.get('datapath'), opt['dict_file']) if os.path.isfile(opt['dict_file']): # load pre-existing dictionary self.load(opt['dict_file']) loaded = True if not loaded and opt.get('dict_initpath'): # load seed dictionary opt['dict_initpath'] = modelzoo_path(opt.get('datapath'), opt['dict_initpath']) # don't check isfile first, should fail if file not found self.load(opt['dict_initpath']) # initialize tokenizers if self.tokenizer == 'nltk': try: import nltk except ImportError: raise ImportError('Please install nltk (pip install nltk)') # nltk-specific setup st_path = 'tokenizers/punkt/{0}.pickle'.format( opt['dict_language']) try: self.sent_tok = nltk.data.load(st_path) except LookupError: nltk.download('punkt') self.sent_tok = nltk.data.load(st_path) self.word_tok = nltk.tokenize.treebank.TreebankWordTokenizer() elif self.tokenizer == 'spacy': try: import spacy except ImportError: raise ImportError('Please install spacy and spacy "en" model: ' '`pip install -U spacy && ' 'python -m spacy download en` ' 'or find alternative installation options ' 'at spacy.io') self.NLP = spacy.load('en') elif self.tokenizer == 'bpe': if not opt.get('dict_file'): raise RuntimeError('--dict-file is mandatory.') self.bpehelper = _BPEHelper(opt.get('dict_file') + '.codecs') elif self.tokenizer == 'gpt2': if self.lower: raise ValueError( 'Only use --dict-lower false with --dict-tokenizer gpt2') if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer gpt2' ' (no --dict-minfreq or --dict-maxtokens).') self.gpt2_bpe = Gpt2BpeHelper(opt) for each_token in self.gpt2_bpe.list_tokens(): self.add_token(each_token) self.freq[each_token] = 1 if not shared: if self.null_token: # fix count for null token to one billion and three self.freq[self.null_token] = 1000000003 if self.start_token: # fix count for start of sentence token to one billion and two self.freq[self.start_token] = 1000000002 if self.end_token: # fix count for end of sentence token to one billion and one self.freq[self.end_token] = 1000000001 if self.unk_token: # fix count for unknown token to one billion self.freq[self.unk_token] = 1000000000 if opt.get('dict_file'): self.save_path = opt['dict_file']
def _process_args_to_opts(self, args_that_override: Optional[List[str]] = None): self.opt = Opt(vars(self.args)) extra_ag = [] if '_subparser' in self.opt: # if using the super command, we need to be aware of the subcommand's # arguments when identifying things manually set by the user self.overridable.update(self.opt['_subparser'].overridable) extra_ag = self.opt.pop('_subparser')._action_groups # custom post-parsing self.opt['parlai_home'] = self.parlai_home self.opt = self._infer_datapath(self.opt) # set all arguments specified in command line as overridable option_strings_dict = {} store_true = [] store_false = [] for group in self._action_groups + extra_ag: for a in group._group_actions: if hasattr(a, 'option_strings'): for option in a.option_strings: option_strings_dict[option] = a.dest if isinstance(a, argparse._StoreTrueAction): store_true.append(option) elif isinstance(a, argparse._StoreFalseAction): store_false.append(option) if args_that_override is None: args_that_override = _sys.argv[1:] args_that_override = fix_underscores(args_that_override) for i in range(len(args_that_override)): if args_that_override[i] in option_strings_dict: if args_that_override[i] in store_true: self.overridable[option_strings_dict[args_that_override[i]]] = True elif args_that_override[i] in store_false: self.overridable[option_strings_dict[args_that_override[i]]] = False elif ( i < len(args_that_override) - 1 and args_that_override[i + 1] not in option_strings_dict ): key = option_strings_dict[args_that_override[i]] self.overridable[key] = self.opt[key] self.opt['override'] = self.overridable # load opts if a file is provided. if self.opt.get('init_opt', None) is not None: self._load_opts(self.opt) # map filenames that start with 'zoo:' to point to the model zoo dir options_to_change = {'model_file', 'dict_file', 'bpe_vocab', 'bpe_merge'} for each_key in options_to_change: if self.opt.get(each_key) is not None: self.opt[each_key] = modelzoo_path( self.opt.get('datapath'), self.opt[each_key] ) if self.opt['override'].get(each_key) is not None: # also check override self.opt['override'][each_key] = modelzoo_path( self.opt.get('datapath'), self.opt['override'][each_key] ) # add start time of an experiment self.opt['starttime'] = datetime.datetime.today().strftime('%b%d_%H-%M')
def build_regret_model(self) -> RagModel: """ Build and return regret RagModel. """ model_file = modelzoo_path(self.opt['datapath'], self.opt['regret_model_file']) if model_file: assert os.path.exists( model_file ), f'specify correct path for --regret-model-file (currently {model_file})' regret_opt = Opt.load(f'{model_file}.opt') regret_opt['n_docs'] = self.opt[ 'n_docs'] # Urgent that this is the same # add keys that were not in this model when originally trained regret_opt.update( {k: v for k, v in self.opt.items() if k not in regret_opt}) retriever_shared = None if all([ regret_opt[k] == self.opt[k] for k in [ 'rag_retriever_type', 'path_to_index', 'path_to_dpr_passages', ] ]): logging.warning( 'Sharing retrievers between model and regret model!') retriever_shared = self.model.retriever.share() elif self.opt['regret_override_index']: # Sharing Index Path & Passages only; not the full retriever logging.warning('Overriding initial ReGReT model index') regret_opt['path_to_index'] = self.opt['path_to_index'] regret_opt['path_to_dpr_passages'] = self.opt[ 'path_to_dpr_passages'] if self.opt['regret_dict_file']: regret_opt['dict_file'] = self.opt['regret_dict_file'] regret_dict = self.dictionary_class()(regret_opt) model = RagModel(regret_opt, regret_dict, retriever_shared=retriever_shared) with PathManager.open(model_file, 'rb') as f: states = torch.load( f, map_location=lambda cpu, _: cpu, pickle_module=parlai.utils.pickle, ) assert 'model' in states model.load_state_dict(states['model']) if self.model_parallel: ph = PipelineHelper() ph.check_compatibility(self.opt) model = ph.make_parallel(model) elif self.use_cuda: model.cuda() if self.fp16: model = model.half() sync_parameters(model) train_params = trainable_parameters(model) total_params = total_parameters(model) logging.info( f"Total regret parameters: {total_params:,d} ({train_params:,d} trainable)" ) else: model = self.model return model
def _fairseq_opt_wrapper(opt, skip_pretrained_embedding_loading=False): """ Marshalls from a dict to a argparse.Namespace object for API compatibility. Also does some necessary post-processing needed for fairseq. Optionally can override pretrained embedding options, which is useful if we're just loading a model from a checkpoint. :param opt: dict. ParlAI options passed around from everywhere. :param skip_pretrained_embedding_loading: bool. Don't preload word embeddings. :return: an argparse.Namespace object for use in fairseq-py. """ args = argparse.Namespace() # first set args according to ParlAI options for key in opt: if opt[key] is not None: setattr(args, key, opt[key]) # at this point the user *must* have specified an arch if not hasattr(args, "arch"): raise ValueError("--arch/-a must be specified") # fill in default options from the model models.ARCH_CONFIG_REGISTRY[args.arch](args) # post processing of args. See # https://github.com/pytorch/fairseq/blob/v0.5.0/fairseq/options.py#L95 if hasattr(args, "lr"): args.lr = options.eval_str_list(args.lr, type=float) if hasattr(args, "update_freq"): args.update_freq = options.eval_str_list(args.update_freq, int) if hasattr(args, "max_sentences_valid"): args.max_sentences_valid = args.max_sentences if getattr(args, "truncate") == -1: # some torch agents use positional embeddings, which must have a max length setattr(args, "truncate", 1024) if not hasattr(args, "max_source_positions"): # fairseq uses a different name for this CLI parameter # Sometimes it's set in model defaults, but not for all models setattr(args, "max_source_positions", getattr(args, "truncate")) # if we don't have source lengths, we don't have target lengths setattr(args, "max_target_positions", getattr(args, "truncate")) # handle modelzoo if possible for k in ("encoder_embed_path", "decoder_embed_path"): if getattr(args, k, None) is None: # not an argument for this model, pretrained embeddings don't matter continue elif skip_pretrained_embedding_loading: # if we want to skip pretrained, then hide the option from fairseq setattr(args, k, None) else: # otherwise we may need to modelzoo adjust the path for fairseq setattr(args, k, modelzoo_path(opt.get("datapath"), getattr(args, k))) # Here we hardcode a few options that we currently do not support # turn off distributed training args.distributed_world_size = 1 args.distributed_rank = 0 return args, vars(args)
def test_load_dpr(self): opt = ParlaiParser(True, True).parse_args([]) # First, we'll load up a DPR model from the zoo dpr file. default_query_encoder = DprQueryEncoder(opt, dpr_model='bert', pretrained_path=DPR_ZOO_MODEL) rag_sequence_query_encoder = DprQueryEncoder( opt, dpr_model='bert_from_parlai_rag', pretrained_path=RAG_SEQUENCE_ZOO_MODEL, ) assert not torch.allclose( default_query_encoder.embeddings.weight.float().cpu(), rag_sequence_query_encoder.embeddings.weight.float().cpu(), ) # 1. Create a zoo RAG Agent, which involves a trained DPR model rag = create_agent( Opt({ 'model_file': modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL), 'override': { 'retriever_debug_index': 'compressed', 'fp16': False }, })) # The default rag token model should have different query encoders # from both the RAG_SEQUENCE_ZOO_MODEL, and the default DPR_ZOO_MODEL assert not torch.allclose( rag_sequence_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), ) assert not torch.allclose( default_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), ) # 2. create a RAG Agent with the rag_sequence_zoo_model DPR model rag = create_agent( Opt({ 'model_file': modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL), 'override': { 'retriever_debug_index': 'compressed', 'dpr_model_file': modelzoo_path(opt['datapath'], RAG_SEQUENCE_ZOO_MODEL), 'query_model': 'bert_from_parlai_rag', 'fp16': False, }, })) # If we override the DPR Model file, we should now have the same # weights as the query encoder from above. assert torch.allclose( rag_sequence_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), ) # 3. Create a RAG Agent with the default DPR zoo model rag = create_agent( Opt({ 'model_file': modelzoo_path(opt['datapath'], RAG_TOKEN_ZOO_MODEL), 'override': { 'retriever_debug_index': 'compressed', 'dpr_model_file': modelzoo_path(opt['datapath'], DPR_ZOO_MODEL), 'fp16': False, }, })) # This model was trained with the DPR_ZOO_MODEL, and yet now should have the same weights # as we explicitly specified it. assert torch.allclose( default_query_encoder.embeddings.weight.float().cpu(), rag.model.retriever.query_encoder.embeddings.weight.float().cpu(), )
def _process_args_to_opts(self, args_that_override: Optional[List[str]] = None): self.opt = Opt(vars(self.args)) # custom post-parsing self.opt['parlai_home'] = self.parlai_home self.opt = self._infer_datapath(self.opt) # set all arguments specified in command line as overridable option_strings_dict = {} store_true = [] store_false = [] for group in self._action_groups: for a in group._group_actions: if hasattr(a, 'option_strings'): for option in a.option_strings: option_strings_dict[option] = a.dest if '_StoreTrueAction' in str(type(a)): store_true.append(option) elif '_StoreFalseAction' in str(type(a)): store_false.append(option) if args_that_override is None: args_that_override = _sys.argv[1:] for i in range(len(args_that_override)): if args_that_override[i] in option_strings_dict: if args_that_override[i] in store_true: self.overridable[option_strings_dict[ args_that_override[i]]] = True elif args_that_override[i] in store_false: self.overridable[option_strings_dict[ args_that_override[i]]] = False elif (i < len(args_that_override) - 1 and args_that_override[i + 1][:1] != '-'): key = option_strings_dict[args_that_override[i]] self.overridable[key] = self.opt[key] self.opt['override'] = self.overridable # load opts if a file is provided. if self.opt.get('init_opt', None) is not None: self._load_opts(self.opt) # map filenames that start with 'zoo:' to point to the model zoo dir if self.opt.get('model_file') is not None: self.opt['model_file'] = modelzoo_path(self.opt.get('datapath'), self.opt['model_file']) if self.opt['override'].get('model_file') is not None: # also check override self.opt['override']['model_file'] = modelzoo_path( self.opt.get('datapath'), self.opt['override']['model_file']) if self.opt.get('dict_file') is not None: self.opt['dict_file'] = modelzoo_path(self.opt.get('datapath'), self.opt['dict_file']) if self.opt['override'].get('dict_file') is not None: # also check override self.opt['override']['dict_file'] = modelzoo_path( self.opt.get('datapath'), self.opt['override']['dict_file']) # add start time of an experiment self.opt['starttime'] = datetime.datetime.today().strftime( '%b%d_%H-%M')
def learn_arora(opt): """ Go through ConvAI2 data and collect word counts, thus compute the unigram probability distribution. Use those probs to compute weighted sentence embeddings for all utterances, thus compute first principal component. Save all info to arora.pkl file. """ arora_file = os.path.join(opt['datapath'], 'controllable_dialogue', 'arora.pkl') opt['task'] = 'fromfile:parlaiformat' opt['log_every_n_secs'] = 2 print('Getting word counts from ConvAI2 train set...') opt['datatype'] = 'train:ordered' opt['fromfile_datapath'] = os.path.join(opt['datapath'], 'controllable_dialogue', 'ConvAI2_parlaiformat', 'train.txt') # Do include inputs because ConvAI2 train set reverses every convo: word_counter_train, total_count_train, all_utts_train = get_word_counts( opt, count_inputs=False) print('Getting word counts from ConvAI2 val set...') opt['datatype'] = 'valid' opt['fromfile_datapath'] = os.path.join(opt['datapath'], 'controllable_dialogue', 'ConvAI2_parlaiformat', 'valid.txt') # Don't include inputs because ConvAI2 val set doesn't reverses convos: word_counter_valid, total_count_valid, all_utts_valid = get_word_counts( opt, count_inputs=True) # Merge word counts word_counter = word_counter_train for word, count in word_counter_valid.items(): word_counter[word] += count total_count = total_count_train + total_count_valid # Merge all_utts all_utts = all_utts_train + all_utts_valid # Compute unigram prob for every word print("Computing unigram probs for all words...") word2prob = {w: c / total_count for w, c in word_counter.items()} # Settings for sentence embedder arora_a = 0.0001 glove_name = '840B' glove_dim = 300 glove_cache = modelzoo_path(opt['datapath'], 'models:glove_vectors') # Embed every sentence, without removing first singular value print('Embedding all sentences...') sent_embedder = SentenceEmbedder( word2prob, arora_a, glove_name, glove_dim, first_sv=None, glove_cache=glove_cache, ) utt_embs = [] log_timer = TimeLogger() for n, utt in enumerate(all_utts): utt_emb = sent_embedder.embed_sent(utt.split(), rem_first_sv=False) utt_embs.append(utt_emb) if log_timer.time() > opt['log_every_n_secs']: text, _log = log_timer.log(n, len(all_utts)) print(text) # Use SVD to calculate singular vector # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.svd.html print('Calculating SVD...') utt_embs = np.stack(utt_embs, axis=0) # shape (num_embs, glove_dim) U, s, V = np.linalg.svd(utt_embs, full_matrices=False) first_sv = V[0, :] # first row of V. shape (glove_dim) # Remove singular vector from all embs to get complete Arora-style sent embs print('Removing singular vec from all sentence embeddings...') utt_embs_adj = [ remove_first_sv(torch.Tensor(emb), torch.Tensor(first_sv)).numpy() for emb in utt_embs ] # list of np arrays shape (glove_dim) # Make dict mapping ConvAI2 dataset utterances to Arora sent emb # We save this to file for convenience (e.g. if you want to inspect) utt2emb = {utt: emb for (utt, emb) in zip(all_utts, utt_embs_adj)} # Save unigram distribution, first singular value, hyperparameter value for a, # info about GloVe vectors used, and full dict of utt->emb to file print("Saving Arora embedding info to %s..." % arora_file) with open(arora_file, "wb") as f: pickle.dump( { 'word2prob': word2prob, # dict: string to float between 0 and 1 'first_sv': first_sv, # np array shape (glove_dim) 'arora_a': arora_a, # float, 0.0001 'glove_name': glove_name, # string, '840B' 'glove_dim': glove_dim, # int, 300 'utt2emb': utt2emb, # dict: string to np array shape (glove_dim) }, f, )
def test_chat_world(self): """ Test functionality of the chat world. """ with testing_utils.tempdir() as tmpdir: save_folder = tmpdir # Params model_name = 'blender_90M' zoo_model_file = 'zoo:blender/blender_90M/model' model = 'TransformerGenerator' num_turn_pairs = 6 config_folder = os.path.join( os.path.dirname(os.path.realpath(run.__file__)), 'config' ) datapath = os.path.join(tmpdir, 'data') # Download zoo model file model_file = modelzoo_path(datapath, zoo_model_file) # Define opt base_model_folder = os.path.dirname(os.path.dirname(model_file)) # Get the folder that encloses the innermost model folder with open(os.path.join(config_folder, 'left_pane_text.html')) as f: left_pane_text = f.readlines() with open(os.path.join(config_folder, 'annotations_config.json')) as f: annotations_config = json.load(f) opt = Opt( { 'annotations_config': annotations_config, 'annotations_intro': ANNOTATIONS_INTRO_TEXT, 'base_model_folder': base_model_folder, 'check_acceptability': False, 'conversation_start_mode': 'hi', 'final_rating_question': 'Please rate your partner on a scale of 1-5.', 'include_persona': False, 'is_sandbox': True, 'left_pane_text': left_pane_text, 'save_folder': save_folder, 'task': 'turn_annotations', 'task_model_parallel': False, } ) # Construct desired dialog human_agent_id = "Person1" bot_utterances = [ "Hello, how are you today? I just got back from a long day at work, so I'm nervous.", "I just don't know what to do. I've never been so nervous in my life.", "Yes, I'll probably go to the movies. What about you? What do you like to do?", "That's great! What kind of restaurant do you usually go to? I love italian food.", "I love thai as well. What's your favorite kind of thai food? I like thai food the best.", 'Oh, I love peanuts! I love all kinds of peanuts. Do you eat a lot of peanuts?', "I eat peanuts a lot, but only a few times a week. It's good for you.", ] human_utterances = [ "What are you nervous about?", "Do you have any plans for the weekend?", "Yeah that sounds great! I like to bike and try new restaurants.", "Oh, Italian food is great. I also love Thai and Indian.", "Hmmm - anything with peanuts? Or I like when they have spicy licorice-like herbs.", "Ha, a decent amount, probably. What about you?", ] bucket_assignments = [ { 'bucket_0': False, 'bucket_1': False, 'bucket_2': True, 'bucket_3': False, 'bucket_4': True, } ] * (num_turn_pairs + 1) # Arbitrary choose buckets. The +1 is for the final model response at the # end final_rating = 3 fake_first_human_turn = { "left_pane_text": left_pane_text, "episode_done": False, "id": "Person1", "text": "Hi!", "fake_start": True, "agent_idx": 0, "config": { "min_num_turns": num_turn_pairs, "annotations_config": annotations_config, }, } final_bot_turn = { "agent_idx": 1, "text": bot_utterances[num_turn_pairs], "id": model, "problem_data": { "turn_idx": num_turn_pairs * 2 + 1, **bucket_assignments[num_turn_pairs], "final_rating": str(final_rating), }, } dialog = [fake_first_human_turn] for turn_pair_idx in range(num_turn_pairs): bot_turn = { "agent_idx": 1, "text": bot_utterances[turn_pair_idx], "id": model, "problem_data": { "turn_idx": turn_pair_idx * 2 + 1, **bucket_assignments[turn_pair_idx], }, } human_turn = { "agent_idx": 0, "text": human_utterances[turn_pair_idx], "id": human_agent_id, } dialog += [bot_turn, human_turn] dialog += [final_bot_turn] # Construct desired output desired_results = { "personas": None, "context_dataset": None, "person1_seed_utterance": None, "person2_seed_utterance": None, "additional_context": None, "dialog": dialog, "workers": [HUMAN_LIKE_AGENT_WORKER_ID, model_name], "bad_workers": [], "acceptability_violations": [None], "hit_ids": [HUMAN_LIKE_AGENT_HIT_ID, "none"], "assignment_ids": [HUMAN_LIKE_AGENT_ASSIGNMENT_ID, "none"], "task_description": { "annotations_config": annotations_config, "model_nickname": model_name, "model_file": model_file, }, } # Set up semaphore max_concurrent_responses = 1 semaphore = threading.Semaphore(max_concurrent_responses) # Set up human agent human_worker = HumanLikeChatAgent( agent_id=human_agent_id, human_utterances=human_utterances, bucket_assignments=bucket_assignments, final_rating=final_rating, ) # Set up bot agent shared_bot_agents = TurkLikeAgent.get_bot_agents( opt=opt, active_models=[model_name] ) # Get a bot and add it to the "human" worker print(f'Choosing the "{model_name}" model for the bot.') agent = create_agent_from_shared(shared_bot_agents[model_name]) bot_worker = TurkLikeAgent( opt, model_name=model_name, model_agent=agent, num_turns=num_turn_pairs, semaphore=semaphore, ) workers_including_bot = [human_worker, bot_worker] # Define world conv_idx = 0 world = TurnAnnotationsChatWorld( opt=opt, agents=workers_including_bot, num_turns=num_turn_pairs, max_resp_time=180, tag='conversation t_{}'.format(conv_idx), context_info=None, ) # Run conversation while not world.episode_done(): print('About to parley') world.parley() # Check the output data model_nickname, worker_is_unacceptable, convo_finished = world.save_data() self.assertEqual(model_nickname, model_name) self.assertFalse(worker_is_unacceptable) self.assertTrue(convo_finished) # Check the final results file saved by the world results_path = list(glob.glob(os.path.join(tmpdir, '*_*_sandbox.json')))[0] with open(results_path) as f: actual_results = json.load(f) for k, v in desired_results.items(): if k == 'task_description': for k2, v2 in desired_results[k].items(): self.assertEqual(actual_results[k].get(k2), v2) else: self.assertEqual(actual_results.get(k), v)
def __init__(self, opt: Opt, shared=None): """ Initialize DictionaryAgent. """ self.opt = copy.deepcopy(opt) self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq) self.null_token = opt.get('dict_nulltoken', DictionaryAgent.default_null) self.end_token = opt.get('dict_endtoken', DictionaryAgent.default_end) self.unk_token = opt.get('dict_unktoken', DictionaryAgent.default_unk) self.start_token = opt.get('dict_starttoken', DictionaryAgent.default_start) self.max_ngram_size = opt.get('dict_max_ngram_size', DictionaryAgent.default_maxngram) self.tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok) self.lower = opt.get('dict_lower', DictionaryAgent.default_lower) self.maxtokens = opt.get('dict_maxtokens', DictionaryAgent.default_maxtokens) self.textfields = opt.get( 'dict_textfields', DictionaryAgent.default_textfields).split(",") # used to signal whether we should use training time tricks, like bpe droput self._tokenization_mode = TokenizationMode.TEST_TIME_LABEL try: self.tokenizer_fun = getattr(self, self.tokenizer + '_tokenize') except AttributeError: raise AttributeError('tokenizer type {} not yet supported'.format( self.tokenizer)) if shared: self.freq = shared.get('freq', {}) self.tok2ind = shared.get('tok2ind', {}) self.ind2tok = shared.get('ind2tok', {}) else: self.additional_special_tokens: List[str] = [] self.freq = defaultdict(int) self.tok2ind = {} self.ind2tok = {} if self.null_token: self.add_token(self.null_token) if self.start_token: # set special start of sentence word token self.add_token(self.start_token) if self.end_token: # set special end of sentence word token self.add_token(self.end_token) if self.unk_token: # set special unknown word token self.add_token(self.unk_token) loaded = False # If data built via pytorch data teacher, we need to load prebuilt dict if opt.get('dict_file'): opt['dict_file'] = modelzoo_path(opt.get('datapath'), opt['dict_file']) if PathManager.exists(opt['dict_file']): # load pre-existing dictionary self.load(opt['dict_file']) loaded = True if not loaded and opt.get('dict_initpath'): # load seed dictionary opt['dict_initpath'] = modelzoo_path(opt.get('datapath'), opt['dict_initpath']) # don't check isfile first, should fail if file not found self.load(opt['dict_initpath']) opt['dict_loaded'] = loaded # cache unk token for later self._unk_token_idx = self.tok2ind.get(self.unk_token) # initialize tokenizers if self.tokenizer == 'nltk': try: import nltk except ImportError: raise ImportError('Please install nltk (pip install nltk)') # nltk-specific setup st_path = 'tokenizers/punkt/{0}.pickle'.format( opt['dict_language']) try: self.sent_tok = nltk.data.load(st_path) except LookupError: nltk.download('punkt') self.sent_tok = nltk.data.load(st_path) self.word_tok = nltk.tokenize.treebank.TreebankWordTokenizer() elif self.tokenizer in [ 'bpe', 'gpt2', 'bytelevelbpe', 'slow_bytelevel_bpe' ]: self.bpe = bpe_factory(opt, shared) self.bpe.sync_with_dict(self) if not shared: if self.null_token: # fix count for null token to one billion and three self.freq[self.null_token] = 1000000003 if self.start_token: # fix count for start of sentence token to one billion and two self.freq[self.start_token] = 1000000002 if self.end_token: # fix count for end of sentence token to one billion and one self.freq[self.end_token] = 1000000001 if self.unk_token: # fix count for unknown token to one billion self.freq[self.unk_token] = 1000000000 if opt.get('dict_file'): self.save_path = opt['dict_file']
def compare_init_model_opts(opt, curr_opt): """Print loud warning when `init_model` opts differ from previous configuration.""" if opt.get('init_model') is None: return opt['init_model'] = modelzoo_path(opt['datapath'], opt['init_model']) optfile = opt['init_model'] + '.opt' if not os.path.isfile(optfile): return init_model_opt = load_opt_file(optfile) extra_opts = {} different_opts = {} exempt_opts = [ 'model_file', 'dict_file', 'override', 'starttime', 'init_model', 'batchindex', ] # search through init model opts for k, v in init_model_opt.items(): if ( k not in exempt_opts and k in init_model_opt and init_model_opt[k] != curr_opt.get(k) ): if isinstance(v, list): if init_model_opt[k] != list(curr_opt[k]): different_opts[k] = ','.join([str(x) for x in v]) else: different_opts[k] = v # search through opts to load for k, v in curr_opt.items(): if k not in exempt_opts and k not in init_model_opt: if isinstance(v, list): extra_opts[k] = ','.join([str(x) for x in v]) else: extra_opts[k] = v # print warnings extra_strs = ['{}: {}'.format(k, v) for k, v in extra_opts.items()] if extra_strs: print('\n' + '*' * 75) print( '[ WARNING ] : your model is being loaded with opts that do not ' 'exist in the model you are initializing the weights with: ' '{}'.format(','.join(extra_strs)) ) different_strs = [ '--{} {}'.format(k, v).replace('_', '-') for k, v in different_opts.items() ] if different_strs: print('\n' + '*' * 75) print( '[ WARNING ] : your model is being loaded with opts that differ ' 'from the model you are initializing the weights with. Add the ' 'following args to your run command to change this: \n' '\n{}'.format(' '.join(different_strs)) ) print('*' * 75)
def __init__(self, opt, shared=None): # initialize fields self.opt = copy.deepcopy(opt) self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq) self.null_token = opt.get('dict_nulltoken', DictionaryAgent.default_null) self.end_token = opt.get('dict_endtoken', DictionaryAgent.default_end) self.unk_token = opt.get('dict_unktoken', DictionaryAgent.default_unk) self.start_token = opt.get('dict_starttoken', DictionaryAgent.default_start) self.max_ngram_size = opt.get('dict_max_ngram_size', DictionaryAgent.default_maxngram) self.tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok) self.lower = opt.get('dict_lower', DictionaryAgent.default_lower) self.maxtokens = opt.get('dict_maxtokens', DictionaryAgent.default_maxtokens) self.textfields = opt.get( 'dict_textfields', DictionaryAgent.default_textfields).split(",") try: self.tokenizer_fun = getattr(self, self.tokenizer + '_tokenize') except AttributeError: raise AttributeError('tokenizer type {} not yet supported'.format( self.tokenizer)) if shared: self.freq = shared.get('freq', {}) self.tok2ind = shared.get('tok2ind', {}) self.ind2tok = shared.get('ind2tok', {}) else: self.freq = defaultdict(int) self.tok2ind = {} self.ind2tok = {} if self.null_token: self.add_token(self.null_token) if self.start_token: # set special start of sentence word token self.add_token(self.start_token) if self.end_token: # set special end of sentence word token self.add_token(self.end_token) if self.unk_token: # set special unknown word token self.add_token(self.unk_token) if opt.get('dict_file') and os.path.isfile(opt['dict_file']): # load pre-existing dictionary self.load(opt['dict_file']) elif opt.get('dict_initpath'): # load seed dictionary opt['dict_initpath'] = modelzoo_path(opt.get('datapath'), opt['dict_initpath']) self.load(opt['dict_initpath']) # initialize tokenizers if self.tokenizer == 'nltk': try: import nltk except ImportError: raise ImportError('Please install nltk (pip install nltk)') # nltk-specific setup st_path = 'tokenizers/punkt/{0}.pickle'.format( opt['dict_language']) try: self.sent_tok = nltk.data.load(st_path) except LookupError: nltk.download('punkt') self.sent_tok = nltk.data.load(st_path) self.word_tok = nltk.tokenize.treebank.TreebankWordTokenizer() elif self.tokenizer == 'spacy': try: import spacy except ImportError: raise ImportError('Please install spacy and spacy "en" model: ' '`pip install -U spacy && ' 'python -m spacy download en` ' 'or find alternative installation options ' 'at spacy.io') self.NLP = spacy.load('en') elif self.tokenizer == 'bpe': if not opt.get('dict_file'): raise RuntimeError('--dict-file is mandatory.') self.bpehelper = _BPEHelper(opt.get('dict_file') + '.codecs') if not shared: if self.null_token: # fix count for null token to one billion and three self.freq[self.null_token] = 1000000003 if self.start_token: # fix count for start of sentence token to one billion and two self.freq[self.start_token] = 1000000002 if self.end_token: # fix count for end of sentence token to one billion and one self.freq[self.end_token] = 1000000001 if self.unk_token: # fix count for unknown token to one billion self.freq[self.unk_token] = 1000000000 if opt.get('dict_file'): self.save_path = opt['dict_file']
def parse_args(self, args=None, namespace=None, print_args=True): """ Parses the provided arguments and returns a dictionary of the ``args``. We specifically remove items with ``None`` as values in order to support the style ``opt.get(key, default)``, which would otherwise return ``None``. """ self.add_extra_args(args) self.args = super().parse_args(args=args) self.opt = vars(self.args) # custom post-parsing self.opt['parlai_home'] = self.parlai_home # set environment variables # Priority for setting the datapath (same applies for download_path): # --datapath -> os.environ['PARLAI_DATAPATH'] -> <self.parlai_home>/data if self.opt.get('download_path'): os.environ['PARLAI_DOWNPATH'] = self.opt['download_path'] elif os.environ.get('PARLAI_DOWNPATH') is None: os.environ['PARLAI_DOWNPATH'] = os.path.join(self.parlai_home, 'downloads') if self.opt.get('datapath'): os.environ['PARLAI_DATAPATH'] = self.opt['datapath'] elif os.environ.get('PARLAI_DATAPATH') is None: os.environ['PARLAI_DATAPATH'] = os.path.join(self.parlai_home, 'data') self.opt['download_path'] = os.environ['PARLAI_DOWNPATH'] self.opt['datapath'] = os.environ['PARLAI_DATAPATH'] # set all arguments specified in commandline as overridable option_strings_dict = {} store_true = [] store_false = [] for group in self._action_groups: for a in group._group_actions: if hasattr(a, 'option_strings'): for option in a.option_strings: option_strings_dict[option] = a.dest if '_StoreTrueAction' in str(type(a)): store_true.append(option) elif '_StoreFalseAction' in str(type(a)): store_false.append(option) for i in range(len(self.cli_args)): if self.cli_args[i] in option_strings_dict: if self.cli_args[i] in store_true: self.overridable[option_strings_dict[self.cli_args[i]]] = \ True elif self.cli_args[i] in store_false: self.overridable[option_strings_dict[self.cli_args[i]]] = \ False elif i < len(self.cli_args) - 1 and self.cli_args[i + 1][:1] != '-': key = option_strings_dict[self.cli_args[i]] self.overridable[key] = self.opt[key] self.opt['override'] = self.overridable # map filenames that start with 'models:' to point to the model zoo dir if self.opt.get('model_file') is not None: self.opt['model_file'] = modelzoo_path(self.opt.get('datapath'), self.opt['model_file']) if self.opt['override'].get('model_file') is not None: # also check override self.opt['override']['model_file'] = modelzoo_path( self.opt.get('datapath'), self.opt['override']['model_file']) if self.opt.get('dict_file') is not None: self.opt['dict_file'] = modelzoo_path(self.opt.get('datapath'), self.opt['dict_file']) if self.opt['override'].get('dict_file') is not None: # also check override self.opt['override']['dict_file'] = modelzoo_path( self.opt.get('datapath'), self.opt['override']['dict_file']) # add start time of an experiment self.opt['starttime'] = datetime.datetime.today().strftime('%b%d_%H-%M') if print_args: self.print_args() print("\n".join([ "", "*" * 80, "Thank you for using ParlAI! We are conducting a user survey.", "Please consider filling it out at https://forms.gle/uEFbYGP7w6hiuGQT9", "*" * 80, "" ])) return self.opt