def __init__(self, opt, datatype: str = 'train', seed: Optional[int] = None): """ Initalize the context generator. opt: only a 'datapath' key is required, to specify the ParlAI data folder """ if seed is not None: self.rng = random.Random(seed) else: self.rng = random.Random() convai2_opt = Opt({'datapath': opt['datapath'], 'datatype': datatype}) self.convai2_teacher = BothTeacher(convai2_opt) ed_opt = Opt({ 'datapath': opt['datapath'], 'datatype': datatype, 'train_experiencer_only': True, }) # Specify train_experiencer_only = True because we want to ensure that the text # will correspond to a Speaker utterance and the label to a Listener response self.ed_teacher = EmpatheticDialoguesTeacher(ed_opt) wow_opt = Opt({'datapath': opt['datapath'], 'datatype': datatype}) self.wow_teacher = WizardDialogKnowledgeTeacher(wow_opt) self.topic_to_persona_path = _topic_to_persona_path(opt) self.wow_topics_to_episode_idxes = self._setup_topics_to_episodes() self.persona_strings_to_wow_topics = self._setup_personas_to_topics()
def cl_build_ref_agent(self): ref_model_file = self.opt['ref_model_file'] if ref_model_file is None or ref_model_file.lower() == "none": raise RuntimeError("CL training requires reference model!") else: from parlai.core.agents import create_agent_from_opt_file ref_agent = create_agent_from_opt_file( Opt({'model_file': ref_model_file})) eval_ref_agent = create_agent_from_opt_file( Opt({'model_file': ref_model_file})) if ref_agent is None: raise RuntimeError( "Build reference model failed! check your `ref_model_file`:{}!" .format(ref_model_file)) if self.id == ref_agent.id and dict_same(self, ref_agent): self.use_external_ref_model = False else: self.use_external_ref_model = True # No need to do this # # check dict # if self.dict.tok2ind != ref_agent.dict.tok2ind or self.dict.ind2tok != ref_agent.dict.ind2tok: # raise RuntimeError("Reference model is using different dict!") self.eval_ref_agent = eval_ref_agent self.ref_agent = ref_agent
def test_beamsearch_contextblocking(self): """ Test beamsearch context blocking. """ agent = create_agent_from_model_file( 'zoo:unittest/context_blocking/model') agent.observe({'text': '5 4 3 2', 'episode_done': True}) assert agent.act()['text'] == '5 4 3 2' agent = create_agent_from_model_file( 'zoo:unittest/context_blocking/model', Opt(beam_context_block_ngram=1)) agent.observe({'text': '5 4 3 2', 'episode_done': True}) text = agent.act()['text'] assert '5' not in text assert '4' not in text assert '3' not in text assert '2' not in text agent = create_agent_from_model_file( 'zoo:unittest/context_blocking/model', Opt(beam_context_block_ngram=2)) agent.observe({'text': '5 4 3 2', 'episode_done': True}) text = agent.act()['text'] assert '5' in text assert '5 4' not in text assert '4 3' not in text assert '3 2' not in text
def get_tl(tmpdir): final_opt = Opt({ 'task': 'integration_tests', 'datatype': 'valid', 'validation_max_exs': 30, 'short_final_eval': True, }) final_opt.save(os.path.join(tmpdir, "final_opt.opt")) opt = Opt({ 'task': 'integration_tests', 'validation_max_exs': 10, 'model': 'repeat_label', 'model_file': os.path.join(tmpdir, 'model'), 'short_final_eval': True, 'num_epochs': 1.0, 'final_extra_opt': str(os.path.join(tmpdir, "final_opt.opt")), }) parser = tms.setup_args() parser.set_params(**opt) popt = parser.parse_args([]) for k, v in opt.items(): popt[k] = v return tms.TrainLoop(popt)
def test_counts(self): with testing_utils.tempdir() as tmpdir: data_path = tmpdir # Check EmpatheticDialoguesTeacher, with multiple examples per episode opts_episodes_and_examples = [ ( {'datatype': 'train'}, EPISODE_COUNTS['train_both_sides'], EXAMPLE_COUNTS['train_both_sides'], ), # Test the default mode ( {'datatype': 'train', 'train_experiencer_only': True}, EPISODE_COUNTS['train_experiencer_only'], EXAMPLE_COUNTS['train_experiencer_only'], ), ( {'datatype': 'train', 'train_experiencer_only': False}, EPISODE_COUNTS['train_both_sides'], EXAMPLE_COUNTS['train_both_sides'], ), ( {'datatype': 'valid'}, EPISODE_COUNTS['valid'], EXAMPLE_COUNTS['valid'], ), ({'datatype': 'test'}, EPISODE_COUNTS['test'], EXAMPLE_COUNTS['test']), ] for teacher_class in [EmpatheticDialoguesTeacher]: for opt, num_episodes, num_examples in opts_episodes_and_examples: full_opt = Opt({**opt, 'datapath': data_path}) teacher = teacher_class(full_opt) self.assertEqual(teacher.num_episodes(), num_episodes) self.assertEqual(teacher.num_examples(), num_examples) # Check EmotionClassificationSituationTeacher, with one example per episode train_episode_count = EPISODE_COUNTS['train_experiencer_only'] # For the situation classifier, we only want to have one episode per train # conversation opts_episodes = [ ({'datatype': 'train'}, train_episode_count), # Test the default mode ( {'datatype': 'train', 'train_experiencer_only': True}, train_episode_count, ), ( {'datatype': 'train', 'train_experiencer_only': False}, train_episode_count, ), ({'datatype': 'valid'}, EPISODE_COUNTS['valid']), ({'datatype': 'test'}, EPISODE_COUNTS['test']), ] for teacher_class in [EmotionClassificationSituationTeacher]: for opt, num_episodes in opts_episodes: full_opt = Opt({**opt, 'datapath': data_path}) teacher = teacher_class(full_opt) self.assertEqual(teacher.num_episodes(), num_episodes) self.assertEqual(teacher.num_examples(), num_episodes)
def _test_iterate(self, teacher_class): for dt in [ 'train:ordered', 'train:stream:ordered', 'valid', 'test', 'valid:stream', 'test:stream', ]: opt = Opt({'datatype': dt, 'datapath': '/tmp', 'task': 'test'}) teacher = teacher_class(opt) self._verify_act(teacher.act(), 1, 2, False) self._verify_act(teacher.act(), 2, 4, False) self._verify_act(teacher.act(), 3, 6, True) self._verify_act(teacher.act(), 1, 2, False) self._verify_act(teacher.act(), 2, 4, False) self._verify_act(teacher.act(), 3, 6, True) self._verify_act(teacher.act(), 1, 2, False) self._verify_act(teacher.act(), 2, 4, False) self._verify_act(teacher.act(), 3, 6, True) assert teacher.epoch_done()
def test_save_load(self): o = Opt({'a': 3, 'b': 'foo'}) with testing_utils.tempdir() as tmpdir: fn = os.path.join(tmpdir, "opt") o.save(fn) o2 = Opt.load(fn) assert o == o2
def test_init_from_from_checkpoint(self): with testing_utils.tempdir() as temp_dir: opt_from_file = { 'datapath': 'dummy_path', 'model': 'repeat_label', 'init_model': os.path.join(temp_dir, 'something'), 'model_file': os.path.join(temp_dir, 'something_else'), } opt = Opt({ 'datapath': 'dummy_path', 'model': 'repeat_label', 'init_model': os.path.join(temp_dir, 'something_else.checkpoint'), 'model_file': os.path.join(temp_dir, 'something_else'), 'load_from_checkpoint': True, }) with open(os.path.join(temp_dir, 'something_else.opt'), 'w') as f: f.write(json.dumps(opt_from_file)) agent = create_agent_from_opt_file(opt) init_model = agent.opt['init_model'] # assert that the model was loaded with the correct checkpoitn assert '.checkpoint' in init_model
def test_allow_missing_init_opts(self): """ Test --allow-missing-init-opts. """ with testing_utils.tempdir() as temp_dir: init_opt_path = os.path.join(temp_dir, 'init_opt.opt') # Save a test opt file with an argument that doesn't exist init_opt = Opt({'made_up_arg': 'foo'}) init_opt.save(init_opt_path) # Assert that the opt file normally can't be loaded in with self.assertRaises(RuntimeError): _ = ParlaiParser(True, True).parse_kwargs(init_opt=init_opt_path) # Assert that the opt file *can* be loaded in if we set # --allow-missing-init-opts, and assert that the made-up arg does not exist # in the opt opt = ParlaiParser(True, True).parse_kwargs(init_opt=init_opt_path, allow_missing_init_opts=True) self.assertFalse(hasattr(opt, 'made_up_arg'))
def add_extra_args(self, args=None): super().add_extra_args(args) parsed = vars(self.parse_known_args(args, nohelp=True)[0]) # Also load extra args options if a file is given. if parsed.get("init_opt") is not None: try: self._load_known_opts(parsed.get("init_opt"), parsed) except FileNotFoundError: # don't die if -o isn't found here. See comment in second call # later on. pass parsed = self._infer_datapath(parsed) partial = Opt(parsed) for model in [ "system_model", "user_model", "api_schema_grounding_model", "goal_grounding_model", "api_resp_model", ]: if (model in partial and partial[model] is not None and len(partial[model]) > 0): self.add_model_subargs(partial[model], partial) for model_file_prefix in ["system", "user"]: key = model_file_prefix + "_model_file" if key in partial and partial[key] and len(partial[key]) > 0: model_name = self._get_model_name_from_model_file(key, partial) self.add_model_subargs(model_name, partial)
def test_gpt2_bpe_tokenize(self): with testing_utils.capture_output(): opt = Opt({'dict_tokenizer': 'gpt2', 'datapath': './data'}) agent = DictionaryAgent(opt) self.assertEqual( # grinning face emoji agent.gpt2_tokenize(u'Hello, ParlAI! \U0001f600'), [ 'Hello', ',', r'\xc4\xa0Par', 'l', 'AI', '!', r'\xc4\xa0\xc3\xb0\xc5\x81\xc4\xba', r'\xc4\xa2', ], ) self.assertEqual( agent.vec2txt(agent.tok2ind[w] for w in [ 'Hello', ',', r'\xc4\xa0Par', 'l', 'AI', '!', r'\xc4\xa0\xc3\xb0\xc5\x81\xc4\xba', r'\xc4\xa2', ]), # grinning face emoji u'Hello, ParlAI! \U0001f600', )
def run(self): """ 1) load model 2) generate embeddings 3) save embeddings. """ self.use_cuda = not self.opt.get('no_cuda') and torch.cuda.is_available() overrides = {'interactive_mode': True, 'interactive_candidates': 'inline'} if self.opt['dpr_model']: overrides.update( { 'model': 'dpr_agent', 'model_file': self.opt['model_file'], 'override': { 'model': 'dpr_agent', 'interactive_candidates': 'inline', }, } ) agent = create_agent(Opt(overrides)) else: agent = create_agent_from_model_file(self.opt['model_file'], overrides) model = agent.model.module if hasattr(agent.model, 'module') else agent.model assert hasattr(model, 'encoder_cand') or hasattr(model, 'cand_encoder') assert isinstance(agent, TorchRankerAgent) passages = self.load_passages() data = self.encode_passages(agent, passages) self.save_data(data)
def _build_model(self, opt: Opt) -> Tuple[PolyEncoderModule, DictionaryAgent]: """ Build poly-encoder module. :param opt: options from base RAG Model :return dropout poly-encoder: return dropout poly agent. """ model_file = modelzoo_path(opt['datapath'], opt['poly_faiss_model_file']) model_opt = Opt.load(f'{model_file}.opt') create_model_opt = { **{k: model_opt[k] for k in TRANSFORMER_RANKER_BASE_OPT}, **{k: model_opt[k] for k in POLYENCODER_OPT_KEYS}, 'model': 'transformer/dropout_poly', 'init_model': model_file, 'dict_file': f'{model_file}.dict', # necessary opt args 'multitask_weights': [1], # dropout_poly args 'poly_dropout_reduction_type': model_opt['poly_dropout_reduction_type'], 'poly_dropout_use_codes': model_opt.get('poly_dropout_use_codes', True), } logging.disable() agent = create_agent(Opt(create_model_opt)) logging.enable() assert isinstance(agent, DropoutPolyAgent) return agent.model, agent.dict
def add_extra_args(self, args=None): """ Add more args depending on how known args are set. """ parsed = vars(self.parse_known_args(args, nohelp=True)[0]) # Also load extra args options if a file is given. if parsed.get('init_opt') is not None: try: self._load_known_opts(parsed.get('init_opt'), parsed) except FileNotFoundError: # don't die if -o isn't found here. See comment in second call # later on. pass parsed = self._infer_datapath(parsed) partial = Opt(parsed) # find which image mode specified if any, and add additional arguments image_mode = parsed.get('image_mode', None) if image_mode is not None and image_mode != 'no_image_model': self.add_image_args(image_mode) # find which task specified if any, and add its specific arguments task = parsed.get('task', None) if task is not None: self.add_task_args(task, partial) evaltask = parsed.get('evaltask', None) if evaltask is not None: self.add_task_args(evaltask, partial) # find which model specified if any, and add its specific arguments model = get_model_name(parsed) if model is not None: self.add_model_subargs(model, partial) # add world args, if we know a priori which world is being used if task is not None: self.add_world_args( task, parsed.get('interactive_task', False), parsed.get('selfchat_task', False), partial, ) # reparse args now that we've inferred some things. specifically helps # with a misparse of `-opt` as `-o pt`, which causes opt loading to # try to load the file "pt" which doesn't exist. # After adding model arguments, -opt becomes known (it's in TorchAgent), # and we parse the `-opt` value correctly. parsed = vars(self.parse_known_args(args, nohelp=True)[0]) if parsed.get('init_opt') is not None: self._load_known_opts(parsed.get('init_opt'), parsed) # reset parser-level defaults over any model-level defaults try: self.set_defaults(**self._defaults) except AttributeError: raise RuntimeError('Please file an issue on github that argparse ' 'got an attribute error when parsing.')
def _process_args_to_opts(self, args_that_override: Optional[List[str]] = None): self.opt = Opt(vars(self.args)) # custom post-parsing self.opt['parlai_home'] = self.parlai_home self.opt = self._infer_datapath(self.opt) # set all arguments specified in command line as overridable option_strings_dict = {} store_true = [] store_false = [] for group in self._action_groups: for a in group._group_actions: if hasattr(a, 'option_strings'): for option in a.option_strings: option_strings_dict[option] = a.dest if '_StoreTrueAction' in str(type(a)): store_true.append(option) elif '_StoreFalseAction' in str(type(a)): store_false.append(option) if args_that_override is None: args_that_override = _sys.argv[1:] for i in range(len(args_that_override)): if args_that_override[i] in option_strings_dict: if args_that_override[i] in store_true: self.overridable[option_strings_dict[ args_that_override[i]]] = True elif args_that_override[i] in store_false: self.overridable[option_strings_dict[ args_that_override[i]]] = False elif (i < len(args_that_override) - 1 and args_that_override[i + 1] not in option_strings_dict): key = option_strings_dict[args_that_override[i]] self.overridable[key] = self.opt[key] self.opt['override'] = self.overridable # load opts if a file is provided. if self.opt.get('init_opt', None) is not None: self._load_opts(self.opt) # map filenames that start with 'zoo:' to point to the model zoo dir options_to_change = { 'model_file', 'dict_file', 'bpe_vocab', 'bpe_merge' } for each_key in options_to_change: if self.opt.get(each_key) is not None: self.opt[each_key] = modelzoo_path(self.opt.get('datapath'), self.opt[each_key]) if self.opt['override'].get(each_key) is not None: # also check override self.opt['override'][each_key] = modelzoo_path( self.opt.get('datapath'), self.opt['override'][each_key]) # add start time of an experiment self.opt['starttime'] = datetime.datetime.today().strftime( '%b%d_%H-%M')
def test_save_withignore(self): o = Opt({'a': 3, 'b': 'foo', 'override': {'a': 3}}) with testing_utils.tempdir() as tmpdir: fn = os.path.join(tmpdir, "opt") o.save(fn) o2 = Opt.load(fn) assert o != o2 assert 'override' not in o2
def train(epochs = 5): opt = Opt({'num_epochs' : epochs,'datapath':datapath,'datatype':datatype}) # set up timers train_time = Timer() validate_time = Timer() log_time = Timer() save_time = Timer() parleys = 0
def test_iter(self): opt = Opt({'datatype': 'valid', 'datapath': '/tmp', 'task': 'test'}) teacher = TupleTeacher(opt) # twice to ensure we reset iterators correctly examples = list(teacher) assert len(examples) == 9 examples = list(teacher) assert len(examples) == 9
def test_no_truncate(self): with self.assertRaises(ValueError): testing_utils.train_model( Opt({ **_DEFAULT_OPTIONS, **{ 'truncate': -1 } }))
def get_dictionary(PATH: str) -> DictionaryAgent: """ 读取字典 :param PATH: 字典工具目录 :return 读取的字典 """ opt = Opt() dictionary = DictionaryAgent(opt=opt) dictionary.load(PATH) return dictionary
def __init__( self, opt: Opt, dpr_model: str = 'bert', pretrained_path: str = DPR_ZOO_MODEL, encoder_type: str = 'query', ): # Override options try: config: BertConfig = BertConfig.from_pretrained( 'bert-base-uncased') except OSError: config_path = PathManager.get_local_path( os.path.join(opt['datapath'], "bert_base_uncased", self.CONFIG_PATH)) config: BertConfig = BertConfig.from_pretrained(config_path) pretrained_path = modelzoo_path(opt['datapath'], pretrained_path) # type: ignore if not os.path.exists(pretrained_path): # when initializing from parlai rag models, the pretrained path # may not longer exist. This is fine if we've already trained # the model. assert dpr_model == 'bert_from_parlai_rag' logging.error(f'Pretrained Path does not exist: {pretrained_path}') pretrained_path = modelzoo_path(opt['datapath'], DPR_ZOO_MODEL) # type: ignore dpr_model = 'bert' logging.error(f'Setting to zoo model: {pretrained_path}') enc_opt = { "n_heads": config.num_attention_heads, "n_layers": config.num_hidden_layers, "embedding_size": config.hidden_size, "ffn_size": config.intermediate_size, "dropout": config.hidden_dropout_prob, "attention_dropout": config.attention_probs_dropout_prob, "activation": config.hidden_act, "variant": 'xlm', "reduction_type": 'first', "n_positions": config.max_position_embeddings, "n_segments": config.type_vocab_size, } embedding = torch.nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) super().__init__( Opt(enc_opt), vocabulary_size=config.vocab_size, padding_idx=config.pad_token_id, embedding=embedding, reduction_type='first', ) self._load_state(opt['datapath'], dpr_model, pretrained_path, encoder_type)
def test_safe_personas(self): base_kwargs = Opt({'datatype': 'train', 'task': 'blended_skill_talk'}) safe_personas_only_to_count = {False: 4819, True: 3890} for safe_personas_only, count in safe_personas_only_to_count.items(): full_kwargs = {**base_kwargs, 'safe_personas_only': safe_personas_only} parser = setup_args() parser.set_defaults(**full_kwargs) opt = parser.parse_args([]) personas = _load_personas(opt) self.assertEqual(len(personas), count)
def _opt(self, **kwargs): return Opt( batchsize=4, optimizer='adam', n_layers=1, n_heads=4, ffn_size=16, embedding_size=16, skip_generation=True, **kwargs, )
def test_beamsearch_blocking(self): """ Test beamsearch blocking. """ with testing_utils.tempdir() as tmpdir: agent = create_agent_from_model_file( 'zoo:unittest/beam_blocking/model') agent.observe({'text': '5 5 5 5 5 5 5', 'episode_done': True}) assert agent.act()['text'] == '5 5 5 5 5 5 5' agent = create_agent_from_model_file( 'zoo:unittest/beam_blocking/model', Opt(beam_block_ngram=1)) agent.observe({'text': '5 5 5 5 5 5 5', 'episode_done': True}) assert '5 5' not in agent.act()['text'] agent = create_agent_from_model_file( 'zoo:unittest/beam_blocking/model', Opt(beam_block_ngram=2)) agent.observe({'text': '5 5 5 5 5 5 5', 'episode_done': True}) assert '5 5 5' not in agent.act()['text'] with open(os.path.join(tmpdir, 'blocklist.txt'), 'w') as f: f.write("38\n62\n34 34\n") agent = create_agent_from_model_file( 'zoo:unittest/beam_blocking/model', Opt(beam_block_list_filename=os.path.join( tmpdir, 'blocklist.txt')), ) agent.observe({'text': '4 4 4', 'episode_done': True}) assert agent.act()['text'] == '4 4 4' agent.observe({'text': '38 38 38', 'episode_done': True}) assert '38' not in agent.act()['text'] agent.observe({'text': '62 62 62', 'episode_done': True}) assert '62' not in agent.act()['text'] agent.observe({'text': '34 34 34', 'episode_done': True}) text = agent.act()['text'] assert '34' in text assert '34 34' not in text
def test_multitask(self): """ Test that model correctly handles multiple inputs. Random chance is 10%, so this should be able to get much better than that very quickly. """ args = Opt({**self.base_args, **self.multitask_args}) valid, test = testing_utils.train_model(args) assert ( valid['accuracy'] > 0.2 ), f'ImagePolyencoderAgent val-set accuracy on a simple task was {valid["accuracy"].value():0.2f}.'
def test_nodatafile(self): for dt in [ 'train:ordered', 'train:stream:ordered', 'valid', 'test', 'valid:stream', 'test:stream', ]: opt = Opt({'datatype': dt, 'datapath': '/tmp', 'task': 'test'}) with self.assertRaises(KeyError): NoDatafileTeacher(opt)
def test_opt(self): opt = {'x': 0} opt = Opt(opt) opt['x'] += 1 opt['x'] = 10 history = opt.history['x'] self.assertEqual(history[0][1], 1, 'History not set properly') self.assertEqual(history[1][1], 10, 'History not set properly') opt_copy = deepcopy(opt) history = opt_copy.history['x'] self.assertEqual(history[0][1], 1, 'Deepcopy history not set properly') self.assertEqual(history[1][1], 10, 'Deepcopy history not set properly')
def test_gpt2_bpe_tokenize(self): opt = Opt({'dict_tokenizer': 'gpt2', 'datapath': './data'}) agent = DictionaryAgent(opt) self.assertEqual( # grinning face emoji agent.gpt2_tokenize(u'Hello, ParlAI! \U0001f600'), GPT2_BPE_RESULT, ) self.assertEqual( agent.vec2txt(agent.tok2ind[w] for w in GPT2_BPE_RESULT), # grinning face emoji u'Hello, ParlAI! \U0001f600', )
def test_asymmetry(self): opt = Opt({'model': 'transformer/generator', 'n_layers': 1}) agent = create_agent(opt) self.assertEqual(agent.model.encoder.n_layers, 1) self.assertEqual(agent.model.decoder.n_layers, 1) opt = Opt({ 'model': 'transformer/generator', 'n_layers': 1, 'n_encoder_layers': 2 }) agent = create_agent(opt) self.assertEqual(agent.model.encoder.n_layers, 2) self.assertEqual(agent.model.decoder.n_layers, 1) opt = Opt({ 'model': 'transformer/generator', 'n_layers': 1, 'n_encoder_layers': 2, 'n_decoder_layers': 4, }) agent = create_agent(opt) self.assertEqual(agent.model.encoder.n_layers, 2) self.assertEqual(agent.model.decoder.n_layers, 4) opt = Opt({ 'model': 'transformer/generator', 'n_layers': 1, 'n_decoder_layers': 4 }) agent = create_agent(opt) self.assertEqual(agent.model.encoder.n_layers, 1) self.assertEqual(agent.model.decoder.n_layers, 4) opt = Opt({'model': 'transformer/generator'}) agent = create_agent(opt) self.assertEqual(agent.model.encoder.n_layers, 2) self.assertEqual(agent.model.decoder.n_layers, 2)
def test_gpt2_bpe_tokenize(self): datapath = ParlaiParser().parse_args([], print_args=False)['datapath'] opt = Opt({'dict_tokenizer': 'gpt2', 'datapath': datapath}) agent = DictionaryAgent(opt) self.assertEqual( # grinning face emoji agent.gpt2_tokenize(u'Hello, ParlAI! \U0001f600'), GPT2_BPE_RESULT, ) self.assertEqual( agent.vec2txt(agent.tok2ind[w] for w in GPT2_BPE_RESULT), # grinning face emoji u'Hello, ParlAI! \U0001f600', )