def test_convert(self): from parlai.scripts.convert_data_to_parlai_format import ( ConvertDataToParlaiFormat, ) with testing_utils.tempdir() as tmpdir: fn = os.path.join(tmpdir, 'parlai.txt') ConvertDataToParlaiFormat.main( task='integration_tests:nocandidate', outfile=fn ) with open(fn) as f: assert ( f.readline() == 'text:4 1 3 2\tlabels:4 1 3 2\tepisode_done:True\n' ) assert f.readline() == '\n' assert ( f.readline() == 'text:3 0 4 1\tlabels:3 0 4 1\tepisode_done:True\n' ) assert f.readline() == '\n' assert ( f.readline() == 'text:5 1 6 3\tlabels:5 1 6 3\tepisode_done:True\n' ) assert f.readline() == '\n' assert ( f.readline() == 'text:4 5 6 2\tlabels:4 5 6 2\tepisode_done:True\n' ) assert f.readline() == '\n' assert ( f.readline() == 'text:0 5 3 1\tlabels:0 5 3 1\tepisode_done:True\n' ) assert f.readline() == '\n'
def test_vacuum(self): with testing_utils.tempdir() as tmpdir: from parlai.scripts.vacuum import Vacuum model_file = os.path.join(tmpdir, 'model') valid, test = testing_utils.train_model({ 'task': 'integration_tests', 'optimizer': 'adam', 'learningrate': 0.01, 'model_file': model_file, 'num_epochs': 0.05, 'skip_generation': True, 'batchsize': 8, # TODO: switch to test_agents/unigram 'model': 'transformer/generator', 'ffn_size': 32, 'embedding_size': 32, 'n_layers': 1, }) size_before = os.stat(model_file).st_size Vacuum.main(model_file=model_file) size_after = os.stat(model_file).st_size assert size_after < size_before assert os.path.exists(model_file + '.unvacuumed') valid2, test2 = testing_utils.eval_model({ 'task': 'integration_tests', 'model_file': model_file, 'batchsize': 8 }) for key in ['loss', 'exs', 'ppl', 'token_acc']: assert valid2[key] == valid[key], f"{key} score doesn't match" assert test2[key] == test[key], f"{key} score doesn't match"
def test_torchscript_agent(self): """ Test exporting a model to TorchScript and then testing it on sample data. """ from parlai.scripts.torchscript import TorchScript test_phrase = "Don't have a cow, man!" # From test_bart.py with testing_utils.tempdir() as tmpdir: scripted_model_file = os.path.join(tmpdir, 'scripted_model.pt') # Export the BART model export_opt = TorchScript.setup_args().parse_kwargs( model='bart', scripted_model_file=scripted_model_file ) TorchScript(export_opt).run() # Test the scripted BART model scripted_opt = ParlaiParser(True, True).parse_kwargs( model='parlai.torchscript.agents:TorchScriptAgent', model_file=scripted_model_file, ) bart = create_agent(scripted_opt) bart.observe({'text': test_phrase, 'episode_done': True}) act = bart.act() self.assertEqual(act['text'], test_phrase)
def test_compare_opts(self): """ Compare opts by loading them with Opt.load(). Will not compare the override field. """ with testing_utils.tempdir() as tmpdir: # Write test opts opt_dir = tmpdir opt_path_1 = os.path.join(opt_dir, '1.opt') opt_path_2 = os.path.join(opt_dir, '2.opt') with open(opt_path_1, 'w') as f1: json.dump(self.compare_opt_1, f1) with open(opt_path_2, 'w') as f2: json.dump(self.compare_opt_2, f2) # Compare opts output = compare_opts(opt_path_1=opt_path_1, opt_path_2=opt_path_2) desired_output = """ Args only found in opt 1: key2: a Args only found in opt 2: key3: b Args that are different in both opts: key1: \tIn opt 1: 0 \tIn opt 2: 1""" self.assertEqual(output, desired_output)
def test_eval_fixed(self): args = self._get_args() args['eval_candidates'] = 'fixed' args['encode_candidate_vecs'] = True args['ignore_bad_candidates'] = True valid, test = testing_utils.train_model(args) # none of the train candidates appear in evaluation, so should have # zero accuracy: this tests whether the fixed candidates were built # properly (i.e., only using candidates from the train set) self.assertEqual(valid['hits@1'], 0) # now try again with a fixed candidate file that includes all possible # candidates teacher = CandidateTeacher({'datatype': 'train'}) all_cands = teacher.train + teacher.val + teacher.test all_cands_str = '\n'.join([' '.join(x) for x in all_cands]) with testing_utils.tempdir() as tmpdir: tmp_cands_file = os.path.join(tmpdir, 'all_cands.text') with open(tmp_cands_file, 'w') as f: f.write(all_cands_str) args['fixed_candidates_path'] = tmp_cands_file args[ 'encode_candidate_vecs'] = False # don't encode before training args['ignore_bad_candidates'] = False args['num_epochs'] = 4 valid, test = testing_utils.train_model(args) self.assertGreaterEqual(valid['hits@100'], 0.1)
def test_deduped_split_distributions(self): with testing_utils.tempdir() as tmpdir: data_path = tmpdir def _split_type_teacher( split_type: str ) -> CMUDocumentGroundedConversationsTeacher: kwargs = { 'task': 'cmu_dog', 'datatype': 'valid', 'cmu_dog_split_type': split_type, 'datapath': data_path, } parser = setup_args() parser.set_defaults(**kwargs) opt = parser.parse_args([]) agents = create_task_agent_from_taskname(opt) assert isinstance(agents, List) task = agents[0] assert isinstance(task, CMUDocumentGroundedConversationsTeacher) return task og_teacher = _split_type_teacher('deduped') sn_teacher = _split_type_teacher('seen') self.assertEqual( len(og_teacher.rare_word_f1.freq_dist), len(sn_teacher.rare_word_f1.freq_dist), )
def test_save_load(self): o = Opt({'a': 3, 'b': 'foo'}) with testing_utils.tempdir() as tmpdir: fn = os.path.join(tmpdir, "opt") o.save(fn) o2 = Opt.load(fn) assert o == o2
def test_both_label(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "goodfile.jsonl") with PathManager.open(fp, "w") as f: f.write( '{"dialog": [[{"text": "Hi.", "id": "speaker1"}, {"text": "Hello.", "id": "speaker2"}]]}\n' ) opt = { 'task': 'jsonfile', 'jsonfile_datapath': fp, 'verbose': True, 'label_turns': 'both', } train_out, valid_out, test_out = testing_utils.display_data(opt) texts = [ l.split(':', 1)[-1].strip() for l in train_out.split('\n') if l in train_out if 'text' in l ] labels = [ l.split(':', 1)[-1].strip() for l in train_out.split('\n') if l in train_out if 'labels' in l ] num_episodes = train_out.count("END OF EPISODE") self.assertEqual(texts[0], '__SILENCE__') self.assertEqual(labels[0], 'Hi.') self.assertEqual(texts[1], 'Hi.') self.assertEqual(labels[1], 'Hello.') self.assertEqual(num_episodes, 2)
def test_allow_missing_init_opts(self): """ Test --allow-missing-init-opts. """ with testing_utils.tempdir() as temp_dir: init_opt_path = os.path.join(temp_dir, 'init_opt.opt') # Save a test opt file with an argument that doesn't exist init_opt = Opt({'made_up_arg': 'foo'}) init_opt.save(init_opt_path) # Assert that the opt file normally can't be loaded in with self.assertRaises(RuntimeError): _ = ParlaiParser(True, True).parse_kwargs(init_opt=init_opt_path) # Assert that the opt file *can* be loaded in if we set # --allow-missing-init-opts, and assert that the made-up arg does not exist # in the opt opt = ParlaiParser(True, True).parse_kwargs(init_opt=init_opt_path, allow_missing_init_opts=True) self.assertFalse(hasattr(opt, 'made_up_arg'))
def test_counts(self): with testing_utils.tempdir() as tmpdir: data_path = tmpdir opts_episodes_and_examples = [ ({ 'datatype': 'train' }, 4819, 27018), ({ 'datatype': 'valid' }, 1009, 5651), ({ 'datatype': 'test' }, 980, 5482), ] for kwargs, num_episodes, num_examples in opts_episodes_and_examples: all_kwargs = { **kwargs, 'task': 'blended_skill_talk', 'datapath': data_path, } parser = setup_args() parser.set_defaults(**all_kwargs) opt = parser.parse_args([]) agent = RepeatLabelAgent(opt) teacher = create_task(opt, agent).get_task_agent() self.assertEqual(teacher.num_episodes(), num_episodes) self.assertEqual(teacher.num_examples(), num_examples)
def test_one_episode(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "badfile.txt") with open(fp, "w") as f: for _ in range(1000): f.write( 'id:test_file\ttext:placeholder\tlabels:placeholder\n\n' ) opt = { 'task': 'fromfile', 'fromfile_datapath': fp, 'display_verbose': True } with self.assertLogs(logger=logging.logger, level='DEBUG') as cm: testing_utils.display_data(opt) print("\n".join(cm.output)) assert any('long episode' in l for l in cm.output) # invert the logic of the assertion with self.assertRaises(self.failureException): fp = os.path.join(tmpdir, "goodfile.txt") with open(fp, "w") as f: for _ in range(1000): f.write( 'id:test_file\ttext:placeholder\tlabels:placeholder\tepisode_done:True\n\n' ) opt = { 'task': 'fromfile', 'fromfile_datapath': fp, 'display_verbose': True, } with self.assertLogs(logger=logging.logger, level='DEBUG') as cm: testing_utils.display_data(opt) assert any('long episode' in l for l in cm.output)
def test_one_episode(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "badfile.txt") with open(fp, "w") as f: for _ in range(1000): f.write( 'id:test_file\ttext:placeholder\tlabels:placeholder\n\n' ) opt = { 'task': 'fromfile', 'fromfile_datapath': fp, 'display_verbose': True } with self.assertWarnsRegex(UserWarning, "long episode"): testing_utils.display_data(opt) # invert the logic of the assertion with self.assertRaises(self.failureException): fp = os.path.join(tmpdir, "goodfile.txt") with open(fp, "w") as f: for _ in range(1000): f.write( 'id:test_file\ttext:placeholder\tlabels:placeholder\tepisode_done:True\n\n' ) opt = { 'task': 'fromfile', 'fromfile_datapath': fp, 'display_verbose': True, } with self.assertWarnsRegex(UserWarning, "long episode"): testing_utils.display_data(opt)
def test_gpt2standin(self): with testing_utils.tempdir() as tmpdir: # we need to build the dict file hf_bpe_opt = self._get_dict_opt('bytelevelbpe') slow_bytelevel_bpe_opt = self._get_dict_opt('slow_bytelevel_bpe') dict_file = os.path.join(tmpdir, "dict") pp = build_dict.setup_args() pp.set_defaults(**hf_bpe_opt) pp.set_defaults(task='babi') popt = pp.parse_args([]) popt['dict_file'] = dict_file build_dict.build_dict(popt) hf_bpe_opt['dict_file'] = dict_file hf_bpe = DictionaryAgent(hf_bpe_opt) slow_bytelevel_bpe_opt['dict_file'] = dict_file slow_bytelevel_bpe = DictionaryAgent(slow_bytelevel_bpe_opt) self._run_test(slow_bytelevel_bpe, hf_bpe) slow_bytelevel_bpe_opt['bpe_add_prefix_space'] = True slow_bytelevel_bpe = DictionaryAgent(slow_bytelevel_bpe_opt) self._run_prefix_space_test(slow_bytelevel_bpe)
def test_save_reload(self): """ Save and reload an existing BL-BPE dictionary. """ pp = ParlaiParser() DictionaryAgent.add_cmdline_args(pp, partial_opt=None) da = DictionaryAgent( pp.parse_args([ '--dict-tokenizer', 'bytelevelbpe', '--bpe-merge', DEFAULT_BYTELEVEL_BPE_MERGE, '--bpe-vocab', DEFAULT_BYTELEVEL_BPE_VOCAB, ])) # poor behavior if we failed to load assert da.txt2vec("hello") != [] with testing_utils.tempdir() as tmpdir: newdf = os.path.join(tmpdir, "dict") da.save(newdf) # now load it da2 = DictionaryAgent( pp.parse_args( ['--dict-tokenizer', 'bytelevelbpe', '--dict-file', newdf])) assert da2.txt2vec("hello") == da.txt2vec("hello")
def test_fixed_label(self): with testing_utils.tempdir() as tmpdir: testing_utils.train_model({ 'task': 'integration_tests', 'model': 'ir_baseline', 'batchsize': 1, 'datatype': 'train:ordered', 'num_epochs': 1, 'model_file': os.path.join(tmpdir, 'model'), }) with open(os.path.join(tmpdir, 'cands.txt'), 'w') as f: f.write("1 2 3 4\n") f.write("4 5 6 7\n") valid, test = testing_utils.eval_model({ 'task': 'integration_tests', 'model': 'ir_baseline', 'model_file': os.path.join(tmpdir, 'model'), 'label_candidates_file': os.path.join(tmpdir, 'cands.txt'), }) assert valid['f1'] == 0.6175 assert test['f1'] == 0.625
def test_init_from_from_checkpoint(self): with testing_utils.tempdir() as temp_dir: opt_from_file = { 'datapath': 'dummy_path', 'model': 'repeat_label', 'init_model': os.path.join(temp_dir, 'something'), 'model_file': os.path.join(temp_dir, 'something_else'), } opt = Opt({ 'datapath': 'dummy_path', 'model': 'repeat_label', 'init_model': os.path.join(temp_dir, 'something_else.checkpoint'), 'model_file': os.path.join(temp_dir, 'something_else'), 'load_from_checkpoint': True, }) with open(os.path.join(temp_dir, 'something_else.opt'), 'w') as f: f.write(json.dumps(opt_from_file)) agent = create_agent_from_opt_file(opt) init_model = agent.opt['init_model'] # assert that the model was loaded with the correct checkpoitn assert '.checkpoint' in init_model
def test_fixed_label2(self): with testing_utils.tempdir() as tmpdir: testing_utils.train_model({ 'task': 'integration_tests', 'model': 'ir_baseline', 'batchsize': 1, 'datatype': 'train:ordered', 'num_epochs': 1, 'model_file': os.path.join(tmpdir, 'model'), }) cand = os.path.join(tmpdir, 'cands.txt') BuildCandidates.main(task='integration_tests', outfile=cand) valid, test = testing_utils.eval_model({ 'task': 'integration_tests', 'model': 'ir_baseline', 'model_file': os.path.join(tmpdir, 'model'), 'label_candidates_file': os.path.join(tmpdir, 'cands.txt'), }) assert valid['f1'] == 1.0 assert test['f1'] == 1.0 assert valid['accuracy'] == 0.0 assert test['accuracy'] == 0.0
def test_token_splitter(self): """ Test TorchScriptable code for splitting tokens against reference GPT-2 version. """ from parlai.scripts.torchscript import TorchScript from parlai.torchscript.modules import ScriptableGpt2BpeHelper # Params tasks = ['taskmaster2', 'convai2'] compiled_pattern = regex.compile(Gpt2BpeHelper.PATTERN) with testing_utils.tempdir() as tmpdir: for task in tasks: opt = TorchScript.setup_args().parse_kwargs( task=task, datatype='train:ordered' ) agent = RepeatLabelAgent(opt) # TODO(roller): make a proper create_teacher helper teacher = create_task(opt, agent).get_task_agent() num_examples = teacher.num_examples() print( f'\nStarting to test {num_examples:d} examples for the ' f'{task} task.' ) for idx, message in enumerate(teacher): if idx % 10000 == 0: print(f'Testing example #{idx:d}.') text = message['text'] canonical_tokens = regex.findall(compiled_pattern, text) scriptable_tokens = ScriptableGpt2BpeHelper.findall(text) self.assertEqual(canonical_tokens, scriptable_tokens) if idx + 1 == num_examples: break
def _test_learning_rate_resuming(self, args): """ Test learning rate resumes correctly. """ with testing_utils.tempdir() as tmpdir: model_file = os.path.join(tmpdir, 'model') valid1, test1 = testing_utils.train_model( dict(model_file=model_file, lr_scheduler='invsqrt', **args) ) valid2, test2 = testing_utils.train_model( dict(model_file=model_file, lr_scheduler='invsqrt', **args) ) # make sure the number of updates is being tracked correctly self.assertGreater( valid2['total_train_updates'], valid1['total_train_updates'], 'Number of updates is not increasing', ) # make sure the learning rate is decreasing self.assertLess( valid2['lr'], valid1['lr'], 'Learning rate is not decreasing', ) # but make sure we're not loading the scheduler if we're fine # tuning valid3, test3 = testing_utils.train_model( dict( init_model=os.path.join(tmpdir, 'model'), model_file=os.path.join(tmpdir, 'newmodel'), lr_scheduler='invsqrt', **args, ) ) self.assertEqual( valid3['total_train_updates'], valid1['total_train_updates'], 'Finetuning LR scheduler reset failed (total_train_updates).', ) self.assertEqual( valid3['lr'], valid1['lr'], 'Finetuning LR scheduler reset failed ' '(lr).', ) # and make sure we're not loading the scheduler if it changes valid4, test4 = testing_utils.train_model( dict( init_model=os.path.join(tmpdir, 'model'), model_file=os.path.join(tmpdir, 'newmodel2'), lr_scheduler='reduceonplateau', **args, ) ) self.assertEqual( valid4['total_train_updates'], valid1['total_train_updates'], 'LR scheduler change reset failed (total_train_updates).', ) self.assertEqual( valid4['lr'], 1e-3, '({}) LR is not correct in final resume.' )
def test_counts(self): with testing_utils.tempdir() as tmpdir: data_path = tmpdir # Check EmpatheticDialoguesTeacher, with multiple examples per episode opts_episodes_and_examples = [ ( {'datatype': 'train'}, EPISODE_COUNTS['train_both_sides'], EXAMPLE_COUNTS['train_both_sides'], ), # Test the default mode ( {'datatype': 'train', 'train_experiencer_only': True}, EPISODE_COUNTS['train_experiencer_only'], EXAMPLE_COUNTS['train_experiencer_only'], ), ( {'datatype': 'train', 'train_experiencer_only': False}, EPISODE_COUNTS['train_both_sides'], EXAMPLE_COUNTS['train_both_sides'], ), ( {'datatype': 'valid'}, EPISODE_COUNTS['valid'], EXAMPLE_COUNTS['valid'], ), ({'datatype': 'test'}, EPISODE_COUNTS['test'], EXAMPLE_COUNTS['test']), ] for teacher_class in [EmpatheticDialoguesTeacher]: for opt, num_episodes, num_examples in opts_episodes_and_examples: full_opt = Opt({**opt, 'datapath': data_path}) teacher = teacher_class(full_opt) self.assertEqual(teacher.num_episodes(), num_episodes) self.assertEqual(teacher.num_examples(), num_examples) # Check EmotionClassificationSituationTeacher, with one example per episode train_episode_count = EPISODE_COUNTS['train_experiencer_only'] # For the situation classifier, we only want to have one episode per train # conversation opts_episodes = [ ({'datatype': 'train'}, train_episode_count), # Test the default mode ( {'datatype': 'train', 'train_experiencer_only': True}, train_episode_count, ), ( {'datatype': 'train', 'train_experiencer_only': False}, train_episode_count, ), ({'datatype': 'valid'}, EPISODE_COUNTS['valid']), ({'datatype': 'test'}, EPISODE_COUNTS['test']), ] for teacher_class in [EmotionClassificationSituationTeacher]: for opt, num_episodes in opts_episodes: full_opt = Opt({**opt, 'datapath': data_path}) teacher = teacher_class(full_opt) self.assertEqual(teacher.num_episodes(), num_episodes) self.assertEqual(teacher.num_examples(), num_episodes)
def test_final_extra_eval_and_save_json(self): """ Test "final_extra_valid_opt_filepath". Happens to test that saving reports as json works too. We copy train_model from testing_utils to directly access train loop. """ import parlai.scripts.train_model as tms def get_tl(tmpdir): final_opt = Opt({ 'task': 'integration_tests', 'datatype': 'valid', 'validation_max_exs': 30, 'short_final_eval': True, }) final_opt.save(os.path.join(tmpdir, "final_opt.opt")) opt = Opt({ 'task': 'integration_tests', 'validation_max_exs': 10, 'model': 'repeat_label', 'model_file': os.path.join(tmpdir, 'model'), 'short_final_eval': True, 'num_epochs': 1.0, 'final_extra_opt': str(os.path.join(tmpdir, "final_opt.opt")), }) parser = tms.setup_args() parser.set_params(**opt) popt = parser.parse_args([]) for k, v in opt.items(): popt[k] = v return tms.TrainLoop(popt) with testing_utils.capture_output(), testing_utils.tempdir() as tmpdir: tl = get_tl(tmpdir) _, _ = tl.train() with open(os.path.join(tmpdir, 'model.trainstats')) as f: data = json.load(f) print(data) self.assertEqual( data["final_valid_report"]["exs"], 10, "Validation exs saved incorrectly", ) self.assertEqual( data["final_extra_valid_report"]["exs"], 30, "Final validation exs saved incorrectly", )
def setup_teardown(self): """ Call code to set up and tear down tests. Run this only once because we'll be running all analysis code before checking any results. """ outputs = {} for case, flag_string in self.CASES.items(): # Paths analysis_samples_folder = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'analysis_samples', case ) analysis_outputs_folder = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'test_model_chat_analysis', ) outputs[f'{case}__expected_stdout_path'] = os.path.join( analysis_outputs_folder, f'{case}__test_stdout.txt' ) prefixes = ['results', 'worker_results'] with testing_utils.tempdir() as tmpdir: # Run analysis with testing_utils.capture_output() as output: arg_string = f"""\ --results-folders {analysis_samples_folder} --output-folder {tmpdir} \ {flag_string} """ parser_ = ModelChatResultsCompiler.setup_args() args_ = parser_.parse_args(arg_string.split()) ModelChatResultsCompiler(vars(args_)).compile_and_save_results() stdout = output.getvalue() # Define output structure filtered_stdout = '\n'.join( [ line for line in stdout.split('\n') if not line.endswith('.csv') ] ) # Don't track lines that record where a file was saved to, because # filenames are timestamped outputs[f'{case}__stdout'] = filtered_stdout for prefix in prefixes: results_path = list( glob.glob(os.path.join(tmpdir, f'{prefix}_*')) )[0] with open(results_path) as f: outputs[f'{case}__{prefix}'] = f.read() yield outputs
def test_no_labels(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "badfile.txt") with PathManager.open(fp, "w") as f: f.write('id:test_file\ttext:bad text\n\n') opt = {'task': 'fromfile', 'fromfile_datapath': fp, 'display_verbose': True} with self.assertRaises(ValueError): testing_utils.display_data(opt)
def test_save_withignore(self): o = Opt({'a': 3, 'b': 'foo', 'override': {'a': 3}}) with testing_utils.tempdir() as tmpdir: fn = os.path.join(tmpdir, "opt") o.save(fn) o2 = Opt.load(fn) assert o != o2 assert 'override' not in o2
def test_byte_level_bpe_tokenize(self): """ Tests a bytelevel bpe tokenizer inside ParlAI. """ parser = ParlaiParser() parser.set_params( dict_tokenizer='bytelevelbpe', bpe_vocab=DEFAULT_BYTELEVEL_BPE_VOCAB, bpe_merge=DEFAULT_BYTELEVEL_BPE_MERGE, bpe_add_prefix_space=False, ) opt = parser.parse_args([], print_args=False) agent = DictionaryAgent(opt) self.assertEqual( # grinning face emoji agent.bytelevelbpe_tokenize(u'Hello, ParlAI! \U0001f600'), BYTELEVEL_BPE_RESULT, ) self.assertEqual( agent.vec2txt([agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT]), # grinning face emoji u'Hello, ParlAI! \U0001f600', ) self.assertEqual( agent.txt2vec(u'Hello, ParlAI! \U0001f600'), [agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT], ) vocab_size = agent.byte_level_bpe.tokenizer.get_vocab_size() with testing_utils.tempdir() as tmpdir: path = os.path.join(tmpdir, 'dict-checkpoint') agent.save(filename=path) agent.load(filename=path) # Test loading / saving self.assertEqual(vocab_size, agent.byte_level_bpe.tokenizer.get_vocab_size()) self.assertEqual( # grinning face emoji agent.bytelevelbpe_tokenize(u'Hello, ParlAI! \U0001f600'), BYTELEVEL_BPE_RESULT, ) self.assertEqual( agent.vec2txt([agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT]), # grinning face emoji u'Hello, ParlAI! \U0001f600', ) self.assertEqual( agent.txt2vec(u'Hello, ParlAI! \U0001f600'), [agent.tok2ind[w] for w in BYTELEVEL_BPE_RESULT], ) # Test special token ids are mapped correctly: # 4 special tokens are added in ParlAI dict in the begining and at the # end for Hugging Face null token would be 0 in ParlAI dict and # original_vocab in Hugging Face assert agent.txt2vec("__null__") == [0] assert agent.txt2vec("__start__") == [1] assert agent.txt2vec("__end__") == [2] assert agent.txt2vec("__unk__") == [3]
def test_connectionerror_download(self): with unittest.mock.patch('requests.Session.get') as Session: Session.side_effect = requests.exceptions.ConnectTimeout with testing_utils.tempdir() as tmpdir: with self.assertRaises(RuntimeError): build_data.download( 'http://test.com/bad', tmpdir, 'foo', num_retries=3 ) assert Session.call_count == 3
def test_good_fileformat(self): """ Checks that we fail to load a dataset where the use specified eval_labels. """ with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "goodfile.txt") with PathManager.open(fp, "w") as f: f.write('id:test_file\ttext:input\tlabels:good label\n\n') opt = {'task': 'fromfile', 'fromfile_datapath': fp, 'display_verbose': True} testing_utils.display_data(opt)
def test_no_text(self): with testing_utils.tempdir() as tmpdir: fp = os.path.join(tmpdir, "badfile.jsonl") with PathManager.open(fp, "w") as f: f.write( '{"dialog": [[{"id": "speaker1"}, {"text": "Hello.", "id": "speaker2"}]]}\n' ) opt = {'task': 'jsonfile', 'fromfile_datapath': fp, 'display_verbose': True} with self.assertRaises(AttributeError): testing_utils.display_data(opt)
def _run_test(self, opt): with testing_utils.tempdir() as tmpdir: dict_file = os.path.join(tmpdir, "dict") pp = build_dict.setup_args() pp.set_defaults(**opt) pp.set_defaults(task='babi') popt = pp.parse_args([]) popt['dict_file'] = dict_file for k, v in opt.items(): popt[k] = v
def test_cprofile(self): from parlai.scripts.profile_train import ProfileTrain with testing_utils.tempdir() as tmpdir: ProfileTrain.main( task='integration_tests:overfit', model='test_agents/unigram', model_file=os.path.join(tmpdir, 'model'), skip_generation=True, )