def test_batched(self): valid, test = testing_utils.eval_model( dict( task='integration_tests:chunky', model='parlai.agents.test_agents.test_agents:MockTorchAgent', batchsize=32, ), valid_datatype='valid:stream', test_datatype='test:stream', ) assert valid['exs'] == 100 assert test['exs'] == 100
def test_stream_only(self): with self.assertRaises(ValueError): valid, test = testing_utils.eval_model( dict( task='integration_tests:chunky', model='parlai.agents.test_agents.test_agents:MockTorchAgent', batchsize=32, ), valid_datatype='valid', ) with self.assertRaises(ValueError): valid, test = testing_utils.eval_model( dict( task='integration_tests:chunky', model='parlai.agents.test_agents.test_agents:MockTorchAgent', batchsize=32, ), valid_datatype='valid:stream', test_datatype='test', )
def test_multitasking_metrics_macro(self): valid, test = testing_utils.eval_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'aggregate_micro': False, }) task1_acc = valid['integration_tests:candidate/accuracy'] task2_acc = valid['integration_tests:multiturnCandidate/accuracy'] total_acc = valid['accuracy'] # task 2 is 4 times the size of task 1 self.assertEqual( total_acc, (task1_acc.value() + task2_acc.value()) * 0.5, 'Task accuracy is averaged incorrectly', ) valid, test = testing_utils.eval_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'aggregate_micro': False, }) task1_acc = valid['integration_tests:candidate/accuracy'] task2_acc = valid['integration_tests:multiturnCandidate/accuracy'] total_acc = valid['accuracy'] # metrics are combined correctly self.assertEqual( total_acc, (task1_acc.value() + task2_acc.value()) * 0.5, 'Task accuracy is averaged incorrectly', )
def test_multitasking_metrics(self): valid, test = testing_utils.eval_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'num_epochs': 0.5, }) task1_acc = valid['integration_tests:candidate/accuracy'] task2_acc = valid['integration_tests:multiturnCandidate/accuracy'] total_acc = valid['accuracy'] # task 2 is 4 times the size of task 1 self.assertEqual( total_acc, task1_acc + task2_acc, 'Task accuracy is averaged incorrectly', ) valid, test = testing_utils.eval_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'num_epochs': 0.5, }) task1_acc = valid['integration_tests:candidate/accuracy'] task2_acc = valid['integration_tests:multiturnCandidate/accuracy'] total_acc = valid['accuracy'] # metrics are combined correctly self.assertEqual( total_acc, (task1_acc + task2_acc), 'Task accuracy is averaged incorrectly', )
def test_short_pacer_pretrain(self): opt = { 'model': 'projects.light_whoami.agents.pacer:PacerPartialOnlyAgent', 'model_file': VANILLA_128, 'predictor_model_file': RPA_RERANKER, 'pacer_n_tokens': 10, 'pacer_frequency_ratio': 0.1, 'beam_min_length': 10, **COMMON_OPT, } testing_utils.eval_model(opt, skip_test=True) opt = { 'model': 'projects.light_whoami.agents.pacer:PacerAgent', 'model_file': VANILLA_128, 'predictor_model_file': RPA_RERANKER, 'pacer_n_tokens': 10, 'pacer_frequency_ratio': 0.1, 'beam_min_length': 10, **COMMON_OPT, } testing_utils.eval_model(opt, skip_test=True)
def test_multi_task(self): """ Test model trained multi-task on dialogue datasets. """ valid, _ = testing_utils.eval_model( opt={ **SHARED_OPTS, 'model_file': f'zoo:blended_skill_talk/multi_task/model', }, skip_test=True, ) self.assertAlmostEqual(valid['accuracy'], 0.9062, delta=0.005)
def test_multi_task_bst_tuned(self): """ Test model trained multi-task and then tuned on BlendedSkillTalk. """ valid, _ = testing_utils.eval_model( opt={ **SHARED_OPTS, 'model_file': f'zoo:blended_skill_talk/multi_task_bst_tuned/model', }, skip_test=True, ) self.assertAlmostEqual(valid['accuracy'], 0.9219, delta=0.005)
def test_wizard_single_task(self): """ Test model trained single-task on Wizard of Wikipedia. """ valid, _ = testing_utils.eval_model( opt={ **SHARED_OPTS, 'model_file': f'zoo:blended_skill_talk/wizard_single_task/model', }, skip_test=True, ) self.assertAlmostEqual(valid['accuracy'], 0.7500, delta=0.005)
def test_ed_single_task(self): """ Test model trained single-task on EmpatheticDialogues. """ valid, _ = testing_utils.eval_model( opt={ **SHARED_OPTS, 'model_file': f'zoo:blended_skill_talk/ed_single_task/model', }, skip_test=True, ) self.assertAlmostEqual(valid['accuracy'], 0.7656, delta=0.005)
def test_convai2_single_task(self): """ Test model trained single-task on ConvAI2. """ valid, _ = testing_utils.eval_model( opt={ **SHARED_OPTS, 'model_file': f'zoo:blended_skill_talk/convai2_single_task/model', }, skip_test=True, ) self.assertAlmostEqual(valid['accuracy'], 0.8438, delta=0.005)
def test_long_pacer(self): opt = { 'model': 'projects.light_whoami.agents.pacer:LongPacerPartialOnlyAgent', 'model_file': TEST_TGA, 'predictor_model_file': TEST_RPA_RERANKER, 'pacer_n_tokens': 10, 'pacer_frequency_ratio': 0.1, 'beam_min_length': 10, **COMMON_OPT, } testing_utils.eval_model(opt, skip_test=True) opt = { 'model': 'projects.light_whoami.agents.pacer:LongPacerAgent', 'model_file': TEST_TGA, 'predictor_model_file': TEST_RPA_RERANKER, 'pacer_n_tokens': 10, 'pacer_frequency_ratio': 0.1, 'beam_min_length': 10, **COMMON_OPT, } testing_utils.eval_model(opt, skip_test=True)
def test_world_logging(self): with testing_utils.tempdir() as tmpdir: save_report = os.path.join(tmpdir, 'report') testing_utils.eval_model( dict( model_file='zoo:unittest/transformer_generator2/model', task='integration_tests:multiturn_candidate', world_logs=save_report, report_filename=save_report, truncate=1024, dynamic_batching='full', batchsize=4, )) convo_fle = str(save_report) + '.jsonl' convos = Conversations(convo_fle) for convo in convos: self.assertEquals(len(convo), 2 * 4) # each episode is 4 turns # now assert that they are all from the same dynamic batch index dyn_batch_idx = convo[0]['dyn_batch_idx'] for i, turn in enumerate(convo): if i % 2 == 0 and i > 0: # we log the batch index in the teacher acts only self.assertEquals(dyn_batch_idx, turn['dyn_batch_idx'])
def test_ppl(self): valid, _ = testing_utils.eval_model( { 'model': 'transformer/generator', 'model_file': 'zoo:tutorial_transformer_generator/model', 'task': 'dailydialog', 'skip_generation': 'true', 'num_examples': 512, 'batchsize': 32, }, skip_test=True, ) self.assertAlmostEqual(valid['ppl'], 19.59, places=2) self.assertAlmostEqual(valid['token_acc'], 0.4234, places=4)
def test_hogwild_eval(self): """ Test eval with numthreads > 1 and batchsize in [1,2,3]. """ opt = dict(task='integration_tests:repeat:{}'.format(NUM_EXS), model='repeat_label') for nt in NUM_THREADS_CHOICES: for bs in BATCHSIZE_CHOICES: opt['numthreads'] = nt opt['batchsize'] = bs valid, test = testing_utils.eval_model(opt) self.assertEqual(valid['exs'], NUM_EXS) self.assertEqual(test['exs'], NUM_EXS)
def test_hogwild_eval(self): """Test eval with numthreads > 1 and batchsize in [1,2,3].""" opt = dict(task='tasks.repeat:RepeatTeacher:{}'.format(NUM_EXS), model='repeat_label') for nt in NUM_THREADS_CHOICES: for bs in BATCHSIZE_CHOICES: opt['num_threads'] = nt opt['batchsize'] = bs stdout, valid, test = testing_utils.eval_model(opt) self.assertEqual(valid['exs'], NUM_EXS, 'LOG:\n{}'.format(stdout)) self.assertEqual(test['exs'], NUM_EXS, 'LOG:\n{}'.format(stdout))
def test_set_model_file_without_dict_file(self): """Check that moving a model without moving the dictfile raises an error.""" # Download model, move to a new location datapath = ParlaiParser().parse_args(print_args=False)['datapath'] try: # remove unittest models if there before shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass testing_utils.download_unittest_models() zoo_path = 'zoo:unittest/seq2seq/model' model_path = modelzoo_path(datapath, zoo_path) os.remove(model_path + '.dict') # Test that eval model fails with self.assertRaises(RuntimeError): testing_utils.eval_model( dict(task='babi:task1k:1', model_file=model_path)) try: # remove unittest models if there after shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass
def test_set_model_file_without_dict_file(self): """ Check that moving a model without moving the dictfile raises an error. """ # Download model, move to a new location with testing_utils.tempdir() as datapath: try: # remove unittest models if there before shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass zoo_path = 'zoo:unittest/seq2seq/model' model_path = modelzoo_path(datapath, zoo_path) PathManager.rm(model_path + '.dict') # Test that eval model fails with self.assertRaises(RuntimeError): testing_utils.eval_model( dict(task='babi:task1k:1', model_file=model_path)) try: # remove unittest models if there after shutil.rmtree(os.path.join(datapath, 'models/unittest')) except FileNotFoundError: pass
def test_backwards_compatibility(self): valid, test = testing_utils.eval_model( dict( task='integration_tests:multiturn_candidate', model='seq2seq', model_file='zoo:unittest/seq2seq/model', dict_file='zoo:unittest/seq2seq/model.dict', )) self.assertLessEqual(valid['ppl'], 1.01) self.assertGreaterEqual(valid['accuracy'], 0.999) self.assertGreaterEqual(valid['f1'], 0.999) self.assertLessEqual(test['ppl'], 1.01) self.assertGreaterEqual(test['accuracy'], 0.999) self.assertGreaterEqual(test['f1'], 0.999)
def _test_zoo_file(mf: str, fid: bool = False, fid_rag: bool = False): opt = copy.deepcopy(common_opt) if fid: opt['model'] = 'fid' if fid_rag: opt['dpr_model_file'] = RAG_TOKEN_ZOO_MODEL opt.update(GENERATION_OPTS['bart']) opt['model_file'] = mf opt['generation_model'] = 'bart' opt['task'] = 'wizard_of_wikipedia' opt['label_truncate'] = 10 valid, _ = testing_utils.eval_model(opt, skip_test=True) assert valid['ppl'] < 15.0 assert (100 * float(valid['f1'])) > 10.0 torch.cuda.empty_cache()
def test_generator_backcomp(self): """ Tests that the generator model files work over time. """ testing_utils.download_unittest_models() stdout, valid, test = testing_utils.eval_model( dict( task='integration_tests:multipass', model='transformer/generator', model_file='zoo:unittest/transformer_generator2/model', dict_file='zoo:unittest/transformer_generator2/model.dict', rank_candidates=True, batch_size=64, )) self.assertGreaterEqual( valid['hits@1'], 0.95, 'valid hits@1 = {}\nLOG:\n{}'.format(valid['hits@1'], stdout), ) self.assertLessEqual( valid['ppl'], 1.01, 'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout)) self.assertGreaterEqual( valid['accuracy'], 0.99, 'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout), ) self.assertGreaterEqual( valid['f1'], 0.99, 'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout)) self.assertGreaterEqual( test['hits@1'], 0.95, 'test hits@1 = {}\nLOG:\n{}'.format(test['hits@1'], stdout), ) self.assertLessEqual( test['ppl'], 1.01, 'test ppl = {}\nLOG:\n{}'.format(test['ppl'], stdout)) self.assertGreaterEqual( test['accuracy'], 0.99, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertGreaterEqual( test['f1'], 0.99, 'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
def test_save_report(self): """ Test that we can save report from eval model. """ with testing_utils.tempdir() as tmpdir: save_report = os.path.join(tmpdir, 'report') opt = dict( task='integration_tests', model='repeat_label', datatype='valid', num_examples=5, display_examples=False, save_world_logs=True, report_filename=save_report, ) valid, test = testing_utils.eval_model(opt)
def test_released_model(self): """ Check the pretrained model produces correct results. """ _, test = testing_utils.eval_model( { 'model_file': 'zoo:self_feeding/hh131k_hb60k_fb60k_st1k/model', 'task': 'self_feeding:all', 'batchsize': 20, }, skip_valid=True, ) self.assertAlmostEqual(test['dia_acc'], 0.506, delta=0.001) self.assertAlmostEqual(test['fee_acc'], 0.744, delta=0.001) self.assertAlmostEqual(test['sat_f1'], 0.8343, delta=0.0001)
def test_beamsearch(self): """ Ensures beam search can generate the correct response. """ valid, test = testing_utils.eval_model( dict( task='integration_tests:multiturn_nocandidate', model='seq2seq', model_file='zoo:unittest/seq2seq/model', dict_file='zoo:unittest/seq2seq/model.dict', skip_generation=False, inference='beam', beam_size=5, )) self.assertGreater(valid['accuracy'], 0.95) self.assertGreater(test['accuracy'], 0.95)
def test_convai2_finetuned_greedy(self): """ Check the greedy model produces correct results. """ valid, _ = testing_utils.eval_model( { 'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline', 'task': 'projects.controllable_dialogue.tasks.agents', 'beam_size': 1, 'batchsize': 64, }, skip_test=True, ) self.assertAlmostEqual(valid['ppl'], 22.86, delta=0.1) self.assertAlmostEqual(valid['f1'], 0.1702, delta=0.0002)
def test_greedy(self): """ Test a simple multiturn task. """ valid, test = testing_utils.eval_model( dict( task="integration_tests:multiturn_candidate", model="hred", model_file="zoo:unittest/hred_model/model", dict_file="zoo:unittest/hred_model/model.dict", skip_generation=False, batchsize=32, )) self.assertLess(valid["ppl"], 1.2) self.assertLess(test["ppl"], 1.2)
def _check_losses( self, opt: Opt, test_name: str, data_regression: DataRegressionFixture ): """ Calculate and check distillation loss terms. Given the input opt, run eval and check each of the loss terms to make sure that they match what is expected. """ valid, _ = testing_utils.eval_model(opt, skip_test=True) losses = {} loss_types = self.LOSS_TYPES[test_name] for loss_type in loss_types: losses[loss_type] = round_sig(valid[loss_type].value(), sig=6) basename = self._get_model_identifier() + '_' + test_name data_regression.check(losses, basename=basename)
def test_transresnet(self): """ Test pretrained model. """ stdout, _, test = testing_utils.eval_model(MODEL_OPTIONS, skip_valid=True) # Overall self.assertEqual( test['accuracy'], 0.3667, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertEqual( test['hits@5'], 0.633, 'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout), ) self.assertEqual( test['hits@10'], 0.767, 'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout), ) # First round self.assertEqual( test['first_round']['hits@1/100'], 0.2, 'test first round hits@1/100 = {}\nLOG:\n{}'.format( test['first_round']['hits@1/100'], stdout), ) # Second round self.assertEqual( test['second_round']['hits@1/100'], 0.5, 'test second round hits@1/100 = {}\nLOG:\n{}'.format( test['second_round']['hits@1/100'], stdout), ) # Third round self.assertEqual( test['third_round+']['hits@1/100'], 0.4, 'test third round hits@1/100 = {}\nLOG:\n{}'.format( test['third_round+']['hits@1/100'], stdout), )
def test_retrieval(self): stdout, _, test = testing_utils.eval_model(RETRIEVAL_OPTIONS) self.assertGreaterEqual( test['accuracy'], 0.86, 'test acc = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertGreaterEqual( test['hits@5'], 0.98, 'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout), ) self.assertGreaterEqual( test['hits@10'], 0.99, 'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout), )
def test_backcomp(self): """ Tests that the memnn model files continue to works over time. """ valid, test = testing_utils.eval_model( dict( task='integration_tests', model='memnn', model_file='zoo:unittest/memnn/model', dict_file='zoo:unittest/memnn/model.dict', batch_size=16, )) self.assertGreaterEqual(valid['accuracy'], 0.88) self.assertGreaterEqual(valid['f1'], 0.999) self.assertGreaterEqual(test['accuracy'], 0.84) self.assertGreaterEqual(test['f1'], 0.999)
def test_generation(self): """ This test uses a single-turn sequence repitition task. """ valid, test = testing_utils.eval_model( dict( task='integration_tests:multiturn_nocandidate', model='seq2seq', model_file='zoo:unittest/seq2seq/model', dict_file='zoo:unittest/seq2seq/model.dict', skip_generation=False, inference='greedy', batchsize=8, num_examples=32, )) self.assertLess(valid['ppl'], 1.2) self.assertLess(test['ppl'], 1.2)