Exemple #1
0
 def test_batched(self):
     valid, test = testing_utils.eval_model(
         dict(
             task='integration_tests:chunky',
             model='parlai.agents.test_agents.test_agents:MockTorchAgent',
             batchsize=32,
         ),
         valid_datatype='valid:stream',
         test_datatype='test:stream',
     )
     assert valid['exs'] == 100
     assert test['exs'] == 100
Exemple #2
0
    def test_stream_only(self):
        with self.assertRaises(ValueError):
            valid, test = testing_utils.eval_model(
                dict(
                    task='integration_tests:chunky',
                    model='parlai.agents.test_agents.test_agents:MockTorchAgent',
                    batchsize=32,
                ),
                valid_datatype='valid',
            )

        with self.assertRaises(ValueError):
            valid, test = testing_utils.eval_model(
                dict(
                    task='integration_tests:chunky',
                    model='parlai.agents.test_agents.test_agents:MockTorchAgent',
                    batchsize=32,
                ),
                valid_datatype='valid:stream',
                test_datatype='test',
            )
    def test_multitasking_metrics_macro(self):
        valid, test = testing_utils.eval_model({
            'task':
            'integration_tests:candidate,'
            'integration_tests:multiturnCandidate',
            'model':
            'random_candidate',
            'aggregate_micro':
            False,
        })

        task1_acc = valid['integration_tests:candidate/accuracy']
        task2_acc = valid['integration_tests:multiturnCandidate/accuracy']
        total_acc = valid['accuracy']
        # task 2 is 4 times the size of task 1
        self.assertEqual(
            total_acc,
            (task1_acc.value() + task2_acc.value()) * 0.5,
            'Task accuracy is averaged incorrectly',
        )

        valid, test = testing_utils.eval_model({
            'task':
            'integration_tests:candidate,'
            'integration_tests:multiturnCandidate',
            'model':
            'random_candidate',
            'aggregate_micro':
            False,
        })
        task1_acc = valid['integration_tests:candidate/accuracy']
        task2_acc = valid['integration_tests:multiturnCandidate/accuracy']
        total_acc = valid['accuracy']

        # metrics are combined correctly
        self.assertEqual(
            total_acc,
            (task1_acc.value() + task2_acc.value()) * 0.5,
            'Task accuracy is averaged incorrectly',
        )
Exemple #4
0
    def test_multitasking_metrics(self):
        valid, test = testing_utils.eval_model({
            'task':
            'integration_tests:candidate,'
            'integration_tests:multiturnCandidate',
            'model':
            'random_candidate',
            'num_epochs':
            0.5,
        })

        task1_acc = valid['integration_tests:candidate/accuracy']
        task2_acc = valid['integration_tests:multiturnCandidate/accuracy']
        total_acc = valid['accuracy']
        # task 2 is 4 times the size of task 1
        self.assertEqual(
            total_acc,
            task1_acc + task2_acc,
            'Task accuracy is averaged incorrectly',
        )

        valid, test = testing_utils.eval_model({
            'task':
            'integration_tests:candidate,'
            'integration_tests:multiturnCandidate',
            'model':
            'random_candidate',
            'num_epochs':
            0.5,
        })
        task1_acc = valid['integration_tests:candidate/accuracy']
        task2_acc = valid['integration_tests:multiturnCandidate/accuracy']
        total_acc = valid['accuracy']

        # metrics are combined correctly
        self.assertEqual(
            total_acc,
            (task1_acc + task2_acc),
            'Task accuracy is averaged incorrectly',
        )
 def test_short_pacer_pretrain(self):
     opt = {
         'model':
         'projects.light_whoami.agents.pacer:PacerPartialOnlyAgent',
         'model_file': VANILLA_128,
         'predictor_model_file': RPA_RERANKER,
         'pacer_n_tokens': 10,
         'pacer_frequency_ratio': 0.1,
         'beam_min_length': 10,
         **COMMON_OPT,
     }
     testing_utils.eval_model(opt, skip_test=True)
     opt = {
         'model': 'projects.light_whoami.agents.pacer:PacerAgent',
         'model_file': VANILLA_128,
         'predictor_model_file': RPA_RERANKER,
         'pacer_n_tokens': 10,
         'pacer_frequency_ratio': 0.1,
         'beam_min_length': 10,
         **COMMON_OPT,
     }
     testing_utils.eval_model(opt, skip_test=True)
 def test_multi_task(self):
     """
     Test model trained multi-task on dialogue datasets.
     """
     valid, _ = testing_utils.eval_model(
         opt={
             **SHARED_OPTS,
             'model_file':
             f'zoo:blended_skill_talk/multi_task/model',
         },
         skip_test=True,
     )
     self.assertAlmostEqual(valid['accuracy'], 0.9062, delta=0.005)
 def test_multi_task_bst_tuned(self):
     """
     Test model trained multi-task and then tuned on BlendedSkillTalk.
     """
     valid, _ = testing_utils.eval_model(
         opt={
             **SHARED_OPTS,
             'model_file':
             f'zoo:blended_skill_talk/multi_task_bst_tuned/model',
         },
         skip_test=True,
     )
     self.assertAlmostEqual(valid['accuracy'], 0.9219, delta=0.005)
 def test_wizard_single_task(self):
     """
     Test model trained single-task on Wizard of Wikipedia.
     """
     valid, _ = testing_utils.eval_model(
         opt={
             **SHARED_OPTS,
             'model_file':
             f'zoo:blended_skill_talk/wizard_single_task/model',
         },
         skip_test=True,
     )
     self.assertAlmostEqual(valid['accuracy'], 0.7500, delta=0.005)
 def test_ed_single_task(self):
     """
     Test model trained single-task on EmpatheticDialogues.
     """
     valid, _ = testing_utils.eval_model(
         opt={
             **SHARED_OPTS,
             'model_file':
             f'zoo:blended_skill_talk/ed_single_task/model',
         },
         skip_test=True,
     )
     self.assertAlmostEqual(valid['accuracy'], 0.7656, delta=0.005)
 def test_convai2_single_task(self):
     """
     Test model trained single-task on ConvAI2.
     """
     valid, _ = testing_utils.eval_model(
         opt={
             **SHARED_OPTS,
             'model_file':
             f'zoo:blended_skill_talk/convai2_single_task/model',
         },
         skip_test=True,
     )
     self.assertAlmostEqual(valid['accuracy'], 0.8438, delta=0.005)
 def test_long_pacer(self):
     opt = {
         'model':
         'projects.light_whoami.agents.pacer:LongPacerPartialOnlyAgent',
         'model_file': TEST_TGA,
         'predictor_model_file': TEST_RPA_RERANKER,
         'pacer_n_tokens': 10,
         'pacer_frequency_ratio': 0.1,
         'beam_min_length': 10,
         **COMMON_OPT,
     }
     testing_utils.eval_model(opt, skip_test=True)
     opt = {
         'model': 'projects.light_whoami.agents.pacer:LongPacerAgent',
         'model_file': TEST_TGA,
         'predictor_model_file': TEST_RPA_RERANKER,
         'pacer_n_tokens': 10,
         'pacer_frequency_ratio': 0.1,
         'beam_min_length': 10,
         **COMMON_OPT,
     }
     testing_utils.eval_model(opt, skip_test=True)
Exemple #12
0
 def test_world_logging(self):
     with testing_utils.tempdir() as tmpdir:
         save_report = os.path.join(tmpdir, 'report')
         testing_utils.eval_model(
             dict(
                 model_file='zoo:unittest/transformer_generator2/model',
                 task='integration_tests:multiturn_candidate',
                 world_logs=save_report,
                 report_filename=save_report,
                 truncate=1024,
                 dynamic_batching='full',
                 batchsize=4,
             ))
         convo_fle = str(save_report) + '.jsonl'
         convos = Conversations(convo_fle)
         for convo in convos:
             self.assertEquals(len(convo), 2 * 4)  # each episode is 4 turns
             # now assert that they are all from the same dynamic batch index
             dyn_batch_idx = convo[0]['dyn_batch_idx']
             for i, turn in enumerate(convo):
                 if i % 2 == 0 and i > 0:
                     # we log the batch index in the teacher acts only
                     self.assertEquals(dyn_batch_idx, turn['dyn_batch_idx'])
Exemple #13
0
 def test_ppl(self):
     valid, _ = testing_utils.eval_model(
         {
             'model': 'transformer/generator',
             'model_file': 'zoo:tutorial_transformer_generator/model',
             'task': 'dailydialog',
             'skip_generation': 'true',
             'num_examples': 512,
             'batchsize': 32,
         },
         skip_test=True,
     )
     self.assertAlmostEqual(valid['ppl'], 19.59, places=2)
     self.assertAlmostEqual(valid['token_acc'], 0.4234, places=4)
Exemple #14
0
    def test_hogwild_eval(self):
        """
        Test eval with numthreads > 1 and batchsize in [1,2,3].
        """
        opt = dict(task='integration_tests:repeat:{}'.format(NUM_EXS),
                   model='repeat_label')
        for nt in NUM_THREADS_CHOICES:
            for bs in BATCHSIZE_CHOICES:
                opt['numthreads'] = nt
                opt['batchsize'] = bs

                valid, test = testing_utils.eval_model(opt)
                self.assertEqual(valid['exs'], NUM_EXS)
                self.assertEqual(test['exs'], NUM_EXS)
Exemple #15
0
    def test_hogwild_eval(self):
        """Test eval with numthreads > 1 and batchsize in [1,2,3]."""
        opt = dict(task='tasks.repeat:RepeatTeacher:{}'.format(NUM_EXS),
                   model='repeat_label')
        for nt in NUM_THREADS_CHOICES:
            for bs in BATCHSIZE_CHOICES:
                opt['num_threads'] = nt
                opt['batchsize'] = bs

                stdout, valid, test = testing_utils.eval_model(opt)
                self.assertEqual(valid['exs'], NUM_EXS,
                                 'LOG:\n{}'.format(stdout))
                self.assertEqual(test['exs'], NUM_EXS,
                                 'LOG:\n{}'.format(stdout))
    def test_set_model_file_without_dict_file(self):
        """Check that moving a model without moving the dictfile raises an error."""
        # Download model, move to a new location
        datapath = ParlaiParser().parse_args(print_args=False)['datapath']
        try:
            # remove unittest models if there before
            shutil.rmtree(os.path.join(datapath, 'models/unittest'))
        except FileNotFoundError:
            pass
        testing_utils.download_unittest_models()

        zoo_path = 'zoo:unittest/seq2seq/model'
        model_path = modelzoo_path(datapath, zoo_path)
        os.remove(model_path + '.dict')
        # Test that eval model fails
        with self.assertRaises(RuntimeError):
            testing_utils.eval_model(
                dict(task='babi:task1k:1', model_file=model_path))
        try:
            # remove unittest models if there after
            shutil.rmtree(os.path.join(datapath, 'models/unittest'))
        except FileNotFoundError:
            pass
Exemple #17
0
    def test_set_model_file_without_dict_file(self):
        """
        Check that moving a model without moving the dictfile raises an error.
        """
        # Download model, move to a new location
        with testing_utils.tempdir() as datapath:
            try:
                # remove unittest models if there before
                shutil.rmtree(os.path.join(datapath, 'models/unittest'))
            except FileNotFoundError:
                pass

            zoo_path = 'zoo:unittest/seq2seq/model'
            model_path = modelzoo_path(datapath, zoo_path)
            PathManager.rm(model_path + '.dict')
            # Test that eval model fails
            with self.assertRaises(RuntimeError):
                testing_utils.eval_model(
                    dict(task='babi:task1k:1', model_file=model_path))
            try:
                # remove unittest models if there after
                shutil.rmtree(os.path.join(datapath, 'models/unittest'))
            except FileNotFoundError:
                pass
Exemple #18
0
    def test_backwards_compatibility(self):
        valid, test = testing_utils.eval_model(
            dict(
                task='integration_tests:multiturn_candidate',
                model='seq2seq',
                model_file='zoo:unittest/seq2seq/model',
                dict_file='zoo:unittest/seq2seq/model.dict',
            ))

        self.assertLessEqual(valid['ppl'], 1.01)
        self.assertGreaterEqual(valid['accuracy'], 0.999)
        self.assertGreaterEqual(valid['f1'], 0.999)
        self.assertLessEqual(test['ppl'], 1.01)
        self.assertGreaterEqual(test['accuracy'], 0.999)
        self.assertGreaterEqual(test['f1'], 0.999)
Exemple #19
0
def _test_zoo_file(mf: str, fid: bool = False, fid_rag: bool = False):
    opt = copy.deepcopy(common_opt)
    if fid:
        opt['model'] = 'fid'
    if fid_rag:
        opt['dpr_model_file'] = RAG_TOKEN_ZOO_MODEL
    opt.update(GENERATION_OPTS['bart'])
    opt['model_file'] = mf
    opt['generation_model'] = 'bart'
    opt['task'] = 'wizard_of_wikipedia'
    opt['label_truncate'] = 10
    valid, _ = testing_utils.eval_model(opt, skip_test=True)
    assert valid['ppl'] < 15.0
    assert (100 * float(valid['f1'])) > 10.0
    torch.cuda.empty_cache()
Exemple #20
0
    def test_generator_backcomp(self):
        """
        Tests that the generator model files work over time.
        """
        testing_utils.download_unittest_models()

        stdout, valid, test = testing_utils.eval_model(
            dict(
                task='integration_tests:multipass',
                model='transformer/generator',
                model_file='zoo:unittest/transformer_generator2/model',
                dict_file='zoo:unittest/transformer_generator2/model.dict',
                rank_candidates=True,
                batch_size=64,
            ))

        self.assertGreaterEqual(
            valid['hits@1'],
            0.95,
            'valid hits@1 = {}\nLOG:\n{}'.format(valid['hits@1'], stdout),
        )
        self.assertLessEqual(
            valid['ppl'], 1.01,
            'valid ppl = {}\nLOG:\n{}'.format(valid['ppl'], stdout))
        self.assertGreaterEqual(
            valid['accuracy'],
            0.99,
            'valid accuracy = {}\nLOG:\n{}'.format(valid['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            valid['f1'], 0.99,
            'valid f1 = {}\nLOG:\n{}'.format(valid['f1'], stdout))
        self.assertGreaterEqual(
            test['hits@1'],
            0.95,
            'test hits@1 = {}\nLOG:\n{}'.format(test['hits@1'], stdout),
        )
        self.assertLessEqual(
            test['ppl'], 1.01,
            'test ppl = {}\nLOG:\n{}'.format(test['ppl'], stdout))
        self.assertGreaterEqual(
            test['accuracy'],
            0.99,
            'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout),
        )
        self.assertGreaterEqual(
            test['f1'], 0.99,
            'test f1 = {}\nLOG:\n{}'.format(test['f1'], stdout))
Exemple #21
0
 def test_save_report(self):
     """
     Test that we can save report from eval model.
     """
     with testing_utils.tempdir() as tmpdir:
         save_report = os.path.join(tmpdir, 'report')
         opt = dict(
             task='integration_tests',
             model='repeat_label',
             datatype='valid',
             num_examples=5,
             display_examples=False,
             save_world_logs=True,
             report_filename=save_report,
         )
         valid, test = testing_utils.eval_model(opt)
    def test_released_model(self):
        """
        Check the pretrained model produces correct results.
        """
        _, test = testing_utils.eval_model(
            {
                'model_file': 'zoo:self_feeding/hh131k_hb60k_fb60k_st1k/model',
                'task': 'self_feeding:all',
                'batchsize': 20,
            },
            skip_valid=True,
        )

        self.assertAlmostEqual(test['dia_acc'], 0.506, delta=0.001)
        self.assertAlmostEqual(test['fee_acc'], 0.744, delta=0.001)
        self.assertAlmostEqual(test['sat_f1'], 0.8343, delta=0.0001)
Exemple #23
0
 def test_beamsearch(self):
     """
     Ensures beam search can generate the correct response.
     """
     valid, test = testing_utils.eval_model(
         dict(
             task='integration_tests:multiturn_nocandidate',
             model='seq2seq',
             model_file='zoo:unittest/seq2seq/model',
             dict_file='zoo:unittest/seq2seq/model.dict',
             skip_generation=False,
             inference='beam',
             beam_size=5,
         ))
     self.assertGreater(valid['accuracy'], 0.95)
     self.assertGreater(test['accuracy'], 0.95)
Exemple #24
0
    def test_convai2_finetuned_greedy(self):
        """
        Check the greedy model produces correct results.
        """
        valid, _ = testing_utils.eval_model(
            {
                'model_file': 'zoo:controllable_dialogue/convai2_finetuned_baseline',
                'task': 'projects.controllable_dialogue.tasks.agents',
                'beam_size': 1,
                'batchsize': 64,
            },
            skip_test=True,
        )

        self.assertAlmostEqual(valid['ppl'], 22.86, delta=0.1)
        self.assertAlmostEqual(valid['f1'], 0.1702, delta=0.0002)
Exemple #25
0
    def test_greedy(self):
        """
        Test a simple multiturn task.
        """
        valid, test = testing_utils.eval_model(
            dict(
                task="integration_tests:multiturn_candidate",
                model="hred",
                model_file="zoo:unittest/hred_model/model",
                dict_file="zoo:unittest/hred_model/model.dict",
                skip_generation=False,
                batchsize=32,
            ))

        self.assertLess(valid["ppl"], 1.2)
        self.assertLess(test["ppl"], 1.2)
Exemple #26
0
    def _check_losses(
        self, opt: Opt, test_name: str, data_regression: DataRegressionFixture
    ):
        """
        Calculate and check distillation loss terms.

        Given the input opt, run eval and check each of the loss terms to make sure that
        they match what is expected.
        """
        valid, _ = testing_utils.eval_model(opt, skip_test=True)
        losses = {}
        loss_types = self.LOSS_TYPES[test_name]
        for loss_type in loss_types:
            losses[loss_type] = round_sig(valid[loss_type].value(), sig=6)
        basename = self._get_model_identifier() + '_' + test_name
        data_regression.check(losses, basename=basename)
Exemple #27
0
    def test_transresnet(self):
        """
        Test pretrained model.
        """
        stdout, _, test = testing_utils.eval_model(MODEL_OPTIONS,
                                                   skip_valid=True)

        # Overall
        self.assertEqual(
            test['accuracy'],
            0.3667,
            'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout),
        )
        self.assertEqual(
            test['hits@5'],
            0.633,
            'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout),
        )
        self.assertEqual(
            test['hits@10'],
            0.767,
            'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout),
        )

        # First round
        self.assertEqual(
            test['first_round']['hits@1/100'],
            0.2,
            'test first round hits@1/100 = {}\nLOG:\n{}'.format(
                test['first_round']['hits@1/100'], stdout),
        )

        # Second round
        self.assertEqual(
            test['second_round']['hits@1/100'],
            0.5,
            'test second round hits@1/100 = {}\nLOG:\n{}'.format(
                test['second_round']['hits@1/100'], stdout),
        )

        # Third round
        self.assertEqual(
            test['third_round+']['hits@1/100'],
            0.4,
            'test third round hits@1/100 = {}\nLOG:\n{}'.format(
                test['third_round+']['hits@1/100'], stdout),
        )
Exemple #28
0
 def test_retrieval(self):
     stdout, _, test = testing_utils.eval_model(RETRIEVAL_OPTIONS)
     self.assertGreaterEqual(
         test['accuracy'],
         0.86,
         'test acc = {}\nLOG:\n{}'.format(test['accuracy'], stdout),
     )
     self.assertGreaterEqual(
         test['hits@5'],
         0.98,
         'test hits@5 = {}\nLOG:\n{}'.format(test['hits@5'], stdout),
     )
     self.assertGreaterEqual(
         test['hits@10'],
         0.99,
         'test hits@10 = {}\nLOG:\n{}'.format(test['hits@10'], stdout),
     )
    def test_backcomp(self):
        """
        Tests that the memnn model files continue to works over time.
        """
        valid, test = testing_utils.eval_model(
            dict(
                task='integration_tests',
                model='memnn',
                model_file='zoo:unittest/memnn/model',
                dict_file='zoo:unittest/memnn/model.dict',
                batch_size=16,
            ))

        self.assertGreaterEqual(valid['accuracy'], 0.88)
        self.assertGreaterEqual(valid['f1'], 0.999)
        self.assertGreaterEqual(test['accuracy'], 0.84)
        self.assertGreaterEqual(test['f1'], 0.999)
Exemple #30
0
    def test_generation(self):
        """
        This test uses a single-turn sequence repitition task.
        """
        valid, test = testing_utils.eval_model(
            dict(
                task='integration_tests:multiturn_nocandidate',
                model='seq2seq',
                model_file='zoo:unittest/seq2seq/model',
                dict_file='zoo:unittest/seq2seq/model.dict',
                skip_generation=False,
                inference='greedy',
                batchsize=8,
                num_examples=32,
            ))

        self.assertLess(valid['ppl'], 1.2)
        self.assertLess(test['ppl'], 1.2)