def test_learning_rate_resuming(self, args): mdl = args['model'] with testing_utils.tempdir() as tmpdir: model_file = os.path.join(tmpdir, 'model') stdout1, valid1, test1 = testing_utils.train_model( dict(model_file=model_file, lr_scheduler='invsqrt', **args)) stdout2, valid2, test2 = testing_utils.train_model( dict(model_file=model_file, lr_scheduler='invsqrt', **args)) # make sure the number of updates is being tracked correctly self.assertGreater( valid2['num_updates'], valid1['num_updates'], '({}) Number of updates is not increasing'.format(mdl), ) # make sure the learning rate is decreasing self.assertLess( valid2['lr'], valid1['lr'], '({}) Learning rate is not decreasing'.format(mdl), ) # but make sure we're not loading the scheduler if we're fine # tuning stdout3, valid3, test3 = testing_utils.train_model( dict( init_model=os.path.join(tmpdir, 'model'), model_file=os.path.join(tmpdir, 'newmodel'), lr_scheduler='invsqrt', **args, )) self.assertEqual( valid3['num_updates'], valid1['num_updates'], '({}) Finetuning LR scheduler reset failed ' '(num_updates).'.format(mdl), ) self.assertEqual( valid3['lr'], valid1['lr'], '({}) Finetuning LR scheduler reset failed ' '(lr).'.format(mdl), ) # and make sure we're not loading the scheduler if it changes stdout4, valid4, test4 = testing_utils.train_model( dict( init_model=os.path.join(tmpdir, 'model'), model_file=os.path.join(tmpdir, 'newmodel2'), lr_scheduler='reduceonplateau', **args, )) self.assertEqual( valid4['num_updates'], valid1['num_updates'], '({}) LR scheduler change reset failed (num_updates).' '\n{}'.format(mdl, stdout4), ) self.assertEqual( valid4['lr'], 1e-3, '({}) LR is not correct in final resume.\n{}'.format(mdl, stdout4), )
def test_resuming(self): with testing_utils.tempdir() as tmpdir: model_file = os.path.join(tmpdir, 'model') stdout1, valid1, test1 = testing_utils.train_model( dict( model_file=model_file, task='integration_tests:candidate', model='transformer/ranker', optimizer='adamax', learningrate=7e-3, batchsize=32, num_epochs=1, n_layers=1, n_heads=1, ffn_size=32, embedding_size=32, warmup_updates=1, lr_scheduler='invsqrt', )) stdout2, valid2, test2 = testing_utils.train_model( dict( model_file=model_file, task='integration_tests:candidate', model='transformer/ranker', num_epochs=1, )) # make sure the number of updates is being tracked correctly self.assertGreater(valid2['num_updates'], valid1['num_updates'], 'Number of updates is not increasing') # make sure the learning rate is decreasing self.assertLess(valid2['lr'], valid1['lr'], 'Learning rate is not decreasing')
def test_generation(self): """This test uses a single-turn sequence repitition task.""" stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:nocandidate', model='seq2seq', learningrate=LR, batchsize=BATCH_SIZE, num_epochs=NUM_EPOCHS, numthreads=1, no_cuda=True, embeddingsize=16, hiddensize=16, rnn_class='gru', attention='general', gradient_clip=1.0, dropout=0.0, lookuptable='all', )) self.assertTrue( valid['ppl'] < 1.2, "valid ppl = {}\nLOG:\n{}".format(valid['ppl'], stdout)) self.assertTrue(test['ppl'] < 1.2, "test ppl = {}\nLOG:\n{}".format(test['ppl'], stdout))
def test_beamsearch(self): """Ensures beam search can generate the correct response""" stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:nocandidate', model='seq2seq', learningrate=LR, batchsize=BATCH_SIZE, num_epochs=NUM_EPOCHS, numthreads=1, no_cuda=True, embeddingsize=16, hiddensize=16, rnn_class='gru', attention='general', gradient_clip=1.0, dropout=0.0, lookuptable='all', inference='beam', beam_size=4, )) self.assertTrue( valid['bleu'] > 0.95, "valid bleu = {}\nLOG:\n{}".format(valid['bleu'], stdout), ) self.assertTrue( test['bleu'] > 0.95, "test bleu = {}\nLOG:\n{}".format(test['bleu'], stdout)) self.assertTrue( valid['ppl'] < 1.2, "valid ppl = {}\nLOG:\n{}".format(valid['ppl'], stdout)) self.assertTrue(test['ppl'] < 1.2, "test ppl = {}\nLOG:\n{}".format(test['ppl'], stdout))
def test_labelcands_nomemnn(self): """This test uses a single-turn task, so doesn't test memories.""" stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:candidate', model='memnn', lr=LR, batchsize=BATCH_SIZE, num_epochs=NUM_EPOCHS, numthreads=1, no_cuda=True, embedding_size=32, gradient_clip=1.0, hops=1, position_encoding=True, use_time_features=False, memsize=0, rank_candidates=True, )) self.assertTrue( valid['hits@1'] > 0.95, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout)) self.assertTrue( test['hits@1'] > 0.95, "test hits@1 = {}\nLOG:\n{}".format(test['hits@1'], stdout))
def test_hogwild_train(self): """Test the trainer eval with numthreads > 1 and batchsize in [1,2,3].""" opt = dict( task='tasks.repeat:RepeatTeacher:{}'.format(1), evaltask='tasks.repeat:RepeatTeacher:{}'.format(NUM_EXS), model='repeat_label', display_examples=False, num_epochs=10, ) for nt in NUM_THREADS_CHOICES: for bs in BATCHSIZE_CHOICES: opt['num_threads'] = nt opt['batchsize'] = bs stdout, valid, test = testing_utils.train_model(opt) self.assertEqual( valid['exs'], NUM_EXS, 'LOG:\n{}'.format(stdout), ) self.assertEqual( test['exs'], NUM_EXS, 'LOG:\n{}'.format(stdout), )
def test_alt_reduction(self): """Test a transformer ranker reduction method other than `mean`.""" stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:candidate', model='transformer/ranker', optimizer='adamax', learningrate=7e-3, batchsize=16, validation_every_n_epochs=5, validation_patience=2, n_layers=1, n_heads=4, ffn_size=64, embedding_size=32, candidates='batch', eval_candidates='inline', gradient_clip=0.5, variant='xlm', activation='gelu', reduction_type= 'first', # this is really what we're trying to test for )) self.assertGreaterEqual( valid['hits@1'], 0.90, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), ) self.assertGreaterEqual( test['hits@1'], 0.90, "test hits@1 = {}\nLOG:\n{}".format(test['hits@1'], stdout), )
def test_repeater(self): stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:candidate', model='transformer/ranker', optimizer='adamax', learningrate=7e-3, batchsize=16, validation_every_n_epochs=5, validation_patience=2, n_layers=1, n_heads=4, ffn_size=64, embedding_size=32, candidates='batch', eval_candidates='inline', gradient_clip=0.5, )) self.assertGreaterEqual( valid['hits@1'], 0.90, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), ) self.assertGreaterEqual( test['hits@1'], 0.90, "test hits@1 = {}\nLOG:\n{}".format(test['hits@1'], stdout), )
def test_xlm(self): stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:nocandidate', model='transformer/generator', optimizer='adamax', learningrate=7e-3, batchsize=32, num_epochs=20, n_layers=1, n_heads=1, ffn_size=32, embedding_size=32, beam_size=1, variant='xlm', activation='gelu', n_segments=8, # doesn't do anything but still good to test )) self.assertLessEqual( valid['ppl'], 1.30, "valid ppl = {}\nLOG:\n{}".format(valid['ppl'], stdout)) self.assertGreaterEqual( valid['bleu'], 0.90, "valid blue = {}\nLOG:\n{}".format(valid['bleu'], stdout), ) self.assertLessEqual( test['ppl'], 1.30, "test ppl = {}\nLOG:\n{}".format(test['ppl'], stdout)) self.assertGreaterEqual( test['bleu'], 0.90, "test bleu = {}\nLOG:\n{}".format(test['bleu'], stdout))
def test_beamsearch(self): stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:nocandidate', model='transformer/generator', optimizer='adamax', learningrate=7e-3, batchsize=32, num_epochs=20, n_layers=1, n_heads=1, ffn_size=32, embedding_size=32, beam_size=5, )) self.assertLessEqual( valid['ppl'], 1.20, "valid ppl = {}\nLOG:\n{}".format(valid['ppl'], stdout)) self.assertGreaterEqual( valid['bleu'], 0.95, "valid blue = {}\nLOG:\n{}".format(valid['bleu'], stdout), ) self.assertLessEqual( test['ppl'], 1.20, "test ppl = {}\nLOG:\n{}".format(test['ppl'], stdout)) self.assertGreaterEqual( test['bleu'], 0.95, "test bleu = {}\nLOG:\n{}".format(test['bleu'], stdout))
def test_labelcands_multi(self): """This test uses a multi-turn task and multithreading.""" stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:multiturn_candidate', model='memnn', lr=LR, batchsize=BATCH_SIZE, num_epochs=NUM_EPOCHS * 3, numthreads=4, no_cuda=True, embedding_size=32, gradient_clip=1.0, hops=2, position_encoding=False, use_time_features=True, memsize=5, rank_candidates=True, ) ) self.assertTrue( valid['hits@1'] > 0.95, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), ) self.assertTrue( test['hits@1'] > 0.95, "test hits@1 = {}\nLOG:\n{}".format(test['hits@1'], stdout), )
def test_crossencoder(self): stdout, valid, test = testing_utils.train_model( dict( task='convai2', model='bert_ranker/cross_encoder_ranker', num_epochs=0.002, batchsize=1, candidates="inline", type_optimization="all_encoder_layers", warmup_updates=100, text_truncate=32, label_truncate=32, validation_max_exs=20, short_final_eval=True, )) # The cross encoder reaches an interesting state MUCH faster # accuracy should be present and somewhere between 0.2 and 0.8 # (large interval so that it doesn't flake.) self.assertGreaterEqual( test['accuracy'], 0.03, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), ) self.assertLessEqual( test['accuracy'], 0.8, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout), )
def test_pyt_batchsort_train(self): """ Tests the functionality of training with batchsort under the following conditions: 1. -dt train --pytorch_preprocess False 2. -dt train:stream --pytorch_preprocess False 3. -dt train --pytorch_preprocess True --batch_sort_field text_vec """ # Next, check that training works dt_and_preprocess = [('train', False), ('train:stream', False), ('train', True)] for dt, preprocess in dt_and_preprocess: defaults = parser_defaults.copy() defaults['datatype'] = dt defaults['pytorch_preprocess'] = preprocess defaults['pytorch_teacher_batch_sort'] = True defaults['batchsize'] = 32 if preprocess: defaults['batch_sort_field'] = 'text_vec' str_output, _, _ = testing_utils.train_model(defaults) self.assertTrue( solved_task(str_output), 'Teacher could not teach seq2seq with batch sort ' 'and args {} and output {}'.format((dt, preprocess), str_output))
def test_multitasking_metrics(self): stdout, valid, test = testing_utils.train_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'num_epochs': 0.5, 'aggregate_micro': True, }) task1_acc = valid['tasks']['integration_tests:candidate']['accuracy'] task2_acc = valid['tasks']['integration_tests:multiturnCandidate'][ 'accuracy'] total_acc = valid['accuracy'] # task 2 is 4 times the size of task 1 self.assertAlmostEqual( total_acc, (task1_acc + 4 * task2_acc) / 5, 4, 'Task accuracy is averaged incorrectly', ) stdout, valid, test = testing_utils.train_model({ 'task': 'integration_tests:candidate,' 'integration_tests:multiturnCandidate', 'model': 'random_candidate', 'num_epochs': 0.5, 'aggregate_micro': False, }) task1_acc = valid['tasks']['integration_tests:candidate']['accuracy'] task2_acc = valid['tasks']['integration_tests:multiturnCandidate'][ 'accuracy'] total_acc = valid['accuracy'] # metrics should be averaged equally across tasks self.assertAlmostEqual( total_acc, (task1_acc + task2_acc) / 2, 4, 'Task accuracy is averaged incorrectly', )
def test_fast_final_eval(self): stdout, valid, test = testing_utils.train_model({ 'task': 'integration_tests', 'validation_max_exs': 10, 'model': 'repeat_label', 'short_final_eval': True, 'num_epochs': 1.0, }) self.assertEqual(valid['exs'], 10, 'Validation exs is wrong') self.assertEqual(test['exs'], 10, 'Test exs is wrong')
def test_train_batch_all(self): args = self._get_args() args['candidates'] = 'batch-all-cands' stdout, valid, test = testing_utils.train_model(args) threshold = self._get_threshold() self.assertGreaterEqual( valid['hits@1'], threshold, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), )
def test_topk(self): """Test topk generation.""" # Topk is inherently stochastic, just ensure no crash. testing_utils.train_model( dict( task='integration_tests:nocandidate', model='transformer/generator', optimizer='adamax', learningrate=7e-3, batchsize=32, num_epochs=20, n_layers=1, n_heads=1, ffn_size=32, embedding_size=32, inference='topk', topk=5, beam_size=5, ) )
def test_eval_inline(self): args = self._get_args() args['eval_candidates'] = 'inline' stdout, valid, test = testing_utils.train_model(args) threshold = self._get_threshold() self.assertGreaterEqual( valid['hits@1'], threshold, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), )
def test_train_fixed(self): args = self._get_args() args['candidates'] = 'fixed' args['encode_candidate_vecs'] = False stdout, valid, test = testing_utils.train_model(args) threshold = self._get_threshold() self.assertGreaterEqual( valid['hits@1'], threshold, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), )
def test_eval_vocab(self): args = self._get_args() args['eval_candidates'] = 'vocab' args['encode_candidate_vecs'] = True stdout, valid, test = testing_utils.train_model(args) # accuracy should be zero, none of the vocab candidates should be the # correct label self.assertEqual( valid['hits@100'], 0, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), )
def test_resuming_reduce_on_plateau(self): """ Reduce on Plateau can be tricky when combined with warmup. See: https://github.com/facebookresearch/ParlAI/pull/1812 """ with testing_utils.tempdir() as tmpdir: model_file = os.path.join(tmpdir, 'model') stdout1, valid1, test1 = testing_utils.train_model( dict( model_file=model_file, task='integration_tests:candidate', model='transformer/ranker', optimizer='adamax', learningrate=7e-3, batchsize=32, num_epochs=1, n_layers=1, n_heads=1, ffn_size=32, embedding_size=32, warmup_updates=1, lr_scheduler='reduceonplateau', ) ) stdout2, valid2, test2 = testing_utils.train_model( dict( model_file=model_file, task='integration_tests:candidate', model='transformer/ranker', num_epochs=1, lr_scheduler='reduceonplateau', ) ) # make sure the learning rate is decreasing self.assertGreater( valid2['lr'], 1e-5, 'Learning rate should not be that low when resuming' )
def test_eval_fixed(self): args = self._get_args() args['eval_candidates'] = 'fixed' args['encode_candidate_vecs'] = True args['ignore_bad_candidates'] = True stdout, valid, test = testing_utils.train_model(args) # none of the train candidates appear in evaluation, so should have # zero accuracy: this tests whether the fixed candidates were built # properly (i.e., only using candidates from the train set) self.assertEqual( valid['hits@1'], 0, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), ) # now try again with a fixed candidate file that includes all possible # candidates teacher = CandidateTeacher({'datatype': 'train'}) all_cands = teacher.train + teacher.val + teacher.test all_cands_str = '\n'.join([' '.join(x) for x in all_cands]) with testing_utils.tempdir() as tmpdir: tmp_cands_file = os.path.join(tmpdir, 'all_cands.text') with open(tmp_cands_file, 'w') as f: f.write(all_cands_str) args['fixed_candidates_path'] = tmp_cands_file args['encode_candidate_vecs'] = False # don't encode before training args['ignore_bad_candidates'] = False args['num_epochs'] = 20 stdout, valid, test = testing_utils.train_model(args) self.assertGreaterEqual( valid['hits@100'], 0.1, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout), )
def test_badinput(self): """Ensures model doesn't crash on malformed inputs.""" stdout, _, _ = testing_utils.train_model(dict( task='integration_tests:bad_example', model='seq2seq', learningrate=LR, batchsize=10, datatype='train:ordered:stream', num_epochs=1, numthreads=1, no_cuda=True, embeddingsize=16, hiddensize=16, )) self.assertIn('valid:{', stdout) self.assertIn('test:{', stdout)
def test_pyt_preprocess_train(self): """ Test that the preprocess functionality works with the PytorchDataTeacher with a sample TorchAgent (here, the Seq2seq model). This tests whether an agent can train to completion with these preprocessed examples """ defaults = integration_test_parser_defaults.copy() defaults['datatype'] = 'train' defaults['pytorch_preprocess'] = True str_output, valid, test = testing_utils.train_model(defaults) self.assertTrue( solved_task(str_output, valid, test), 'Teacher could not teach seq2seq with preprocessed obs, output: {}' .format(str_output) )
def _pyt_train(self, datatype): """ Integration test: ensure that pytorch data teacher can successfully teach Seq2Seq model to fully solve the babi:task10k:1 task. The Seq2Seq model can solve the babi:task10k:1 task with the normal ParlAI setup, and thus should be able to with a PytorchDataTeacher """ defaults = integration_test_parser_defaults.copy() defaults['datatype'] = datatype defaults['shuffle'] = True # for train:stream str_output, valid, test = testing_utils.train_model(defaults) self.assertTrue( solved_task(str_output, valid, test), 'Teacher could not teach seq2seq with args: {}; here is str_output: {}' .format(defaults, str_output) )
def test_biencoder(self): stdout, valid, test = testing_utils.train_model( dict( task='convai2:LimitedSelfOriginalTeacher', model='bert_ranker/bi_encoder_ranker', num_epochs=1.0, batchsize=8, text_truncate=32, )) # can't conclude much from the biencoder after that little iterations. # accuracy should be present and somewhere between 0.01 and 0.2 # basically it's still a random classifier self.assertGreaterEqual( test['accuracy'], 0.01, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout)) self.assertLessEqual( test['accuracy'], 0.2, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout))
def test_biencoder(self): stdout, valid, test = testing_utils.train_model( dict( task='convai2', model='bert_ranker/bi_encoder_ranker', num_epochs=0.1, batchsize=8, learningrate=3e-4, text_truncate=32, validation_max_exs=20, short_final_eval=True, )) # can't conclude much from the biencoder after that little iterations. # this test will just make sure it hasn't crashed and the accuracy isn't # too high self.assertLessEqual( test['accuracy'], 0.5, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout))
def test_generation(self): stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:nocandidate', model='fairseq', arch='lstm_wiseman_iwslt_de_en', lr=LR, batchsize=BATCH_SIZE, num_epochs=NUM_EPOCHS, rank_candidates=False, skip_generation=False, )) self.assertTrue( valid['ppl'] < 1.2, "valid ppl = {}\nLOG:\n{}".format(valid['ppl'], stdout)) self.assertTrue(test['ppl'] < 1.2, "test ppl = {}\nLOG:\n{}".format(test['ppl'], stdout))
def test_labelcands(self): stdout, valid, test = testing_utils.train_model( dict( task='integration_tests:candidate', model='fairseq', arch='lstm_wiseman_iwslt_de_en', lr=LR, batchsize=BATCH_SIZE, num_epochs=NUM_EPOCHS, rank_candidates=True, skip_generation=True, )) self.assertTrue( valid['hits@1'] > 0.95, "valid hits@1 = {}\nLOG:\n{}".format(valid['hits@1'], stdout)) self.assertTrue( test['hits@1'] > 0.95, "test hits@1 = {}\nLOG:\n{}".format(test['hits@1'], stdout))
def test_biencoder(self): stdout, valid, test = testing_utils.train_model( dict( task='convai2', model='bert_ranker/bi_encoder_ranker', num_epochs=0.1, batchsize=8, learningrate=3e-4, text_truncate=32, validation_max_exs=20, short_final_eval=True, )) # can't conclude much from the biencoder after that little iterations. # accuracy should be present and somewhere between 0.01 and 0.2 # basically it's still a random classifier self.assertGreaterEqual( test['accuracy'], 0.01, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout)) self.assertLessEqual( test['accuracy'], 0.5, 'test accuracy = {}\nLOG:\n{}'.format(test['accuracy'], stdout))