def test_sort(self): data = Data( self.data_source, self.tensorizers, Batcher(train_batch_size=5), sort_key="tokens", ) def assert_sorted(batch): _, seq_lens, _ = batch["tokens"] seq_lens = seq_lens.tolist() for i in range(len(seq_lens) - 1): self.assertTrue(seq_lens[i] >= seq_lens[i + 1]) batches = iter(list(data.batches(Stage.TRAIN))) first_raw_batch, first_batch = next(batches) assert_sorted(first_batch) # make sure labels are also in the same order of sorted tokens self.assertEqual( self.tensorizers["labels"].vocab[first_batch["labels"][1]], "alarm/set_alarm", ) self.assertEqual(first_raw_batch[1][RawExampleFieldName.ROW_INDEX], 1) second_raw_batch, second_batch = next(batches) assert_sorted(second_batch) self.assertEqual( self.tensorizers["labels"].vocab[second_batch["labels"][1]], "alarm/time_left_on_alarm", ) self.assertEqual(second_raw_batch[0][RawExampleFieldName.ROW_INDEX], 6) self.assertEqual(second_raw_batch[1][RawExampleFieldName.ROW_INDEX], 5)
def test_create_data_no_batcher_provided(self): data = Data(self.data_source, self.tensorizers) batches = list(data.batches(Stage.TRAIN)) # We should have made at least one non-empty batch self.assertTrue(batches) raw_batch, batch = next(iter(batches)) self.assertTrue(batch)
def test_create_batches_different_tensorizers(self): tensorizers = {"tokens": WordTensorizer(column="text")} data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) batch = next(iter(batches)) self.assertEqual({"tokens"}, set(batch)) tokens, seq_lens = batch["tokens"] self.assertEqual((10,), seq_lens.size()) self.assertEqual(10, len(tokens))
def test_create_batches(self): data = Data(self.data_source, self.tensorizers, Batcher(train_batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) batch = next(iter(batches)) self.assertEqual(set(self.tensorizers), set(batch)) tokens, seq_lens = batch["tokens"] self.assertEqual((10,), seq_lens.size()) self.assertEqual((10,), batch["labels"].size()) self.assertEqual({"tokens", "labels"}, set(batch)) self.assertEqual(10, len(tokens))
def test_create_batches(self): data = Data(self.data_source, self.tensorizers, RawBatcher(batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) batch, batch_tensors = next(iter(batches)) self.assertEqual(set(self.tensorizers), set(batch_tensors)) tokens, seq_lens = batch_tensors["tokens"] self.assertEqual((10, ), seq_lens.size()) self.assertEqual((10, ), batch_tensors["labels"].size()) self.assertEqual(10, len(batch)) example = next(iter(batch)) self.assertEqual({"text", "label"}, set(example))
def test_data_iterate_multiple_times(self): data = Data(self.data_source, self.tensorizers) batches = data.batches(Stage.TRAIN) data1 = list(batches) data2 = list(batches) # We should have made at least one non-empty batch self.assertTrue(data1) self.assertTrue(data2) _, (batch1, _) = data1[0] _, (batch2, _) = data2[0] # pytorch tensors don't have equals comparisons, so comparing the tensor # dicts is non-trivial, but they should also be equal self.assertEqual(batch1, batch2)
def test_create_batches(self): data = Data(self.data_source, self.tensorizers, Batcher(train_batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) raw_batch, batch = next(iter(batches)) self.assertEqual(set(self.tensorizers), set(batch)) tokens, seq_lens, _ = batch["tokens"] self.assertEqual(10, len(raw_batch)) self.assertEqual({"text", "label", RawExampleFieldName.ROW_INDEX}, set(raw_batch[0])) self.assertEqual((10, ), seq_lens.size()) self.assertEqual((10, ), batch["labels"].size()) self.assertEqual({"tokens", "labels"}, set(batch)) self.assertEqual(10, len(tokens))
def _get_config_with_export_list( self, task_class: Type[NewTask], model_class: Type[Model], test_file_metadata: TestFileMetadata, ) -> PyTextConfig: return PyTextConfig( task=task_class.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=test_file_metadata.filename, eval_filename=test_file_metadata.filename, test_filename=test_file_metadata.filename, field_names=test_file_metadata.field_names, ), batcher=PoolingBatcher.Config(train_batch_size=1, test_batch_size=1), ), trainer=TaskTrainer.Config(epochs=1), model=model_class.Config( inputs=type(model_class.Config.inputs)( dense=FloatListTensorizer.Config( column=test_file_metadata.dense_col_name, error_check=True, dim=test_file_metadata.dense_feat_dim, ))), ), use_tensorboard=False, use_cuda_if_available=False, export=ExportConfig( export_torchscript_path="/tmp/model_torchscript.pt"), version=LATEST_VERSION, )
def _get_pytext_config( self, test_file_name: TestFileName, task_class: Type[NewTask], model_class: Type[Model], ) -> PyTextConfig: test_file_metadata = get_test_file_metadata(test_file_name) return PyTextConfig( task=task_class.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=test_file_metadata.filename, eval_filename=test_file_metadata.filename, test_filename=test_file_metadata.filename, field_names=test_file_metadata.field_names, ), batcher=Batcher.Config( ), # Use Batcher to avoid shuffling. ), trainer=TaskTrainer.Config(epochs=1), model=model_class.Config( inputs=type(model_class.Config.inputs)( dense=FloatListTensorizer.Config( column=test_file_metadata.dense_col_name, dim=test_file_metadata.dense_feat_dim, ))), ), use_tensorboard=False, use_cuda_if_available=False, version=LATEST_VERSION, )
def test_batch_predict_caffe2_model(self): with tempfile.NamedTemporaryFile( ) as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, test_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=snapshot_file.name, export_caffe2_path=caffe2_model_file.name, ) task = create_task(config.task) task.export(task.model, caffe2_model_file.name) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) results = batch_predict_caffe2_model(snapshot_file.name, caffe2_model_file.name) self.assertEqual(4, len(results))
def test_load_saved_model(self): with tempfile.NamedTemporaryFile() as snapshot_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ) ) ), version=LATEST_VERSION, save_snapshot_path=snapshot_file.name, ) task = create_task(config.task) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) task2, config2 = load(snapshot_file.name) self.assertEqual(config, config2) self.assertModulesEqual(model, task2.model) model.eval() task2.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual(model(*inputs).tolist(), task2.model(*inputs).tolist())
def test_batch_predict_caffe2_model(self): with tempfile.NamedTemporaryFile() as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( model=DocModel.Config( inputs=DocModel.Config.ModelInput( tokens=TokenTensorizer.Config(), dense=FloatListTensorizer.Config( column="dense", dim=1, error_check=True ), labels=LabelTensorizer.Config(), ) ), data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, test_filename=eval_data, field_names=["label", "slots", "text", "dense"], ) ), ), version=21, save_snapshot_path=snapshot_file.name, export_caffe2_path=caffe2_model_file.name, ) task = create_task(config.task) task.export(task.model, caffe2_model_file.name) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) pt_results = task.predict(task.data.data_source.test) def assert_caffe2_results_correct(caffe2_results): for pt_res, res in zip(pt_results, caffe2_results): np.testing.assert_array_almost_equal( pt_res["score"].tolist()[0], [score[0] for score in res.values()], ) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=2 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=-1 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results)
def test_load_checkpoint(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=None, ) checkpoint_path = checkpoint_file.name save( config, model, None, task.data.tensorizers, training_state, checkpoint_file, ) task_restored, config_restored, training_state_restored = load( checkpoint_path) optimizer_restored = training_state_restored.optimizer scheduler_restored = training_state_restored.scheduler self.assertOptimizerEqual(optimizer, optimizer_restored) self.assertNotNone(scheduler_restored) self.assertEqual(config, config_restored) self.assertModulesEqual(model, task_restored.model) model.eval() task_restored.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual( model(*inputs).tolist(), task_restored.model(*inputs).tolist())
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), } # verify TokenTensorizer isn't in an initialized state yet assert tensorizers["tokens"].vocab is None Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
def test_sort(self): data = Data( self.data_source, self.tensorizers, Batcher(train_batch_size=16), sort_key="tokens", ) batches = list(data.batches(Stage.TRAIN)) batch = next(iter(batches)) _, seq_lens, _ = batch["tokens"] seq_lens = seq_lens.tolist() for i in range(len(seq_lens) - 1): self.assertTrue(seq_lens[i] >= seq_lens[i + 1]) # make sure labels are also in the same order of sorted tokens self.assertEqual( self.tensorizers["labels"].vocab[batch["labels"][1]], "reminder/set_reminder", ) self.assertEqual(self.tensorizers["labels"].vocab[batch["labels"][8]], "alarm/snooze_alarm")
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": WordTensorizer(column="text"), "labels": LabelTensorizer(column="label"), } with self.assertRaises(AttributeError): # verify WordTensorizer isn't in an initialized state yet tensorizers["tokens"].vocab Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].labels))
def test_load_checkpoint_in_dist_training(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=BlockShardedTSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=task.data.tensorizers, ) id = "epoch-1" saved_path = save(config, model, None, task.data.tensorizers, training_state, id) new_rank = 2 new_world_size = 4 task_restored, config_restored, training_state_restored = load( saved_path, rank=new_rank, world_size=new_world_size) self.assertCheckpointEqual( model, config, training_state, task_restored.model, config_restored, training_state_restored, ) self.assertEqual(task_restored.data.data_source.rank, new_rank) self.assertEqual(task_restored.data.data_source.world_size, new_world_size)
def test_create_batches_with_cache(self): data = Data( self.data_source, self.tensorizers, Batcher(train_batch_size=1), in_memory=True, ) list(data.batches(Stage.TRAIN)) self.assertEqual(10, len(data.numberized_cache[Stage.TRAIN])) data1 = Data( self.data_source, self.tensorizers, Batcher(train_batch_size=1), in_memory=True, ) with self.assertRaises(Exception): # Concurrent iteration not supported batches1 = data1.batches(Stage.TRAIN) batches2 = data1.batches(Stage.TRAIN) for _ in batches1: for _ in batches2: continue
def test_load_saved_model(self): with tempfile.NamedTemporaryFile() as snapshot_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ) ) ), version=LATEST_VERSION, save_snapshot_path=snapshot_file.name, ) task = create_task(config.task) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) task2, config2, training_state_none = load(snapshot_file.name) self.assertEqual(config, config2) self.assertModulesEqual(model, task2.model) self.assertIsNone(training_state_none) model.eval() task2.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual(model(*inputs).tolist(), task2.model(*inputs).tolist()) def assertOptimizerEqual(self, optim_1, optim_2, msg=None): self.assertTrue(optim_1 is Optimizer and optim_2 is Optimizer, msg) state_dict_1 = optim_1.state_dict() state_dict_2 = optim_2.state_dict() self.assertEqual(len(state_dict_1), len(state_dict_2)) for key_1, val_1 in optim_1.state_dict().items(): self.assertEqualt(val_1, state_dict_2[key_1], msg) def test_load_checkpoint(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ) ) ), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=task.data.tensorizers, ) checkpoint_path = checkpoint_file.name save( config, model, None, task.data.tensorizers, training_state, "epoch-1", ) task_restored, config_restored, training_state_restored = load( checkpoint_path ) optimizer_restored = training_state_restored.optimizer scheduler_restored = training_state_restored.scheduler self.assertOptimizerEqual(optimizer, optimizer_restored) self.assertNotNone(scheduler_restored) self.assertEqual(config, config_restored) self.assertModulesEqual(model, task_restored.model) model.eval() task_restored.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual( model(*inputs).tolist(), task_restored.model(*inputs).tolist() )