def preprocess_task(config: PyTextConfig): if hasattr(config.task, "data") and hasattr(config.task.data, "numberized_dir"): if config.load_snapshot_path and os.path.isfile(config.load_snapshot_path): task = load(config.load_snapshot_path) else: task = create_task(config.task) task.data.initialize_numberized_data()
def prepare_task( config: PyTextConfig, dist_init_url: str = None, device_id: int = 0, rank: int = 0, world_size: int = 1, metric_channels: Optional[List[Channel]] = None, metadata: CommonMetadata = None, ) -> Task: if dist_init_url and world_size > 1: assert metadata is not None print("\nParameters: {}\n".format(config)) _set_cuda(config.use_cuda_if_available, device_id, world_size) _set_fp16(config.use_fp16) if config.random_seed is not None: set_random_seeds(config.random_seed) if config.load_snapshot_path and os.path.isfile(config.load_snapshot_path): task = load(config.load_snapshot_path) else: task = create_task(config.task, metadata=metadata) for mc in metric_channels or []: task.metric_reporter.add_channel(mc) return task
def prepare_task( config: PyTextConfig, dist_init_url: str = None, device_id: int = 0, rank: int = 0, world_size: int = 1, metric_channels: Optional[List[Channel]] = None, metadata: CommonMetadata = None, ) -> Tuple[Task_Deprecated, TrainingState]: if world_size > 1 and config.random_seed is None: msg = ( "Must set random seed when using world_size > 1, so that parameters have " "same initialization across workers." ) raise ValueError(msg) if rank == 0: print("\nParameters: {}\n".format(config), flush=True) _set_cuda(config.use_cuda_if_available, device_id, world_size) _set_fp16(config.use_fp16, rank) _set_distributed( rank, world_size, dist_init_url, device_id, config.gpu_streams_for_distributed_training, ) if config.random_seed is not None: set_random_seeds(config.random_seed, config.use_deterministic_cudnn) training_state = None if config.auto_resume_from_snapshot: # if there are existing checkpoints, resume from the latest one latest_snapshot_path = get_latest_checkpoint_path( os.path.dirname(config.save_snapshot_path) ) if latest_snapshot_path: config.load_snapshot_path = latest_snapshot_path if config.load_snapshot_path: assert PathManager.isfile(config.load_snapshot_path) if config.use_config_from_snapshot: task, _, training_state = load(config.load_snapshot_path) else: task, _, training_state = load( config.load_snapshot_path, overwrite_config=config ) if training_state: training_state.rank = rank else: task = create_task( config.task, metadata=metadata, rank=rank, world_size=world_size ) for mc in metric_channels or []: task.metric_reporter.add_channel(mc) return task, training_state
def test_load_saved_model(self): with tempfile.NamedTemporaryFile() as snapshot_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ) ) ), version=LATEST_VERSION, save_snapshot_path=snapshot_file.name, ) task = create_task(config.task) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) task2, config2 = load(snapshot_file.name) self.assertEqual(config, config2) self.assertModulesEqual(model, task2.model) model.eval() task2.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual(model(*inputs).tolist(), task2.model(*inputs).tolist())
def test_batch_predict_caffe2_model(self): with tempfile.NamedTemporaryFile( ) as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, test_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=snapshot_file.name, export_caffe2_path=caffe2_model_file.name, ) task = create_task(config.task) task.export(task.model, caffe2_model_file.name) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) results = batch_predict_caffe2_model(snapshot_file.name, caffe2_model_file.name) self.assertEqual(4, len(results))
def prepare_task( config: PyTextConfig, dist_init_url: str = None, device_id: int = 0, rank: int = 0, world_size: int = 1, metric_channels: Optional[List[Channel]] = None, metadata: CommonMetadata = None, ) -> Tuple[Task_Deprecated, TrainingState]: print("\nParameters: {}\n".format(config)) _set_cuda(config.use_cuda_if_available, device_id, world_size) _set_fp16(config.use_fp16) _set_distributed(rank, world_size, dist_init_url, device_id) if config.random_seed is not None: set_random_seeds(config.random_seed, config.use_deterministic_cudnn) training_state = None if config.load_snapshot_path and os.path.isfile(config.load_snapshot_path): task, _config, training_state = load(config.load_snapshot_path) if training_state and training_state.model is None and task.model: training_state.model = task.model else: task = create_task(config.task, metadata=metadata, rank=rank, world_size=world_size) for mc in metric_channels or []: task.metric_reporter.add_channel(mc) return task, training_state
def prepare_task( config: PyTextConfig, dist_init_url: str = None, device_id: int = 0, rank: int = 0, world_size: int = 1, summary_writer: Optional[SummaryWriter] = None, metadata: CommonMetadata = None, ) -> Task: if dist_init_url and world_size > 1: assert metadata is not None dist_init(rank, world_size, dist_init_url) print("\nParameters: {}\n".format(config)) _set_cuda(config.use_cuda_if_available, device_id, world_size) if config.load_snapshot_path and os.path.isfile(config.load_snapshot_path): task = load(config.load_snapshot_path) else: task = create_task(config.task, metadata=metadata) if summary_writer: task.metric_reporter.add_channel( TensorBoardChannel(summary_writer=summary_writer)) return task
def test_batch_predict_caffe2_model(self): with tempfile.NamedTemporaryFile() as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( model=DocModel.Config( inputs=DocModel.Config.ModelInput( tokens=TokenTensorizer.Config(), dense=FloatListTensorizer.Config( column="dense", dim=1, error_check=True ), labels=LabelTensorizer.Config(), ) ), data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, test_filename=eval_data, field_names=["label", "slots", "text", "dense"], ) ), ), version=21, save_snapshot_path=snapshot_file.name, export_caffe2_path=caffe2_model_file.name, ) task = create_task(config.task) task.export(task.model, caffe2_model_file.name) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) pt_results = task.predict(task.data.data_source.test) def assert_caffe2_results_correct(caffe2_results): for pt_res, res in zip(pt_results, caffe2_results): np.testing.assert_array_almost_equal( pt_res["score"].tolist()[0], [score[0] for score in res.values()], ) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=2 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=-1 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results)
def prepare_task_metadata(config: PyTextConfig) -> CommonMetadata: """ Loading the whole dataset into cpu memory on every single processes could cause OOMs for data parallel distributed training. To avoid such practice, we move the operations that required loading the whole dataset out of spawn, and pass the context to every single process. """ return create_task(config.task).data_handler.metadata
def test_load_checkpoint(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=None, ) checkpoint_path = checkpoint_file.name save( config, model, None, task.data.tensorizers, training_state, checkpoint_file, ) task_restored, config_restored, training_state_restored = load( checkpoint_path) optimizer_restored = training_state_restored.optimizer scheduler_restored = training_state_restored.scheduler self.assertOptimizerEqual(optimizer, optimizer_restored) self.assertNotNone(scheduler_restored) self.assertEqual(config, config_restored) self.assertModulesEqual(model, task_restored.model) model.eval() task_restored.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual( model(*inputs).tolist(), task_restored.model(*inputs).tolist())
def test_load_checkpoint_in_dist_training(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=BlockShardedTSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=task.data.tensorizers, ) id = "epoch-1" saved_path = save(config, model, None, task.data.tensorizers, training_state, id) new_rank = 2 new_world_size = 4 task_restored, config_restored, training_state_restored = load( saved_path, rank=new_rank, world_size=new_world_size) self.assertCheckpointEqual( model, config, training_state, task_restored.model, config_restored, training_state_restored, ) self.assertEqual(task_restored.data.data_source.rank, new_rank) self.assertEqual(task_restored.data.data_source.world_size, new_world_size)
def prepare_task( config: PyTextConfig, dist_init_url: str = None, device_id: int = 0, rank: int = 0, world_size: int = 1, ) -> Task: if dist_init_url and world_size > 1: dist_init(rank, world_size, dist_init_url) print("\nParameters: {}\n".format(config)) _set_cuda(config.use_cuda_if_available, device_id, world_size) if config.load_snapshot_path and os.path.isfile(config.load_snapshot_path): return load(config.load_snapshot_path) return create_task(config.task)
def test_load_saved_model(self): with tempfile.NamedTemporaryFile() as snapshot_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ) ) ), version=LATEST_VERSION, save_snapshot_path=snapshot_file.name, ) task = create_task(config.task) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) task2, config2, training_state_none = load(snapshot_file.name) self.assertEqual(config, config2) self.assertModulesEqual(model, task2.model) self.assertIsNone(training_state_none) model.eval() task2.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual(model(*inputs).tolist(), task2.model(*inputs).tolist()) def assertOptimizerEqual(self, optim_1, optim_2, msg=None): self.assertTrue(optim_1 is Optimizer and optim_2 is Optimizer, msg) state_dict_1 = optim_1.state_dict() state_dict_2 = optim_2.state_dict() self.assertEqual(len(state_dict_1), len(state_dict_2)) for key_1, val_1 in optim_1.state_dict().items(): self.assertEqualt(val_1, state_dict_2[key_1], msg) def test_load_checkpoint(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ) ) ), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=task.data.tensorizers, ) checkpoint_path = checkpoint_file.name save( config, model, None, task.data.tensorizers, training_state, "epoch-1", ) task_restored, config_restored, training_state_restored = load( checkpoint_path ) optimizer_restored = training_state_restored.optimizer scheduler_restored = training_state_restored.scheduler self.assertOptimizerEqual(optimizer, optimizer_restored) self.assertNotNone(scheduler_restored) self.assertEqual(config, config_restored) self.assertModulesEqual(model, task_restored.model) model.eval() task_restored.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual( model(*inputs).tolist(), task_restored.model(*inputs).tolist() )