class Config(ConfigBase): #: Training epochs epochs: int = 10 #: Stop after how many epochs when the eval metric is not improving early_stop_after: int = 0 #: Clip gradient norm if set max_clip_norm: Optional[float] = None #: Whether metrics on training data should be computed and reported. report_train_metrics: bool = True #: Target time limit for training, default (None) to no time limit. target_time_limit_seconds: Optional[int] = None #: Whether to do evaluation and model selection based on it. do_eval: bool = True #: Number of samples for logging training progress. num_samples_to_log_progress: int = 1000 #: Number of forward & backward per batch before update gradients, the #: actual_batch_size = batch_size x num_accumulated_batches num_accumulated_batches: int = 1 #: Define epoch as a fixed number of batches. Subsequent epochs will continue #: to iterate through the data, cycling through it when they reach the end. #: If not set, use exactly one pass through the dataset as one epoch. #: This configuration only affects the train epochs, test and eval #: will always test their entire datasets. num_batches_per_epoch: Optional[int] = None #: config for optimizer, used in parameter update optimizer: Optimizer.Config = Adam.Config() scheduler: Optional[Scheduler.Config] = None
class Config(ConfigBase): data: Data.Config = Data.Config() model: Model.Config trainer: NewTaskTrainer.Config = NewTaskTrainer.Config() optimizer: Optimizer.Config = Adam.Config() scheduler: Scheduler.Config = Scheduler.Config() exporter: Optional[ModelExporter.Config] = None
class Config(ConfigBase): features: FeatureConfig = FeatureConfig() featurizer: Featurizer.Config = SimpleFeaturizer.Config() data_handler: DataHandler.Config trainer: Trainer.Config = Trainer.Config() optimizer: Optimizer.Config = Adam.Config() scheduler: Optional[Scheduler.Config] = Scheduler.Config() exporter: Optional[ModelExporter.Config] = None
def test_load_checkpoint(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=None, ) checkpoint_path = checkpoint_file.name save( config, model, None, task.data.tensorizers, training_state, checkpoint_file, ) task_restored, config_restored, training_state_restored = load( checkpoint_path) optimizer_restored = training_state_restored.optimizer scheduler_restored = training_state_restored.scheduler self.assertOptimizerEqual(optimizer, optimizer_restored) self.assertNotNone(scheduler_restored) self.assertEqual(config, config_restored) self.assertModulesEqual(model, task_restored.model) model.eval() task_restored.model.eval() inputs = torch.LongTensor([[1, 2, 3]]), torch.LongTensor([3]) self.assertEqual( model(*inputs).tolist(), task_restored.model(*inputs).tolist())
def test_load_checkpoint_in_dist_training(self): with tempfile.NamedTemporaryFile() as checkpoint_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config(data=Data.Config( source=BlockShardedTSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, field_names=["label", "slots", "text"], ))), version=LATEST_VERSION, save_snapshot_path=checkpoint_file.name, ) task = create_task(config.task) model = task.model # test checkpoint saving and loading optimizer = create_optimizer(Adam.Config(), model) scheduler = create_scheduler(Scheduler.Config(), optimizer) training_state = TrainingState( model=model, optimizer=optimizer, scheduler=scheduler, start_time=0, epoch=0, rank=0, stage=Stage.TRAIN, epochs_since_last_improvement=0, best_model_state=None, best_model_metric=None, tensorizers=task.data.tensorizers, ) id = "epoch-1" saved_path = save(config, model, None, task.data.tensorizers, training_state, id) new_rank = 2 new_world_size = 4 task_restored, config_restored, training_state_restored = load( saved_path, rank=new_rank, world_size=new_world_size) self.assertCheckpointEqual( model, config, training_state, task_restored.model, config_restored, training_state_restored, ) self.assertEqual(task_restored.data.data_source.rank, new_rank) self.assertEqual(task_restored.data.data_source.world_size, new_world_size)
class Config(ConfigBase): #: Training epochs epochs: int = 10 #: Stop after how many epochs when the eval metric is not improving early_stop_after: int = 0 #: Clip gradient norm if set max_clip_norm: Optional[float] = None #: Whether metrics on training data should be computed and reported. report_train_metrics: bool = True #: Target time limit for training, default (None) to no time limit. target_time_limit_seconds: Optional[int] = None #: Whether to do evaluation and model selection based on it. do_eval: bool = True #: if do_eval, do we load the best model state dict after training or just # use the latest model state load_best_model_after_train: bool = True #: Number of samples for logging training progress. num_samples_to_log_progress: int = 1000 #: Number of forward & backward per batch before update gradients, the #: actual_batch_size = batch_size x num_accumulated_batches num_accumulated_batches: int = 1 #: Define epoch as a fixed number of batches. Subsequent epochs will continue #: to iterate through the data, cycling through it when they reach the end. #: If not set, use exactly one pass through the dataset as one epoch. #: This configuration only affects the train epochs, test and eval #: will always test their entire datasets. num_batches_per_epoch: Optional[int] = None #: config for optimizer, used in parameter update optimizer: Optimizer.Config = Adam.Config() scheduler: Optional[Scheduler.Config] = None sparsifier: Optional[Sparsifier.Config] = None #: Define arguments for fp16 training. A fp16_optimizer will be created #: and wraps the original optimizer, which will scale loss during #: backward and master weight will be maintained on original optimizer. #: https://arxiv.org/abs/1710.03740 fp16_args: FP16Optimizer.Config = FP16OptimizerFairseq.Config() use_tensorboard: bool = False find_unused_parameters: bool = True #: Set a discriminative learning rate for some of the parameters in model. #: If None, all parameters will have the same lr. discriminative_lr: Optional[float] = None #: Model parameters match any patterns in the list will have discriminative_lr #: Parameters not matching any patterns will have default lr. # E.g. ["decoder.mlp.0", "decoder.mlp.3"] discriminative_lr_params_pattern: Optional[List[str]] = None #: Model parameters match any patterns in the list will have lr = 0.0 freeze_params_pattern: Optional[List[str]] = None
class Config(ConfigBase): #: Training epochs epochs: int = 10 #: Stop after how many epochs when the eval metric is not improving early_stop_after: int = 0 #: Clip gradient norm if set max_clip_norm: Optional[float] = None #: Whether metrics on training data should be computed and reported. report_train_metrics: bool = True #: Target time limit for training, default (None) to no time limit. target_time_limit_seconds: Optional[int] = None #: Whether to do evaluation and model selection based on it. do_eval: bool = True #: Number of samples for logging training progress. num_samples_to_log_progress = 1000 # config for optimizer, used in parameter update optimizer: Optimizer.Config = Adam.Config() scheduler: Optional[Scheduler.Config] = None
def __init__( self, data: Data, model: Model, metric_reporter: Optional[MetricReporter] = None, trainer: Optional[NewTaskTrainer] = None, optimizer: Optional[Optimizer] = None, scheduler: Optional[Scheduler] = None, exporter: Optional[ModelExporter] = None, ): self.data = data self.model = model # Attempt to build a default metric reporter self.metric_reporter = metric_reporter or self.create_metric_reporter( self.Config.metric_reporter, model) self.trainer = trainer or NewTaskTrainer() self.optimizer = optimizer or Adam(model.parameters(), **Adam.Config()._asdict()) self.scheduler = scheduler self.exporter = exporter