def __init__(self, num_train_data, max_updates, max_epochs): self.training_config = OmegaConf.create({ "detect_anomaly": False, "evaluation_interval": 10000 }) if max_updates is not None: self.training_config["max_updates"] = max_updates if max_epochs is not None: self.training_config["max_epochs"] = max_epochs self.model = SimpleModel(1) if torch.cuda.is_available(): self.model = self.model.cuda() self.dataset_loader = MagicMock() self.dataset_loader.seed_sampler = MagicMock(return_value=None) self.dataset_loader.prepare_batch = lambda x: SampleList(x) self.optimizer = MagicMock() self.optimizer.step = MagicMock(return_value=None) self.optimizer.zero_grad = MagicMock(return_value=None) dataset = NumbersDataset(num_train_data) self.train_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=1, shuffle=False, num_workers=1, drop_last=False, ) self.on_batch_start = MagicMock(return_value=None) self.logistics_callback = MagicMock(return_value=None) self.logistics_callback.log_interval = MagicMock(return_value=None) self.on_batch_end = MagicMock(return_value=None) self.meter = MagicMock(return_value=None) self.after_training_loop = MagicMock(return_value=None)
def _train_with_condition( self, num_train_data, max_updates, max_epochs, update_frequency, batch_size, on_update_end_fn=None, ): torch.random.manual_seed(2) model = SimpleModel(1) opt = torch.optim.SGD(model.parameters(), lr=0.01) trainer = TrainerTrainingLoopMock( num_train_data, max_updates, max_epochs, optimizer=opt, update_frequency=update_frequency, batch_size=batch_size, on_update_end_fn=on_update_end_fn, ) model.to(trainer.device) trainer.model = model trainer.training_loop() return trainer
def __init__(self, num_train_data, max_updates, max_epochs, device="cuda", fp16_model=False): config = get_config_with_defaults({ "training": { "max_updates": max_updates, "max_epochs": max_epochs, "evaluation_interval": 10000, "fp16": True, }, "run_type": "train", }) super().__init__(num_train_data, config=config) if fp16_model: assert (torch.cuda.is_available() ), "MMFTrainerMock fp16 requires cuda enabled" model = SimpleModelWithFp16Assert({"in_dim": 1}) model.build() model = model.cuda() else: model = SimpleModel({"in_dim": 1}) model.build() model.train() model.to(self.device) self.model = model self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1e-3)
def get_mmf_trainer( model_size=1, num_data_size=100, max_updates=5, max_epochs=None, on_update_end_fn=None, fp16=False, scheduler_config=None, grad_clipping_config=None, ): torch.random.manual_seed(2) model = SimpleModel({"in_dim": model_size}) model.build() model.train() trainer_config = get_trainer_config() optimizer = build_optimizer(model, trainer_config) trainer = TrainerTrainingLoopMock( num_data_size, max_updates, max_epochs, config=trainer_config, optimizer=optimizer, on_update_end_fn=on_update_end_fn, fp16=fp16, scheduler_config=scheduler_config, grad_clipping_config=grad_clipping_config, ) trainer.load_datasets() model.to(trainer.device) trainer.model = model return trainer
def __init__( self, config, num_train_data, max_updates, max_epochs, device="cuda", fp16_model=False, ): super().__init__(num_train_data, max_updates, max_epochs, fp16=True) self.device = torch.device(device) self.config = config self.model = SimpleModel(1) self.model = self.model.cuda() self.optimizer = build_optimizer(self.model, self.config) self.distributed = True self.local_rank = 0 self.parallelize_model() self.load_fp16_scaler()
def setUp(self): self.trainer = argparse.Namespace() self.config = load_yaml(os.path.join("configs", "defaults.yaml")) self.config = OmegaConf.merge( self.config, { "model": "simple", "model_config": {}, "training": { "lr_scheduler": True, "lr_ratio": 0.1, "lr_steps": [1, 2], "use_warmup": False, "callbacks": [{ "type": "test_callback", "params": {} }], }, }, ) # Keep original copy for testing purposes self.trainer.config = deepcopy(self.config) registry.register("config", self.trainer.config) model = SimpleModel(SimpleModel.Config()) model.build() self.trainer.model = model self.trainer.val_loader = torch.utils.data.DataLoader( NumbersDataset(2), batch_size=self.config.training.batch_size) self.trainer.optimizer = torch.optim.Adam( self.trainer.model.parameters(), lr=1e-01) self.trainer.lr_scheduler_callback = LRSchedulerCallback( self.config, self.trainer) self.trainer.callbacks = [] for callback in self.config.training.get("callbacks", []): callback_type = callback.type callback_param = callback.params callback_cls = registry.get_callback_class(callback_type) self.trainer.callbacks.append( callback_cls(self.trainer.config, self.trainer, **callback_param))
def __init__(self, config, num_train_data, max_updates, max_epochs, device="cuda"): config.training.max_updates = max_updates config.training.max_epochs = max_epochs config.training.fp16 = True config = get_config_with_defaults(config) super().__init__(num_train_data, config=config) self.device = torch.device(device) self.config = config self.model = SimpleModel({"in_dim": 1}) self.model.build() self.model = self.model.cuda() self.optimizer = build_optimizer(self.model, self.config) self.distributed = True self.local_rank = 0 self.parallelize_model() self.load_fp16_scaler()
def test_batch_size_per_device(self, a): # Need to patch the mmf.utils.general's world size not mmf.utils.distributed # as the first one is what will be used with patch("mmf.utils.general.get_world_size", return_value=2): config = self._get_config(max_updates=2, max_epochs=None, batch_size=4) trainer = TrainerTrainingLoopMock(config=config) add_model(trainer, SimpleModel({"in_dim": 1})) add_optimizer(trainer, config) registry.register("config", trainer.config) batch_size = get_batch_size() trainer.config.training.batch_size = batch_size trainer.load_datasets() # Train loader has batch size per device, for global batch size 4 # with world size 2, batch size per device should 4 // 2 = 2 self.assertEqual(trainer.train_loader.current_loader.batch_size, 2) # This is per device, so should stay same config = self._get_config(max_updates=2, max_epochs=None, batch_size_per_device=4) trainer = TrainerTrainingLoopMock(config=config) add_model(trainer, SimpleModel({"in_dim": 1})) add_optimizer(trainer, config) registry.register("config", trainer.config) batch_size = get_batch_size() trainer.config.training.batch_size = batch_size trainer.load_datasets() self.assertEqual(trainer.train_loader.current_loader.batch_size, 4) max_updates = trainer._calculate_max_updates() self.assertEqual(max_updates, 2) self.check_values(trainer, 0, 0, 0) trainer.training_loop() self.check_values(trainer, 2, 1, 2)
def get_mmf_trainer(config=None, model_size=1, num_data_size=100, load_model_from_config=False, seed=2): torch.random.manual_seed(seed) trainer = TrainerTrainingLoopMock(num_data_size, config=config) if not load_model_from_config: add_model(trainer, SimpleModel({"in_dim": model_size})) else: trainer.load_model() add_optimizer(trainer, config) trainer.load_datasets() return trainer
def get_mmf_trainer( model_size=1, num_data_size=100, max_updates=5, max_epochs=None, on_update_end_fn=None, fp16=False, scheduler_config=None, grad_clipping_config=None, evaluation_interval=4, log_interval=1, batch_size=1, tensorboard=False, ): torch.random.manual_seed(2) model = SimpleModel({"in_dim": model_size}) model.build() model.train() trainer_config = get_trainer_config() trainer_config.training.evaluation_interval = evaluation_interval trainer_config.training.log_interval = log_interval optimizer = build_optimizer(model, trainer_config) trainer = TrainerTrainingLoopMock( num_data_size, max_updates, max_epochs, config=trainer_config, optimizer=optimizer, fp16=fp16, on_update_end_fn=on_update_end_fn, scheduler_config=scheduler_config, grad_clipping_config=grad_clipping_config, batch_size=batch_size, tensorboard=tensorboard, ) trainer.load_datasets() model.to(trainer.device) trainer.model = model return trainer
def __init__( self, num_train_data, max_updates, max_epochs, config=None, optimizer=None, update_frequency=1, batch_size=1, batch_size_per_device=None, fp16=False, on_update_end_fn=None, scheduler_config=None, grad_clipping_config=None, ): if config is None: self.config = OmegaConf.create( { "training": { "detect_anomaly": False, "evaluation_interval": 10000, "update_frequency": update_frequency, "fp16": fp16, "batch_size": batch_size, "batch_size_per_device": batch_size_per_device, } } ) self.training_config = self.config.training else: self.training_config = config.training self.config = config # Load batch size with custom config and cleanup original_config = registry.get("config") registry.register("config", self.config) batch_size = get_batch_size() registry.register("config", original_config) if max_updates is not None: self.training_config["max_updates"] = max_updates if max_epochs is not None: self.training_config["max_epochs"] = max_epochs self.model = SimpleModel({"in_dim": 1}) self.model.build() if torch.cuda.is_available(): self.model = self.model.cuda() self.device = "cuda" else: self.device = "cpu" self.distributed = False self.dataset_loader = MagicMock() self.dataset_loader.seed_sampler = MagicMock(return_value=None) self.dataset_loader.prepare_batch = lambda x: SampleList(x) if optimizer is None: self.optimizer = MagicMock() self.optimizer.step = MagicMock(return_value=None) self.optimizer.zero_grad = MagicMock(return_value=None) else: self.optimizer = optimizer if scheduler_config: config.training.lr_scheduler = True config.scheduler = scheduler_config self.lr_scheduler_callback = LRSchedulerCallback(config, self) self.callbacks.append(self.lr_scheduler_callback) on_update_end_fn = ( on_update_end_fn if on_update_end_fn else self.lr_scheduler_callback.on_update_end ) if grad_clipping_config: self.training_config.clip_gradients = True self.training_config.max_grad_l2_norm = grad_clipping_config[ "max_grad_l2_norm" ] self.training_config.clip_norm_mode = grad_clipping_config["clip_norm_mode"] dataset = NumbersDataset(num_train_data) self.train_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=1, drop_last=False, ) self.train_loader.current_dataset = dataset self.on_batch_start = MagicMock(return_value=None) self.on_update_start = MagicMock(return_value=None) self.logistics_callback = MagicMock(return_value=None) self.logistics_callback.log_interval = MagicMock(return_value=None) self.on_batch_end = MagicMock(return_value=None) self.on_update_end = ( on_update_end_fn if on_update_end_fn else MagicMock(return_value=None) ) self.meter = Meter() self.after_training_loop = MagicMock(return_value=None) self.on_validation_start = MagicMock(return_value=None) self.evaluation_loop = MagicMock(return_value=(None, None)) self.scaler = torch.cuda.amp.GradScaler(enabled=False) self.val_loader = MagicMock(return_value=None) self.early_stop_callback = MagicMock(return_value=None) self.on_validation_end = MagicMock(return_value=None) self.metrics = MagicMock(return_value=None)
def test_build_optimizer_simple_model(self): model = SimpleModel({"in_dim": 1}) model.build() optimizer = build_optimizer(model, self.config) self.assertTrue(isinstance(optimizer, torch.optim.Optimizer)) self.assertEqual(len(optimizer.param_groups), 1)
def __init__( self, num_train_data, max_updates, max_epochs, config=None, optimizer=None, update_frequency=1, batch_size=1, batch_size_per_device=None, fp16=False, on_update_end_fn=None, scheduler_config=None, grad_clipping_config=None, tensorboard=False, ): if config is None: self.config = OmegaConf.create({ "training": { "detect_anomaly": False, "evaluation_interval": 10000, "update_frequency": update_frequency, "fp16": fp16, "batch_size": batch_size, "batch_size_per_device": batch_size_per_device, "tensorboard": tensorboard, } }) self.training_config = self.config.training else: config.training.batch_size = batch_size config.training.fp16 = fp16 config.training.update_frequency = update_frequency config.training.tensorboard = tensorboard self.training_config = config.training self.config = config registry.register("config", self.config) if max_updates is not None: self.training_config["max_updates"] = max_updates if max_epochs is not None: self.training_config["max_epochs"] = max_epochs self.model = SimpleModel({"in_dim": 1}) self.model.build() if torch.cuda.is_available(): self.model = self.model.cuda() self.device = "cuda" else: self.device = "cpu" self.distributed = False if optimizer is None: self.optimizer = MagicMock() self.optimizer.step = MagicMock(return_value=None) self.optimizer.zero_grad = MagicMock(return_value=None) else: self.optimizer = optimizer if scheduler_config: config.training.lr_scheduler = True config.scheduler = scheduler_config self.lr_scheduler_callback = LRSchedulerCallback(config, self) self.callbacks.append(self.lr_scheduler_callback) on_update_end_fn = (on_update_end_fn if on_update_end_fn else self.lr_scheduler_callback.on_update_end) if grad_clipping_config: self.training_config.clip_gradients = True self.training_config.max_grad_l2_norm = grad_clipping_config[ "max_grad_l2_norm"] self.training_config.clip_norm_mode = grad_clipping_config[ "clip_norm_mode"] self.on_batch_start = MagicMock(return_value=None) self.on_update_start = MagicMock(return_value=None) self.logistics_callback = MagicMock(return_value=None) self.logistics_callback.log_interval = MagicMock(return_value=None) self.on_batch_end = MagicMock(return_value=None) self.on_update_end = (on_update_end_fn if on_update_end_fn else MagicMock(return_value=None)) self.after_training_loop = MagicMock(return_value=None) self.on_validation_start = MagicMock(return_value=None) self.scaler = torch.cuda.amp.GradScaler(enabled=False) self.early_stop_callback = MagicMock(return_value=None) self.on_validation_end = MagicMock(return_value=None) self.metrics = MagicMock(return_value={}) self.num_data = num_train_data
class TrainerTrainingLoopMock(MMFTrainer): def __init__( self, num_train_data, max_updates, max_epochs, config=None, optimizer=None, update_frequency=1, batch_size=1, batch_size_per_device=None, fp16=False, on_update_end_fn=None, scheduler_config=None, grad_clipping_config=None, tensorboard=False, ): if config is None: self.config = OmegaConf.create({ "training": { "detect_anomaly": False, "evaluation_interval": 10000, "update_frequency": update_frequency, "fp16": fp16, "batch_size": batch_size, "batch_size_per_device": batch_size_per_device, "tensorboard": tensorboard, "run_type": "train", }, "evaluation": { "use_cpu": False }, }) self.training_config = self.config.training else: config.training.batch_size = batch_size config.training.fp16 = fp16 config.training.update_frequency = update_frequency config.training.tensorboard = tensorboard self.training_config = config.training self.config = config registry.register("config", self.config) if max_updates is not None: self.training_config["max_updates"] = max_updates if max_epochs is not None: self.training_config["max_epochs"] = max_epochs self.model = SimpleModel({"in_dim": 1}) self.model.build() if torch.cuda.is_available(): self.model = self.model.cuda() self.device = "cuda" else: self.device = "cpu" self.distributed = False if optimizer is None: self.optimizer = MagicMock() self.optimizer.step = MagicMock(return_value=None) self.optimizer.zero_grad = MagicMock(return_value=None) else: self.optimizer = optimizer if scheduler_config: config.training.lr_scheduler = True config.scheduler = scheduler_config self.lr_scheduler_callback = LRSchedulerCallback(config, self) self.callbacks.append(self.lr_scheduler_callback) on_update_end_fn = (on_update_end_fn if on_update_end_fn else self.lr_scheduler_callback.on_update_end) if grad_clipping_config: self.training_config.clip_gradients = True self.training_config.max_grad_l2_norm = grad_clipping_config[ "max_grad_l2_norm"] self.training_config.clip_norm_mode = grad_clipping_config[ "clip_norm_mode"] self.on_batch_start = MagicMock(return_value=None) self.on_update_start = MagicMock(return_value=None) self.logistics_callback = MagicMock(return_value=None) self.logistics_callback.log_interval = MagicMock(return_value=None) self.on_batch_end = MagicMock(return_value=None) self.on_update_end = (on_update_end_fn if on_update_end_fn else MagicMock(return_value=None)) self.on_validation_start = MagicMock(return_value=None) self.scaler = torch.cuda.amp.GradScaler(enabled=False) self.early_stop_callback = MagicMock(return_value=None) self.on_validation_end = MagicMock(return_value=None) self.metrics = MagicMock(return_value={}) self.num_data = num_train_data self.run_type = self.config.get("run_type", "train") def load_datasets(self): self.dataset_loader = MultiDataModuleNumbersTestObject( num_data=self.num_data, batch_size=self.config.training.batch_size) self.dataset_loader.seed_sampler = MagicMock(return_value=None) self.dataset_loader.prepare_batch = lambda x: SampleList(x) self.train_loader = self.dataset_loader.train_dataloader() self.val_loader = self.dataset_loader.val_dataloader() self.test_loader = self.dataset_loader.test_dataloader()
def __init__( self, num_train_data, max_updates, max_epochs, config=None, optimizer=None, update_frequency=1, batch_size=1, fp16=False, on_update_end_fn=None, ): if config is None: self.training_config = OmegaConf.create({ "detect_anomaly": False, "evaluation_interval": 10000, "update_frequency": update_frequency, "fp16": fp16, "batch_size": batch_size, }) else: self.training_config = config.training if max_updates is not None: self.training_config["max_updates"] = max_updates if max_epochs is not None: self.training_config["max_epochs"] = max_epochs self.model = SimpleModel(1) if torch.cuda.is_available(): self.model = self.model.cuda() self.device = "cuda" else: self.device = "cpu" self.dataset_loader = MagicMock() self.dataset_loader.seed_sampler = MagicMock(return_value=None) self.dataset_loader.prepare_batch = lambda x: SampleList(x) if optimizer is None: self.optimizer = MagicMock() self.optimizer.step = MagicMock(return_value=None) self.optimizer.zero_grad = MagicMock(return_value=None) else: self.optimizer = optimizer dataset = NumbersDataset(num_train_data) self.train_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=1, drop_last=False, ) self.train_loader.current_dataset = dataset self.on_batch_start = MagicMock(return_value=None) self.on_update_start = MagicMock(return_value=None) self.logistics_callback = MagicMock(return_value=None) self.logistics_callback.log_interval = MagicMock(return_value=None) self.on_batch_end = MagicMock(return_value=None) self.on_update_end = (on_update_end_fn if on_update_end_fn else MagicMock(return_value=None)) self.meter = Meter() self.after_training_loop = MagicMock(return_value=None) self.on_validation_start = MagicMock(return_value=None) self.evaluation_loop = MagicMock(return_value=(None, None)) self.scaler = torch.cuda.amp.GradScaler(enabled=False) self.val_loader = MagicMock(return_value=None) self.early_stop_callback = MagicMock(return_value=None) self.on_validation_end = MagicMock(return_value=None) self.metrics = MagicMock(return_value=None)