def test_base_ops(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() trainer = Trainer(TrainConfig(model, [], torch.nn.L1Loss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) with self.assertRaises(Trainer.TrainerException): trainer.train()
def test_lr_decaying(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [ TrainStage( TestDataProducer([[{ 'data': torch.rand(1, 3), 'target': torch.rand(1) } for _ in list(range(20))]]), metrics_processor), ValidationStage( TestDataProducer([[{ 'data': torch.rand(1, 3), 'target': torch.rand(1) } for _ in list(range(20))]]), metrics_processor) ] trainer = Trainer( model, TrainConfig(stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=0.1)), fsm).set_epoch_num(10) def target_value_clbk() -> float: return 1 trainer.enable_lr_decaying(0.5, 3, target_value_clbk) trainer.train() self.assertAlmostEqual(trainer.data_processor().get_lr(), 0.1 * (0.5**3), delta=1e-6)
def test_train(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor), ValidationStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor)] Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) \ .set_epoch_num(1).train()
def test_train_stage(self): data_producer = DataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]) metrics_processor = FakeMetricsProcessor() train_stage = TrainStage(data_producer, metrics_processor).enable_hard_negative_mining(0.1) fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() Trainer(TrainConfig(model, [train_stage], SimpleLoss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) \ .set_epoch_num(1).train() self.assertEqual(metrics_processor.call_num, len(data_producer))
def test_savig_best_states(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor)] trainer = Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=0.1)), fsm).set_epoch_num(3).enable_best_states_saving(lambda: np.mean(stages[0].get_losses())) checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'last', 'last_checkpoint.zip') best_checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'best', 'best_checkpoint.zip') class Val: def __init__(self): self.v = None first_val = Val() def on_epoch_end(val): if val.v is not None and np.mean(stages[0].get_losses()) < val.v: self.assertTrue(os.path.exists(best_checkpoint_file)) os.remove(best_checkpoint_file) val.v = np.mean(stages[0].get_losses()) return val.v = np.mean(stages[0].get_losses()) self.assertTrue(os.path.exists(checkpoint_file)) self.assertFalse(os.path.exists(best_checkpoint_file)) os.remove(checkpoint_file) trainer.add_on_epoch_end_callback(lambda: on_epoch_end(first_val)) trainer.train()
def test_savig_states(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [ TrainStage( TestDataProducer([[{ 'data': torch.rand(1, 3), 'target': torch.rand(1) } for _ in list(range(20))]]), metrics_processor) ] trainer = Trainer( model, TrainConfig(stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=0.1)), fsm).set_epoch_num(3) checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'last', 'last_checkpoint.zip') def on_epoch_end(): self.assertTrue(os.path.exists(checkpoint_file)) os.remove(checkpoint_file) trainer.add_on_epoch_end_callback(on_epoch_end) trainer.train()
def train(num_epochs=5): fsm = FileStructManager(base_dir="models/UoI/", is_continue=False) model = HumanBBox() train_dataset = DataProducer( [ BBoxDataset( "coco/train2017_one_human.csv", size=SZ, type="train", fastai_out=False ) ], batch_size=8, num_workers=5, ) validation_dataset = DataProducer( [ BBoxDataset( "coco/val2017_one_human_train.csv", size=SZ, type="val", fastai_out=False, ) ], batch_size=4, num_workers=2, ) train_config = TrainConfig( [TrainStage(train_dataset), ValidationStage(validation_dataset)], # torch.nn.L1Loss(), IoU, torch.optim.SGD(model.parameters(), lr=5e-3, momentum=0.8), ) trainer = ( Trainer(model, train_config, fsm, torch.device("cuda:0")).set_epoch_num( num_epochs ) # .enable_lr_decaying(0.97, 1000) ) trainer.monitor_hub.add_monitor(TensorboardMonitor(fsm, is_continue=True)) # .add_monitor(LogMonitor(fsm) # .resume(from_best_checkpoint=False) trainer.train()
def train(config_type: BaseTrainConfig): fsm = FileStructManager(base_dir=config_type.experiment_dir, is_continue=False) config = config_type({'train': ['train.npy'], 'val': 'val.npy'}) trainer = Trainer(config, fsm, device=torch.device('cuda')) tensorboard = TensorboardMonitor(fsm, is_continue=False) trainer.monitor_hub.add_monitor(tensorboard) trainer.set_epoch_num(300) trainer.enable_lr_decaying( coeff=0.5, patience=10, target_val_clbk=lambda: np.mean(config.val_stage.get_losses())) trainer.add_on_epoch_end_callback( lambda: tensorboard.update_scalar('params/lr', trainer.data_processor().get_lr())) trainer.enable_best_states_saving( lambda: np.mean(config.val_stage.get_losses())) trainer.add_stop_rule(lambda: trainer.data_processor().get_lr() < 1e-6) trainer.train()
def __len__(self): return len(self.dataset) def __getitem__(self, item): data, target = self.dataset[item] return {'data': self.transforms(data), 'target': target} if __name__ == '__main__': fsm = FileStructManager(base_dir='data', is_continue=False) model = Net() train_dataset = DataProducer([MNISTDataset('data/dataset', True)], batch_size=4, num_workers=2) validation_dataset = DataProducer([MNISTDataset('data/dataset', False)], batch_size=4, num_workers=2) train_config = TrainConfig( model, [TrainStage(train_dataset), ValidationStage(validation_dataset)], torch.nn.NLLLoss(), torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.5)) trainer = Trainer(train_config, fsm, torch.device('cuda:0')).set_epoch_num(5) trainer.monitor_hub.add_monitor(TensorboardMonitor(fsm, is_continue=False)) trainer.train()
def train(): train_config = PoseNetTrainConfig() file_struct_manager = FileStructManager( base_dir=PoseNetTrainConfig.experiment_dir, is_continue=False) trainer = Trainer(train_config, file_struct_manager, torch.device('cuda')) trainer.set_epoch_num(EPOCH_NUM) tensorboard = TensorboardMonitor(file_struct_manager, is_continue=False) log = LogMonitor(file_struct_manager).write_final_metrics() trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log) trainer.enable_best_states_saving( lambda: np.mean(train_config.val_stage.get_losses())) trainer.enable_lr_decaying( coeff=0.5, patience=10, target_val_clbk=lambda: np.mean(train_config.val_stage.get_losses())) trainer.add_on_epoch_end_callback( lambda: tensorboard.update_scalar('params/lr', trainer.data_processor().get_lr())) trainer.train()
def train(): model = resnet18(classes_num=1, in_channels=3, pretrained=True) train_config = TrainConfig(model, [train_stage, val_stage], torch.nn.BCEWithLogitsLoss(), torch.optim.Adam(model.parameters(), lr=1e-4)) file_struct_manager = FileStructManager(base_dir='data', is_continue=False) trainer = Trainer(train_config, file_struct_manager, torch.device('cuda:0')).set_epoch_num(2) tensorboard = TensorboardMonitor(file_struct_manager, is_continue=False, network_name='PortraitSegmentation') log = LogMonitor(file_struct_manager).write_final_metrics() trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log) trainer.enable_best_states_saving(lambda: np.mean(train_stage.get_losses())) trainer.enable_lr_decaying(coeff=0.5, patience=10, target_val_clbk=lambda: np.mean(train_stage.get_losses())) trainer.add_on_epoch_end_callback(lambda: tensorboard.update_scalar('params/lr', trainer.data_processor().get_lr())) trainer.train()
def continue_training(): ######################################################## # Create needed parameters again ######################################################## model = resnet18(classes_num=1, in_channels=3, pretrained=True) train_config = TrainConfig([train_stage, val_stage], torch.nn.BCEWithLogitsLoss(), torch.optim.Adam(model.parameters(), lr=1e-4)) ######################################################## # If FileStructManager creates again - just 'set is_continue' parameter to True ######################################################## file_struct_manager = FileStructManager(base_dir='data', is_continue=True) trainer = Trainer(model, train_config, file_struct_manager, torch.device('cuda:0')).set_epoch_num(10) tensorboard = TensorboardMonitor(file_struct_manager, is_continue=False, network_name='PortraitSegmentation') log = LogMonitor(file_struct_manager).write_final_metrics() trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log) trainer.enable_best_states_saving( lambda: np.mean(train_stage.get_losses())) trainer.enable_lr_decaying( coeff=0.5, patience=10, target_val_clbk=lambda: np.mean(train_stage.get_losses())) trainer.add_on_epoch_end_callback( lambda: tensorboard.update_scalar('params/lr', trainer.data_processor().get_lr())) ######################################################## # For set resume mode to Trainer just call 'resume' method ######################################################## trainer.resume(from_best_checkpoint=False).train()