def __init__(self, fsm: FileStructManager, is_continue: bool, network_name: str = None): super().__init__() self.__writer = None self.__txt_log_file = None fsm.register_dir(self) dir = fsm.get_path(self) if dir is None: return dir = os.path.join(dir, network_name) if network_name is not None else dir if not (fsm.in_continue_mode() or is_continue) and os.path.exists(dir) and os.path.isdir(dir): idx = 0 tmp_dir = dir + "_v{}".format(idx) while os.path.exists(tmp_dir) and os.path.isdir(tmp_dir): idx += 1 tmp_dir = dir + "_v{}".format(idx) dir = tmp_dir os.makedirs(dir, exist_ok=True) self.__writer = SummaryWriter(dir) self.__txt_log_file = open(os.path.join(dir, "log.txt"), 'a' if is_continue else 'w')
def test_savig_best_states(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor)] trainer = Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=0.1)), fsm).set_epoch_num(3).enable_best_states_saving(lambda: np.mean(stages[0].get_losses())) checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'last', 'last_checkpoint.zip') best_checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'best', 'best_checkpoint.zip') class Val: def __init__(self): self.v = None first_val = Val() def on_epoch_end(val): if val.v is not None and np.mean(stages[0].get_losses()) < val.v: self.assertTrue(os.path.exists(best_checkpoint_file)) os.remove(best_checkpoint_file) val.v = np.mean(stages[0].get_losses()) return val.v = np.mean(stages[0].get_losses()) self.assertTrue(os.path.exists(checkpoint_file)) self.assertFalse(os.path.exists(best_checkpoint_file)) os.remove(checkpoint_file) trainer.add_on_epoch_end_callback(lambda: on_epoch_end(first_val)) trainer.train()
def test_base_ops(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() trainer = Trainer(TrainConfig(model, [], torch.nn.L1Loss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) with self.assertRaises(Trainer.TrainerException): trainer.train()
def test_train(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor), ValidationStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor)] Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) \ .set_epoch_num(1).train()
def test_train_stage(self): data_producer = DataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]) metrics_processor = FakeMetricsProcessor() train_stage = TrainStage(data_producer, metrics_processor).enable_hard_negative_mining(0.1) fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() Trainer(TrainConfig(model, [train_stage], SimpleLoss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) \ .set_epoch_num(1).train() self.assertEqual(metrics_processor.call_num, len(data_producer))
def test_savig_states(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor)] trainer = Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=0.1)), fsm).set_epoch_num(3) checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'last', 'last_checkpoint.zip') def on_epoch_end(): self.assertTrue(os.path.exists(checkpoint_file)) os.remove(checkpoint_file) trainer.add_on_epoch_end_callback(on_epoch_end) trainer.train()
def train(): model = resnet18(classes_num=1, in_channels=3, pretrained=True) train_config = TrainConfig(model, [train_stage, val_stage], torch.nn.BCEWithLogitsLoss(), torch.optim.Adam(model.parameters(), lr=1e-4)) file_struct_manager = FileStructManager(base_dir='data', is_continue=False) trainer = Trainer(train_config, file_struct_manager, torch.device('cuda:0')).set_epoch_num(2) tensorboard = TensorboardMonitor(file_struct_manager, is_continue=False, network_name='PortraitSegmentation') log = LogMonitor(file_struct_manager).write_final_metrics() trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log) trainer.enable_best_states_saving(lambda: np.mean(train_stage.get_losses())) trainer.enable_lr_decaying(coeff=0.5, patience=10, target_val_clbk=lambda: np.mean(train_stage.get_losses())) trainer.add_on_epoch_end_callback(lambda: tensorboard.update_scalar('params/lr', trainer.data_processor().get_lr())) trainer.train()
def test_lr_decaying(self): fsm = FileStructManager(base_dir=self.base_dir, is_continue=False) model = SimpleModel() metrics_processor = MetricsProcessor() stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor), ValidationStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]]), metrics_processor)] trainer = Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=0.1)), fsm).set_epoch_num(10) def target_value_clbk() -> float: return 1 trainer.enable_lr_decaying(0.5, 3, target_value_clbk) trainer.train() self.assertAlmostEqual(trainer.data_processor().get_lr(), 0.1 * (0.5 ** 3), delta=1e-6)
def train(): train_config = PoseNetTrainConfig() file_struct_manager = FileStructManager( base_dir=PoseNetTrainConfig.experiment_dir, is_continue=False) trainer = Trainer(train_config, file_struct_manager, torch.device('cuda')) trainer.set_epoch_num(EPOCH_NUM) tensorboard = TensorboardMonitor(file_struct_manager, is_continue=False) log = LogMonitor(file_struct_manager).write_final_metrics() trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log) trainer.enable_best_states_saving( lambda: np.mean(train_config.val_stage.get_losses())) trainer.enable_lr_decaying( coeff=0.5, patience=10, target_val_clbk=lambda: np.mean(train_config.val_stage.get_losses())) trainer.add_on_epoch_end_callback( lambda: tensorboard.update_scalar('params/lr', trainer.data_processor().get_lr())) trainer.train()