def fit(self, train_loader, epochs, val_loader=None, send_weixin=False, save_per_epochs=None, callbacks=[]): validate = val_loader is not None # Weixin if send_weixin: self._enable_send_weixin() # Create engine engine = self._create_engine() # Register events engine.add_event_handler(Events.EPOCH_STARTED, self._log_epochs, epochs) if validate: engine.add_event_handler(Events.EPOCH_COMPLETED, self._evaluate, val_loader) engine.add_event_handler(Events.EPOCH_COMPLETED, self._log_results, validate) # Set checkpoint if save_per_epochs: checkpoint_handler = ModelCheckpoint(self.save_path, self.name, save_per_epochs, save_as_state_dict=True, require_empty=False) checkpoint_handler._iteration = self.epochs() engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"trainer": self}) for callback in callbacks: engine.add_event_handler(Events.EPOCH_COMPLETED, _callback_wrapper(callback), self) # Run engine.run(train_loader, epochs) # Destroy self._disable_send_weixin() # Return history hist = { metric: hist[-epochs:] for metric, hist in self.metric_history.items() } if not validate: hist = keyfilter(lambda k: not k.startswith("val_"), hist) return hist
def do_train(cfg, model, train_loader, val_loader, optimizer, scheduler, loss_fn, swriter, resume_iter=0): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD output_dir = cfg.OUTPUT_DIR epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("RFRender.%s.train" % cfg.OUTPUT_DIR.split('/')[-1]) logger.info("Start training") trainer = create_supervised_trainer(model, optimizer, loss_fn, coarse_stage=cfg.SOLVER.COARSE_STAGE, swriter=swriter) checkpointer = ModelCheckpoint(output_dir, 'rfnr', n_saved=10, require_empty=False) checkpointer._iteration = resume_iter timer = Timer(average=True) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, { 'model': model, 'optimizer': optimizer, 'scheduler': scheduler }) timer.attach(trainer, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED) RunningAverage(output_transform=lambda x: x).attach(trainer, 'avg_loss') def val_vis(engine): avg_loss = evaluator(val_loader, model, loss_fn, swriter, engine.state.iteration) logger.info("Validation Results - Epoch: {} Avg Loss: {:.3f}".format( engine.state.epoch, avg_loss)) swriter.add_scalar('Loss/val_loss', avg_loss, engine.state.epoch) #xyz, density = vis_density(model) #res = torch.cat([xyz[0],density[0]],dim=1).detach().cpu().numpy() #np.savetxt(os.path.join(output_dir,'voxels_%d.txt' % engine.state.epoch),res) @trainer.on(Events.STARTED) def resume_training(engine): if resume_iter > 0: engine.state.iteration = resume_iter engine.state.epoch = resume_iter // len(train_loader) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_period == 0: for param_group in optimizer.param_groups: lr = param_group['lr'] logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3e} Lr: {:.2e} Speed: {:.1f}[rays/s]" .format(engine.state.epoch, iter, len(train_loader), engine.state.metrics['avg_loss'], lr, float(cfg.SOLVER.BUNCH) / timer.value())) if iter % 1000 == 1: val_vis(engine) scheduler.step() #@trainer.on(Events.EPOCH_COMPLETED) #def adjust_learning_rate(engine): # scheduler.step() # adding handlers using `trainer.on` decorator API @trainer.on(Events.EPOCH_COMPLETED) def print_times(engine): logger.info( 'Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[rays/s]'. format(engine.state.epoch, timer.value() * timer.step_count, float(cfg.SOLVER.BUNCH) / timer.value())) timer.reset() if val_loader is not None: @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): val_vis(engine) pass trainer.run(train_loader, max_epochs=epochs)