def _write_msg_to_logbook(self, msg): # Note: This fails sometimes, when the previous writing has not been over yet # Hence, we wrap it in a loop waiting for success while True: try: logging.logbook(msg) except IOError, e: if e.errno != errno.EINTR: raise ValueError("Logbook TQDM IOError") else: time.sleep(0.2) else: break
def update(self, n=1): if self.logging_on_update: msg = self.__repr__() logging.logbook(msg) res = super().update(n=n) # remember eta if tracking is enabled if self.track_eta: if self.total is not None: rate = 1 / self.avg_time if self.avg_time else None if rate is None: elapsed = self._time() - self.start_t rate = self.n / elapsed self.eta = TqdmToLogger.format_eta( (self.total - self.n) / rate) return res
def close(self): if self._logging_on_close and not self._closed: msg = self.__repr__() logging.logbook(msg) self._closed = True return super(TqdmToLogger, self).close()
def update(self, n=1): if self._logging_on_update: msg = self.__repr__() logging.logbook(msg) return super(TqdmToLogger, self).update(n=n)
def exec_runtime(args, checkpoint_saver, model_and_loss, optimizer, lr_scheduler, train_loader, validation_loader, inference_loader, training_augmentation, validation_augmentation): # ---------------------------------------------------------------------------------------------- # Validation schedulers are a bit special: # They want to be called with a validation loss.. # ---------------------------------------------------------------------------------------------- validation_scheduler = (lr_scheduler is not None and args.lr_scheduler == "ReduceLROnPlateau") # -------------------------------------------------------- # Log some runtime info # -------------------------------------------------------- with logger.LoggingBlock("Runtime", emph=True): logging.info("start_epoch: %i" % args.start_epoch) logging.info("total_epochs: %i" % args.total_epochs) # --------------------------------------- # Total progress bar arguments # --------------------------------------- progressbar_args = { "desc": "Progress", "initial": args.start_epoch - 1, "invert_iterations": True, "iterable": range(1, args.total_epochs + 1), "logging_on_close": True, "logging_on_update": True, "postfix": False, "unit": "ep" } # -------------------------------------------------------- # Total progress bar # -------------------------------------------------------- print(''), logging.logbook('') total_progress = create_progressbar(**progressbar_args) print("\n") # -------------------------------------------------------- # Remember validation loss # -------------------------------------------------------- best_validation_loss = float("inf") if args.validation_key_minimize else -float("inf") store_as_best = False for epoch in range(args.start_epoch, args.total_epochs + 1): with logger.LoggingBlock("Epoch %i/%i" % (epoch, args.total_epochs), emph=True): # -------------------------------------------------------- # Update standard learning scheduler # -------------------------------------------------------- if lr_scheduler is not None and not validation_scheduler: lr_scheduler.step(epoch) # -------------------------------------------------------- # Always report learning rate # -------------------------------------------------------- if lr_scheduler is None: logging.info("lr: %s" % format_learning_rate(args.optimizer_lr)) else: logging.info("lr: %s" % format_learning_rate(lr_scheduler.get_lr())) # ------------------------------------------- # Create and run a training epoch # ------------------------------------------- if train_loader is not None: avg_loss_dict = TrainingEpoch( args, desc=" Train", model_and_loss=model_and_loss, optimizer=optimizer, loader=train_loader, augmentation=training_augmentation, checkpoint_saver=checkpoint_saver, checkpoint_args={ 'directory': args.save, 'model_and_loss': model_and_loss, 'stats_dict': dict({'epe': 0, 'F1': 0}, epoch=epoch), 'store_as_best': False } ).run() # ------------------------------------------- # Create and run a validation epoch # ------------------------------------------- if validation_loader is not None: # --------------------------------------------------- # Construct holistic recorder for epoch # --------------------------------------------------- avg_loss_dict = EvaluationEpoch( args, desc="Validate", model_and_loss=model_and_loss, loader=validation_loader, augmentation=validation_augmentation).run() # ---------------------------------------------------------------- # Evaluate whether this is the best validation_loss # ---------------------------------------------------------------- validation_loss = avg_loss_dict[args.validation_key] if args.validation_key_minimize: store_as_best = validation_loss < best_validation_loss else: store_as_best = validation_loss > best_validation_loss if store_as_best: best_validation_loss = validation_loss # ---------------------------------------------------------------- # Update validation scheduler, if one is in place # ---------------------------------------------------------------- if lr_scheduler is not None and validation_scheduler: lr_scheduler.step(validation_loss, epoch=epoch) # ---------------------------------------------------------------- # Also show best loss on total_progress # ---------------------------------------------------------------- total_progress_stats = { "best_" + args.validation_key + "_avg": "%1.4f" % best_validation_loss } total_progress.set_postfix(total_progress_stats) # ---------------------------------------------------------------- # Bump total progress # ---------------------------------------------------------------- total_progress.update() print('') # ---------------------------------------------------------------- # Store checkpoint # ---------------------------------------------------------------- if checkpoint_saver is not None: checkpoint_saver.save_latest( directory=args.save, model_and_loss=model_and_loss, stats_dict=dict(avg_loss_dict, epoch=epoch), store_as_best=store_as_best) # ---------------------------------------------------------------- # Vertical space between epochs # ---------------------------------------------------------------- print(''), logging.logbook('') # quit after completing epoch quit() # ---------------------------------------------------------------- # Finish # ---------------------------------------------------------------- total_progress.close() logging.info("Finished.")
def exec_runtime(args, checkpoint_saver, model_and_loss, optimizer, lr_scheduler, train_loader, validation_loader, inference_loader, training_augmentation, validation_augmentation): # ---------------------------------------------------------------------------------------------- # Tensorboard writer # ---------------------------------------------------------------------------------------------- if args.evaluation is False: tensorBoardWriter = SummaryWriter(args.save + '/writer') else: tensorBoardWriter = None if train_loader is not None: training_module = TrainingEpoch( args, desc=" Train", loader=train_loader, augmentation=training_augmentation, tbwriter=tensorBoardWriter) if validation_loader is not None: evaluation_module = EvaluationEpoch( args, desc="Validate", loader=validation_loader, augmentation=validation_augmentation, tbwriter=tensorBoardWriter) # -------------------------------------------------------- # Log some runtime info # -------------------------------------------------------- with logger.LoggingBlock("Runtime", emph=True): logging.info("start_epoch: %i" % args.start_epoch) logging.info("total_epochs: %i" % args.total_epochs) # --------------------------------------- # Total progress bar arguments # --------------------------------------- progressbar_args = { "desc": "Progress", "initial": args.start_epoch - 1, "invert_iterations": True, "iterable": range(1, args.total_epochs + 1), "logging_on_close": True, "logging_on_update": True, "postfix": False, "unit": "ep" } # -------------------------------------------------------- # Total progress bar # -------------------------------------------------------- print(''), logging.logbook('') total_progress = create_progressbar(**progressbar_args) print("\n") # -------------------------------------------------------- # Remember validation loss # -------------------------------------------------------- best_validation_loss = float("inf") if args.validation_key_minimize else -float("inf") store_as_best = False for epoch in range(args.start_epoch, args.total_epochs + 1): with logger.LoggingBlock("Epoch %i/%i" % (epoch, args.total_epochs), emph=True): # -------------------------------------------------------- # Always report learning rate # -------------------------------------------------------- if lr_scheduler is None: logging.info("lr: %s" % format_learning_rate(args.optimizer_lr)) else: logging.info("lr: %s" % format_learning_rate(lr_scheduler.get_lr())) # ------------------------------------------- # Create and run a training epoch # ------------------------------------------- if train_loader is not None: avg_loss_dict, _ = training_module.run(model_and_loss=model_and_loss, optimizer=optimizer) if args.evaluation is False: tensorBoardWriter.add_scalar('Train/Loss', avg_loss_dict[args.training_key], epoch) # ------------------------------------------- # Create and run a validation epoch # ------------------------------------------- if validation_loader is not None: # --------------------------------------------------- # Construct holistic recorder for epoch # --------------------------------------------------- avg_loss_dict, output_dict = evaluation_module.run(model_and_loss=model_and_loss, epoch=epoch) # -------------------------------------------------------- # Tensorboard X writing # -------------------------------------------------------- if args.evaluation is False: tensorBoardWriter.add_scalar('Val/Metric', avg_loss_dict[args.validation_key], epoch) # ---------------------------------------------------------------- # Evaluate whether this is the best validation_loss # ---------------------------------------------------------------- validation_loss = avg_loss_dict[args.validation_key] if args.validation_key_minimize: store_as_best = validation_loss < best_validation_loss else: store_as_best = validation_loss > best_validation_loss if store_as_best: best_validation_loss = validation_loss # -------------------------------------------------------- # Update standard learning scheduler # -------------------------------------------------------- if lr_scheduler is not None: lr_scheduler.step(epoch) # ---------------------------------------------------------------- # Also show best loss on total_progress # ---------------------------------------------------------------- total_progress_stats = { "best_" + args.validation_key + "_avg": "%1.4f" % best_validation_loss } total_progress.set_postfix(total_progress_stats) # # ---------------------------------------------------------------- # # Bump total progress # # ---------------------------------------------------------------- total_progress.update() print('') # ---------------------------------------------------------------- # Store checkpoint # ---------------------------------------------------------------- if checkpoint_saver is not None: checkpoint_saver.save_latest( directory=args.save, model_and_loss=model_and_loss, stats_dict=dict(avg_loss_dict, epoch=epoch), store_as_best=store_as_best) # ---------------------------------------------------------------- # Vertical space between epochs # ---------------------------------------------------------------- print(''), logging.logbook('') # ---------------------------------------------------------------- # Finish # ---------------------------------------------------------------- if args.evaluation is False: tensorBoardWriter.close() total_progress.close() logging.info("Finished.")
def exec_runtime(args, checkpoint_saver, model_and_loss, optimizer, lr_scheduler, param_scheduler, train_loader, validation_loader, training_augmentation, validation_augmentation, visualizer): # -------------------------------------------------------------------------------- # Validation schedulers are a bit special: # They need special treatment as they want to be called with a validation loss.. # -------------------------------------------------------------------------------- validation_scheduler = (lr_scheduler is not None and args.lr_scheduler == "ReduceLROnPlateau") # -------------------------------------------------------- # Log some runtime info # -------------------------------------------------------- with logging.block("Runtime", emph=True): logging.value("start_epoch: ", args.start_epoch) logging.value("total_epochs: ", args.total_epochs) # --------------------------------------- # Total progress bar arguments # --------------------------------------- progressbar_args = { "desc": "Total", "initial": args.start_epoch - 1, "invert_iterations": True, "iterable": range(1, args.total_epochs + 1), "logging_on_close": True, "logging_on_update": True, "unit": "ep", "track_eta": True } # -------------------------------------------------------- # Total progress bar # -------------------------------------------------------- print(''), logging.logbook('') total_progress = create_progressbar(**progressbar_args) total_progress_stats = {} print("\n") # -------------------------------------------------k------- # Remember validation losses # -------------------------------------------------------- best_validation_losses = None store_as_best = None if validation_loader is not None: num_validation_losses = len(args.validation_keys) best_validation_losses = [ float("inf") if args.validation_modes[i] == 'min' else -float("inf") for i in range(num_validation_losses) ] store_as_best = [False for _ in range(num_validation_losses)] # ---------------------------------------------------------------- # Send Telegram message # ---------------------------------------------------------------- logging.telegram(format_telegram_status_update(args, epoch=0)) avg_loss_dict = {} for epoch in range(args.start_epoch, args.total_epochs + 1): # -------------------------------- # Make Epoch %i/%i header message # -------------------------------- epoch_header = "Epoch {}/{}{}{}".format( epoch, args.total_epochs, " " * 24, format_epoch_header_machine_stats(args)) with logger.LoggingBlock(epoch_header, emph=True): # ------------------------------------------------------------------------------- # Let TensorBoard know where we are.. # ------------------------------------------------------------------------------- summary.set_global_step(epoch) # ----------------------------------------------------------------- # Update standard learning scheduler and get current learning rate # ----------------------------------------------------------------- # Starting with PyTorch 1.1 the expected validation order is: # optimize(...) # validate(...) # scheduler.step().. # --------------------------------------------------------------------- # Update parameter schedule before the epoch # Note: Parameter schedulers are tuples of (optimizer, schedule) # --------------------------------------------------------------------- if param_scheduler is not None: param_scheduler.step(epoch=epoch) # ----------------------------------------------------------------- # Get current learning rate from either optimizer or scheduler # ----------------------------------------------------------------- lr = args.optimizer_lr if args.optimizer is not None else "None" if lr_scheduler is not None: lr = [group['lr'] for group in optimizer.param_groups] \ if args.optimizer is not None else "None" # -------------------------------------------------------- # Current Epoch header stats # -------------------------------------------------------- logging.info(format_epoch_header_stats(args, lr)) # ------------------------------------------- # Create and run a training epoch # ------------------------------------------- if train_loader is not None: if visualizer is not None: visualizer.on_epoch_init(lr, train=True, epoch=epoch, total_epochs=args.total_epochs) ema_loss_dict = RuntimeEpoch( args, desc="Train", augmentation=training_augmentation, loader=train_loader, model_and_loss=model_and_loss, optimizer=optimizer, visualizer=visualizer).run(train=True) if visualizer is not None: visualizer.on_epoch_finished( ema_loss_dict, train=True, epoch=epoch, total_epochs=args.total_epochs) # ------------------------------------------- # Create and run a validation epoch # ------------------------------------------- if validation_loader is not None: if visualizer is not None: visualizer.on_epoch_init(lr, train=False, epoch=epoch, total_epochs=args.total_epochs) # --------------------------------------------------- # Construct holistic recorder for epoch # --------------------------------------------------- epoch_recorder = configure_holistic_epoch_recorder( args, epoch=epoch, loader=validation_loader) with torch.no_grad(): avg_loss_dict = RuntimeEpoch( args, desc="Valid", augmentation=validation_augmentation, loader=validation_loader, model_and_loss=model_and_loss, recorder=epoch_recorder, visualizer=visualizer).run(train=False) try: epoch_recorder.add_scalars("evaluation_losses", avg_loss_dict) except Exception: pass if visualizer is not None: visualizer.on_epoch_finished( avg_loss_dict, train=False, epoch=epoch, total_epochs=args.total_epochs) # ---------------------------------------------------------------- # Evaluate valdiation losses # ---------------------------------------------------------------- validation_losses = [ avg_loss_dict[vkey] for vkey in args.validation_keys ] for i, (vkey, vmode) in enumerate( zip(args.validation_keys, args.validation_modes)): if vmode == 'min': store_as_best[i] = validation_losses[ i] < best_validation_losses[i] else: store_as_best[i] = validation_losses[ i] > best_validation_losses[i] if store_as_best[i]: best_validation_losses[i] = validation_losses[i] # ---------------------------------------------------------------- # Update validation scheduler, if one is in place # We use the first key in validation keys as the relevant one # ---------------------------------------------------------------- if lr_scheduler is not None: if validation_scheduler: lr_scheduler.step(validation_losses[0], epoch=epoch) else: lr_scheduler.step(epoch=epoch) # ---------------------------------------------------------------- # Also show best loss on total_progress # ---------------------------------------------------------------- total_progress_stats = { "best_" + vkey + "_avg": "%1.4f" % best_validation_losses[i] for i, vkey in enumerate(args.validation_keys) } total_progress.set_postfix(total_progress_stats) # ---------------------------------------------------------------- # Bump total progress # ---------------------------------------------------------------- total_progress.update() print('') # ---------------------------------------------------------------- # Get ETA string for display in loggers # ---------------------------------------------------------------- eta_str = total_progress.eta_str() # ---------------------------------------------------------------- # Send Telegram status udpate # ---------------------------------------------------------------- total_progress_stats['lr'] = format_learning_rate(lr) logging.telegram( format_telegram_status_update( args, eta_str=eta_str, epoch=epoch, total_progress_stats=total_progress_stats)) # ---------------------------------------------------------------- # Update ETA in progress title # ---------------------------------------------------------------- eta_proctitle = "{} finishes in {}".format(args.proctitle, eta_str) proctitles.setproctitle(eta_proctitle) # ---------------------------------------------------------------- # Store checkpoint # ---------------------------------------------------------------- if checkpoint_saver is not None and validation_loader is not None: checkpoint_saver.save_latest( directory=args.save, model_and_loss=model_and_loss, stats_dict=dict(avg_loss_dict, epoch=epoch), store_as_best=store_as_best, store_prefixes=args.validation_keys) # ---------------------------------------------------------------- # Vertical space between epochs # ---------------------------------------------------------------- print(''), logging.logbook('') # ---------------------------------------------------------------- # Finish up # ---------------------------------------------------------------- logging.telegram_flush() total_progress.close() logging.info("Finished.")
def exec_runtime(args, device, checkpoint_saver, model_and_loss, optimizer, attack, lr_scheduler, train_loader, validation_loader, inference_loader, training_augmentation, validation_augmentation): # ---------------------------------------------------------------------------------------------- # Validation schedulers are a bit special: # They want to be called with a validation loss.. # ---------------------------------------------------------------------------------------------- validation_scheduler = (lr_scheduler is not None and args.lr_scheduler == "ReduceLROnPlateau") # -------------------------------------------------------- # Log some runtime info # -------------------------------------------------------- with logger.LoggingBlock("Runtime", emph=True): logging.info("start_epoch: %i" % args.start_epoch) logging.info("total_epochs: %i" % args.total_epochs) # --------------------------------------- # Total progress bar arguments # --------------------------------------- progressbar_args = { "desc": "Progress", "initial": args.start_epoch - 1, "invert_iterations": True, "iterable": range(1, args.total_epochs + 1), "logging_on_close": True, "logging_on_update": True, "postfix": False, "unit": "ep" } # -------------------------------------------------------- # Total progress bar # -------------------------------------------------------- print(''), logging.logbook('') total_progress = create_progressbar(**progressbar_args) print("\n") # -------------------------------------------------------- # Remember validation losses # -------------------------------------------------------- num_validation_losses = len(args.validation_keys) best_validation_losses = [ float("inf") if args.validation_keys_minimize[i] else -float("inf") for i in range(num_validation_losses) ] store_as_best = [False for i in range(num_validation_losses)] # -------------------------------------------------------- # Transfer model to device once before training/evaluation # -------------------------------------------------------- model_and_loss = model_and_loss.to(device) avg_loss_dict = {} for epoch in range(args.start_epoch, args.total_epochs + 1): with logger.LoggingBlock("Epoch %i/%i" % (epoch, args.total_epochs), emph=True): # -------------------------------------------------------- # Update standard learning scheduler # -------------------------------------------------------- if lr_scheduler is not None and not validation_scheduler: lr_scheduler.step(epoch) # -------------------------------------------------------- # Always report learning rate and model # -------------------------------------------------------- if lr_scheduler is None: logging.info( "model: %s lr: %s" % (args.model, format_learning_rate(args.optimizer_lr))) else: logging.info( "model: %s lr: %s" % (args.model, format_learning_rate(lr_scheduler.get_lr()))) # ------------------------------------------- # Create and run a training epoch # ------------------------------------------- if train_loader is not None: avg_loss_dict, _ = TrainingEpoch( args, desc=" Train", device=device, model_and_loss=model_and_loss, optimizer=optimizer, loader=train_loader, augmentation=training_augmentation).run() # ------------------------------------------- # Create and run a validation epoch # ------------------------------------------- if validation_loader is not None: # --------------------------------------------------- # Construct holistic recorder for epoch # --------------------------------------------------- epoch_recorder = configure_holistic_epoch_recorder( args, epoch=epoch, loader=validation_loader) with torch.no_grad(): avg_loss_dict, output_dict = EvaluationEpoch( args, desc="Validate", device=device, model_and_loss=model_and_loss, attack=attack, loader=validation_loader, recorder=epoch_recorder, augmentation=validation_augmentation).run() # ---------------------------------------------------------------- # Evaluate valdiation losses # ---------------------------------------------------------------- validation_losses = [ avg_loss_dict[vkey] for vkey in args.validation_keys ] for i, (vkey, vminimize) in enumerate( zip(args.validation_keys, args.validation_keys_minimize)): if vminimize: store_as_best[i] = validation_losses[ i] < best_validation_losses[i] else: store_as_best[i] = validation_losses[ i] > best_validation_losses[i] if store_as_best[i]: best_validation_losses[i] = validation_losses[i] # ---------------------------------------------------------------- # Update validation scheduler, if one is in place # We use the first key in validation keys as the relevant one # ---------------------------------------------------------------- if lr_scheduler is not None and validation_scheduler: lr_scheduler.step(validation_losses[0], epoch=epoch) # ---------------------------------------------------------------- # Also show best loss on total_progress # ---------------------------------------------------------------- total_progress_stats = { "best_" + vkey + "_avg": "%1.4f" % best_validation_losses[i] for i, vkey in enumerate(args.validation_keys) } total_progress.set_postfix(total_progress_stats) # ---------------------------------------------------------------- # Bump total progress # ---------------------------------------------------------------- total_progress.update() print('') # ---------------------------------------------------------------- # Store checkpoint # ---------------------------------------------------------------- if checkpoint_saver is not None: checkpoint_saver.save_latest( directory=args.save, model_and_loss=model_and_loss, stats_dict=dict(avg_loss_dict, epoch=epoch), store_as_best=store_as_best, store_prefixes=args.validation_keys) # ---------------------------------------------------------------- # Vertical space between epochs # ---------------------------------------------------------------- print(''), logging.logbook('') # ---------------------------------------------------------------- # Finish # ---------------------------------------------------------------- total_progress.close() logging.info("Finished.")