def run(self, train_set, valid_set, epochs: int, batch_size: int, num_workers: int = 0, device: str = 'cuda', **kwargs): # pylint: disable=unused-argument assert isinstance(train_set, torch.utils.data.Dataset) assert isinstance(valid_set, torch.utils.data.Dataset) assert isinstance(epochs, int) assert isinstance(batch_size, int) assert isinstance(num_workers, int) assert device.startswith('cuda') or device == 'cpu' logger = kwargs.get('logger', None) self.backbone = self.backbone.to(device) self.projector = self.projector.to(device) train_loader = get_dataloader(train_set, batch_size, num_workers=num_workers) valid_loader = get_dataloader(valid_set, batch_size, num_workers=num_workers) with tqdm.tqdm(**get_tqdm_config(total=epochs, leave=True, color='blue')) as pbar: best_valid_loss = float('inf') best_epoch = 0 for epoch in range(1, epochs + 1): # 0. Train & evaluate train_history = self.train(train_loader, device=device) valid_history = self.evaluate(valid_loader, device=device) # 1. Epoch history (loss) epoch_history = { 'loss': { 'train': train_history.get('loss'), 'valid': valid_history.get('loss'), } } # 2. Epoch history (other metrics if provided) if self.metrics is not None: raise NotImplementedError # 3. TensorBoard if self.writer is not None: for metric_name, metric_dict in epoch_history.items(): self.writer.add_scalars( main_tag=metric_name, tag_scalar_dict=metric_dict, global_step=epoch ) if self.scheduler is not None: self.writer.add_scalar( tag='lr', scalar_value=self.scheduler.get_last_lr()[0], global_step=epoch ) # 4. Save model if it is the current best valid_loss = epoch_history['loss']['valid'] if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_epoch = epoch self.save_checkpoint(self.best_ckpt, epoch=epoch, **epoch_history) if kwargs.get('save_every', False): new_ckpt = os.path.join(self.checkpoint_dir, f'epoch_{epoch:04d}.loss_{valid_loss:.4f}.pt') self.save_checkpoint(new_ckpt, epoch=epoch, **epoch_history) # 5. Update learning rate scheduler if self.scheduler is not None: self.scheduler.step() # 6. Logging desc = make_epoch_description( history=epoch_history, current=epoch, total=epochs, best=best_epoch ) pbar.set_description_str(desc) pbar.update(1) if logger is not None: logger.info(desc) # 7. Save last model self.save_checkpoint(self.last_ckpt, epoch=epoch, **epoch_history) # 8. Test model (optional) if 'test_set' in kwargs.keys(): test_loader = get_dataloader(kwargs.get('test_set'), batch_size=batch_size, num_workers=num_workers) self.test(test_loader, device=device, logger=logger)
def run(self, train_set, valid_set, epochs: int, batch_size: int, num_workers: int = 0, device: str = 'cuda', **kwargs): """Train, evaluate and optionally test.""" assert isinstance(train_set, torch.utils.data.Dataset) assert isinstance(valid_set, torch.utils.data.Dataset) assert isinstance(epochs, int) assert isinstance(batch_size, int) assert isinstance(num_workers, int) assert device.startswith('cuda') or device == 'cpu' logger = kwargs.get('logger', None) disable_mixup = kwargs.get('disable_mixup', False) self.backbone = self.backbone.to(device) self.classifier = self.classifier.to(device) balance = kwargs.get('balance', False) if logger is not None: logger.info(f"Class balance: {balance}") shuffle = not balance train_loader = get_dataloader(train_set, batch_size, num_workers=num_workers, shuffle=shuffle, balance=balance) valid_loader = get_dataloader(valid_set, batch_size, num_workers=num_workers, balance=False) with tqdm.tqdm(**get_tqdm_config( total=epochs, leave=True, color='blue')) as pbar: # Determine model selection metric. Defaults to 'loss'. eval_metric = kwargs.get('eval_metric', 'loss') if eval_metric == 'loss': best_metric_val = float('inf') elif eval_metric in [ 'accuracy', 'precision', 'recall', 'f1', 'auroc', 'auprc' ]: best_metric_val = 0 else: raise NotImplementedError best_epoch = 0 for epoch in range(1, epochs + 1): # 0. Train & evaluate if disable_mixup: train_history = self.train(train_loader, device) else: train_history = self.train_with_mixup(train_loader, device) valid_history = self.evaluate(valid_loader, device) # 1. Epoch history (loss) epoch_history = { 'loss': { 'train': train_history.get('loss'), 'valid': valid_history.get('loss') } } # 2. Epoch history (other metrics if provided) if isinstance(self.metrics, dict): for metric_name, _ in self.metrics.items(): epoch_history[metric_name] = { 'train': train_history[metric_name], 'valid': valid_history[metric_name], } # 3. Tensorboard if self.writer is not None: for metric_name, metric_dict in epoch_history.items(): self.writer.add_scalars(main_tag=metric_name, tag_scalar_dict=metric_dict, global_step=epoch) if self.scheduler is not None: self.writer.add_scalar( tag='lr', scalar_value=self.scheduler.get_last_lr()[0], global_step=epoch) # 4. Save model if it is the current best metric_val = epoch_history[eval_metric]['valid'] if eval_metric == 'loss': if metric_val <= best_metric_val: best_metric_val = metric_val best_epoch = epoch self.save_checkpoint(self.best_ckpt, epoch=epoch, **epoch_history) elif eval_metric in ['accuracy', 'f1', 'auroc', 'auprc']: if metric_val >= best_metric_val: best_metric_val = metric_val best_epoch = epoch self.save_checkpoint(self.best_ckpt, epoch=epoch, **epoch_history) else: raise NotImplementedError # 5. Update learning rate scheduler (optional) if self.scheduler is not None: self.scheduler.step() # 6. Logging desc = make_epoch_description( history=epoch_history, current=epoch, total=epochs, best=best_epoch, ) pbar.set_description_str(desc) pbar.update(1) if logger is not None: logger.info(desc) # 7. Save last model self.save_checkpoint(self.last_ckpt, epoch=epoch, **epoch_history) # 8. Test model (optional) if 'test_set' in kwargs.keys(): test_loader = get_dataloader(kwargs.get('test_set'), batch_size, num_workers=num_workers) self.test(test_loader, device=device, logger=logger)