def _test(n_epochs, metric_device): n_iters = 100 s = 16 n_classes = 10 offset = n_iters * s y_true = torch.randint(0, n_classes, size=(offset * idist.get_world_size(),)).to(device) y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device) def update(engine, i): return ( y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, :], y_true[i * s + rank * offset : (i + 1) * s + rank * offset], ) engine = Engine(update) k = 5 acc = TopKCategoricalAccuracy(k=k, device=metric_device) acc.attach(engine, "acc") data = list(range(n_iters)) engine.run(data=data, max_epochs=n_epochs) assert "acc" in engine.state.metrics res = engine.state.metrics["acc"] if isinstance(res, torch.Tensor): res = res.cpu().numpy() true_res = top_k_accuracy(y_true.cpu().numpy(), y_preds.cpu().numpy(), k=k) assert pytest.approx(res) == true_res
def test_zero_div(): acc = TopKCategoricalAccuracy(2) with pytest.raises( NotComputableError, match= r"TopKCategoricalAccuracy must have at least one example before it can be computed" ): acc.compute()
def test_compute(): acc = TopKCategoricalAccuracy(2) y_pred = torch.FloatTensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]]) y = torch.ones(2).type(torch.LongTensor) acc.update((y_pred, y)) assert acc.compute() == 0.5 acc.reset() y_pred = torch.FloatTensor([[0.4, 0.8, 0.2, 0.6], [0.8, 0.6, 0.4, 0.2]]) y = torch.ones(2).type(torch.LongTensor) acc.update((y_pred, y)) assert acc.compute() == 1.0
def create_supervised_classification_trainer(model, loss_fn, optimizer, val_loader, learning_rate_scheduler, callback=None, use_cuda=None): """ Todo: Add description :param model: :param loss_fn: :param optimizer: :param val_loader: :param learning_rate_scheduler: :param callback: :param use_cuda: :return: """ if use_cuda and not torch.cuda.is_available(): raise RuntimeError( 'Trying to run using cuda, while cuda is not available') if use_cuda and torch.cuda.is_available(): device = torch.device('cuda:0') torch.backends.cudnn.benchmark = True if torch.cuda.device_count() > 1 and not isinstance( model, nn.DataParallel): model = nn.DataParallel(model) print("Using {} gpus for training".format( torch.cuda.device_count())) else: device = torch.device('cpu') trainer = create_trainer(model=model, optimizer=optimizer, loss_fn=loss_fn, metrics={ 'top_1_accuracy': CategoricalAccuracy(), 'top_5_accuracy': TopKCategoricalAccuracy(), 'loss': Loss(loss_fn), }, device=device) evaluator = create_supervised_classification_evaluator( model, loss_fn, use_cuda) if learning_rate_scheduler: trainer.add_event_handler(Events.EPOCH_STARTED, lambda _: learning_rate_scheduler.step()) if callback is not None: trainer.add_event_handler(Events.ITERATION_COMPLETED, callback, model) trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results, optimizer) trainer.add_event_handler(Events.EPOCH_COMPLETED, run_evaluation, evaluator, val_loader) return trainer, evaluator
def create_supervised_classification_evaluator(model, loss_fn, use_cuda): """ Create an evaluator :param model: :param loss_fn: :param use_cuda: :return: """ if use_cuda and torch.cuda.is_available(): device = torch.device('cuda:0') # multiple GPUs, we can remove this as well torch.backends.cudnn.benchmark = True if torch.cuda.device_count() > 1 and not isinstance( model, nn.DataParallel): model = nn.DataParallel(model) logger.info("Using %d gpus for training", torch.cuda.device_count()) else: device = torch.device('cpu') evaluator = create_supervised_evaluator(model, metrics={ 'top_1_accuracy': CategoricalAccuracy(), 'top_5_accuracy': TopKCategoricalAccuracy(), 'loss': Loss(loss_fn) }, device=device) return evaluator
def create_classification_evaluator( model, device, non_blocking=True, ): from ignite.metrics import Accuracy, TopKCategoricalAccuracy from ignite.engine import create_supervised_evaluator def _prepare_batch(batch, device, non_blocking): image = batch['image'].to(device, non_blocking=non_blocking) label = batch['label'].to(device, non_blocking=non_blocking) return image, label metrics = { 'accuracy': Accuracy(), 'top5': TopKCategoricalAccuracy(k=5), } evaluator = create_supervised_evaluator(model, metrics, device, non_blocking=non_blocking, prepare_batch=_prepare_batch) return evaluator
def metrics_selector(mode, loss): mode = mode.lower() if mode == "classification": metrics = { "loss": loss, "accuracy": Accuracy(), "accuracy_topk": TopKCategoricalAccuracy(), "precision": Precision(average=True), "recall": Recall(average=True) } elif mode == "multiclass-multilabel": metrics = { "loss": loss, "accuracy": Accuracy(), } elif mode == "regression": metrics = { "loss": loss, "mse": MeanSquaredError(), "mae": MeanAbsoluteError() } else: raise RuntimeError( "Invalid task mode, select classification or regression") return metrics
def accuracy_metrics(ks: Iterable[int], output_transform=lambda x: x, prefix="") -> Dict[str, Metric]: return { f"{prefix}accuracy@{k}": TopKCategoricalAccuracy(k=k, output_transform=output_transform) for k in ks }
def __init__(self, prefix, loss_type: str, threshold=0.5, top_k=[1, 5, 10], n_classes: int = None, multilabel: bool = None, metrics=["precision", "recall", "top_k", "accuracy"]): super().__init__() self.loss_type = loss_type.upper() self.threshold = threshold self.n_classes = n_classes self.multilabel = multilabel self.top_ks = top_k self.prefix = prefix self.metrics = {} for metric in metrics: if "precision" == metric: self.metrics[metric] = Precision(average=True, is_multilabel=multilabel) elif "recall" == metric: self.metrics[metric] = Recall(average=True, is_multilabel=multilabel) elif "top_k" in metric: if n_classes: top_k = [k for k in top_k if k < n_classes] if multilabel: self.metrics[metric] = TopKMultilabelAccuracy(k_s=top_k) else: self.metrics[metric] = TopKCategoricalAccuracy(k=max(int(np.log(n_classes)), 1), output_transform=None) elif "macro_f1" in metric: self.metrics[metric] = F1(num_classes=n_classes, average="macro", multilabel=multilabel) elif "micro_f1" in metric: self.metrics[metric] = F1(num_classes=n_classes, average="micro", multilabel=multilabel) elif "mse" == metric: self.metrics[metric] = MeanSquaredError() elif "auroc" == metric: self.metrics[metric] = AUROC(num_classes=n_classes) elif "avg_precision" in metric: self.metrics[metric] = AveragePrecision(num_classes=n_classes, ) elif "accuracy" in metric: self.metrics[metric] = Accuracy(top_k=int(metric.split("@")[-1]) if "@" in metric else None) elif "ogbn" in metric: self.metrics[metric] = OGBNodeClfMetrics(NodeEvaluator(metric)) elif "ogbg" in metric: self.metrics[metric] = OGBNodeClfMetrics(GraphEvaluator(metric)) elif "ogbl" in metric: self.metrics[metric] = OGBLinkPredMetrics(LinkEvaluator(metric)) else: print(f"WARNING: metric {metric} doesn't exist") # Needed to add the PytorchGeometric methods as Modules, so they'll be on the correct CUDA device during training if isinstance(self.metrics[metric], torchmetrics.metric.Metric): setattr(self, metric, self.metrics[metric]) self.reset_metrics()
def _test_distrib_accumulator_device(device): metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: acc = TopKCategoricalAccuracy(2, device=metric_device) assert acc._device == metric_device assert acc._num_correct.device == metric_device, "{}:{} vs {}:{}".format( type(acc._num_correct.device), acc._num_correct.device, type(metric_device), metric_device) y_pred = torch.tensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]]) y = torch.ones(2).long() acc.update((y_pred, y)) assert acc._num_correct.device == metric_device, "{}:{} vs {}:{}".format( type(acc._num_correct.device), acc._num_correct.device, type(metric_device), metric_device)
def _create_evaluator_engine(self): """ """ return create_hog_gcn_evaluator( self.model, device=self.device, metrics={ "Accuracy": Accuracy(), "Loss": Loss(self.loss), "Recall": Recall(average=True), "Top K Categorical Accuracy": TopKCategoricalAccuracy(k=10), }, )
def run(self, logging_dir=None, best_model_only=True): #assert self.model is not None, '[ERROR] No model object loaded. Please load a PyTorch model torch.nn object into the class object.' #assert (self.train_loader is not None) or (self.val_loader is not None), '[ERROR] You must specify data loaders.' for key in self.trainer_status.keys(): assert self.trainer_status[ key], '[ERROR] The {} has not been generated and you cannot proceed.'.format( key) print('[INFO] Trainer pass OK for training.') # TRAIN ENGINE # Create the objects for training self.train_engine = self.create_trainer() # METRICS AND EVALUATION # Metrics - running average RunningAverage(output_transform=lambda x: x).attach( self.train_engine, 'loss') # Metrics - epochs metrics = { 'accuracy': Accuracy(), 'recall': Recall(average=True), 'precision': Precision(average=True), 'f1': Fbeta(beta=1), 'topKCatAcc': TopKCategoricalAccuracy(k=5), 'loss': Loss(self.criterion) } # Create evaluators self.evaluator = self.create_evaluator(metrics=metrics) self.train_evaluator = self.create_evaluator(metrics=metrics, tag='train') # LOGGING # Create logging to terminal self.add_logging() # Create Tensorboard logging self.add_tensorboard_logging(logging_dir=logging_dir) ## CALLBACKS self.create_callbacks(best_model_only=best_model_only) ## TRAIN # Train the model print('[INFO] Executing model training...') self.train_engine.run(self.train_loader, max_epochs=self.config.TRAIN.NUM_EPOCHS) print('[INFO] Model training is complete.')
def _create_evaluator_engine(self): """ """ return create_classification_gcn_evaluator( self.model, self.dataset.classes_dataframe, device=self.device, processes=self.processes, metrics={ "Accuracy": Accuracy(), "Loss": Loss(self.loss), "Recall": Recall(average=True), "Top K Categorical Accuracy": TopKCategoricalAccuracy(k=10), }, )
def test_zero_div(): acc = TopKCategoricalAccuracy(2) with pytest.raises(NotComputableError): acc.compute()
def test_compute(): acc = TopKCategoricalAccuracy(2) y_pred = torch.FloatTensor([[0.2, 0.4, 0.6, 0.8], [0.8, 0.6, 0.4, 0.2]]) y = torch.ones(2).long() acc.update((y_pred, y)) assert isinstance(acc.compute(), float) assert acc.compute() == 0.5 acc.reset() y_pred = torch.FloatTensor([[0.4, 0.8, 0.2, 0.6], [0.8, 0.6, 0.4, 0.2]]) y = torch.ones(2).long() acc.update((y_pred, y)) assert isinstance(acc.compute(), float) assert acc.compute() == 1.0
engine.state.iteration = resume_epoch * len(engine.state.dataloader) engine.state.epoch = resume_epoch print('Las iteraciones son') print(engine.state.iteration) print('El epoch actual es') print(engine.state.epoch) #trainer.add_event_handler(Events.STARTED, resume_training) metrics = { 'Loss': Loss(criterion), 'Accuracy': Accuracy(), 'Precision': Precision(average=True), 'Recall': Recall(average=True), 'Top-5 Accuracy': TopKCategoricalAccuracy(k=5) } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, prepare_batch=zca_prepare_batch, non_blocking=True) train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, prepare_batch=zca_prepare_batch, non_blocking=True) cpe = CustomPeriodicEvent(n_epochs=3) cpe.attach(trainer)
def evaluate(net, test_dataloader): with torch.no_grad(): net.eval() preds_all = torch.empty((len(test_dataloader), 256)) top_1 = TopKCategoricalAccuracy(k=1) top_5 = TopKCategoricalAccuracy(k=5) top_10 = TopKCategoricalAccuracy(k=10) for i, data in enumerate(test_dataloader): lidar, beams = data lidar = lidar.cuda() beams = beams.cuda() preds = net(lidar) preds = F.softmax(preds, dim=1) preds_all[i, :] = preds top_1.update((preds, torch.argmax(beams))) top_5.update((preds, torch.argmax(beams))) top_10.update((preds, torch.argmax(beams))) net.train() print("Top-1: {:.4f} Top-5: {:.4f} Top-10: {:.4f}".format( top_1.compute(), top_5.compute(), top_10.compute())) return preds_all
def run(args, use_gpu=True): # saving save_path = os.path.join(os.getcwd(), 'models') if not os.path.isdir(save_path): os.mkdir(save_path) model = lipnext(inputDim=256, hiddenDim=512, nClasses=args.nClasses, frameLen=29, alpha=args.alpha) model = reload_model(model, args.path).to(device) dset_loaders, dset_sizes = data_loader(args) train_loader = dset_loaders['train'] val_loader = dset_loaders['test'] train_size = dset_sizes['train'] val_size = dset_sizes['val'] optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.) scheduler = AdjustLR(optimizer, [args.lr], sleep_epochs=5, half=5, verbose=1) # TQDM desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) # Ignite trainer trainer = create_supervised_trainer(model, optimizer, F.cross_entropy, \ device=device, prepare_batch=prepare_train_batch) evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(), 'cross_entropy': Loss(F.cross_entropy), 'top-3': TopKCategoricalAccuracy(3) }, device=device,\ prepare_batch=prepare_val_batch) # call backs @evaluator.on(Events.EPOCH_STARTED) def start_val(engine): tqdm.write("Evaluation in progress") @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % args.interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(args.interval) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] avg_loss = metrics['cross_entropy'] top_acc = metrics['top-3'] tqdm.write( "Training Results - Epoch: {} Avg accuracy: {:.2f}, Top3: {:.2f} Avg loss: {:.2f}" .format(engine.state.epoch, avg_accuracy, top_acc, avg_loss)) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): # large dataset so saving often tqdm.write("saving model ..") torch.save( model.state_dict(), os.path.join(save_path, 'epoch' + str(engine.state.epoch + 1) + '.pt')) # saving to ONNX format dummy_input = torch.randn(args.batch_size, 1, 29, 88, 88) torch.onnx.export(model, dummy_input, "lipnext.onnx") evaluator.run(val_loader) metrics = evaluator.state.metrics avg_accuracy = metrics['accuracy'] top_acc = metrics['top-3'] avg_loss = metrics['cross_entropy'] tqdm.write( "Validation Results - Epoch: {} Avg accuracy: {:.2f}, Top3: {:.2f} Avg loss: {:.2f} " .format(engine.state.epoch, avg_accuracy, top_acc, avg_loss)) pbar.n = pbar.last_print_n = 0 @trainer.on(Events.EPOCH_COMPLETED) def update_lr(engine): scheduler.step(engine.state.epoch) trainer.run(train_loader, max_epochs=args.epochs) pbar.close()
def training(config, local_rank=None, with_mlflow_logging=False, with_plx_logging=False): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") set_seed(config.seed + local_rank) torch.cuda.set_device(local_rank) device = 'cuda' torch.backends.cudnn.benchmark = True train_loader = config.train_loader train_sampler = getattr(train_loader, "sampler", None) assert train_sampler is not None, "Train loader of type '{}' " \ "should have attribute 'sampler'".format(type(train_loader)) assert hasattr(train_sampler, 'set_epoch') and callable(train_sampler.set_epoch), \ "Train sampler should have a callable method `set_epoch`" train_eval_loader = config.train_eval_loader val_loader = config.val_loader model = config.model.to(device) optimizer = config.optimizer model, optimizer = amp.initialize(model, optimizer, opt_level=getattr( config, "fp16_opt_level", "O2"), num_losses=1) model = DDP(model, delay_allreduce=True) criterion = config.criterion.to(device) prepare_batch = getattr(config, "prepare_batch", _prepare_batch) non_blocking = getattr(config, "non_blocking", True) # Setup trainer accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def train_update_function(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) / accumulation_steps with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return { 'supervised batch loss': loss.item(), } trainer = Engine(train_update_function) common.setup_common_distrib_training_handlers( trainer, train_sampler, to_save={ 'model': model, 'optimizer': optimizer }, save_every_iters=1000, output_path=config.output_path.as_posix(), lr_scheduler=config.lr_scheduler, with_gpu_stats=True, output_names=[ 'supervised batch loss', ], with_pbars=True, with_pbar_on_iters=with_mlflow_logging, log_every_iters=1) if getattr(config, "benchmark_dataflow", False): benchmark_dataflow_num_iters = getattr(config, "benchmark_dataflow_num_iters", 1000) DataflowBenchmark(benchmark_dataflow_num_iters, prepare_batch=prepare_batch, device=device).attach(trainer, train_loader) # Setup evaluators val_metrics = { "Accuracy": Accuracy(device=device), "Top-5 Accuracy": TopKCategoricalAccuracy(k=5, device=device), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) model_output_transform = getattr(config, "model_output_transform", lambda x: x) evaluator_args = dict(model=model, metrics=val_metrics, device=device, non_blocking=non_blocking, prepare_batch=prepare_batch, output_transform=lambda x, y, y_pred: ( model_output_transform(y_pred), y, )) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if dist.get_rank() == 0 and with_mlflow_logging: ProgressBar(persist=False, desc="Train Evaluation").attach(train_evaluator) ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator) def run_validation(_): train_evaluator.run(train_eval_loader) evaluator.run(val_loader) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)), run_validation) trainer.add_event_handler(Events.COMPLETED, run_validation) score_metric_name = "Accuracy" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) if dist.get_rank() == 0: tb_logger = common.setup_tb_logging(config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) if with_mlflow_logging: common.setup_mlflow_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) if with_plx_logging: common.setup_plx_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) common.save_best_model_by_val_score(config.output_path.as_posix(), evaluator, model, metric_name=score_metric_name, trainer=trainer) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2)) tb_logger.attach(train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="training"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2)) trainer.run(train_loader, max_epochs=config.num_epochs)
def run_training(model, train, valid, optimizer, loss, lr_find=False): print_file(f'Experiment: {rcp.experiment}\nDescription:{rcp.description}', f'{rcp.base_path}description.txt') print_file(model, f'{rcp.models_path}model.txt') print_file(get_transforms(), f'{rcp.models_path}transform_{rcp.stage}.txt') # Data train.transform = get_transforms() valid.transform = get_transforms() train.save_csv(f'{rcp.base_path}train_df_{rcp.stage}.csv') valid.save_csv(f'{rcp.base_path}valid_df_{rcp.stage}.csv') train_loader = DataLoader(train, batch_size=rcp.bs, num_workers=8, shuffle=rcp.shuffle_batch) valid_loader = DataLoader(valid, batch_size=rcp.bs, num_workers=8, shuffle=rcp.shuffle_batch) if lr_find: lr_finder(model, optimizer, loss, train_loader, valid_loader) one_batch = next(iter(train_loader)) dot = make_dot(model(one_batch[0].to(cfg.device)), params=dict(model.named_parameters())) dot.render(f'{rcp.models_path}graph', './', format='png', cleanup=True) summary(model, one_batch[0].shape[-3:], batch_size=rcp.bs, device=cfg.device, to_file=f'{rcp.models_path}summary_{rcp.stage}.txt') # Engines trainer = create_supervised_trainer(model, optimizer, loss, device=cfg.device) t_evaluator = create_supervised_evaluator(model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(loss), 'precision': Precision(average=True), 'recall': Recall(average=True), 'topK': TopKCategoricalAccuracy() }, device=cfg.device) v_evaluator = create_supervised_evaluator( model, metrics={ 'accuracy': Accuracy(), 'nll': Loss(loss), 'precision_avg': Precision(average=True), 'recall_avg': Recall(average=True), 'topK': TopKCategoricalAccuracy(), 'conf_mat': ConfusionMatrix(num_classes=len(valid.classes), average=None), }, device=cfg.device) # Tensorboard tb_logger = TensorboardLogger(log_dir=f'{rcp.tb_log_path}{rcp.stage}') tb_writer = tb_logger.writer tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer, "lr"), event_name=Events.EPOCH_STARTED) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED) @trainer.on(Events.EPOCH_COMPLETED) def tb_and_log_training_stats(engine): t_evaluator.run(train_loader) v_evaluator.run(valid_loader) tb_and_log_train_valid_stats(engine, t_evaluator, v_evaluator, tb_writer) @trainer.on( Events.ITERATION_COMPLETED(every=int(1 + len(train_loader) / 100))) def print_dash(engine): print('-', sep='', end='', flush=True) if cfg.show_batch_images: @trainer.on(Events.STARTED) def show_batch_images(engine): imgs, lbls = next(iter(train_loader)) denormalize = DeNormalize(**rcp.transforms.normalize) for i in range(len(imgs)): imgs[i] = denormalize(imgs[i]) imgs = imgs.to(cfg.device) grid = thv.utils.make_grid(imgs) tb_writer.add_image('images', grid, 0) tb_writer.add_graph(model, imgs) tb_writer.flush() if cfg.show_top_losses: @trainer.on(Events.COMPLETED) def show_top_losses(engine, k=6): nll_loss = nn.NLLLoss(reduction='none') df = predict_dataset(model, valid, nll_loss, transform=None, bs=rcp.bs, device=cfg.device) df.sort_values('loss', ascending=False, inplace=True) df.reset_index(drop=True, inplace=True) for i, row in df.iterrows(): img = cv2.imread(str(row['fname'])) img = th.as_tensor(img.transpose(2, 0, 1)) # #CHW tag = f'TopLoss_{engine.state.epoch}/{row.loss:.4f}/{row.target}/{row.pred}/{row.pred2}' tb_writer.add_image(tag, img, 0) if i >= k - 1: break tb_writer.flush() if cfg.tb_projector: images, labels = train.select_n_random(250) # get the class labels for each image class_labels = [train.classes[lab] for lab in labels] # log embeddings features = images.view(-1, images.shape[-1] * images.shape[-2]) tb_writer.add_embedding(features, metadata=class_labels, label_img=images) if cfg.log_pr_curve: @trainer.on(Events.COMPLETED) def log_pr_curve(engine): """ 1. gets the probability predictions in a test_size x num_classes Tensor 2. gets the preds in a test_size Tensor takes ~10 seconds to run """ class_probs = [] class_preds = [] with th.no_grad(): for data in valid_loader: imgs, lbls = data imgs, lbls = imgs.to(cfg.device), lbls.to(cfg.device) output = model(imgs) class_probs_batch = [ th.softmax(el, dim=0) for el in output ] _, class_preds_batch = th.max(output, 1) class_probs.append(class_probs_batch) class_preds.append(class_preds_batch) test_probs = th.cat([th.stack(batch) for batch in class_probs]) test_preds = th.cat(class_preds) for i in range(len(valid.classes)): """ Takes in a "class_index" from 0 to 9 and plots the corresponding precision-recall curve""" tensorboard_preds = test_preds == i tensorboard_probs = test_probs[:, i] tb_writer.add_pr_curve(f'{rcp.stage}/{valid.classes[i]}', tensorboard_preds, tensorboard_probs, global_step=engine.state.epoch, num_thresholds=127) tb_writer.flush() print() if cfg.lr_scheduler: # lr_scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=.5, min_lr=1e-7, verbose=True) # v_evaluator.add_event_handler(Events.EPOCH_COMPLETED, lambda engine: lr_scheduler.step(v_evaluator.state.metrics['nll'])) lr_scheduler = DelayedCosineAnnealingLR(optimizer, 10, 5) trainer.add_event_handler( Events.EPOCH_COMPLETED, lambda engine: lr_scheduler.step(trainer.state.epoch)) if cfg.early_stopping: def score_function(engine): score = -1 * round(engine.state.metrics['nll'], 5) # score = engine.state.metrics['accuracy'] return score es_handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer) v_evaluator.add_event_handler(Events.COMPLETED, es_handler) if cfg.save_last_checkpoint: @trainer.on(Events.EPOCH_COMPLETED(every=1)) def save_last_checkpoint(engine): checkpoint = {} objects = {'model': model, 'optimizer': optimizer} if cfg.lr_scheduler: objects['lr_scheduler'] = lr_scheduler for k, obj in objects.items(): checkpoint[k] = obj.state_dict() th.save(checkpoint, f'{rcp.models_path}last_{rcp.stage}_checkpoint.pth') if cfg.save_best_checkpoint: def score_function(engine): score = -1 * round(engine.state.metrics['nll'], 5) # score = engine.state.metrics['accuracy'] return score objects = {'model': model, 'optimizer': optimizer} if cfg.lr_scheduler: objects['lr_scheduler'] = lr_scheduler save_best = Checkpoint( objects, DiskSaver(f'{rcp.models_path}', require_empty=False, create_dir=True), n_saved=4, filename_prefix=f'best_{rcp.stage}', score_function=score_function, score_name='val_loss', global_step_transform=global_step_from_engine(trainer)) v_evaluator.add_event_handler(Events.EPOCH_COMPLETED(every=1), save_best) load_checkpoint = False if load_checkpoint: resume_epoch = 6 cp = f'{rcp.models_path}last_{rcp.stage}_checkpoint.pth' obj = th.load(f'{cp}') Checkpoint.load_objects(objects, obj) @trainer.on(Events.STARTED) def resume_training(engine): engine.state.iteration = (resume_epoch - 1) * len( engine.state.dataloader) engine.state.epoch = resume_epoch - 1 if cfg.save_confusion_matrix: @trainer.on(Events.STARTED) def init_best_loss(engine): engine.state.metrics['best_loss'] = 1e99 @trainer.on(Events.EPOCH_COMPLETED) def confusion_matric(engine): if engine.state.metrics['best_loss'] > v_evaluator.state.metrics[ 'nll']: engine.state.metrics['best_loss'] = v_evaluator.state.metrics[ 'nll'] cm = v_evaluator.state.metrics['conf_mat'] cm_df = pd.DataFrame(cm.numpy(), index=valid.classes, columns=valid.classes) pretty_plot_confusion_matrix( cm_df, f'{rcp.results_path}cm_{rcp.stage}_{trainer.state.epoch}.png', False) if cfg.log_stats: class Hook: def __init__(self, module): self.name = module[0] self.hook = module[1].register_forward_hook(self.hook_fn) self.stats_mean = 0 self.stats_std = 0 def hook_fn(self, module, input, output): self.stats_mean = output.mean() self.stats_std = output.std() def close(self): self.hook.remove() hookF = [Hook(layer) for layer in list(model.cnn.named_children())] @trainer.on(Events.ITERATION_COMPLETED) def log_stats(engine): std = {} mean = {} for hook in hookF: tb_writer.add_scalar(f'std/{hook.name}', hook.stats_std, engine.state.iteration) tb_writer.add_scalar(f'mean/{hook.name}', hook.stats_mean, engine.state.iteration) cfg.save_yaml() rcp.save_yaml() print(f'# batches: train: {len(train_loader)}, valid: {len(valid_loader)}') trainer.run(data=train_loader, max_epochs=rcp.max_epochs) tb_writer.close() tb_logger.close() return model
def multiclass_train_lstm( model: LstmClassifier, dataloader_train: DataLoader, dataloader_val: DataLoader, filename_prefix: str, ): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-3) criterion = CrossEntropyLossOneHot() def process_function(_engine, batch): model.train() optimizer.zero_grad() x, y = batch x = x.to(device) y = y.to(device) y_pred = model(x) loss = criterion(y_pred, y) loss.backward() optimizer.step() return y_pred, y, loss.item(), def eval_function(_engine, batch): model.eval() with torch.no_grad(): x, y = batch y = y.to(device) x = x.to(device) y_pred = model(x) return y_pred, y def score_function(engine): return engine.state.metrics['top3-accuracy'] model.to(device) trainer = Engine(process_function) train_evaluator = Engine(eval_function) validation_evaluator = Engine(eval_function) accuracy_top1 = Accuracy(output_transform=lambda x: (x[0], x[1]), device=device, is_multilabel=True) accuracy_top3 = TopKCategoricalAccuracy(output_transform=lambda x: (x[0], x[1]), k=3, device=device) RunningAverage(accuracy_top1).attach(trainer, 'accuracy') RunningAverage(accuracy_top3).attach(trainer, 'top3-accuracy') RunningAverage(output_transform=lambda x: x[2]).attach(trainer, 'loss') accuracy_top1.attach(train_evaluator, 'accuracy') accuracy_top3.attach(train_evaluator, 'top3-accuracy') Loss(criterion).attach(train_evaluator, 'loss') accuracy_top1.attach(validation_evaluator, 'accuracy') accuracy_top3.attach(validation_evaluator, 'top3-accuracy') Loss(criterion).attach(validation_evaluator, 'loss') pbar = ProgressBar(persist=True, bar_format="") pbar.attach(engine=trainer, metric_names='all') @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(dataloader_train) message = f'Training results - Epoch: {engine.state.epoch}.' for metric_name, score in train_evaluator.state.metrics.items(): message += f' {metric_name}: {score:.2f}.' pbar.log_message(message) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): validation_evaluator.run(dataloader_val) message = f'Validation results - Epoch: {engine.state.epoch}.' for metric_name, score in train_evaluator.state.metrics.items(): message += f' {metric_name}: {score:.2f}.' pbar.log_message(message) pbar.n = pbar.last_print_n = 0 validation_evaluator.add_event_handler( Events.COMPLETED, EarlyStopping(patience=5, score_function=score_function, trainer=trainer)) checkpointer = ModelCheckpoint(dirname=DIR_MODELS, filename_prefix=filename_prefix, score_function=score_function, score_name='top3-accuracy', n_saved=2, create_dir=True, save_as_state_dict=True, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'v2': model}) trainer.run(dataloader_train, max_epochs=20)
) print('done') ## SETUP TRAINER AND EVALUATOR # Setup model trainer and evaluator print('[INFO] Creating Ignite training, evaluation objects and logging...', end='') trainer = create_trainer(model=model, optimizer=optimizer, criterion=criterion, lr_scheduler=lr_scheduler) # Metrics - running average RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') # Metrics - epochs metrics = { 'accuracy':Accuracy(), 'recall':Recall(average=True), 'precision':Precision(average=True), 'f1':Fbeta(beta=1), 'topKCatAcc':TopKCategoricalAccuracy(k=5), 'loss':Loss(criterion) } # Create evaluators evaluator = create_evaluator(model, metrics=metrics) train_evaluator = create_evaluator(model, metrics=metrics, tag='train') # Add validation logging trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), evaluate_model) # Add step length update at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: lr_scheduler.step()) # Add TensorBoard logging tb_logger = TensorboardLogger(log_dir=os.path.join(working_dir,'tb_logs'))
def training(local_rank, config, logger=None): if not getattr(config, "use_fp16", True): raise RuntimeError("This training script uses by default fp16 AMP") torch.backends.cudnn.benchmark = True set_seed(config.seed + local_rank) train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader # Setup model, optimizer, criterion model, optimizer, criterion = initialize(config) if not hasattr(config, "prepare_batch"): config.prepare_batch = _prepare_batch # Setup trainer for this specific task trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) if getattr(config, "benchmark_dataflow", False): benchmark_dataflow_num_iters = getattr(config, "benchmark_dataflow_num_iters", 1000) DataflowBenchmark(benchmark_dataflow_num_iters, prepare_batch=config.prepare_batch).attach( trainer, train_loader) # Setup evaluators val_metrics = { "Accuracy": Accuracy(), "Top-5 Accuracy": TopKCategoricalAccuracy(k=5), } if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator, train_evaluator = create_evaluators(model, val_metrics, config) @trainer.on( Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)) | Events.COMPLETED) def run_validation(): epoch = trainer.state.epoch state = train_evaluator.run(train_eval_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) state = evaluator.run(val_loader) log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) if getattr(config, "start_by_validation", False): trainer.add_event_handler(Events.STARTED, run_validation) score_metric_name = "Accuracy" if hasattr(config, "es_patience"): common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) # Store 3 best models by validation accuracy: common.save_best_model_by_val_score( config.output_path.as_posix(), evaluator, model=model, metric_name=score_metric_name, n_saved=3, trainer=trainer, tag="val", ) if idist.get_rank() == 0: tb_logger = common.setup_tb_logging( config.output_path.as_posix(), trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }, ) exp_tracking_logger = exp_tracking.setup_logging(trainer, optimizer, evaluators={ "training": train_evaluator, "validation": evaluator }) # Log train/val predictions: tb_logger.attach( evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation"), event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2), ) tb_logger.attach( train_evaluator, log_handler=predictions_gt_images_handler( img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="training"), event_name=Events.ITERATION_COMPLETED( once=len(train_eval_loader) // 2), ) trainer.run(train_loader, max_epochs=config.num_epochs) if idist.get_rank() == 0: tb_logger.close() exp_tracking_logger.close()
def test_topk_accuracy(self, k: int, y_pred: Tensor, y_true: Tensor, score: float): accuracy = TopKCategoricalAccuracy(k=k) accuracy.update((y_pred, y_true)) self.assertEqual(score, accuracy.compute())
def __init__(self, prefix, loss_type: str, threshold=0.5, top_k=[1, 5, 10], n_classes: int = None, multilabel: bool = None, metrics=["precision", "recall", "top_k", "accuracy"]): self.loss_type = loss_type.upper() self.threshold = threshold self.n_classes = n_classes self.multilabel = multilabel self.top_ks = top_k self.prefix = prefix add_f1_metric = False if n_classes: top_k = [k for k in top_k if k < n_classes] self.metrics = {} for metric in metrics: if "precision" == metric: self.metrics[metric] = Precision(average=False, is_multilabel=multilabel, output_transform=None) if "micro_f1" in metrics: self.metrics["precision_avg"] = Precision( average=True, is_multilabel=multilabel, output_transform=None) elif "recall" == metric: self.metrics[metric] = Recall(average=False, is_multilabel=multilabel, output_transform=None) if "micro_f1" in metrics: self.metrics["recall_avg"] = Recall( average=True, is_multilabel=multilabel, output_transform=None) elif "top_k" in metric: if multilabel: self.metrics[metric] = TopKMultilabelAccuracy(k_s=top_k) else: self.metrics[metric] = TopKCategoricalAccuracy( k=max(int(np.log(n_classes)), 1), output_transform=None) elif "f1" in metric: add_f1_metric = True continue elif "accuracy" in metric: self.metrics[metric] = Accuracy(is_multilabel=multilabel, output_transform=None) elif "ogbn" in metric: self.metrics[metric] = NodeClfEvaluator(NodeEvaluator(metric)) elif "ogbg" in metric: self.metrics[metric] = NodeClfEvaluator(GraphEvaluator(metric)) elif "ogbl" in metric: self.metrics[metric] = LinkPredEvaluator(LinkEvaluator(metric)) else: print(f"WARNING: metric {metric} doesn't exist") if add_f1_metric: assert "precision" in self.metrics and "recall" in self.metrics def macro_f1(precision, recall): return (precision * recall * 2 / (precision + recall + 1e-12)).mean() self.metrics["macro_f1"] = MetricsLambda(macro_f1, self.metrics["precision"], self.metrics["recall"]) if "micro_f1" in metrics: def micro_f1(precision, recall): return (precision * recall * 2 / (precision + recall + 1e-12)) self.metrics["micro_f1"] = MetricsLambda( micro_f1, self.metrics["precision_avg"], self.metrics["recall_avg"]) self.reset_metrics()