Example #1
0
def test_integration():

    n_iters = 100
    batch_size = 10
    n_classes = 10
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    loss_values = iter(range(n_iters))

    def update_fn(engine, batch):
        loss_value = next(loss_values)
        y_true_batch = next(y_true_batch_values)
        y_pred_batch = next(y_pred_batch_values)
        return (
            loss_value,
            torch.from_numpy(y_pred_batch),
            torch.from_numpy(y_true_batch),
        )

    trainer = Engine(update_fn)
    alpha = 0.98

    acc_metric = RunningAverage(
        Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha)
    acc_metric.attach(trainer, "running_avg_accuracy")

    avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha)
    avg_output.attach(trainer, "running_avg_output")

    running_avg_acc = [
        None,
    ]

    @trainer.on(Events.ITERATION_COMPLETED)
    def manual_running_avg_acc(engine):
        _, y_pred, y = engine.state.output
        indices = torch.max(y_pred, 1)[1]
        correct = torch.eq(indices, y).view(-1)
        num_correct = torch.sum(correct).item()
        num_examples = correct.shape[0]
        batch_acc = num_correct * 1.0 / num_examples
        if running_avg_acc[0] is None:
            running_avg_acc[0] = batch_acc
        else:
            running_avg_acc[0] = running_avg_acc[0] * alpha + (
                1.0 - alpha) * batch_acc
        engine.state.running_avg_acc = running_avg_acc[0]

    @trainer.on(Events.EPOCH_STARTED)
    def running_avg_output_init(engine):
        engine.state.running_avg_output = None

    @trainer.on(Events.ITERATION_COMPLETED)
    def running_avg_output_update(engine):
        if engine.state.running_avg_output is None:
            engine.state.running_avg_output = engine.state.output[0]
        else:
            engine.state.running_avg_output = (
                engine.state.running_avg_output * alpha +
                (1.0 - alpha) * engine.state.output[0])

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_acc_values(engine):
        assert (engine.state.running_avg_acc == engine.state.
                metrics["running_avg_accuracy"]), "{} vs {}".format(
                    engine.state.running_avg_acc,
                    engine.state.metrics["running_avg_accuracy"])

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_output_values(engine):
        assert (engine.state.running_avg_output ==
                engine.state.metrics["running_avg_output"]), "{} vs {}".format(
                    engine.state.running_avg_output,
                    engine.state.metrics["running_avg_output"])

    np.random.seed(10)
    running_avg_acc = [
        None,
    ]
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)

    running_avg_acc = [
        None,
    ]
    n_iters = 10
    batch_size = 10
    n_classes = 10
    data = list(range(n_iters))
    loss_values = iter(range(n_iters))
    y_true_batch_values = iter(
        np.random.randint(0, n_classes, size=(n_iters, batch_size)))
    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
    trainer.run(data, max_epochs=1)
Example #2
0
def main(dataset_path, batch_size=256, max_epochs=10):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    scaler = GradScaler()

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        # Runs the forward pass with autocasting.
        with autocast():
            y_pred = model(x)
            loss = criterion(y_pred, y)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same precision that autocast used for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}")

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print(f"- Mean elapsed time for 1 epoch: {timer.value()}")
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics, "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)
Example #3
0
def fit_model_multiclass(model,
                         train_loader,
                         test_loader,
                         lr,
                         max_epochs=5,
                         number_of_classes=2):

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    def threshold_output_transform(output):
        y_pred, y = output
        print('got in here threshold ')
        y_pred = torch.heaviside(y_pred, values=torch.zeros(1))
        # print(f'y_pred size : {y_pred.size()}')
        # print(f'y size : {y.size()}')
        return y_pred, y

    def prepare_batch(batch, device, non_blocking):
        x, y = batch
        # print(f'x type: {x.dtype}')
        # print(f'y type: {y.dtype}')
        x = x.to(dtype=torch.long)
        y = y.to(dtype=torch.long)
        # print(f'x type: {x.dtype}')
        # print(f'y type: {y.dtype}')
        # y = torch.unsqueeze(y, 1)
        y = y.squeeze()
        return (x, y)

    def squeeze_y_dims(output):
        prediction, target = output
        print('got in here squeeze y dims')
        # print(f'prediction size: {prediction.size()}')
        # print(f'target size: {target.size()}')
        return prediction.long(), target.squeeze().long()

    def correct_shape(output):
        y_pred, y = output
        print('got in here squeeze correct shape')
        one_hot_y = torch.nn.functional.one_hot(y,
                                                num_classes=number_of_classes)
        one_hot_y = one_hot_y.squeeze(1)

        argmax = y_pred.argmax(1)
        m = torch.zeros(y_pred.shape).scatter(1, argmax.unsqueeze(1), 1.0)

        return m, one_hot_y

    def trainer_output_shape(output):
        print('got here output transform trainer ')
        x, y, y_pred, loss = output

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch=prepare_batch)

    val_metrics = {
        "accuracy": Accuracy(output_transform=correct_shape,
                             is_multilabel=True),
        "loss": Loss(criterion, output_transform=squeeze_y_dims)
        # "precision" : Precision(threshold_output_transform, average=False),
        # "recall": Recall(threshold_output_transform, average=False)
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            prepare_batch=prepare_batch)

    # @trainer.on(Events.ITERATION_COMPLETED(every=10))
    # def log_training_loss(trainer):
    #     evaluator.run(train_loader)
    #     metrics = evaluator.state.metrics
    #     print(f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")

    # @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        # print(f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}"
        )

    @trainer.on(Events.EPOCH_COMPLETED(every=10))
    def log_validation_results(trainer):
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        # print(f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}"
        )

    trainer.run(train_loader, max_epochs=max_epochs)

    return model
def run_inference_test(root_dir, model_file, device=torch.device("cuda:0")):
    images = sorted(glob(os.path.join(root_dir, "im*.nii.gz")))
    segs = sorted(glob(os.path.join(root_dir, "seg*.nii.gz")))
    val_files = [{
        "image": img,
        "label": seg
    } for img, seg in zip(images, segs)]

    # define transforms for image and segmentation
    val_transforms = Compose([
        LoadNiftid(keys=["image", "label"]),
        AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
        ScaleIntensityd(keys=["image", "label"]),
        ToTensord(keys=["image", "label"]),
    ])

    # create a validation data loader
    val_ds = monai.data.Dataset(data=val_files, transform=val_transforms)
    val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4)

    # create UNet, DiceLoss and Adam optimizer
    net = monai.networks.nets.UNet(
        dimensions=3,
        in_channels=1,
        out_channels=1,
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
        num_res_units=2,
    ).to(device)

    val_post_transforms = Compose([
        Activationsd(keys="pred", sigmoid=True),
        AsDiscreted(keys="pred", threshold_values=True),
        KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
    ])
    val_handlers = [
        StatsHandler(output_transform=lambda x: None),
        CheckpointLoader(load_path=f"{model_file}", load_dict={"net": net}),
        SegmentationSaver(
            output_dir=root_dir,
            batch_transform=lambda batch: batch["image_meta_dict"],
            output_transform=lambda output: output["pred"],
        ),
    ]

    evaluator = SupervisedEvaluator(
        device=device,
        val_data_loader=val_loader,
        network=net,
        inferer=SlidingWindowInferer(roi_size=(96, 96, 96),
                                     sw_batch_size=4,
                                     overlap=0.5),
        post_transform=val_post_transforms,
        key_val_metric={
            "val_mean_dice":
            MeanDice(include_background=True,
                     output_transform=lambda x: (x["pred"], x["label"]))
        },
        additional_metrics={
            "val_acc":
            Accuracy(output_transform=lambda x: (x["pred"], x["label"]))
        },
        val_handlers=val_handlers,
    )
    evaluator.run()

    return evaluator.state.best_metric
def train():
    # parser = ArgumentParser()
    # parser.add_argument("--dataset_path", type=str, default="",
    #                     help="Path or url of the dataset. If empty download from S3.")
    # parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    # parser.add_argument("--model_checkpoint", type=str,
    #                     default="/home/rohola/codes/transfer-learning-conv-ai/model/bert_large_uncased",
    #                     help="Path, url or short name of the model")
    # # parser.add_argument("--model_checkpoint", type=str, default="/home/rohola/codes/transfer-learning-conv-ai/logs/logs4", help="Path, url or short name of the model")
    # # parser.add_argument("--model_checkpoint", type=str, default="bert-base-uncased", help="Path, url or short name of the model")
    # parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
    # parser.add_argument("--do_lower_case", default='True', action='store_true')
    # parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
    # parser.add_argument("--train_batch_size", type=int, default=3, help="Batch size for training")
    # parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation")
    # parser.add_argument("--gradient_accumulation_steps", type=int, default=8,
    #                     help="Accumulate gradients on several steps")
    # parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
    # parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup Proportion")
    # parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
    # parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    # parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    # parser.add_argument("--n_epochs", type=int, default=4, help="Number of training epochs")
    # parser.add_argument("--personality_permutations", type=int, default=1,
    #                     help="Number of permutations of personality sentences")
    # parser.add_argument("--eval_before_start", action='store_true',
    #                     help="If true start with a first evaluation before training")
    # parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
    #                     help="Device (cuda or cpu)")
    # parser.add_argument("--fp16", type=str, default="",
    #                     help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    # parser.add_argument("--local_rank", type=int, default=-1,
    #                     help="Local rank for distributed training (-1: not distributed)")
    # parser.add_argument("--log_dir", type=str, default="",
    #                     help="Local rank for distributed training (-1: not distributed)")
    # config = parser.parse_args()

    config_file = "configs/train_using_bert_config.json"
    config = Config.from_json_file(config_file)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", config.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    tokenizer = BertTokenizer.from_pretrained(
        config.model_checkpoint, do_lower_case=config.do_lower_case)
    model = BertDoubleHeadsModel.from_pretrained(config.model_checkpoint)
    model.to(config.device)
    optimizer = BertAdam(model.parameters(),
                         lr=config.lr,
                         warmup=config.warmup_proportion)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[config.local_rank],
                                        output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        config, tokenizer)

    # Training function and trainer

    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, input_mask = batch
        lm_loss, mc_loss = model(input_ids, mc_token_ids, input_mask,
                                 lm_labels, mc_labels, token_type_ids)
        loss = (lm_loss * config.lm_coef +
                mc_loss * config.mc_coef) / config.gradient_accumulation_steps
        # mc_loss = model(input_ids, mc_token_ids, input_mask, lm_labels=None, mc_labels=mc_labels, token_type_ids=token_type_ids)
        # loss = mc_loss[0] / config.gradient_accumulation_steps
        if config.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           config.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
        if engine.state.iteration % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(config.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, input_mask = batch
            model_outputs = model(input_ids,
                                  mc_token_ids,
                                  input_mask,
                                  token_type_ids=token_type_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[1]
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            # loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
            # l = loss_fct(lm_logits_flat_shifted, lm_labels_flat_shifted)
            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if config.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if config.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if config.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, config.lr),
                                 (config.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], config),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if config.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=config.log_dir)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(config,
                   tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=config.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if config.local_rank in [-1, 0] and config.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Example #6
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler)

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Example #7
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="ImageNet-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = "stop-on-{}".format(config["stop_iteration"])

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_imagenet_dataloader(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_supervised_trainer(model, optimizer, criterion,
                                        lr_scheduler, train_loader.sampler,
                                        config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info("Stop training on {} iteration".format(
                trainer.state.iteration))
            trainer.terminate()

    @trainer.on(Events.ITERATION_COMPLETED(every=20))
    def print_acc(engine):
        if rank == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\
                    .format(engine.state.epoch, engine.state.iteration, len(train_loader),
                            engine.state.saved_batch_loss
                            ))

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Example #8
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    if sys.version_info > (3, ):
        from ignite.contrib.metrics.gpu_info import GpuInfo

        try:
            GpuInfo().attach(trainer)
        except RuntimeError:
            print(
                "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). "
                "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please "
                "install it : `pip install pynvml`")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    tb_logger = TensorboardLogger(log_dir=log_dir)

    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
        metric_names="all",
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    tb_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    tb_logger.attach(trainer,
                     log_handler=WeightsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=WeightsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=GradsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    tb_logger.attach(trainer,
                     log_handler=GradsHistHandler(model),
                     event_name=Events.EPOCH_COMPLETED(every=100))

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        log_dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    tb_logger.close()
Example #9
0
def metrics_estimating(tester, criterion):
    Accuracy(output_transform=thresholded_output_transform).attach(tester, 'accuracy')
    Recall(output_transform=thresholded_output_transform, average=True).attach(tester, 'recall')
    Precision(output_transform=thresholded_output_transform, average=True).attach(tester, 'precision')
    Loss(criterion).attach(tester, 'loss')
Example #10
0
    def run(self, train_loader, val_loader, test_loader):
        """Perform model training and evaluation on holdout dataset."""

        ## attach certain metrics to trainer ##
        CpuInfo().attach(self.trainer, "cpu_util")
        Loss(self.loss).attach(self.trainer, "loss")

        ###### configure evaluator settings ######
        def get_output_transform(target: str, collapse_y: bool = False):
            return lambda out: metric_output_transform(
                out, self.loss, target, collapse_y=collapse_y)

        graph_num_classes = len(self.graph_classes)
        node_num_classes = len(self.node_classes)
        node_num_classes = 2 if node_num_classes == 1 else node_num_classes

        node_output_transform = get_output_transform("node")
        node_output_transform_collapsed = get_output_transform("node",
                                                               collapse_y=True)
        graph_output_transform = get_output_transform("graph")
        graph_output_transform_collapsed = get_output_transform(
            "graph", collapse_y=True)

        # metrics we are interested in
        base_metrics: dict = {
            'loss':
            Loss(self.loss),
            "cpu_util":
            CpuInfo(),
            'node_accuracy_avg':
            Accuracy(output_transform=node_output_transform,
                     is_multilabel=False),
            'node_accuracy':
            LabelwiseAccuracy(output_transform=node_output_transform,
                              is_multilabel=False),
            "node_recall":
            Recall(output_transform=node_output_transform_collapsed,
                   is_multilabel=False,
                   average=False),
            "node_precision":
            Precision(output_transform=node_output_transform_collapsed,
                      is_multilabel=False,
                      average=False),
            "node_f1_score":
            Fbeta(1,
                  output_transform=node_output_transform_collapsed,
                  average=False),
            "node_c_matrix":
            ConfusionMatrix(node_num_classes,
                            output_transform=node_output_transform_collapsed,
                            average=None)
        }

        metrics = dict(**base_metrics)

        # settings for the evaluator
        evaluator_settings = {
            "device": self.device,
            "loss_fn": self.loss,
            "node_classes": self.node_classes,
            "graph_classes": self.graph_classes,
            "non_blocking": True,
            "metrics": OrderedDict(sorted(metrics.items(),
                                          key=lambda m: m[0])),
            "pred_collector_function": self._pred_collector_function
        }

        ## configure evaluators ##
        val_evaluator = None
        if len(val_loader):
            val_evaluator = create_supervised_evaluator(
                self.model, **evaluator_settings)
            # configure behavior for early stopping
            if self.stopper:
                val_evaluator.add_event_handler(Events.COMPLETED, self.stopper)
            # configure behavior for checkpoint saving
            val_evaluator.add_event_handler(Events.COMPLETED,
                                            self.best_checkpoint_handler)
            val_evaluator.add_event_handler(Events.COMPLETED,
                                            self.latest_checkpoint_handler)
        else:
            self.trainer.add_event_handler(Events.COMPLETED,
                                           self.latest_checkpoint_handler)

        test_evaluator = None
        if len(test_loader):
            test_evaluator = create_supervised_evaluator(
                self.model, **evaluator_settings)
        #############################

        @self.trainer.on(Events.STARTED)
        def log_training_start(trainer):
            self.custom_print("Start training...")

        @self.trainer.on(Events.EPOCH_COMPLETED)
        def compute_metrics(trainer):
            """Compute evaluation metric values after each epoch."""

            epoch = trainer.state.epoch

            self.custom_print(f"Finished epoch {epoch:03d}!")

            if len(val_loader):
                self.persist_collection = True
                val_evaluator.run(val_loader)
                self._save_collected_predictions(
                    prefix=f"validation_epoch{epoch:03}")
                # write metrics to file
                self.write_metrics(trainer, val_evaluator, suffix="validation")

        @self.trainer.on(Events.COMPLETED)
        def log_training_complete(trainer):
            """Trigger evaluation on test set if training is completed."""

            epoch = trainer.state.epoch
            suffix = "(Early Stopping)" if epoch < self.epochs else ""

            self.custom_print("Finished after {:03d} epochs! {}".format(
                epoch, suffix))

            # load best model for evaluation
            self.custom_print("Load best model for final evaluation...")
            last_checkpoint: str = self.best_checkpoint_handler.last_checkpoint or self.latest_checkpoint_handler.last_checkpoint
            best_checkpoint_path = os.path.join(self.save_path,
                                                last_checkpoint)
            checkpoint_path_dict: dict = {
                "latest_checkpoint_path":
                best_checkpoint_path  # we want to load states from the best checkpoint as "latest" configuration for testing
            }
            self.model, self.optimizer, self.trainer, _, _, _ = self._load_checkpoint(
                self.model,
                self.optimizer,
                self.trainer,
                None,
                None,
                None,
                checkpoint_path_dict=checkpoint_path_dict)

            if len(test_loader):
                self.persist_collection = True
                test_evaluator.run(test_loader)
                self._save_collected_predictions(prefix="test_final")
                # write metrics to file
                self.write_metrics(trainer, test_evaluator, suffix="test")

        # terminate training if Nan values are produced
        self.trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                       TerminateOnNan())

        # start the actual training
        self.custom_print(f"Train for a maximum of {self.epochs} epochs...")
        self.trainer.run(train_loader, max_epochs=self.epochs)
Example #11
0
def training(local_rank, config, logger=None):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    torch.backends.cudnn.benchmark = True

    set_seed(config.seed + local_rank)

    train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader

    # Setup model, optimizer, criterion
    model, optimizer, criterion = initialize(config)

    if not hasattr(config, "prepare_batch"):
        config.prepare_batch = _prepare_batch

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler,
                             config, logger)

    if getattr(config, "benchmark_dataflow", False):
        benchmark_dataflow_num_iters = getattr(config,
                                               "benchmark_dataflow_num_iters",
                                               1000)
        DataflowBenchmark(benchmark_dataflow_num_iters,
                          prepare_batch=config.prepare_batch).attach(
                              trainer, train_loader)

    # Setup evaluators
    val_metrics = {
        "Accuracy": Accuracy(),
        "Top-5 Accuracy": TopKCategoricalAccuracy(k=5),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator, train_evaluator = create_evaluators(model, val_metrics, config)

    @trainer.on(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1))
        | Events.COMPLETED)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(val_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)

    score_metric_name = "Accuracy"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        config.output_path.as_posix(),
        evaluator,
        model=model,
        metric_name=score_metric_name,
        n_saved=3,
        trainer=trainer,
        tag="val",
    )

    if idist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )

        exp_tracking_logger = exp_tracking.setup_logging(trainer,
                                                         optimizer,
                                                         evaluators={
                                                             "training":
                                                             train_evaluator,
                                                             "validation":
                                                             evaluator
                                                         })

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation"),
            event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2),
        )

        tb_logger.attach(
            train_evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="training"),
            event_name=Events.ITERATION_COMPLETED(
                once=len(train_eval_loader) // 2),
        )

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
        exp_tracking_logger.close()
Example #12
0
def train(model, train_loader, eval_loaders, optimizer, loss_fn,
          n_it_max, patience, split_names, select_metric='Val accuracy_0',
          select_mode='max', viz=None, device='cpu', lr_scheduler=None, name=None, log_steps=None,
          log_epoch=False, _run=None, prepare_batch=_prepare_batch,
          single_pass=False, n_ep_max=None):

    # print(model)

    if not log_steps and not log_epoch:
        logger.warning('/!\\ No logging during training /!\\')

    if log_steps is None:
        log_steps = []

    epoch_steps = len(train_loader)
    if log_epoch:
        log_steps.append(epoch_steps)

    if single_pass:
        max_epoch = 1
    elif n_ep_max is None:
        assert n_it_max is not None
        max_epoch = int(n_it_max / epoch_steps) + 1
    else:
        assert n_it_max is None
        max_epoch = n_ep_max

    all_metrics = defaultdict(dict)
    trainer = create_supervised_trainer(model, optimizer, loss_fn,
                                        device=device,
                                        prepare_batch=prepare_batch)

    if hasattr(model, 'new_epoch_hook'):
        trainer.add_event_handler(Events.EPOCH_STARTED, model.new_epoch_hook)
    if hasattr(model, 'new_iter_hook'):
        trainer.add_event_handler(Events.ITERATION_STARTED,
                                     model.new_iter_hook)

    trainer.logger.setLevel(logging.WARNING)

    # trainer output is in the format (x, y, y_pred, loss, optionals)
    train_loss = RunningAverage(output_transform=lambda out: out[3].item(),
                                epoch_bound=True)
    train_loss.attach(trainer, 'Trainer loss')
    if hasattr(model, 's'):
        met = Average(output_transform=lambda _: float('nan') if model.s is None else model.s)
        met.attach(trainer, 'cur_s')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed, 'cur_s')

    if hasattr(model, 'arch_sampler') and model.arch_sampler.distrib_dim > 0:
        met = Average(output_transform=lambda _: float('nan') if model.cur_split is None else model.cur_split)
        met.attach(trainer, 'Trainer split')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, met.completed, 'Trainer split')
        # trainer.add_event_handler(Events.EPOCH_STARTED, met.started)
        all_ent = Average(
            output_transform=lambda out: out[-1]['arch_entropy_avg'].item())
        all_ent.attach(trainer, 'Trainer all entropy')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, all_ent.completed, 'Trainer all entropy')
        train_ent = Average(
            output_transform=lambda out: out[-1]['arch_entropy_sample'].item())
        train_ent.attach(trainer, 'Trainer sampling entropy')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, train_ent.completed, 'Trainer sampling entropy')
        trainer.add_event_handler(Events.EPOCH_COMPLETED,
                                  lambda engine: model.check_arch_freezing(
                                      ent=train_ent.compute(),
                                      epoch=engine.state.iteration/(epoch_steps*max_epoch))
                                  )
        def log_always(engine, name):
            val = engine.state.output[-1][name]
            all_metrics[name][engine.state.iteration/epoch_steps] = val.mean().item()

        def log_always_dict(engine, name):
            for node, val in engine.state.output[-1][name].items():
                all_metrics['node {} {}'.format(node, name)][engine.state.iteration/epoch_steps] = val.mean().item()
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='arch_grads')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='arch_probas')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always_dict, name='node_grads')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='task all_loss')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='arch all_loss')
        trainer.add_event_handler(Events.ITERATION_COMPLETED, log_always, name='entropy all_loss')

    if n_it_max is not None:
        StopAfterIterations([n_it_max]).attach(trainer)
    # epoch_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name,
    #                          persist=True, disable=not (_run or viz))
    # epoch_pbar.attach(trainer, metric_names=['Train loss'])
    #
    # training_pbar = ProgressBar(bar_format='{l_bar}{bar}{r_bar}', desc=name,
    #                             persist=True, disable=not (_run or viz))
    # training_pbar.attach(trainer, event_name=Events.EPOCH_COMPLETED,
    #                      closing_event_name=Events.COMPLETED)
    total_time = Timer(average=False)
    eval_time = Timer(average=False)
    eval_time.pause()
    data_time = Timer(average=False)
    forward_time = Timer(average=False)
    forward_time.attach(trainer, start=Events.EPOCH_STARTED,
                        pause=Events.ITERATION_COMPLETED,
                        resume=Events.ITERATION_STARTED,
                        step=Events.ITERATION_COMPLETED)
    epoch_time = Timer(average=False)
    epoch_time.attach(trainer, start=Events.EPOCH_STARTED,
                      pause=Events.EPOCH_COMPLETED,
                      resume=Events.EPOCH_STARTED,
                      step=Events.EPOCH_COMPLETED)

    def get_loss(y_pred, y):
        l = loss_fn(y_pred, y)
        if not torch.is_tensor(l):
            l, *l_details = l
        return l.mean()

    def get_member(x, n=0):
        if isinstance(x, (list, tuple)):
            return x[n]
        return x

    eval_metrics = {'loss': Loss(get_loss)}

    for i in range(model.n_out):
        out_trans = get_attr_transform(i)
        def extract_ys(out):
            x, y, y_pred, loss, _ = out
            return out_trans((y_pred, y))

        train_acc = Accuracy(extract_ys)
        train_acc.attach(trainer, 'Trainer accuracy_{}'.format(i))
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  train_acc.completed, 'Trainer accuracy_{}'.format(i))
        eval_metrics['accuracy_{}'.format(i)] = \
            Accuracy(output_transform=out_trans)
        # if isinstance(model, SSNWrapper):
        #     model.arch_sampler.entropy().mean()

    evaluator = create_supervised_evaluator(model, metrics=eval_metrics,
                                            device=device,
                                            prepare_batch=prepare_batch)
    last_iteration = 0
    patience_counter = 0

    best = {'value': float('inf') * 1 if select_mode == 'min' else -1,
            'iter': -1,
            'state_dict': None
            }

    def is_better(new, old):
        if select_mode == 'min':
            return new < old
        else:
            return new > old

    def log_results(evaluator, data_loader, iteration, split_name):
        evaluator.run(data_loader)
        metrics = evaluator.state.metrics

        log_metrics = {}

        for metric_name, metric_val in metrics.items():
            log_name = '{} {}'.format(split_name, metric_name)
            if viz:
                first = iteration == 0 and split_name == split_names[0]
                viz.line([metric_val], X=[iteration], win=metric_name,
                         name=log_name,
                         update=None if first else 'append',
                         opts={'title': metric_name,
                               'showlegend': True,
                               'width': 500, 'xlabel': 'iterations'})
                viz.line([metric_val], X=[iteration/epoch_steps],
                         win='{}epoch'.format(metric_name),
                         name=log_name,
                         update=None if first else 'append',
                         opts={'title': metric_name,
                               'showlegend': True,
                               'width': 500, 'xlabel': 'epoch'})
            if _run:
                _run.log_scalar(log_name, metric_val, iteration)
            log_metrics[log_name] = metric_val
            all_metrics[log_name][iteration] = metric_val

        return log_metrics

    if lr_scheduler is not None:
        @trainer.on(Events.EPOCH_COMPLETED)
        def step(_):
            lr_scheduler.step()
            # logger.warning('current lr {:.5e}'.format(
            #     optimizer.param_groups[0]['lr']))

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_event(trainer):
        iteration = trainer.state.iteration if trainer.state else 0
        nonlocal last_iteration, patience_counter, best

        if not log_steps or not \
                (iteration in log_steps or iteration % log_steps[-1] == 0):
            return
        epoch_time.pause()
        eval_time.resume()
        all_metrics['training_epoch'][iteration] = iteration / epoch_steps
        all_metrics['training_iteration'][iteration] = iteration
        if hasattr(model, 'arch_sampler'):
            all_metrics['training_archs'][iteration] = \
                model.arch_sampler().squeeze().detach()
        # if hasattr(model, 'distrib_gen'):
        #     entropy = model.distrib_gen.entropy()
        #     all_metrics['entropy'][iteration] = entropy.mean().item()
        # if trainer.state and len(trainer.state.metrics) > 1:
        #     raise ValueError(trainer.state.metrics)
        all_metrics['data time'][iteration] = data_time.value()
        all_metrics['data time_ps'][iteration] = data_time.value() / max(data_time.step_count, 1.)
        all_metrics['forward time'][iteration] = forward_time.value()
        all_metrics['forward time_ps'][iteration] = forward_time.value() / max(forward_time.step_count, 1.)
        all_metrics['epoch time'][iteration] = epoch_time.value()
        all_metrics['epoch time_ps'][iteration] = epoch_time.value() / max(epoch_time.step_count, 1.)

        if trainer.state:
            # logger.warning(trainer.state.metrics)
            for metric, value in trainer.state.metrics.items():
                all_metrics[metric][iteration] = value
                if viz:
                    viz.line([value], X=[iteration], win=metric.split()[-1],
                         name=metric,
                         update=None if iteration==0 else 'append',
                         opts={'title': metric,
                               'showlegend': True,
                               'width': 500, 'xlabel': 'iterations'})

        iter_this_step = iteration - last_iteration
        for d_loader, name in zip(eval_loaders, split_names):
            if name == 'Train':
                if iteration == 0:
                    all_metrics['Trainer loss'][iteration] = float('nan')
                    all_metrics['Trainer accuracy_0'][iteration] = float('nan')
                    if hasattr(model, 'arch_sampler'):
                        all_metrics['Trainer all entropy'][iteration] = float('nan')
                        all_metrics['Trainer sampling entropy'][iteration] = float('nan')
                    # if hasattr(model, 'cur_split'):
                        all_metrics['Trainer split'][iteration] = float('nan')
                continue
            split_metrics = log_results(evaluator, d_loader, iteration, name)
            if select_metric not in split_metrics:
                continue
            if is_better(split_metrics[select_metric], best['value']):
                best['value'] = split_metrics[select_metric]
                best['iter'] = iteration
                best['state_dict'] = copy.deepcopy(model.state_dict())
                if patience > 0:
                    patience_counter = 0
            elif patience > 0:
                patience_counter += iter_this_step
                if patience_counter >= patience:
                    logger.info('#####')
                    logger.info('# Early stopping Run')
                    logger.info('#####')
                    trainer.terminate()
        last_iteration = iteration
        eval_time.pause()
        eval_time.step()
        all_metrics['eval time'][iteration] = eval_time.value()
        all_metrics['eval time_ps'][iteration] = eval_time.value() / eval_time.step_count
        all_metrics['total time'][iteration] = total_time.value()
        epoch_time.resume()

    log_event(trainer)

    #
    # @trainer.on(Events.EPOCH_COMPLETED)
    # def log_epoch(trainer):
    #     iteration = trainer.state.iteration if trainer.state else 0
    #     epoch = iteration/epoch_steps
    #     fw_t = forward_time.value()
    #     fw_t_ps = fw_t / forward_time.step_count
    #     d_t = data_time.value()
    #     d_t_ps = d_t / data_time.step_count
    #     e_t = epoch_time.value()
    #     e_t_ps = e_t / epoch_time.step_count
    #     ev_t = eval_time.value()
    #     ev_t_ps = ev_t / eval_time.step_count
    #     logger.warning('<{}> Epoch {}/{} finished (Forward: {:.3f}s({:.3f}), '
    #                    'data: {:.3f}s({:.3f}), epoch: {:.3f}s({:.3f}),'
    #                    ' Eval: {:.3f}s({:.3f}), Total: '
    #                    '{:.3f}s)'.format(type(model).__name__, epoch,
    #                                      max_epoch, fw_t, fw_t_ps, d_t, d_t_ps,
    #                                      e_t, e_t_ps, ev_t, ev_t_ps,
    #                                      total_time.value()))

    data_time.attach(trainer, start=Events.STARTED,
                     pause=Events.ITERATION_STARTED,
                     resume=Events.ITERATION_COMPLETED,
                     step=Events.ITERATION_STARTED)

    if hasattr(model, 'iter_per_epoch'):
        model.iter_per_epoch = len(train_loader)
    trainer.run(train_loader, max_epochs=max_epoch)
    return trainer.state.iteration, all_metrics, best
Example #13
0
def task_diagnostics(tasks, train_data, val_data, vocabulary, model, args):
    devicea = -1
    if torch.cuda.is_available():
        devicea = 0
    train_activations = []
    test_activations = []
    train_labels = []
    test_labels = []

    for tid,task in enumerate(tasks):
        train_act, train_lab = evaluate_get_dataset(model, task, vocabulary[task],
                                                   train_data[task], 1000, tid)
        test_act, test_lab = evaluate_get_dataset(model, task, vocabulary[task],
                                                  val_data[task], 500, tid)
        train_activations.append(train_act)
        test_activations.append(test_act)
        train_labels.append(train_lab)
        test_labels.append(test_lab)

    train_activations = torch.cat(train_activations, dim=0)
    test_activations = torch.cat(test_activations, dim=0)
    train_labels = torch.cat(train_labels, dim=0)
    test_labels = torch.cat(test_labels, dim=0)
    print("Activations ", train_activations.shape, test_activations.shape, train_labels.shape, test_labels.shape)
    # Datasets
    train_ds = torch.utils.data.TensorDataset(train_activations, train_labels)
    test_ds = torch.utils.data.TensorDataset(test_activations, test_labels)
    train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32)
    test_dl = torch.utils.data.DataLoader(test_ds, batch_size=2100)

    # Models and Optimizer
    diag_model = DiagnositicClassifier(train_activations.size(1), 128, len(tasks))
    if devicea != -1:
        diag_model.cuda(devicea)
    optimizer = utils.get_optimizer(args.opt_alg, diag_model.parameters(), args.lr, args.wdecay)
    criterion = nn.CrossEntropyLoss()

    # ignite training loops
    if devicea == -1:
        trainer = create_supervised_trainer(diag_model, optimizer, criterion)
        evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)})
        val_evaluator = create_supervised_evaluator(diag_model, {"accuracy": Accuracy(), "loss": Loss(criterion)})
    else:
        trainer = create_supervised_trainer(diag_model, optimizer, diag_model.loss_function, device=devicea)
        evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea)
        val_evaluator = create_supervised_evaluator(diag_model, metrics={'accuracy': Accuracy()}, device=devicea)
    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        evaluator.run(train_dl)
        val_evaluator.run(test_dl)

    def score_function(engine):
        return engine.state.metrics['accuracy']
    
    early_stop_metric = EarlyStopping(patience=20, score_function=score_function, trainer=trainer)
    val_evaluator.add_event_handler(Events.COMPLETED, early_stop_metric)
    trainer.run(train_dl, max_epochs=1000)
    logits, test_labels = val_evaluator.state.output

    _, predicted = torch.max(logits, 1)
    correct_ones = (predicted == test_labels).sum()
    metrics = {}
    for i,task in enumerate(tasks):
        start = i*500
        end = (i+1)*500
        correct_this = (predicted[start:end] == test_labels[start:end]).sum()
        metrics[task] = correct_this.item()/500
        #print("Task based accuracy", start, end , task, correct_this)

    metrics["overall"] = val_evaluator.state.metrics["accuracy"]
    print("Diagnostics metric", metrics)
    return metrics
Example #14
0

trainer = Engine(process_function)
train_evaluator = Engine(eval_function)
validator_evaluator = Engine(eval_function)

RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')


def thresholded_output_transform(output):
    y_pred, y = output
    y_pred = torch.round(y_pred)
    return y_pred, y


Accuracy(output_transform=thresholded_output_transform).attach(train_evaluator, 'accuracy')
Loss(loss_function).attach(train_evaluator, 'loss_train')  # binary cross entropy

Accuracy(output_transform=thresholded_output_transform).attach(validator_evaluator, 'accuracy')
Loss(loss_function).attach(validator_evaluator, 'loss_val')

pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss'])


@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    train_evaluator.run(train_iter)
    metrics = train_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_loss = metrics['loss_train']
Example #15
0
def test_binary_input():

    acc = Accuracy()

    def _test(y_pred, y, batch_size):
        acc.reset()
        if batch_size > 1:
            n_iters = y.shape[0] // batch_size + 1
            for i in range(n_iters):
                idx = i * batch_size
                acc.update(
                    (y_pred[idx:idx + batch_size], y[idx:idx + batch_size]))
        else:
            acc.update((y_pred, y))

        np_y = y.numpy().ravel()
        np_y_pred = y_pred.numpy().ravel()

        assert acc._type == "binary"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())

    def get_test_cases():

        test_cases = [
            # Binary accuracy on input of shape (N, 1) or (N, )
            (torch.randint(0, 2, size=(10, )).long(),
             torch.randint(0, 2, size=(10, )).long(), 1),
            (torch.randint(0, 2, size=(10, 1)).long(),
             torch.randint(0, 2, size=(10, 1)).long(), 1),
            # updated batches
            (torch.randint(0, 2, size=(50, )).long(),
             torch.randint(0, 2, size=(50, )).long(), 16),
            (torch.randint(0, 2, size=(50, 1)).long(),
             torch.randint(0, 2, size=(50, 1)).long(), 16),
            # Binary accuracy on input of shape (N, L)
            (torch.randint(0, 2, size=(10, 5)).long(),
             torch.randint(0, 2, size=(10, 5)).long(), 1),
            (torch.randint(0, 2, size=(10, 8)).long(),
             torch.randint(0, 2, size=(10, 8)).long(), 1),
            # updated batches
            (torch.randint(0, 2, size=(50, 5)).long(),
             torch.randint(0, 2, size=(50, 5)).long(), 16),
            (torch.randint(0, 2, size=(50, 8)).long(),
             torch.randint(0, 2, size=(50, 8)).long(), 16),
            # Binary accuracy on input of shape (N, H, W, ...)
            (torch.randint(0, 2, size=(4, 1, 12, 10)).long(),
             torch.randint(0, 2, size=(4, 1, 12, 10)).long(), 1),
            (torch.randint(0, 2, size=(15, 1, 20, 10)).long(),
             torch.randint(0, 2, size=(15, 1, 20, 10)).long(), 1),
            # updated batches
            (torch.randint(0, 2, size=(50, 1, 12, 10)).long(),
             torch.randint(0, 2, size=(50, 1, 12, 10)).long(), 16),
            (torch.randint(0, 2, size=(50, 1, 20, 10)).long(),
             torch.randint(0, 2, size=(50, 1, 20, 10)).long(), 16),
        ]

        return test_cases

    for _ in range(5):
        # check multiple random inputs as random exact occurencies are rare
        test_cases = get_test_cases()
        for y_pred, y, n_iters in test_cases:
            _test(y_pred, y, n_iters)
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    wandb_logger = WandBLogger(
        project="pytorch-ignite-integration",
        name="ignite-mnist-example",
        config={
            "train_batch_size": train_batch_size,
            "val_batch_size": val_batch_size,
            "epochs": epochs,
            "lr": lr,
            "momentum": momentum,
        },
    )

    wandb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        wandb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=lambda *_: trainer.state.iteration,
        )

    wandb_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)
    wandb_logger.watch(model, log="all")

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        wandb_logger.run.dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    wandb_logger.close()
Example #17
0
                        num_workers=num_workers)

test_dataset = TrainValTestDataset(image_dataset, mode="test")
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=args.batchsize,
                         num_workers=num_workers)

model = Model(number_of_classes=number_of_classes)

optimizer = optim.Adam(model.parameters(), lr=args.learningrate)

trainer = create_supervised_trainer(model, optimizer, criterion, device=device)

metrics = {
    "accuracy":
    Accuracy(),
    "MAE":
    MeanAbsoluteError(
        output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])),
    "MSE":
    MeanSquaredError(
        output_transform=lambda out: (torch.max(out[0], dim=1)[1], out[1])),
    "loss":
    Loss(loss_fn=criterion)
}

evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)


@trainer.on(Events.ITERATION_COMPLETED)
def log_training_loss(trainer):
Example #18
0
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
path = '/mnt/ssd1/datasets/Recursion_class'
path_data = path
device = 'cuda'
batch_size = 64
warnings.filterwarnings('ignore')


criterion = nn.CrossEntropyLoss()
criterion = FocalLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
# optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0003, momentum=0.9)

metrics = {
    'loss': Loss(criterion),
    'accuracy': Accuracy(),
}


trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)


@trainer.on(Events.EPOCH_COMPLETED)
def compute_and_display_val_metrics(engine):
    epoch = engine.state.epoch
    metrics = val_evaluator.run(val_loader).metrics
    print("Validation Results - Epoch: {}  Average Loss: {:.4f} | Accuracy: {:.4f} "
          .format(engine.state.epoch,
                  metrics['loss'],
                  metrics['accuracy']))
Example #19
0
# Create DenseNet121
net = monai.networks.nets.densenet3d.densenet121(
    in_channels=1,
    out_channels=2,
)
device = torch.device("cuda:0")


def prepare_batch(batch, device=None, non_blocking=False):
    return _prepare_batch((batch['img'], batch['label']), device, non_blocking)


metric_name = 'Accuracy'
# add evaluation metric to the evaluator engine
val_metrics = {metric_name: Accuracy()}
# ignite evaluator expects batch=(img, label) and returns output=(y_pred, y) at every iteration,
# user can add output_transform to return other values
evaluator = create_supervised_evaluator(net,
                                        val_metrics,
                                        device,
                                        True,
                                        prepare_batch=prepare_batch)

# Add stats event handler to print validation stats via evaluator
val_stats_handler = StatsHandler(
    name='evaluator',
    output_transform=lambda x:
    None  # no need to print loss value, so disable per iteration output
)
val_stats_handler.attach(evaluator)
Example #20
0
def adv_prune_train_loop(model, params, ds, dset, min_y, base_data, model_id, prune_type, device, batch_size, tpa, max_epochs=5):
    #assert prune_type in ['global_unstructured', 'structured']
    total_prune_amount = tpa
    remove_amount = tpa
    ds_train, ds_valid = ds
    train_set, valid_set = dset
    min_y_train, min_y_val = min_y
    train_set, valid_set = dset
    total_prune_amount = tpa
    original_model = copy.deepcopy(model)
    original_model.eval()
    model_id = f'{model_id}_{prune_type}_pruning_{tpa}_l1'
    valid_freq = 200 * 500 // batch_size // 3

    conv_layers = [model.conv1]
    for sequential in [model.layer1, model.layer2, model.layer3, model.layer4]:
        for bottleneck in sequential:
            conv_layers.extend([bottleneck.conv1, bottleneck.conv2, bottleneck.conv3])
    conv_layers = conv_layers[:22]
    def prune_model(model):
        print(f'pruned model by {total_prune_amount}')
        if prune_type == 'global_unstructured':
            parameters_to_prune = [(layer, 'weight') for layer in conv_layers]
            prune.global_unstructured(
                parameters_to_prune,
                pruning_method=prune.L1Unstructured,
                amount=total_prune_amount,
            )
        else:
            for layer in conv_layers:

    prune_model(model)

    def valid_eval(model, dataset, dataloader, device, label):
        right = 0
        total = 0
        model.eval()
        with torch.no_grad():
            for i, data in tqdm(enumerate(dataloader), total=len(dataset) / dataloader.batch_size):
                data, y = data
                data = data.to(device)
                y = y.to(device) - label
                ans = model.forward(data)
                right += torch.sum(torch.eq(torch.argmax(ans, dim=1), y))
                total += y.shape[0]
        return right/total
    valid_acc = valid_eval(model, valid_set, ds_valid, device, min_y_val)
    print('initial accuracy:', valid_acc.item())
    
    with create_summary_writer(model, ds_train, base_data, model_id, device=device) as writer:
        lr = params['lr']
        mom = params['momentum']
        wd = params['l2_wd']
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=mom, weight_decay=wd)
        sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
        funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)}
        loss = funcs['loss']._loss_fn

        acc_metric = Accuracy(device=device)
        loss_metric = Loss(F.cross_entropy, device=device)

        acc_val_metric = Accuracy(device=device)
        loss_val_metric = Loss(F.cross_entropy, device=device)

#         attack = GradientSignAttack(original_model, loss_fn=loss, eps=0.2)

        def train_step(engine, batch):
            model.train()
            x, y = batch
            x = x.to(device)
            y = y.to(device) - min_y_train
#             with ctx_noparamgrad_and_eval(model):
#                 x_adv = attack.perturb(x, y)
#             optimizer.zero_grad()
#             x = torch.cat((x, x_adv))
#             y = torch.cat((y, y))
            ans = model.forward(x)
            l = loss(ans, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            with torch.no_grad():
                for layer in conv_layers:
                    layer.weight *= layer.weight_mask
            return l.item()

        trainer = Engine(train_step)

        def train_eval_step(engine, batch):
            model.eval()
            x, y = batch
            x = x.to(device)
            y = y.to(device) - min_y_train
#             x_adv = attack.perturb(x, y)
#             x = torch.cat((x, x_adv))
#             y = torch.cat((y, y))
            with torch.no_grad():
                ans = model.forward(x)
            return ans, y

        train_evaluator = Engine(train_eval_step)
        acc_metric.attach(train_evaluator, "accuracy")
        loss_metric.attach(train_evaluator, 'loss')

        def validation_step(engine, batch):
            model.eval()
            x, y = batch
            x = x.to(device)
            y = y.to(device) - min_y_val
#             x_adv = attack.perturb(x, y)
#             x = torch.cat((x, x_adv))
#             y = torch.cat((y, y))
            with torch.no_grad():
                ans = model.forward(x)
            return ans, y

        valid_evaluator = Engine(validation_step)
        acc_val_metric.attach(valid_evaluator, "accuracy")
        loss_val_metric.attach(valid_evaluator, 'loss')

        @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq))
        def log_validation_results(engine):
            valid_evaluator.run(ds_valid)
            metrics = valid_evaluator.state.metrics
            valid_avg_accuracy = metrics['accuracy']
            avg_nll = metrics['loss']
            print("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
                  .format(engine.state.epoch, valid_avg_accuracy, avg_nll))
            writer.add_scalar("validation/avg_loss", avg_nll, engine.state.epoch)
            writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy, engine.state.epoch)
            writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy, engine.state.epoch)

        @trainer.on(Events.EPOCH_COMPLETED)
        def lr_scheduler(engine):
            metrics = valid_evaluator.state.metrics
            avg_nll = metrics['accuracy']
            sched.step(avg_nll)

        @trainer.on(Events.ITERATION_COMPLETED(every=100))
        def log_training_loss(engine):
            batch = engine.state.batch
            ds = DataLoader(TensorDataset(*batch), batch_size=batch_size)
            train_evaluator.run(ds)
            metrics = train_evaluator.state.metrics
            accuracy = metrics['accuracy']
            nll = metrics['loss']
            iter = (engine.state.iteration - 1) % len(ds_train) + 1
            if (iter % 50) == 0:
                print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}"
                      .format(engine.state.epoch, iter, len(ds_train), accuracy, nll))
            writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch)
            writer.add_scalar("batchtraining/accuracy", accuracy, engine.state.iteration)
            writer.add_scalar("batchtraining/error", 1. - accuracy, engine.state.iteration)
            writer.add_scalar("batchtraining/loss", engine.state.output, engine.state.iteration)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_lr(engine):
            writer.add_scalar("lr", optimizer.param_groups[0]['lr'], engine.state.epoch)

        @trainer.on(Events.ITERATION_COMPLETED(every=valid_freq))
        def validation_value(engine):
            metrics = valid_evaluator.state.metrics
            valid_avg_accuracy = metrics['accuracy']
            return valid_avg_accuracy

        to_save = {'model': model}
        handler = Checkpoint(to_save, DiskSaver(os.path.join(base_data, model_id),
                                                create_dir=True),
                             score_function=validation_value, score_name="val_acc",
                             global_step_transform=global_step_from_engine(trainer),
                             n_saved=None)

        # kick everything off
        trainer.add_event_handler(Events.ITERATION_COMPLETED(every=valid_freq), handler)
        trainer.run(ds_train, max_epochs=max_epochs)
Example #21
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument

    arg('clf_gt', help='segmentation predictions')
    # Dataset params
    arg('--test-height', type=int, default=2528)
    arg('--crop-height', type=int, default=768)
    arg('--crop-width', type=int, default=512)
    arg('--scale-aug', type=float, default=0.3)
    arg('--color-hue-aug', type=int, default=7)
    arg('--color-sat-aug', type=int, default=30)
    arg('--color-val-aug', type=int, default=30)
    arg('--n-tta', type=int, default=1)
    arg('--pseudolabels',
        nargs='+',
        help='path to pseudolabels to be added to train')
    arg('--pseudolabels-oversample', type=int, default=1)
    arg('--test-book', help='use only this book for testing and pseudolabels')
    arg('--fold', type=int, default=0)
    arg('--n-folds', type=int, default=5)
    arg('--train-limit', type=int)
    arg('--test-limit', type=int)
    # Model params
    arg('--base', default='resnet50')
    arg('--use-sequences', type=int, default=0)
    arg('--head-dropout', type=float, default=0.5)
    arg('--frozen-start', type=int)
    arg('--head', type=str, default='Head')
    # Training params
    arg('--device', default='cuda', help='device')
    arg('--opt-level', help='pass 01 to use fp16 training with apex')
    arg('--benchmark', type=int)
    arg('--batch-size', default=10, type=int)
    arg('--max-targets', type=int)
    arg('--workers',
        default=8,
        type=int,
        help='number of data loading workers')
    arg('--lr', default=14e-3, type=float, help='initial learning rate')
    arg('--wd', default=1e-4, type=float, help='weight decay')
    arg('--optimizer', default='sgd')
    arg('--accumulation-steps', type=int, default=1)
    arg('--epochs', default=50, type=int, help='number of total epochs to run')
    arg('--repeat-train', type=int, default=6)
    arg('--drop-lr-epoch',
        default=0,
        type=int,
        help='epoch at which to drop lr')
    arg('--cosine', type=int, default=1, help='cosine lr schedule')
    # Misc. params
    arg('--output-dir', help='path where to save')
    arg('--resume', help='resume from checkpoint')
    arg('--test-only', help='Only test the model', action='store_true')
    arg('--submission', help='Create submission', action='store_true')
    arg('--detailed-postfix', default='', help='postfix of detailed file name')
    arg('--print-model', default=1, type=int)
    arg('--dump-features', default=0, type=int)  # for knn, unused
    args = parser.parse_args()
    if args.test_only and args.submission:
        parser.error('pass one of --test-only and --submission')
    print(args)

    output_dir = Path(args.output_dir) if args.output_dir else None
    if output_dir:
        output_dir.mkdir(parents=True, exist_ok=True)
        if not args.resume:
            (output_dir / 'params.json').write_text(
                json.dumps(vars(args), indent=4))

    print('Loading data')
    df_train_gt, df_valid_gt = load_train_valid_df(args.fold, args.n_folds)
    df_clf_gt = load_train_df(args.clf_gt)[['labels', 'image_id']]
    if args.submission:
        df_valid = df_train = df_clf_gt
        empty_index = df_valid['labels'] == ''
        empty_pages = df_valid[empty_index]['image_id'].values
        df_valid = df_valid[~empty_index]
    else:
        df_train, df_valid = [
            df_clf_gt[df_clf_gt['image_id'].isin(set(df['image_id']))]
            for df in [df_train_gt, df_valid_gt]
        ]
        df_valid = df_valid[df_valid['labels'] != '']
    if args.pseudolabels:
        df_ps = pd.concat(
            [pd.read_csv(p)[df_train.columns] for p in args.pseudolabels])
        if args.test_book:
            df_ps = df_ps[df_ps['image_id'].apply(
                lambda x: get_book_id(x) == args.test_book)]
        df_train = (
            pd.concat([df_train] +
                      [df_ps] * args.pseudolabels_oversample).reset_index(
                          drop=True))
    if args.test_book:
        df_valid = df_valid[df_valid['image_id'].apply(
            lambda x: get_book_id(x) == args.test_book)]
    if args.train_limit:
        df_train = df_train.sample(n=args.train_limit, random_state=42)
    if args.test_limit:
        df_valid = df_valid.sample(n=args.test_limit, random_state=42)
    gt_by_image_id = {item.image_id: item for item in df_valid_gt.itertuples()}
    print(f'{len(df_train):,} in train, {len(df_valid):,} in valid')
    classes = get_encoded_classes()

    def _get_transforms(*, train: bool):
        if not train and args.n_tta > 1:
            test_heights = [
                args.test_height * (1 + s)
                for s in np.linspace(0, args.scale_aug, args.n_tta)
            ]
            print('TTA test heights:', list(map(int, test_heights)))
        else:
            test_heights = [args.test_height]
        return [
            get_transform(
                train=train,
                test_height=test_height,
                crop_width=args.crop_width,
                crop_height=args.crop_height,
                scale_aug=args.scale_aug,
                color_hue_aug=args.color_hue_aug,
                color_sat_aug=args.color_sat_aug,
                color_val_aug=args.color_val_aug,
            ) for test_height in test_heights
        ]

    def make_test_data_loader(df):
        return DataLoader(
            Dataset(
                df=df,
                transforms=_get_transforms(train=False),
                resample_empty=False,
                classes=classes,
            ),
            batch_size=1,
            collate_fn=collate_fn,
            num_workers=args.workers,
        )

    data_loader_test = make_test_data_loader(df_valid)
    if args.dump_features:  # unused
        df_train = df_train[df_train['labels'] != '']
        data_loader_train = make_test_data_loader(df_train)
    else:
        data_loader_train = DataLoader(
            Dataset(
                df=pd.concat([df_train] * args.repeat_train),
                transforms=_get_transforms(train=True),
                resample_empty=True,
                classes=classes,
            ),
            num_workers=args.workers,
            shuffle=True,
            collate_fn=partial(collate_fn, max_targets=args.max_targets),
            batch_size=args.batch_size,
        )

    print('Creating model')
    fp16 = bool(args.opt_level)
    model: nn.Module = build_model(
        base=args.base,
        head=args.head,
        frozen_start=args.frozen_start,
        fp16=fp16,
        n_classes=len(classes),
        head_dropout=args.head_dropout,
        use_sequences=bool(args.use_sequences),
    )
    if args.print_model:
        print(model)
    device = torch.device(args.device)
    model.to(device)
    if args.benchmark:
        torch.backends.cudnn.benchmark = True

    parameters = model.parameters()
    if args.optimizer == 'adam':
        optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd)
    elif args.optimizer == 'sgd':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              weight_decay=args.wd,
                              momentum=0.9)
    else:
        parser.error(f'Unexpected optimzier {args.optimizer}')

    if fp16:
        from apex import amp
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)
    loss = nn.CrossEntropyLoss()
    step = epoch = 0
    best_f1 = 0

    if args.resume:
        state = torch.load(args.resume, map_location='cpu')
        if 'optimizer' in state:
            optimizer.load_state_dict(state['optimizer'])
            model.load_state_dict(state['model'])
            step = state['step']
            epoch = state['epoch']
            best_f1 = state['best_f1']
        else:
            model.load_state_dict(state)
        del state

    @contextmanager
    def no_benchmark():
        torch.backends.cudnn.benchmark = False
        yield
        if args.benchmark:
            torch.backends.cudnn.benchmark = True

    if args.dump_features and not args.submission:  # unused
        if not output_dir:
            parser.error('set --output-dir with --dump-features')
        # We also dump test features below
        feature_evaluator = create_supervised_evaluator(
            model,
            device=device,
            prepare_batch=_prepare_batch,
            metrics={'features': GetFeatures(n_tta=args.n_tta)},
        )
        with no_benchmark():
            run_with_pbar(feature_evaluator,
                          data_loader_train,
                          desc='train features')
        torch.save(feature_evaluator.state.metrics['features'],
                   output_dir / 'train_features.pth')

    def get_y_pred_y(output):
        y_pred, y = output
        return get_output(y_pred), get_labels(y)

    metrics = {
        'accuracy': Accuracy(output_transform=get_y_pred_y),
        'loss': Loss(loss, output_transform=get_y_pred_y),
        'predictions': GetPredictions(n_tta=args.n_tta, classes=classes),
        'detailed': GetDetailedPrediction(n_tta=args.n_tta, classes=classes),
    }
    if args.dump_features:
        metrics['features'] = GetFeatures(n_tta=args.n_tta)
    evaluator = create_supervised_evaluator(model,
                                            device=device,
                                            prepare_batch=_prepare_batch,
                                            metrics=metrics)

    def evaluate():
        with no_benchmark():
            run_with_pbar(evaluator, data_loader_test, desc='evaluate')
        metrics = {
            'valid_loss': evaluator.state.metrics['loss'],
            'accuracy': evaluator.state.metrics['accuracy'],
        }
        scores = []
        for prediction, meta in evaluator.state.metrics['predictions']:
            item = gt_by_image_id[meta['image_id']]
            target_boxes, target_labels = get_target_boxes_labels(item)
            target_boxes = torch.from_numpy(target_boxes)
            pred_centers = np.array([p['center'] for p in prediction])
            pred_labels = [p['cls'] for p in prediction]
            scores.append(
                dict(score_boxes(
                    truth_boxes=from_coco(target_boxes).numpy(),
                    truth_label=target_labels,
                    preds_center=pred_centers,
                    preds_label=np.array(pred_labels),
                ),
                     image_id=item.image_id))
        metrics.update(get_metrics(scores))
        if output_dir:
            pd.DataFrame(evaluator.state.metrics['detailed']).to_csv(
                output_dir / f'detailed{args.detailed_postfix}.csv.gz',
                index=None)
        if args.dump_features:
            f_name = 'test' if args.submission else 'valid'
            torch.save(evaluator.state.metrics['features'],
                       output_dir / f'{f_name}_features.pth')
        return metrics

    def make_submission():
        with no_benchmark():
            run_with_pbar(evaluator, data_loader_test, desc='evaluate')
        submission = []
        for prediction, meta in tqdm.tqdm(
                evaluator.state.metrics['predictions']):
            submission.append(submission_item(meta['image_id'], prediction))
        submission.extend(
            submission_item(image_id, []) for image_id in empty_pages)
        pd.DataFrame(submission).to_csv(output_dir /
                                        f'submission_{output_dir.name}.csv.gz',
                                        index=None)
        pd.DataFrame(evaluator.state.metrics['detailed']).to_csv(
            output_dir / f'test_detailed{args.detailed_postfix}.csv.gz',
            index=None)
        if args.dump_features:
            torch.save(evaluator.state.metrics['features'],
                       output_dir / 'test_features.pth')

    if args.test_only or args.submission:
        if not args.resume:
            parser.error('please pass --resume when running with --test-only '
                         'or --submission')
        if args.test_only:
            print_metrics(evaluate())
        elif args.submission:
            if not output_dir:
                parser.error('--output-dir required with --submission')
            make_submission()
        return

    trainer = create_supervised_trainer(
        model,
        optimizer,
        loss_fn=lambda y_pred, y: loss(get_output(y_pred), get_labels(y)),
        device=device,
        prepare_batch=_prepare_batch,
        accumulation_steps=args.accumulation_steps,
        fp16=fp16,
    )

    epochs_left = args.epochs - epoch
    epochs_pbar = tqdm.trange(epochs_left)
    epoch_pbar = tqdm.trange(len(data_loader_train))
    train_losses = deque(maxlen=20)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(_):
        nonlocal step
        train_losses.append(trainer.state.output)
        smoothed_loss = np.mean(train_losses)
        epoch_pbar.set_postfix(loss=f'{smoothed_loss:.4f}')
        epoch_pbar.update(1)
        step += 1
        if step % 20 == 0 and output_dir:
            json_log_plots.write_event(output_dir,
                                       step=step * args.batch_size,
                                       loss=smoothed_loss)

    @trainer.on(Events.EPOCH_COMPLETED)
    def checkpoint(_):
        if output_dir:
            torch.save(
                {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'step': step,
                    'epoch': epoch,
                    'best_f1': best_f1,
                }, output_dir / 'checkpoint.pth')

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(_):
        nonlocal best_f1
        metrics = evaluate()
        if output_dir:
            json_log_plots.write_event(output_dir,
                                       step=step * args.batch_size,
                                       **metrics)
        if metrics['f1'] > best_f1:
            best_f1 = metrics['f1']
            if output_dir:
                torch.save(model.state_dict(), output_dir / 'model_best.pth')
        epochs_pbar.set_postfix(
            {k: format_value(v)
             for k, v in metrics.items()})

    @trainer.on(Events.EPOCH_COMPLETED)
    def update_pbars_on_epoch_completion(_):
        nonlocal epoch
        epochs_pbar.update(1)
        epoch_pbar.reset()
        epoch += 1

    scheduler = None
    if args.drop_lr_epoch and args.cosine:
        parser.error('Choose only one schedule')
    if args.drop_lr_epoch:
        scheduler = StepLR(optimizer, step_size=args.drop_lr_epoch, gamma=0.1)
    if args.cosine:
        scheduler = CosineAnnealingLR(optimizer, epochs_left)
    if scheduler is not None:
        trainer.on(Events.EPOCH_COMPLETED)(lambda _: scheduler.step())

    trainer.run(data_loader_train, max_epochs=epochs_left)
Example #22
0
def test_multilabel_input():
    acc = Accuracy(is_multilabel=True)

    def _test(y_pred, y, batch_size):
        acc.reset()
        if batch_size > 1:
            n_iters = y.shape[0] // batch_size + 1
            for i in range(n_iters):
                idx = i * batch_size
                acc.update(
                    (y_pred[idx:idx + batch_size], y[idx:idx + batch_size]))
        else:
            acc.update((y_pred, y))

        np_y_pred = to_numpy_multilabel(y_pred)
        np_y = to_numpy_multilabel(y)

        assert acc._type == "multilabel"
        assert isinstance(acc.compute(), float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(acc.compute())

    def get_test_cases():

        test_cases = [
            # Multilabel input data of shape (N, C) and (N, C)
            (torch.randint(0, 2, size=(10, 4)).long(),
             torch.randint(0, 2, size=(10, 4)).long(), 1),
            (torch.randint(0, 2, size=(10, 7)).long(),
             torch.randint(0, 2, size=(10, 7)).long(), 1),
            # updated batches
            (torch.randint(0, 2, size=(50, 4)).long(),
             torch.randint(0, 2, size=(50, 4)).long(), 16),
            (torch.randint(0, 2, size=(50, 7)).long(),
             torch.randint(0, 2, size=(50, 7)).long(), 16),
            # Multilabel input data of shape (N, H, W)
            (torch.randint(0, 2, size=(10, 5, 10)).long(),
             torch.randint(0, 2, size=(10, 5, 10)).long(), 1),
            (torch.randint(0, 2, size=(10, 4, 10)).long(),
             torch.randint(0, 2, size=(10, 4, 10)).long(), 1),
            # updated batches
            (torch.randint(0, 2, size=(50, 5, 10)).long(),
             torch.randint(0, 2, size=(50, 5, 10)).long(), 16),
            (torch.randint(0, 2, size=(50, 4, 10)).long(),
             torch.randint(0, 2, size=(50, 4, 10)).long(), 16),
            # Multilabel input data of shape (N, C, H, W, ...) and (N, C, H, W, ...)
            (torch.randint(0, 2, size=(4, 5, 12, 10)).long(),
             torch.randint(0, 2, size=(4, 5, 12, 10)).long(), 1),
            (torch.randint(0, 2, size=(4, 10, 12, 8)).long(),
             torch.randint(0, 2, size=(4, 10, 12, 8)).long(), 1),
            # updated batches
            (torch.randint(0, 2, size=(50, 5, 12, 10)).long(),
             torch.randint(0, 2, size=(50, 5, 12, 10)).long(), 16),
            (torch.randint(0, 2, size=(50, 10, 12, 8)).long(),
             torch.randint(0, 2, size=(50, 10, 12, 8)).long(), 16),
        ]
        return test_cases

    for _ in range(5):
        # check multiple random inputs as random exact occurencies are rare
        test_cases = get_test_cases()
        for y_pred, y, batch_size in test_cases:
            _test(y_pred, y, batch_size)
def run_training_test(root_dir, device=torch.device("cuda:0")):
    images = sorted(glob(os.path.join(root_dir, "img*.nii.gz")))
    segs = sorted(glob(os.path.join(root_dir, "seg*.nii.gz")))
    train_files = [{
        "image": img,
        "label": seg
    } for img, seg in zip(images[:20], segs[:20])]
    val_files = [{
        "image": img,
        "label": seg
    } for img, seg in zip(images[-20:], segs[-20:])]

    # define transforms for image and segmentation
    train_transforms = Compose([
        LoadNiftid(keys=["image", "label"]),
        AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
        ScaleIntensityd(keys=["image", "label"]),
        RandCropByPosNegLabeld(keys=["image", "label"],
                               label_key="label",
                               spatial_size=[96, 96, 96],
                               pos=1,
                               neg=1,
                               num_samples=4),
        RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
        ToTensord(keys=["image", "label"]),
    ])
    val_transforms = Compose([
        LoadNiftid(keys=["image", "label"]),
        AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
        ScaleIntensityd(keys=["image", "label"]),
        ToTensord(keys=["image", "label"]),
    ])

    # create a training data loader
    train_ds = monai.data.CacheDataset(data=train_files,
                                       transform=train_transforms,
                                       cache_rate=0.5)
    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
    train_loader = monai.data.DataLoader(train_ds,
                                         batch_size=2,
                                         shuffle=True,
                                         num_workers=4)
    # create a validation data loader
    val_ds = monai.data.CacheDataset(data=val_files,
                                     transform=val_transforms,
                                     cache_rate=1.0)
    val_loader = monai.data.DataLoader(val_ds, batch_size=1, num_workers=4)

    # create UNet, DiceLoss and Adam optimizer
    net = monai.networks.nets.UNet(
        dimensions=3,
        in_channels=1,
        out_channels=1,
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
        num_res_units=2,
    ).to(device)
    loss = monai.losses.DiceLoss(sigmoid=True)
    opt = torch.optim.Adam(net.parameters(), 1e-3)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1)

    val_post_transforms = Compose([
        Activationsd(keys="pred", sigmoid=True),
        AsDiscreted(keys="pred", threshold_values=True),
        KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
    ])
    val_handlers = [
        StatsHandler(output_transform=lambda x: None),
        TensorBoardStatsHandler(log_dir=root_dir,
                                output_transform=lambda x: None),
        TensorBoardImageHandler(log_dir=root_dir,
                                batch_transform=lambda x:
                                (x["image"], x["label"]),
                                output_transform=lambda x: x["pred"]),
        CheckpointSaver(save_dir=root_dir,
                        save_dict={"net": net},
                        save_key_metric=True),
    ]

    evaluator = SupervisedEvaluator(
        device=device,
        val_data_loader=val_loader,
        network=net,
        inferer=SlidingWindowInferer(roi_size=(96, 96, 96),
                                     sw_batch_size=4,
                                     overlap=0.5),
        post_transform=val_post_transforms,
        key_val_metric={
            "val_mean_dice":
            MeanDice(include_background=True,
                     output_transform=lambda x: (x["pred"], x["label"]))
        },
        additional_metrics={
            "val_acc":
            Accuracy(output_transform=lambda x: (x["pred"], x["label"]))
        },
        val_handlers=val_handlers,
    )

    train_post_transforms = Compose([
        Activationsd(keys="pred", sigmoid=True),
        AsDiscreted(keys="pred", threshold_values=True),
        KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
    ])
    train_handlers = [
        LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True),
        ValidationHandler(validator=evaluator, interval=2, epoch_level=True),
        StatsHandler(tag_name="train_loss",
                     output_transform=lambda x: x["loss"]),
        TensorBoardStatsHandler(log_dir=root_dir,
                                tag_name="train_loss",
                                output_transform=lambda x: x["loss"]),
        CheckpointSaver(save_dir=root_dir,
                        save_dict={
                            "net": net,
                            "opt": opt
                        },
                        save_interval=2,
                        epoch_level=True),
    ]

    trainer = SupervisedTrainer(
        device=device,
        max_epochs=5,
        train_data_loader=train_loader,
        network=net,
        optimizer=opt,
        loss_function=loss,
        inferer=SimpleInferer(),
        amp=False,
        post_transform=train_post_transforms,
        key_train_metric={
            "train_acc":
            Accuracy(output_transform=lambda x: (x["pred"], x["label"]))
        },
        train_handlers=train_handlers,
    )
    trainer.run()

    return evaluator.state.best_metric
Example #24
0
    def _test(metric_device):
        metric_device = torch.device(metric_device)
        acc = Accuracy(is_multilabel=True, device=metric_device)

        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(
            y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(
            y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long()
        acc.update((y_pred, y))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(
            y_pred.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)
        np_y = to_numpy_multilabel(
            y.cpu())  # (N, C, H, W, ...) -> (N * H * W ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
        # check that result is not changed
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)

        # Batched Updates
        acc.reset()
        torch.manual_seed(10 + rank)
        y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()
        y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long()

        batch_size = 16
        n_iters = y.shape[0] // batch_size + 1

        for i in range(n_iters):
            idx = i * batch_size
            acc.update((y_pred[idx:idx + batch_size], y[idx:idx + batch_size]))

        assert (
            acc._num_correct.device == metric_device
        ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}"

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = to_numpy_multilabel(
            y_pred.cpu())  # (N, C, L, ...) -> (N * L * ..., C)
        np_y = to_numpy_multilabel(y.cpu())  # (N, C, L, ...) -> (N * L ..., C)

        assert acc._type == "multilabel"
        n = acc._num_examples
        res = acc.compute()
        assert n * idist.get_world_size() == acc._num_examples
        assert isinstance(res, float)
        assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
Example #25
0
def train():
    config_file = "configs/train_full_config.json"
    config = Config.from_json_file(config_file)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", config.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(config.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(config.device)
    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[config.local_rank],
                                        output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        config, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = tuple(
            input_tensor.to(config.device) for input_tensor in batch)
        lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels,
                                 token_type_ids, token_emotion_ids)
        loss = (lm_loss * config.lm_coef +
                mc_loss * config.mc_coef) / config.gradient_accumulation_steps
        if config.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           config.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
        if engine.state.iteration % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(config.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids,
                                  mc_token_ids,
                                  token_type_ids=token_type_ids,
                                  token_emotion_ids=token_emotion_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[
                1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if config.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if config.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if config.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, config.lr),
                                 (config.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], config),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if config.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=config.log_dir)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(config,
                   tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=config.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if config.local_rank in [-1, 0] and config.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Example #26
0
def test__check_type():
    acc = Accuracy()

    with pytest.raises(RuntimeError, match=r"Invalid shapes of y"):
        acc._check_type((torch.rand([1, 1, 1]), torch.rand([1])))
Example #27
0
def fit_model(model, train_loader, test_loader, lr, max_epochs=5):

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = torch.nn.BCEWithLogitsLoss()

    def threshold_output_transform(output):
        y_pred, y = output
        y_pred = torch.heaviside(y_pred, values=torch.zeros(1))
        # print(f'y_pred size : {y_pred.size()}')
        # print(f'y size : {y.size()}')
        return y_pred, y

    def prepare_batch(batch, device, non_blocking):
        x, y = batch
        x = x.float()
        y = y.float()
        y = torch.unsqueeze(y, 1)
        return (x, y)

    def squeeze_y_dims(output):
        prediction, target = output
        # print(f'prediction size: {prediction.size()}')
        # print(f'target size: {target.size()}')
        return prediction, target

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch=prepare_batch)

    val_metrics = {
        "accuracy": Accuracy(threshold_output_transform),
        "bce": Loss(criterion, output_transform=squeeze_y_dims)
        # "precision" : Precision(threshold_output_transform, average=False),
        # "recall": Recall(threshold_output_transform, average=False)
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            prepare_batch=prepare_batch)

    @trainer.on(Events.ITERATION_COMPLETED(every=10))
    def log_training_loss(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        print(
            f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}"
        )

    # @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        # print(f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Training Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}"
        )

    @trainer.on(Events.EPOCH_COMPLETED(every=10))
    def log_validation_results(trainer):
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        # print(f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f} Avg precision : {metrics['precision']:.2f} Avg recall: {metrics['recall']:.2f}")
        print(
            f"Validation Results - Epoch: {trainer.state.epoch}  Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['bce']:.2f}"
        )

    trainer.run(train_loader, max_epochs=max_epochs)

    return model
Example #28
0
def test_binary_wrong_inputs():
    acc = Accuracy()

    with pytest.raises(
            ValueError,
            match=r"For binary cases, y must be comprised of 0's and 1's"):
        # y has not only 0 or 1 values
        acc.update(
            (torch.randint(0, 2, size=(10, )).long(), torch.arange(0,
                                                                   10).long()))

    with pytest.raises(
            ValueError,
            match=r"For binary cases, y_pred must be comprised of 0's and 1's"
    ):
        # y_pred values are not thresholded to 0, 1 values
        acc.update((
            torch.rand(10, ),
            torch.randint(0, 2, size=(10, )).long(),
        ))

    with pytest.raises(ValueError, match=r"y must have shape of "):
        # incompatible shapes
        acc.update((torch.randint(0, 2, size=(10, )).long(),
                    torch.randint(0, 2, size=(10, 5)).long()))

    with pytest.raises(ValueError, match=r"y must have shape of "):
        # incompatible shapes
        acc.update((torch.randint(0, 2, size=(10, 5, 6)).long(),
                    torch.randint(0, 2, size=(10, )).long()))

    with pytest.raises(ValueError, match=r"y must have shape of "):
        # incompatible shapes
        acc.update((torch.randint(0, 2, size=(10, )).long(),
                    torch.randint(0, 2, size=(10, 5, 6)).long()))
Example #29
0
        create_dataloader,
        imdb_dataset(directory='../data/', train=True, test=True))

    model = Classifier(
        WordRNN(256,
                embeddings,
                bidirectional=True,
                merge_bi='cat',
                packed_sequence=True,
                attention=True,
                device=DEVICE), 512, 3)

    optimizer = Adam([p for p in model.parameters() if p.requires_grad],
                     lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    metrics = {'accuracy': Accuracy(), 'loss': Loss(criterion)}
    trainer = SequentialTrainer(
        model,
        optimizer,
        checkpoint_dir='../checkpoints' if not DEBUG else None,
        metrics=metrics,
        non_blocking=True,
        retain_graph=True,
        patience=5,
        loss_fn=criterion,
        device=DEVICE)

    if DEBUG:
        log.info('Starting end to end test')
        print('--------------------------------------------------------------')
        trainer.fit_debug(train_loader, dev_loader)
Example #30
0
def _test_distrib_on_metric(device):
    import torch.distributed as dist

    rank = dist.get_rank()
    n_iters = 10
    n_epochs = 3
    batch_size = 10
    n_classes = 10

    data = list(range(n_iters))
    np.random.seed(12)
    all_y_true_batch_values = np.random.randint(0,
                                                n_classes,
                                                size=(dist.get_world_size(),
                                                      n_epochs * n_iters,
                                                      batch_size))
    all_y_pred_batch_values = np.random.rand(dist.get_world_size(),
                                             n_epochs * n_iters, batch_size,
                                             n_classes)

    y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
    y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])

    def update_fn(engine, batch):
        y_true_batch = next(y_true_batch_values)
        y_pred_batch = next(y_pred_batch_values)
        return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)

    trainer = Engine(update_fn)
    alpha = 0.98

    acc_metric = RunningAverage(
        Accuracy(output_transform=lambda x: [x[0], x[1]], device=device),
        alpha=alpha,
        epoch_bound=False,
    )
    acc_metric.attach(trainer, "running_avg_accuracy")

    running_avg_acc = [
        None,
    ]
    true_acc_metric = Accuracy(device=device)

    @trainer.on(Events.ITERATION_COMPLETED)
    def manual_running_avg_acc(engine):
        i = engine.state.iteration - 1

        true_acc_metric.reset()
        for j in range(dist.get_world_size()):
            output = (
                torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
                torch.from_numpy(all_y_true_batch_values[j, i, :]),
            )
            true_acc_metric.update(output)

        batch_acc = true_acc_metric._num_correct * 1.0 / true_acc_metric._num_examples

        if running_avg_acc[0] is None:
            running_avg_acc[0] = batch_acc
        else:
            running_avg_acc[0] = running_avg_acc[0] * alpha + (
                1.0 - alpha) * batch_acc
        engine.state.running_avg_acc = running_avg_acc[0]

    @trainer.on(Events.ITERATION_COMPLETED)
    def assert_equal_running_avg_acc_values(engine):
        assert (engine.state.running_avg_acc == engine.state.
                metrics["running_avg_accuracy"]), "{} vs {}".format(
                    engine.state.running_avg_acc,
                    engine.state.metrics["running_avg_accuracy"])

    trainer.run(data, max_epochs=3)