def run_training(model, optimizer, scheduler, output_path,
                 train_loader, val_loader, epochs, patience,
                  epochs_pretrain, mixed_precision, classes_weights):

    # trainer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if classes_weights is not None:
        classes_weights = classes_weights.to(device)
    crit = nn.CrossEntropyLoss(weight=classes_weights)
    metrics = {"accuracy": Accuracy(), "loss": Loss(crit)}
    trainer = create_supervised_trainer_with_pretraining(
        model, optimizer, crit, device=device, epochs_pretrain=epochs_pretrain,
        mixed_precision=mixed_precision)
    train_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)
    val_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)

    # Out paths
    path_ckpt = os.path.join(output_path, "model_ckpt")
    log_dir = os.path.join(output_path, "log_dir")
    os.makedirs(log_dir, exist_ok=True)

    # tensorboard
    tb_logger = TensorboardLogger(log_dir=log_dir)
    tb_logger.attach(train_evaluator, log_handler=OutputHandler(tag="training", metric_names=[
        "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="validation", metric_names=[
        "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

    # training progress
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names="all")

    # @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        train_evaluator.run(train_loader)
        val_evaluator.run(val_loader)
        train_loss = train_evaluator.state.metrics["loss"]
        val_loss = val_evaluator.state.metrics["loss"]
        train_acc = train_evaluator.state.metrics["accuracy"]
        val_acc = val_evaluator.state.metrics["accuracy"]
        pbar.log_message(
            "Training Results - Epoch: {}  Loss: {:.6f}  Accuracy: {:.6f}".format(engine.state.epoch, train_loss, train_acc))
        pbar.log_message(
            "Validation Results - Epoch: {}  Loss: {:.6f}  Accuracy: {:.6f}".format(engine.state.epoch, val_loss, val_acc))

        pbar.n = pbar.last_print_n = 0

    trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)

    # def get_val_loss(engine):
    # 	return -engine.state.metrics['loss']
    def get_val_acc(engine):
        return engine.state.metrics['accuracy']

    # checkpoint and early stopping
    checkpointer = ModelCheckpoint(
        path_ckpt, "model", score_function=get_val_acc, score_name="accuracy", require_empty=False)
    early_stopper = EarlyStopping(patience, get_val_acc, trainer)

    to_save = {'optimizer': optimizer, 'model': model}
    if scheduler is not None:
        to_save["scheduler"] = scheduler
    val_evaluator.add_event_handler(Events.COMPLETED, checkpointer, to_save)
    val_evaluator.add_event_handler(Events.COMPLETED, early_stopper)
    if scheduler is not None:
        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # free resources
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())
    train_evaluator.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())
    val_evaluator.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())

    trainer.run(train_loader, max_epochs=epochs)
    tb_logger.close()

    # Evaluation with best model
    model.load_state_dict(torch.load(
        glob.glob(os.path.join(path_ckpt, "*.pth"))[0])["model"])
    train_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)
    val_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)

    train_evaluator.run(train_loader)
    val_evaluator.run(val_loader)

    _pretty_print("Evaluating best model")
    pbar.log_message(
        "Best model on training set - Loss: {:.6f}  Accuracy: {:.6f}"
        .format(train_evaluator.state.metrics["loss"], train_evaluator.state.metrics["accuracy"]))
    pbar.log_message(
        "Best model on validation set - Loss: {:.6f}  Accuracy: {:.6f}"
        .format(val_evaluator.state.metrics["loss"], val_evaluator.state.metrics["accuracy"]))

    return model, train_evaluator.state.metrics, val_evaluator.state.metrics
Beispiel #2
0
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
    """function to be run by idist.Parallel context manager."""

    # ----------------------
    # make a certain seed
    # ----------------------
    rank = idist.get_rank()
    manual_seed(config.seed + rank)

    # -----------------------
    # create output folder
    # -----------------------

    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")
        name = f"{config.model}-backend-{idist.backend()}-{now}"
        path = Path(config.output_dir, name)
        path.mkdir(parents=True, exist_ok=True)
        config.output_dir = path.as_posix()

    config.output_dir = Path(idist.broadcast(config.output_dir, src=0))

    # -----------------------------
    # datasets and dataloaders
    # -----------------------------
    # TODO : PLEASE provide your custom datasets and dataloaders configurations
    # we can use `idist.auto_dataloader` to handle distributed configurations
    # TODO : PLEASE replace `kwargs` with your desirable DataLoader arguments
    # See : https://pytorch.org/ignite/distributed.html#ignite.distributed.auto.auto_dataloader

    train_dataset, eval_dataset = get_datasets(path=config.data_path)

    train_dataloader = idist.auto_dataloader(
        train_dataset,
        batch_size=config.train_batch_size,
        num_workers=config.num_workers,
        shuffle=True,
        {% if use_distributed_training and not use_distributed_launcher %}
        persistent_workers=True,
        {% endif %}
    )
    eval_dataloader = idist.auto_dataloader(
        eval_dataset,
        batch_size=config.eval_batch_size,
        num_workers=config.num_workers,
        shuffle=False,
        {% if use_distributed_training and not use_distributed_launcher %}
        persistent_workers=True,
        {% endif %}
    )

    # ------------------------------------------
    # model, optimizer, loss function, device
    # ------------------------------------------

    device = idist.device()
    config.num_iters_per_epoch = len(train_dataloader)
    model, optimizer, loss_fn, lr_scheduler = initialize(config=config)

    # -----------------------------
    # trainer and evaluator
    # -----------------------------

    trainer, evaluator = create_trainers(
        config=config,
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        device=device,
    )

    # ---------------------------------
    # attach metrics to evaluator
    # ---------------------------------
    accuracy = Accuracy(device=device)
    metrics = {
        "eval_accuracy": accuracy,
        "eval_loss": Loss(loss_fn, device=device),
        "eval_error": (1.0 - accuracy) * 100,
    }
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # -------------------------------------------
    # setup engines logger with python logging
    # print training configurations
    # -------------------------------------------

    logger = setup_logging(config)
    log_basic_info(logger, config)
    trainer.logger = logger
    evaluator.logger = logger

    # -------------------------------------
    # ignite handlers and ignite loggers
    # -------------------------------------

    to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler}
    best_model_handler, es_handler, timer_handler = get_handlers(
        config=config,
        model=model,
        trainer=trainer,
        evaluator=evaluator,
        metric_name="eval_accuracy",
        es_metric_name="eval_accuracy",
        to_save=to_save,
        lr_scheduler=lr_scheduler,
        output_names=None,
    )

    # setup ignite logger only on rank 0
    if rank == 0:
        logger_handler = get_logger(
            config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer
        )

    # -----------------------------------
    # resume from the saved checkpoints
    # -----------------------------------

    if config.resume_from:
        resume_from(to_load=to_save, checkpoint_fp=config.resume_from)

    # --------------------------------
    # print metrics to the stderr
    # with `add_event_handler` API
    # for training stats
    # --------------------------------

    trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train")

    # ---------------------------------------------
    # run evaluation at every training epoch end
    # with shortcut `on` decorator API and
    # print metrics to the stderr
    # again with `add_event_handler` API
    # for evaluation stats
    # ---------------------------------------------

    @trainer.on(Events.EPOCH_COMPLETED(every=1))
    def _():
        evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length)
        log_metrics(evaluator, "eval")

    # --------------------------------------------------
    # let's try run evaluation first as a sanity check
    # --------------------------------------------------

    @trainer.on(Events.STARTED)
    def _():
        evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length)

    # ------------------------------------------
    # setup if done. let's run the training
    # ------------------------------------------

    trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length)

    # ------------------------------------------------------------
    # close the logger after the training completed / terminated
    # ------------------------------------------------------------

    if rank == 0:
        from ignite.contrib.handlers.wandb_logger import WandBLogger

        if isinstance(logger_handler, WandBLogger):
            # why handle differently for wandb ?
            # See : https://github.com/pytorch/ignite/issues/1894
            logger_handler.finish()
        elif logger_handler:
            logger_handler.close()

    # -----------------------------------------
    # where is my best and last checkpoint ?
    # -----------------------------------------

    if best_model_handler is not None:
        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
Beispiel #3
0
trainer = Engine(process_function)
train_evaluator = Engine(eval_function)
validator_evaluator = Engine(eval_function)

RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')


def thresholded_output_transform(output):
    y_pred, y = output
    y_pred = torch.round(y_pred)
    return y_pred, y


Accuracy(output_transform=thresholded_output_transform).attach(train_evaluator, 'accuracy')
Loss(loss_function).attach(train_evaluator, 'loss_train')  # binary cross entropy

Accuracy(output_transform=thresholded_output_transform).attach(validator_evaluator, 'accuracy')
Loss(loss_function).attach(validator_evaluator, 'loss_val')

pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss'])


@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    train_evaluator.run(train_iter)
    metrics = train_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_loss = metrics['loss_train']
    pbar.log_message(
Beispiel #4
0
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options loaded from default.py will be overridden by those loaded from cfg file
        Options passed in via options argument will override those loaded from cfg file
    
    Args:
        *options (str, int, optional): Options used to overide what is loaded from the
                                    config. To see what options are available consult
                                    default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
    """

    update_config(config, options=options, config_file=cfg)

    # we will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"

    # Setup Augmentations
    basic_aug = Compose(
        [
            Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,),
            PadIfNeeded(
                min_height=config.TRAIN.PATCH_SIZE,
                min_width=config.TRAIN.PATCH_SIZE,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
            Resize(
                config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True,
            ),
            PadIfNeeded(
                min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
                min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
        ]
    )
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    PenobscotDataset = get_patch_dataset(config)

    train_set = PenobscotDataset(
        config.DATASET.ROOT,
        config.TRAIN.PATCH_SIZE,
        config.TRAIN.STRIDE,
        split="train",
        transforms=train_aug,
        n_channels=config.MODEL.IN_CHANNELS,
        complete_patches_only=config.TRAIN.COMPLETE_PATCHES_ONLY,
    )

    val_set = PenobscotDataset(
        config.DATASET.ROOT,
        config.TRAIN.PATCH_SIZE,
        config.TRAIN.STRIDE,
        split="val",
        transforms=val_aug,
        n_channels=config.MODEL.IN_CHANNELS,
        complete_patches_only=config.VALIDATION.COMPLETE_PATCHES_ONLY,
    )
    logger.info(train_set)
    logger.info(val_set)
    n_classes = train_set.n_classes

    train_loader = data.DataLoader(
        train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True,
    )

    if debug:
        val_set = data.Subset(val_set, range(3))

    val_loader = data.DataLoader(val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS)

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)

    model = model.to(device)  # Send to GPU

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    try:
        output_dir = generate_path(
            config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),
        )
    except TypeError:
        output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),)

    summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))
    snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader)
    scheduler = CosineAnnealingScheduler(
        optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration
    )

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean")

    trainer = create_supervised_trainer(model, optimizer, criterion, _prepare_batch, device=device)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU),
    )
    trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer))
    trainer.add_event_handler(
        Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"),
    )
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer),
    )

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze())

    evaluator = create_supervised_evaluator(
        model,
        _prepare_batch,
        metrics={
            "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "nll": Loss(criterion, output_transform=_select_pred_and_mask),
            "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask),
            "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask),
        },
        device=device,
    )

    # Set the validation run to start on the epoch completion of the training run
    trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader))

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_metrics(
            "Validation results",
            metrics_dict={
                "nll": "Avg loss :",
                "pixacc": "Pixelwise Accuracy :",
                "mca": "Avg Class Accuracy :",
                "mIoU": "Avg Class IoU :",
            },
        ),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        tensorboard_handlers.log_metrics(
            summary_writer,
            trainer,
            "epoch",
            metrics_dict={
                "mIoU": "Validation/mIoU",
                "nll": "Validation/Loss",
                "mca": "Validation/MCA",
                "pixacc": "Validation/Pixel_Acc",
            },
        ),
    )

    def _select_max(pred_tensor):
        return pred_tensor.max(1)[1]

    def _tensor_to_numpy(pred_tensor):
        return pred_tensor.squeeze().cpu().numpy()

    transform_func = compose(np_to_tb, decode_segmap, _tensor_to_numpy,)

    transform_pred = compose(transform_func, _select_max)

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred),
    )

    def snapshot_function():
        return (trainer.state.iteration % snapshot_duration) == 0

    checkpoint_handler = SnapshotHandler(output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function,)
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model})

    logger.info("Starting training")
    if debug:
        trainer.run(
            train_loader,
            max_epochs=config.TRAIN.END_EPOCH,
            epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU,
            seed=config.SEED,
        )
    else:
        trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
Beispiel #5
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = 'cpu'

    if torch.cuda.is_available():
        device = 'cuda'

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)

    # define a trainer
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    # define a evaluator
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)
    # Print
    desc = "ITERATION - loss: {:.2f}"  # the loss of each iteration while training
    pbar = tqdm(
        initial=0, leave=False, total=len(train_loader), desc=desc.format(
            0))  # Progress of the current iteration in the entire epoch

    @trainer.on(Events.ITERATION_COMPLETED
                )  # call this function when iteration is completed
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(
                engine.state.output)  # update the training loss
            pbar.update(log_interval)  # update the progress bar

    @trainer.on(Events.EPOCH_COMPLETED
                )  # call this function when epoch is completed
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED
                )  # call this function when epoch is completed
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = FushionNet()
    #model=torch.load(SAVE_PATH+"350-0.908.pth")

    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'
    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
                          momentum=momentum,
                          weight_decay=2e-6,
                          nesterov=False)
    #optimizer = optim.Adamax(model.parameters(),lr,(0.9,0.999),1e-8,1e-6)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               [50, 100, 150, 200, 250, 300],
                                               0.1)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        current_lr = optimizer.param_groups[0]['lr']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f} Current lr: {:.6f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll, current_lr))
        scheduler.step()

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        pbar.n = pbar.last_print_n = 0
        if (engine.state.epoch % 10 == 0):
            torch.save(
                model, SAVE_PATH + str(engine.state.epoch) + "-" +
                str(avg_accuracy) + ".pth")

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
Beispiel #7
0
def run(train_batch_size, val_batch_size, epochs, learning_rate, weight_decay,
        log_interval, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    test_loader = get_test_loader(val_batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("Pytorch Version:", torch.__version__)
    print('device={}'.format(device))

    model = CP_MixedNet()
    writer = create_summary_writer(model, train_loader, log_dir)
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)
    evaluator_val = create_supervised_evaluator(model,
                                                metrics={
                                                    'accuracy': Accuracy(),
                                                    'nll': Loss(F.nll_loss)
                                                },
                                                device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1

        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)
            writer.add_scalar("training/loss", engine.state.output,
                              engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_test_results(engine):
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Test Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}".
            format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("test/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("test/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    handler = EarlyStopping(patience=400,
                            score_function=score_function,
                            trainer=trainer)
    evaluator_val.add_event_handler(Events.COMPLETED, handler)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):

        evaluator_val.run(val_loader)
        metrics = evaluator_val.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("val/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("val/avg_accuracy", avg_accuracy, engine.state.epoch)

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
    writer.close()

    save_model = True
    if (save_model):
        torch.save(model.state_dict(), "weights_BCI.pt")
def main(dataset_path, batch_size=256, max_epochs=10):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(
        dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    scaler = GradScaler()

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        # Runs the forward pass with autocasting.
        with autocast():
            y_pred = model(x)
            loss = criterion(y_pred, y)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same precision that autocast used for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(
        trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}")

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print(f"- Mean elapsed time for 1 epoch: {timer.value()}")
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)
def run(tb, vb, lr, epochs, writer):
    device = os.environ['main-device']
    logging.info('Training program start!')
    logging.info('Configuration:')
    logging.info('\n' + json.dumps(INFO, indent=2))

    # ------------------------------------
    # 1. Define dataloader
    train_loader, train4val_loader, val_loader, num_of_images, mapping, _ = get_dataloaders(
        tb, vb)
    # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb)
    # Adjust weights of unknown
    num_of_images[6] += int(sum(num_of_images) / len(num_of_images))
    weights = (1 / num_of_images) / ((1 / num_of_images).sum().item())
    # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images))
    weights = weights.to(device=device)

    # ------------------------------------
    # 2. Define model
    model = EfficientNet.from_pretrained(
        'efficientnet-b0', num_classes=INFO['dataset-info']['num-of-classes'])
    model = carrier(model)

    # ------------------------------------
    # 3. Define optimizer
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
    ignite_scheduler = LRScheduler(scheduler)

    # ------------------------------------
    # 4. Define metrics

    class SoftCrossEntropyLoss(nn.Module):
        def __init__(self, weight=None):
            super(SoftCrossEntropyLoss, self).__init__()
            self.class_weights = weight

        def forward(self, input, target):
            softmax = torch.exp(input) / torch.exp(input).sum(1)[:, None]
            onehot_labels = to_onehot(target, input.shape[1])
            soft_labels = torch.zeros_like(onehot_labels)
            soft_labels = torch.where(
                onehot_labels.cpu() == 1, torch.tensor([0.9]),
                torch.tensor([0.1 / (input.shape[1] - 1)])).to(device=device)
            if self.class_weights is not None:
                # print(soft_labels.shape, softmax.shape)
                loss = -torch.sum(
                    torch.log(softmax) * soft_labels * self.class_weights *
                    input.shape[1])
            else:
                loss = -torch.sum(torch.log(softmax) * soft_labels)
            return loss

    class EntropyPrediction(metric.Metric):
        def __init__(self, threshold=1.0):
            super(EntropyPrediction, self).__init__()
            self.threshold = threshold
            self.prediction = torch.tensor([], dtype=torch.int)
            self.y = torch.tensor([], dtype=torch.int)

        def reset(self):
            # self.threshold = 0.3
            self.prediction = torch.tensor([])
            self.y = torch.tensor([])
            super(EntropyPrediction, self).reset()

        def update(self, output):
            y_pred, y = output
            softmax = torch.exp(y_pred) / torch.exp(y_pred).sum(1)[:, None]
            entropy_base = math.log(y_pred.shape[1])
            entropy = (-softmax * torch.log(softmax)).sum(1) / entropy_base
            values, inds = softmax.max(1)
            prediction = torch.where(entropy > self.threshold, inds,
                                     torch.tensor([-1]).to(device=device))
            self.prediction = torch.cat(
                (self.prediction.type(torch.LongTensor).to(device=device),
                 torch.tensor([mapping[x.item()]
                               for x in prediction]).to(device=device)))
            self.y = torch.cat(
                (self.y.type(torch.LongTensor).to(device=device),
                 y.to(device=device)))
            # return self.prediction, self.y

        def compute(self):
            return self.prediction, self.y

    train_metrics = {
        'accuracy':
        Accuracy(),
        'loss':
        Loss(nn.CrossEntropyLoss(weight=weights)),
        'precision_recall':
        MetricsLambda(PrecisionRecallTable, Precision(), Recall(),
                      train_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(CMatrixTable,
                      ConfusionMatrix(INFO['dataset-info']['num-of-classes']),
                      train_loader.dataset.classes)
    }

    val_metrics = {
        'accuracy':
        MetricsLambda(Labels2Acc, EntropyPrediction()),
        'precision_recall':
        MetricsLambda(Labels2PrecisionRecall, EntropyPrediction(),
                      val_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(Labels2CMatrix, EntropyPrediction(),
                      val_loader.dataset.classes)
    }

    # ------------------------------------
    # 5. Create trainer
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        nn.CrossEntropyLoss(weight=weights),
                                        device=device)

    # ------------------------------------
    # 6. Create evaluator
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=train_metrics,
                                                  device=device)
    val_evaluator = create_supervised_evaluator(model,
                                                metrics=val_metrics,
                                                device=device)

    desc = 'ITERATION - loss: {:.4f}'
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    # ------------------------------------
    # 7. Create event hooks

    # Update process bar on each iteration completed.
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        log_interval = 1
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_STARTED)
    def refresh_pbar(engine):
        pbar.refresh()
        pbar.n = pbar.last_print_n = 0

    # Compute metrics on train data on each epoch completed.
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        print('Checking on training set.')
        train_evaluator.run(train4val_loader)
        metrics = train_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      Training Results - Epoch: {}
      Avg accuracy: {:.4f}
      Avg loss: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, avg_loss,
                 precision_recall['pretty'], cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss},
                           engine.state.epoch)

    # Compute metrics on val data on each epoch completed.
    cpe = CustomPeriodicEvent(n_epochs=50)
    cpe.attach(trainer)

    @trainer.on(cpe.Events.EPOCHS_50_COMPLETED)
    def log_validation_results(engine):
        pbar.clear()
        print('* - * - * - * - * - * - * - * - * - * - * - * - *')
        print('Checking on validation set.')
        val_evaluator.run(val_loader)
        metrics = val_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      Validating Results - Epoch: {}
      Avg accuracy: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'],
                 cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars(
            'Aggregate/Score', {
                'Val avg precision': precision_recall['data'][0, -1],
                'Val avg recall': precision_recall['data'][1, -1]
            }, engine.state.epoch)

    # Save model ever N epoch.
    save_model_handler = ModelCheckpoint(os.environ['savedir'],
                                         '',
                                         save_interval=10,
                                         n_saved=2)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler,
                              {'model': model})

    # Update learning-rate due to scheduler.
    trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler)

    # ------------------------------------
    # Run
    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
Beispiel #10
0
def main(batch_size, epochs):

    # 1. GPUの設定(PyTorchでは明示的に指定する必要がある)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)

    train_loader, test_loader = get_data_loaders(batch_size)

    # 2. モデル作成
    #    model = net.CNN(num_classes=num_classes).to(device)
    model = net.Net(1000, 10).to(device)
    print(model)  # ネットワークの詳細を確認用に表示

    # 3. 損失関数を定義
    criterion = nn.CrossEntropyLoss()

    # 4. 最適化手法を定義(ここでは例としてAdamを選択)
    #    optimizer = optim.Adam(model.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters())
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics={
                                                      'accuracy': Accuracy(),
                                                      'loss': Loss(criterion)
                                                  },
                                                  device=device)
    test_evaluator = create_supervised_evaluator(model,
                                                 metrics={
                                                     'accuracy': Accuracy(),
                                                     'loss': Loss(criterion)
                                                 },
                                                 device=device)

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    log_interval = 10

    # 5. ログ出力
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        i = (engine.state.iteration - 1) % len(train_loader) + 1
        if i % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        train_evaluator.run(train_loader)
        metrics = train_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.3f} Avg loss: {:.4f}"
            .format(engine.state.epoch, avg_accuracy, avg_loss))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        test_evaluator.run(test_loader)
        metrics = test_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.3f} Avg loss: {:.4f}"
            .format(engine.state.epoch, avg_accuracy, avg_loss))
        pbar.n = pbar.last_print_n = 0

    def score_function(engine):
        val_loss = engine.state.metrics['loss']
        return -val_loss

    # 5. checkpoint setting
    best_handler = ModelCheckpoint(dirname='./checkpoints',
                                   filename_prefix='best',
                                   n_saved=3,
                                   score_name='loss',
                                   score_function=score_function,
                                   create_dir=True,
                                   require_empty=False)
    test_evaluator.add_event_handler(Events.EPOCH_COMPLETED, best_handler,
                                     {'mymodel': model})

    early_handler = EarlyStopping(patience=5,
                                  score_function=score_function,
                                  trainer=trainer)
    # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset)
    test_evaluator.add_event_handler(Events.COMPLETED, early_handler)

    # 6. 実行
    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
Beispiel #11
0
def train_with_ignite(networks, dataset, data_dir, batch_size, img_size,
                      epochs, lr, momentum, num_workers, optimizer, logger):

    from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
    from ignite.metrics import Loss
    from utils.metrics import MultiThresholdMeasures, Accuracy, IoU, F1score

    # device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # build model
    model = get_network(networks)

    # log model summary
    input_size = (3, img_size, img_size)
    summarize_model(model.to(device), input_size, logger, batch_size, device)

    # build loss
    loss = torch.nn.BCEWithLogitsLoss()

    # build optimizer and scheduler
    model_optimizer = get_optimizer(optimizer, model, lr, momentum)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer)

    # transforms on both image and mask
    train_joint_transforms = jnt_trnsf.Compose([
        jnt_trnsf.RandomCrop(img_size),
        jnt_trnsf.RandomRotate(5),
        jnt_trnsf.RandomHorizontallyFlip()
    ])

    # transforms only on images
    train_image_transforms = std_trnsf.Compose([
        std_trnsf.ColorJitter(0.05, 0.05, 0.05, 0.05),
        std_trnsf.ToTensor(),
        std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    test_joint_transforms = jnt_trnsf.Compose([jnt_trnsf.Safe32Padding()])

    test_image_transforms = std_trnsf.Compose([
        std_trnsf.ToTensor(),
        std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # transforms only on mask
    mask_transforms = std_trnsf.Compose([std_trnsf.ToTensor()])

    # build train / test loader
    train_loader = get_loader(dataset=dataset,
                              data_dir=data_dir,
                              train=True,
                              joint_transforms=train_joint_transforms,
                              image_transforms=train_image_transforms,
                              mask_transforms=mask_transforms,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers)

    test_loader = get_loader(dataset=dataset,
                             data_dir=data_dir,
                             train=False,
                             joint_transforms=test_joint_transforms,
                             image_transforms=test_image_transforms,
                             mask_transforms=mask_transforms,
                             batch_size=1,
                             shuffle=False,
                             num_workers=num_workers)

    # build trainer / evaluator with ignite
    trainer = create_supervised_trainer(model,
                                        model_optimizer,
                                        loss,
                                        device=device)
    measure = MultiThresholdMeasures()
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                '': measure,
                                                'pix-acc': Accuracy(measure),
                                                'iou': IoU(measure),
                                                'loss': Loss(loss),
                                                'f1': F1score(measure),
                                            },
                                            device=device)

    # initialize state variable for checkpoint
    state = update_state(model.state_dict(), 0, 0, 0, 0, 0)

    # make ckpt path
    ckpt_root = './ckpt/'
    filename = '{network}_{optimizer}_lr_{lr}_epoch_{epoch}.pth'
    ckpt_path = os.path.join(ckpt_root, filename)

    # execution after every training iteration
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        num_iter = (trainer.state.iteration - 1) % len(train_loader) + 1
        if num_iter % 20 == 0:
            logger.info("Epoch[{}] Iter[{:03d}] Loss: {:.2f}".format(
                trainer.state.epoch, num_iter, trainer.state.output))

    # execution after every training epoch
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        # evaluate on training set
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        logger.info(
            "Training Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IOU: {}\n F1: {}\n"
            .format(trainer.state.epoch, metrics['loss'],
                    str(metrics['pix-acc']), str(metrics['iou']),
                    str(metrics['f1'])))

        # update state
        update_state(weight=model.state_dict(),
                     train_loss=metrics['loss'],
                     val_loss=state['val_loss'],
                     val_pix_acc=state['val_pix_acc'],
                     val_iou=state['val_iou'],
                     val_f1=state['val_f1'])

    # execution after every epoch
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        # evaluate test(validation) set
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        logger.info(
            "Validation Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IOU: {}\n F1: {}\n"
            .format(trainer.state.epoch, metrics['loss'],
                    str(metrics['pix-acc']), str(metrics['iou']),
                    str(metrics['f1'])))

        # update scheduler
        lr_scheduler.step(metrics['loss'])

        # update and save state
        update_state(weight=model.state_dict(),
                     train_loss=state['train_loss'],
                     val_loss=metrics['loss'],
                     val_pix_acc=metrics['pix-acc'],
                     val_iou=metrics['iou'],
                     val_f1=metrics['f1'])

        path = ckpt_path.format(network=networks,
                                optimizer=optimizer,
                                lr=lr,
                                epoch=trainer.state.epoch)
        save_ckpt_file(path, state)

    trainer.run(train_loader, max_epochs=epochs)
    def log_training_acc(engine):
        metrics = engine.state.metrics
        sim_acc = metrics['sim_acc']
        clsf_acc = metrics['clsf_acc']
        print("Epoch[{}] sim_acc: {:.2f}; clsf_acc {:.2f}".format(
            engine.state.epoch, sim_acc, clsf_acc))

    from ignite.engine import create_supervised_evaluator
    from ignite.metrics import Loss
    from utils import extract_embeddings
    from trainer.metrics import SiameseNetSimilarityAccuracy as SimilarityAccuracy
    siamese_evaluator = create_supervised_evaluator(
        siamese_net, device=device, non_blocking=pin_memory, metrics={
            # no a good approach
            'accuracy': SimilarityAccuracy(margin, l2_normalize=True),
            'loss': Loss(con_loss_fn)
        })
    pbar = ProgressBar()
    pbar.attach(siamese_evaluator)
    clsf_evaluator = create_supervised_evaluator(
        clsf_net, device=device, non_blocking=pin_memory, metrics={
            'accuracy': Accuracy(),
            'loss': Loss(CrossEntropyLoss())
        })

    @engine.on(Events.EPOCH_COMPLETED)
    def run_validation(engine):
        # loader_kwargs = {
        #     'pin_memory': True,
        #     'num_workers': 4,
        #     'batch_size': 100,
Beispiel #13
0
def run(
    train_batch_size,
    val_batch_size,
    epochs,
    lr,
    momentum,
    log_interval,
    log_dir,
    checkpoint_every,
    resume_from,
    crash_iteration=1000,
):

    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    writer = SummaryWriter(logdir=log_dir)
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    criterion = nn.NLLLoss()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                "accuracy": Accuracy(),
                                                "nll": Loss(criterion)
                                            },
                                            device=device)

    @trainer.on(Events.EPOCH_COMPLETED)
    def lr_step(engine):
        lr_scheduler.step()

    desc = "ITERATION - loss: {:.4f} - lr: {:.4f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0, lr))

    if log_interval is None:
        e = Events.ITERATION_COMPLETED
        log_interval = 1
    else:
        e = Events.ITERATION_COMPLETED(every=log_interval)

    @trainer.on(e)
    def log_training_loss(engine):
        lr = optimizer.param_groups[0]["lr"]
        pbar.desc = desc.format(engine.state.output, lr)
        pbar.update(log_interval)
        writer.add_scalar("training/loss", engine.state.output,
                          engine.state.iteration)
        writer.add_scalar("lr", lr, engine.state.iteration)

    if resume_from is None:

        @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration))
        def _(engine):
            raise Exception("STOP at {}".format(engine.state.iteration))

    else:

        @trainer.on(Events.STARTED)
        def _(engine):
            pbar.n = engine.state.iteration

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    objects_to_checkpoint = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": lr_scheduler
    }
    training_checkpoint = Checkpoint(to_save=objects_to_checkpoint,
                                     save_handler=DiskSaver(
                                         log_dir, require_empty=False))

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED(every=checkpoint_every),
        training_checkpoint)

    if resume_from is not None:
        tqdm.write("Resume from a checkpoint: {}".format(resume_from))
        checkpoint = torch.load(resume_from)
        Checkpoint.load_objects(to_load=objects_to_checkpoint,
                                checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=epochs)
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    pbar.close()
    writer.close()
Beispiel #14
0
    def _train(
        self,
        train_data,
        val_data,
        test_data,
        writer,
        experiment,
        dry_run: bool = False,
    ) -> None:

        use_cuda = torch.cuda.is_available()

        # Preprocess all datasets.
        logger.info("Preprocessing datasets...")
        train_loader = self._preprocess_for_training("train", train_data, use_cuda)
        val_loader = self._preprocess_for_training("val", val_data, use_cuda)
        test_loader = self._preprocess_for_training("test", test_data, use_cuda)
        logger.info("")

        # Set up model and move it to device.
        logger.info("Creating model...")
        self._create_model(self.num_classes)
        device = torch.device("cuda" if use_cuda else "cpu")
        logger.info(f"    device: {device}")
        self.model = self.model.to(device)

        # Set up optimizer and loss.
        optimizer = self._create_optimizer()
        loss_func = nn.CrossEntropyLoss()
        logger.info(f"    loss function: cross-entropy")
        logger.info("")

        # Dedicate a few images that will be plotted as samples to tensorboard.
        num_samples_to_plot = self.config.get("num_samples_to_plot", 5)

        def get_samples(loader):
            if loader is None:
                return None, None
            else:
                return next(
                    iter(DataLoader(loader.dataset, batch_size=num_samples_to_plot))
                )

        train_sample_images, train_sample_labels = get_samples(train_loader)
        val_sample_images, val_sample_labels = get_samples(val_loader)
        test_sample_images, test_sample_labels = get_samples(test_loader)

        # Configure trainer and metrics.
        accumulate_train_metrics = self.config.get("accumulate_train_metrics", True)

        # We need to transform the output of the trainer and metrics here to accumulate
        # metrics during training (otherwise, we have to re-evaluate on the complete
        # train set which takes a long time). By default, the trainer outputs
        # `loss.item()` and the metrics expect `y_pred, y` (which is what the evaluator
        # outputs). We are now outputting `y_pred, y, loss` from the trainer and then
        # slicing off the `loss` before it goes into the metric.
        # See also the footnote here but note that it's a bit wrong:
        # https://pytorch.org/ignite/quickstart.html#
        def trainer_output_transform(x, y, y_pred, loss):
            return y_pred, y, loss.item()

        def metrics_output_transform(output):
            return output[:2]  # use only y_pred, y

        trainer = create_supervised_trainer(
            self.model,
            optimizer,
            loss_func,
            device=device,
            output_transform=trainer_output_transform,
        )
        if accumulate_train_metrics:
            # TODO: Maybe put train_metrics and val_metrics into one dict.
            train_metrics = {
                "accuracy": Accuracy(output_transform=metrics_output_transform),
                "loss": Loss(loss_func, output_transform=metrics_output_transform),
                # "confusion_matrix": ConfusionMatrix(num_classes),
            }
            for name, metric in train_metrics.items():
                # Attach metrics to trainer to accumulate them during training.
                metric.attach(trainer, name)
        val_metrics = {
            "accuracy": Accuracy(),
            "loss": Loss(loss_func),
            # "confusion_matrix": ConfusionMatrix(num_classes),
        }
        evaluator = create_supervised_evaluator(
            self.model, metrics=val_metrics, device=device
        )

        @trainer.on(
            Events.ITERATION_COMPLETED(every=self.config.get("print_every", 100))
        )
        def log_batch(trainer):
            batch = (trainer.state.iteration - 1) % trainer.state.epoch_length + 1
            logger.info(
                f"Epoch {trainer.state.epoch} / {num_epochs}, "
                f"batch {batch} / {trainer.state.epoch_length}: "
                f"Loss: {trainer.state.output[2]:.3f}"
                # f"Loss: {trainer.state.output:.3f}"
            )

        def log_results(name, metrics, epoch):
            """Log results of an epoch to stdout, tensorboard and comet."""
            logger.info(
                f"{name}: Average loss: {metrics['loss']:.3f}, "
                f"Average accuracy: {metrics['accuracy']:.3f}"
            )
            experiment.log_metric(f"{name}_loss", metrics["loss"])
            experiment.log_metric(f"{name}_accuracy", metrics["accuracy"])
            writer.add_scalar(f"{name}_loss", metrics["loss"], epoch)
            writer.add_scalar(f"{name}_accuracy", metrics["accuracy"], epoch)

        # TODO: This iterates over complete train set again, maybe accumulate as in the
        #   example in the footnote here: https://pytorch.org/ignite/quickstart.html#
        @trainer.on(Events.EPOCH_COMPLETED)
        def log_epoch(trainer):
            logger.info("")
            logger.info(f"Epoch {trainer.state.epoch} / {num_epochs} results: ")

            # Train data.
            if accumulate_train_metrics:
                log_results("train", trainer.state.metrics, trainer.state.epoch)
                logger.info("(train metrics are accumulated during training; "
                            "to re-evaluate on the complete train set after training, "
                            "use config parameter 'accumulate_train_metrics': False)")
            else:
                evaluator.run(train_loader)
                log_results("train", evaluator.state.metrics, trainer.state.epoch)

            # Val data.
            if val_loader:
                evaluator.run(val_loader)
                log_results("val", evaluator.state.metrics, trainer.state.epoch)

            # Test data.
            if test_loader:
                evaluator.run(test_loader)
                log_results("test", evaluator.state.metrics, trainer.state.epoch)

            logger.info("")

        @trainer.on(Events.EPOCH_COMPLETED)
        def checkpoint_model(trainer):
            # TODO: Do not checkpoint at every step.
            checkpoint_dir = (
                self.out_dir / "checkpoints" / f"epoch{trainer.state.epoch}"
            )
            checkpoint_dir.mkdir(parents=True, exist_ok=True)
            torch.save(self.model, checkpoint_dir / "model.pt")

        @trainer.on(Events.EPOCH_COMPLETED)
        def plot_samples(trainer):
            """Plot a few sample images and probabilites to tensorboard."""

            def write_samples_plot(name, sample_images, sample_labels):
                # TODO: This can be improved by just using the outputs already
                #   calculated in evaluator.state.output in the functions above.
                #   Problem: At least in the train evaluator, the batches are not equal,
                #   so the plotted images will differ from run to run.
                if sample_images is None:
                    return

                with torch.no_grad():
                    sample_output = self.model(sample_images.to(device))
                    sample_pred = torch.softmax(sample_output, dim=1)

                visualization.plot_samples(
                    writer,
                    f"{name}-samples",
                    trainer.state.epoch,
                    sample_images.to("cpu").numpy(),
                    sample_labels.to("cpu").numpy(),
                    sample_pred.to("cpu").numpy(),
                )

            write_samples_plot("train", train_sample_images, train_sample_labels)
            write_samples_plot("val", val_sample_images, val_sample_labels)
            write_samples_plot("test", test_sample_images, test_sample_labels)

        # Start training.
        num_epochs = 1 if dry_run else self.config.get("num_epochs", 5)
        if dry_run:
            num_batches = 1
            logger.info(f"Training model on device {device}... (DRY RUN, only 1 batch)")
        elif "num_samples" in self.config:
            # TODO: Make sure batch_size doesn't differ from the value extracted during
            #   preprocessing.
            batch_size = self.config.get("batch_size", 128)
            # TODO: This always uses a few more samples than num_samples. Maybe get it
            #   to the correct value.
            num_batches = int(self.config["num_samples"] / batch_size) + 1
            logger.info(
                f"Training model on device {device}... (using "
                f"{self.config['num_samples']} of {len(train_loader.dataset)} samples)"
            )
        else:
            num_batches = None  # all batches
            logger.info(f"Training model on device {device}...")
            logger.info(
                "(if this takes too long, train on less data with the config "
                "parameter 'num_samples')"
            )
        logger.info("(show more steps by setting the config parameter 'print_every')")
        logger.info("")
        trainer.run(train_loader, max_epochs=num_epochs, epoch_length=num_batches)
        logger.info("Training finished!")

        # Save the trained model.
        torch.save(self.model, self.out_dir / "model.pt")
Beispiel #15
0
def train():
    config_file = "configs/train_daily_dialog_emotion_action_topic_config.json"
    config = Config.from_json_file(config_file)


    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", config.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(config.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(config.device)
    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch)
        lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids)
        loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps
        if config.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
        if engine.state.iteration % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids,
                                  token_emotion_ids=token_emotion_ids,
                                  token_action_ids=token_action_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if config.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if config.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if config.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if config.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=config.log_dir)
        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation

        torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=config.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if config.local_rank in [-1, 0] and config.n_epochs > 0:
        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Beispiel #16
0
def run(config, plx_experiment):

    set_seed(config['seed'])

    device = "cuda"
    batch_size = config['batch_size']

    train_transforms = [DynamicCrop(32, 32), FlipLR()]
    cutout_size = config['cutout_size']
    if cutout_size is not None:
        train_transforms.append(DynamicCutout(cutout_size, cutout_size))

    train_loader, test_loader = get_fast_train_test_loaders(
        path=config["data_path"],
        batch_size=batch_size,
        num_workers=config['num_workers'],
        device=device,
        train_transforms=train_transforms)

    bn_kwargs = config['bn_kwargs']
    conv_kwargs = config['conv_kwargs']

    model = config["model"](conv_kwargs=conv_kwargs,
                            bn_kwargs=bn_kwargs,
                            final_weight=config['final_weight'])
    model = model.to(device)
    model = model.half()
    model_name = model.__class__.__name__

    criterion = nn.CrossEntropyLoss(reduction='sum').to(device)
    criterion = criterion.half()
    eval_criterion = criterion

    if config["enable_mixup"]:
        criterion = MixupCriterion(criterion)

    weight_decay = config['weight_decay']

    if not config['use_adamw']:
        opt_kwargs = [("lr", 0.0), ("momentum", config['momentum']),
                      ("weight_decay", weight_decay), ("nesterov", True)]
        optimizer_cls = optim.SGD
    else:
        opt_kwargs = [
            ("lr", 0.0),
            ("betas", (0.9, 0.999)),
            ("eps", 1e-08),
        ]
        optimizer_cls = optim.Adam

    optimizer = optimizer_cls([
        # conv + bn
        dict([("params", model.prep.parameters())] + opt_kwargs),
        # conv + bn
        dict([("params", model.layer1[0].parameters())] + opt_kwargs),
        # identity residual block
        dict([("params", model.layer1[-1].conv1.parameters())] + opt_kwargs),
        dict([("params", model.layer1[-1].conv2.parameters())] + opt_kwargs),
        # conv + bn
        dict([("params", model.layer2.parameters())] + opt_kwargs),
        # conv + bn
        dict([("params", model.layer3[0].parameters())] + opt_kwargs),
        # identity residual block
        dict([("params", model.layer3[-1].conv1.parameters())] + opt_kwargs),
        dict([("params", model.layer3[-1].conv2.parameters())] + opt_kwargs),
        # linear
        dict([("params", model.classifier.parameters())] + opt_kwargs),
    ])

    num_iterations_per_epoch = len(train_loader)
    num_iterations = num_iterations_per_epoch * config['num_epochs']
    layerwise_milestones_lr_values = []
    for i in range(len(optimizer.param_groups)):
        key = "lr_param_group_{}".format(i)
        assert key in config, "{} not in config".format(key)
        milestones_values = config[key]
        layerwise_milestones_lr_values.append([
            (int(m * num_iterations_per_epoch), v / batch_size)
            for m, v in milestones_values
        ])

    lr_scheduler = get_layerwise_lr_scheduler(optimizer,
                                              layerwise_milestones_lr_values)

    momentum_scheduler = None
    if config["momentum_scheduling"] is not None:
        milestones_values = config["momentum_scheduling"]
        layerwise_milestones_mtm_values = []
        for i in range(len(optimizer.param_groups)):
            layerwise_milestones_mtm_values.append([
                (int(m * num_iterations_per_epoch), v)
                for m, v in milestones_values
            ])
        momentum_scheduler = get_layerwise_scheduler(
            optimizer,
            param_name="momentum",
            milestones_values=layerwise_milestones_mtm_values)

    def _prepare_batch_fp16(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device,
                               non_blocking=non_blocking).half(),
                convert_tensor(y, device=device,
                               non_blocking=non_blocking).long())

    def process_function(engine, batch):
        x, y = _prepare_batch_fp16(batch, device=device, non_blocking=True)

        if config['enable_mixup']:
            x, y = mixup_data(x, y, config['mixup_alpha'],
                              config['mixup_proba'])

        optimizer.zero_grad()
        y_pred = model(x)

        loss = criterion(y_pred, y)
        loss.backward()

        if config["clip_gradients"] is not None:
            clip_grad_norm_(model.parameters(), config["clip_gradients"])

        if config['use_adamw']:
            for group in optimizer.param_groups:
                for param in group['params']:
                    param.data.add_(-weight_decay / batch_size * group['lr'])

        optimizer.step()
        loss = loss.item()

        return loss

    trainer = Engine(process_function)

    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(eval_criterion) / len(test_loader)
    }
    evaluator = create_supervised_evaluator(model,
                                            metrics,
                                            prepare_batch=_prepare_batch_fp16,
                                            device=device,
                                            non_blocking=True)

    train_evaluator = create_supervised_evaluator(
        model,
        metrics,
        prepare_batch=_prepare_batch_fp16,
        device=device,
        non_blocking=True)

    total_timer = Timer(average=False)
    train_timer = Timer(average=False)
    test_timer = Timer(average=False)

    table_logger = TableLogger()

    if config["use_tb_logger"]:
        path = "experiments/tb_logs" if "TB_LOGGER_PATH" not in os.environ else os.environ[
            "TB_LOGGER_PATH"]
        tb_logger = SummaryWriter(log_dir=path)

    test_timer.attach(evaluator, start=Events.EPOCH_STARTED)

    @trainer.on(Events.STARTED)
    def on_training_started(engine):
        print("Warming up cudnn on random inputs")
        for _ in range(5):
            for size in [batch_size, len(test_loader.dataset) % batch_size]:
                warmup_cudnn(model, criterion, size, config)

        total_timer.reset()

    @trainer.on(Events.EPOCH_STARTED)
    def on_epoch_started(engine):
        model.train()
        train_timer.reset()

        # Warm-up on small images
        if config['warmup_on_small_images']:
            if engine.state.epoch < config['warmup_duration']:
                train_loader.dataset.transforms[0].h = 20
                train_loader.dataset.transforms[0].w = 20
            elif engine.state.epoch == config['warmup_duration']:
                train_loader.dataset.transforms[0].h = 32
                train_loader.dataset.transforms[0].w = 32

        train_loader.dataset.set_random_choices()

        if config['reduce_cutout']:
            # after 15 epoch remove cutout augmentation
            if 14 <= engine.state.epoch < 16:
                train_loader.dataset.transforms[-1].h -= 1
                train_loader.dataset.transforms[-1].w -= 1
            elif engine.state.epoch == 16:
                train_loader.dataset.transforms.pop()

        if config['enable_mixup'] and config[
                'mixup_max_epochs'] == engine.state.epoch - 1:
            config['mixup_proba'] = 0.0

    if config["use_tb_logger"]:

        @trainer.on(Events.ITERATION_COMPLETED)
        def on_iteration_completed(engine):
            # log learning rate
            param_name = "lr"
            if len(optimizer.param_groups) == 1:
                param = float(optimizer.param_groups[0][param_name])
                tb_logger.add_scalar(param_name, param * batch_size,
                                     engine.state.iteration)
            else:
                for i, param_group in enumerate(optimizer.param_groups):
                    param = float(param_group[param_name])
                    tb_logger.add_scalar(
                        "{}/{}/group_{}".format(param_name, model_name, i),
                        param * batch_size, engine.state.iteration)

            # log training loss
            tb_logger.add_scalar("training/loss_vs_iterations",
                                 engine.state.output / batch_size,
                                 engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def on_epoch_completed(engine):
        trainer.state.train_time = train_timer.value()

        if config["use_tb_logger"]:
            # Log |w|^2 and gradients
            for i, p in enumerate(model.parameters()):
                tb_logger.add_scalar(
                    "w2/{}/{}_{}".format(model_name, i, list(p.data.shape)),
                    torch.norm(p.data), engine.state.epoch)
                tb_logger.add_scalar(
                    "mean_grad/{}/{}_{}".format(model_name, i,
                                                list(p.grad.shape)),
                    torch.mean(p.grad), engine.state.epoch)

        for i, p in enumerate(model.parameters()):
            plx_experiment.log_metrics(
                step=engine.state.epoch,
                **{
                    "w2/{}/{}_{}".format(model_name, i, list(p.data.shape)):
                    torch.norm(p.data).item()
                })

        evaluator.run(test_loader)

    trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)
    if momentum_scheduler is not None:
        trainer.add_event_handler(Events.ITERATION_STARTED, momentum_scheduler)

    @evaluator.on(Events.COMPLETED)
    def log_results(engine):
        evaluator.state.test_time = test_timer.value()
        metrics = evaluator.state.metrics
        output = [("epoch", trainer.state.epoch)]
        output += [(key, trainer.state.param_history[key][-1][0] * batch_size)
                   for key in trainer.state.param_history if "lr" in key]
        output += [(key, trainer.state.param_history[key][-1][0])
                   for key in trainer.state.param_history if "lr" not in key]
        output += [("train time", trainer.state.train_time),
                   ("train loss", trainer.state.output / batch_size),
                   ("test time", evaluator.state.test_time),
                   ("test loss", metrics['loss'] / batch_size),
                   ("test acc", metrics['accuracy']),
                   ("total time", total_timer.value())]
        output = OrderedDict(output)
        table_logger.append(output)

        plx_experiment.log_metrics(step=trainer.state.epoch, **output)

        if config["use_tb_logger"]:
            tb_logger.add_scalar("training/total_time", total_timer.value(),
                                 trainer.state.epoch)
            tb_logger.add_scalar("test/loss", metrics['loss'] / batch_size,
                                 trainer.state.epoch)
            tb_logger.add_scalar("test/accuracy", metrics['accuracy'],
                                 trainer.state.epoch)

    @trainer.on(Events.COMPLETED)
    def on_training_completed(engine):
        train_evaluator.run(train_loader)
        metrics = train_evaluator.state.metrics

        if config["use_tb_logger"]:
            tb_logger.add_scalar("training/loss", metrics['loss'] / batch_size,
                                 0)
            tb_logger.add_scalar("training/loss", metrics['loss'] / batch_size,
                                 engine.state.epoch)

            tb_logger.add_scalar("training/accuracy", metrics['accuracy'], 0)
            tb_logger.add_scalar("training/accuracy", metrics['accuracy'],
                                 engine.state.epoch)

        output = {
            "train acc": metrics['accuracy'],
            "train loss": metrics['loss'] / batch_size
        }
        plx_experiment.log_metrics(step=engine.state.epoch, **output)

    trainer.run(train_loader, max_epochs=config['num_epochs'])

    if config["use_tb_logger"]:
        tb_logger.close()
Beispiel #17
0
        out = self.fc(out)

        return out


model = EfficientNetTwoInputs()

# In[7]:

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=10e-4)

# In[8]:

metrics = {
    'loss': Loss(criterion),
    'accuracy': Accuracy(),
}

trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
val_evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device)

# #### EarlyStopping

# In[9]:

# handler = EarlyStopping(patience=30, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer)
# val_evaluator.add_event_handler(Events.COMPLETED, handler)
def run(train_batch_size, test_batch_size, epochs, lr, log_interval, log_dir,
        no_cuda, sub_spectrogram_size, sub_spectrogram_mel_hop, n_mel_bins,
        seed, root_dir, train_dir, eval_dir):
    """
	Model runner

	Parameters
	----------
	train_batch_size : int
		Size of the training batch. Default: 16

	test_batch_size : int
		size of the testing batch. Default: 16

	epochs : int
		Number of training epochs. Default: 200

	lr : float
		Learning rate for the ADAM optimizer. Default: 0.001

	log_interval : int
		Interval for logging data: Default: 10

	log_dir : str
		Directory to save the logs

	no_cuda : Bool
		Should you NOT use cuda? Default: False

	sub_spectrogram_size : int
		Size of the SubSpectrogram. Default 20
		
	sub_spectrogram_mel_hop : int
		Mel-bin hop size of the SubSpectrogram. Default 10

	n_mel_bins : int
		Number of mel-bins of the Spectrogram extracted. Default: 40.

	seed : int
		Torch random seed value, for reproducable results. Default: 1

	root_dir : str
		Directory of the folder which contains the dataset (has 'audio' and 'evaluation_setup' folders inside)

	train_dir : str
		Set as default: 'evaluation_setup/train_fold1.txt'

	eval_dir : str
		Set as default: 'evaluation_setup/evaluate_fold1.txt'
	"""

    # check if possible to use CUDA
    use_cuda = not no_cuda and torch.cuda.is_available()

    # set seed
    torch.manual_seed(seed)

    # Map to GPU
    device = torch.device("cuda" if use_cuda else "cpu")

    # Load the data loaders
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                test_batch_size,
                                                sub_spectrogram_size,
                                                sub_spectrogram_mel_hop,
                                                n_mel_bins, use_cuda, root_dir,
                                                train_dir, eval_dir)

    # Get the model
    model = SubSpectralNet(sub_spectrogram_size, sub_spectrogram_mel_hop,
                           n_mel_bins, use_cuda).to(device)

    # Init the TensorBoard summary writer
    writer = create_summary_writer(model, train_loader, log_dir)

    # Init the optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Use GPU if possible
    if device:
        model.to(device)

    def update_model(engine, batch):
        """Prepare batch for training: pass to a device with options.

		"""
        model.train()
        optimizer.zero_grad()

        inputs, label = prepare_batch(batch, device=device)
        output = model(inputs)
        losses = []
        for ite in range(output.shape[1]):
            losses.append(F.nll_loss(output[:, ite, :], label))
        loss = sum(losses)
        loss.backward()
        optimizer.step()
        return losses, output

    # get the trainer module
    trainer = Engine(update_model)

    def evaluate(engine, batch):
        """Prepare batch for training: pass to a device with options.
		"""
        model.eval()
        with torch.no_grad():
            inputs, label = prepare_batch(batch, device=device)
            output = model(inputs)
            losses = []
            correct = []
            for ite in range(output.shape[1]):
                losses.append(
                    F.nll_loss(output[:, ite, :], label,
                               reduction='sum').item())
        return losses, output, label

    # get the evaluator module
    evaluator = Engine(evaluate)

    # define output transforms for multiple outputs.
    def output_transform1(output):
        # `output` variable is returned by above `process_function`
        losses, correct, label = output
        return correct[:, 0, :], label

    metric = Accuracy(output_transform=output_transform1)
    metric.attach(evaluator, "acc_highband")
    metric = Loss(F.nll_loss, output_transform=output_transform1)
    metric.attach(evaluator, "loss_highband")

    def output_transform2(output):
        # `output` variable is returned by above `process_function`
        losses, correct, label = output
        return correct[:, 1, :], label

    metric = Accuracy(output_transform=output_transform2)
    metric.attach(evaluator, "acc_midband")
    metric = Loss(F.nll_loss, output_transform=output_transform2)
    metric.attach(evaluator, "loss_midband")

    def output_transform3(output):
        # `output` variable is returned by above `process_function`
        losses, correct, label = output
        return correct[:, 2, :], label

    metric = Accuracy(output_transform=output_transform3)
    metric.attach(evaluator, "acc_lowband")
    metric = Loss(F.nll_loss, output_transform=output_transform3)
    metric.attach(evaluator, "loss_lowband")

    def output_transform(output):
        # `output` variable is returned by above `process_function`
        losses, correct, label = output
        return correct[:, 3, :], label

    metric = Accuracy(output_transform=output_transform)
    metric.attach(evaluator, "acc_globalclassifier")
    metric = Loss(F.nll_loss, output_transform=output_transform)
    metric.attach(evaluator, "loss_globalclassifier")

    # Log the events in Ignite: EVERY ITERATION
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            losses, output = engine.state.output
            epoch = engine.state.epoch
            print(
                'Train Epoch: {} [{}/{}]\tLosses: {:.6f} (Top Band), {:.6f} (Mid Band), {:.6f} (Low Band), {:.6f} (Global Classifier)'
                .format(epoch, iter, len(train_loader), losses[0].item(),
                        losses[1].item(), losses[2].item(), losses[3].item()))
            # TensorBoard Logs
            writer.add_scalar("training/loss_topband_itr", losses[0].item(),
                              engine.state.iteration)
            writer.add_scalar("training/loss_midband_itr", losses[1].item(),
                              engine.state.iteration)
            writer.add_scalar("training/loss_lowband_itr", losses[2].item(),
                              engine.state.iteration)
            writer.add_scalar("training/loss_global_itr", losses[3].item(),
                              engine.state.iteration)

    # Log the events in Ignite: Test the training data on EVERY EPOCH
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        print(
            "Training Results - Epoch: {}  Global accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch,
                    evaluator.state.metrics['acc_globalclassifier'],
                    evaluator.state.metrics['loss_globalclassifier']))
        # TensorBoard Logs
        writer.add_scalar("training/global_loss",
                          evaluator.state.metrics['loss_globalclassifier'],
                          engine.state.epoch)
        writer.add_scalar("training/lowband_loss",
                          evaluator.state.metrics['loss_lowband'],
                          engine.state.epoch)
        writer.add_scalar("training/midband_loss",
                          evaluator.state.metrics['loss_midband'],
                          engine.state.epoch)
        writer.add_scalar("training/highband_loss",
                          evaluator.state.metrics['loss_highband'],
                          engine.state.epoch)
        writer.add_scalar("training/global_acc",
                          evaluator.state.metrics['acc_globalclassifier'],
                          engine.state.epoch)
        writer.add_scalar("training/lowband_acc",
                          evaluator.state.metrics['acc_lowband'],
                          engine.state.epoch)
        writer.add_scalar("training/midband_acc",
                          evaluator.state.metrics['acc_midband'],
                          engine.state.epoch)
        writer.add_scalar("training/highband_acc",
                          evaluator.state.metrics['acc_highband'],
                          engine.state.epoch)

    # Log the events in Ignite: Test the validation data on EVERY EPOCH
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        print(
            "Validation Results - Epoch: {}  Global accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch,
                    evaluator.state.metrics['acc_globalclassifier'],
                    evaluator.state.metrics['loss_globalclassifier']))
        # TensorBoard Logs
        writer.add_scalar("validation/global_loss",
                          evaluator.state.metrics['loss_globalclassifier'],
                          engine.state.epoch)
        writer.add_scalar("validation/lowband_loss",
                          evaluator.state.metrics['loss_lowband'],
                          engine.state.epoch)
        writer.add_scalar("validation/midband_loss",
                          evaluator.state.metrics['loss_midband'],
                          engine.state.epoch)
        writer.add_scalar("validation/highband_loss",
                          evaluator.state.metrics['loss_highband'],
                          engine.state.epoch)
        writer.add_scalar("validation/global_acc",
                          evaluator.state.metrics['acc_globalclassifier'],
                          engine.state.epoch)
        writer.add_scalar("validation/lowband_acc",
                          evaluator.state.metrics['acc_lowband'],
                          engine.state.epoch)
        writer.add_scalar("validation/midband_acc",
                          evaluator.state.metrics['acc_midband'],
                          engine.state.epoch)
        writer.add_scalar("validation/highband_acc",
                          evaluator.state.metrics['acc_highband'],
                          engine.state.epoch)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    # close the writer
    writer.close()

    # return the model
    return model
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model")
    parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation")
    # parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradients on several steps")
    # parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
    parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")

    parser.add_argument(
        "--init_model",
        default="model/pytorch_kogpt2_676e9bcfa7.params",
        type=str,
        help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
    )


    args = parser.parse_args()
    
    

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    
    config = GPT2Config(vocab_size=50000)
    model = GPT2DoubleHeadsModel(config)
    if args.init_model:
        print("Load model from ", args.init_model)
        model.load_state_dict(torch.load(args.init_model), strict=False)

    model.to(args.device)
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        (lm_loss), (mc_loss), *_ = model(
            input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
            mc_labels=mc_labels, lm_labels=lm_labels
        )
        loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            lm_logits, mc_logits, *_ = model(
                input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
            )
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.init_model)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        # tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Beispiel #20
0
def train(epochs: int, model: nn.Module, train_loader: DataLoader,
          valid_loader: DataLoader, criterion: Callable, device: str,
          lr: float, patience: int, lr_decay: float, lr_scheduler: str,
          lr_scheduler_kwargs: Dict[str, Any]):

    model.to(torch.device(device))
    optimizer = optim.Adam(
        [param for param in model.parameters() if param.requires_grad], lr=lr)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)

    scheduler = LRScheduler(
        getattr(optim.lr_scheduler, lr_scheduler)(optimizer,
                                                  **lr_scheduler_kwargs))
    trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler)

    pbar = ProgressBar(False)
    pbar.attach(trainer)

    train_evaluator = create_supervised_evaluator(
        model,
        metrics={
            'ACC': Accuracy(discreted_output_transform),
            'BCE': Loss(criterion),
            'AP': AveragePrecision(probability_output_transform)
        },
        device=device)
    valid_evaluator = create_supervised_evaluator(
        model,
        metrics={
            'ACC': Accuracy(discreted_output_transform),
            'BCE': Loss(criterion),
            'AP': AveragePrecision(probability_output_transform)
        },
        device=device)

    history = {
        col: list()
        for col in [
            'epoch', 'elapsed time', 'iterations', 'lr', 'train BCE',
            'valid BCE', 'train ACC', 'valid ACC', 'train AP', 'valid AP'
        ]
    }

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        train_evaluator.run(train_loader)

        history['train BCE'] += [train_evaluator.state.metrics['BCE']]
        history['train ACC'] += [train_evaluator.state.metrics['ACC']]
        history['train AP'] += [train_evaluator.state.metrics['AP']]

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        valid_evaluator.run(valid_loader)

        history['epoch'] += [valid_evaluator.state.epoch]
        history['iterations'] += [valid_evaluator.state.epoch_length]
        history['elapsed time'] += [
            0 if len(history['elapsed time']) == 0 else
            history['elapsed time'][-1] +
            valid_evaluator.state.times['COMPLETED']
        ]
        history['lr'] += [scheduler.get_param()]

        history['valid BCE'] += [valid_evaluator.state.metrics['BCE']]
        history['valid ACC'] += [valid_evaluator.state.metrics['ACC']]
        history['valid AP'] += [valid_evaluator.state.metrics['AP']]

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_progress_bar(engine):
        pbar.log_message(
            f"train BCE: {history['train BCE'][-1]:.2f} " \
            + f"train ACC: {history['train ACC'][-1]:.2f} " \
            + f"train AP: {history['train AP'][-1]:.2f} " \
            + f"valid BCE: {history['valid BCE'][-1]:.2f} " \
            + f"valid ACC: {history['valid ACC'][-1]:.2f} " \
            + f"valid AP: {history['valid AP'][-1]:.2f}"
        )

    # Early stopping
    handler = EarlyStopping(patience=patience,
                            score_function=score_function,
                            trainer=trainer)
    valid_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler)

    trainer.run(train_loader, max_epochs=epochs)
    return pd.DataFrame(history)
Beispiel #21
0
def run(args):
    train_loader, val_loader = get_data_loaders(args.dataset_dir,
                                                args.batch_size,
                                                args.val_batch_size,
                                                args.num_workers)

    if args.seed is not None:
        torch.manual_seed(args.seed)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    num_classes = KITTI.num_classes()
    model = LiLaNet(num_classes)

    device_count = torch.cuda.device_count()
    if device_count > 1:
        print("Using %d GPU(s)" % device_count)
        model = nn.DataParallel(model)
        args.batch_size = device_count * args.batch_size
        args.val_batch_size = device_count * args.val_batch_size

    model = model.to(device)

    criterion = nn.CrossEntropyLoss(weight=KITTI.class_weights()).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    if args.resume:
        if os.path.isfile(args.resume):
            print("Loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("Loaded checkpoint '{}' (Epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("No checkpoint found at '{}'".format(args.resume))

    def _prepare_batch(batch, non_blocking=True):
        distance, reflectivity, target = batch

        return (convert_tensor(distance,
                               device=device,
                               non_blocking=non_blocking),
                convert_tensor(reflectivity,
                               device=device,
                               non_blocking=non_blocking),
                convert_tensor(target,
                               device=device,
                               non_blocking=non_blocking))

    def _update(engine, batch):
        model.train()
        optimizer.zero_grad()
        distance, reflectivity, target = _prepare_batch(batch)
        pred = model(distance, reflectivity)
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()

        return loss.item()

    trainer = Engine(_update)

    # attach running average metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

    # attach progress bar
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=['loss'])

    def _inference(engine, batch):
        model.eval()
        with torch.no_grad():
            distance, reflectivity, target = _prepare_batch(batch)
            pred = model(distance, reflectivity)

            return pred, target

    evaluator = Engine(_inference)
    cm = ConfusionMatrix(num_classes)
    IoU(cm, ignore_index=0).attach(evaluator, 'IoU')
    Loss(criterion).attach(evaluator, 'loss')

    pbar2 = ProgressBar(persist=True, desc='Eval Epoch')
    pbar2.attach(evaluator)

    def _global_step_transform(engine, event_name):
        if trainer.state is not None:
            return trainer.state.iteration
        else:
            return 1

    tb_logger = TensorboardLogger(args.log_dir)
    tb_logger.attach(trainer,
                     log_handler=OutputHandler(tag='training',
                                               metric_names=['loss']),
                     event_name=Events.ITERATION_COMPLETED)

    tb_logger.attach(evaluator,
                     log_handler=OutputHandler(
                         tag='validation',
                         metric_names=['loss', 'IoU'],
                         global_step_transform=_global_step_transform),
                     event_name=Events.EPOCH_COMPLETED)

    @trainer.on(Events.STARTED)
    def initialize(engine):
        engine.state.exception_raised = False
        if args.resume:
            engine.state.epoch = args.start_epoch

    @evaluator.on(Events.EPOCH_COMPLETED)
    def save_checkpoint(engine):
        epoch = trainer.state.epoch if trainer.state is not None else 1
        iou = engine.state.metrics['IoU'] * 100.0
        mean_iou = iou.mean()

        name = 'epoch{}_mIoU={:.1f}.pth'.format(epoch, mean_iou)
        file = {
            'model': model.state_dict(),
            'epoch': epoch,
            'optimizer': optimizer.state_dict(),
            'args': args
        }

        save(file, args.output_dir, 'checkpoint_{}'.format(name))
        save(model.state_dict(), args.output_dir, 'model_{}'.format(name))

    @trainer.on(Events.EPOCH_COMPLETED)
    def run_validation(engine):
        pbar.log_message("Start Validation - Epoch: [{}/{}]".format(
            engine.state.epoch, engine.state.max_epochs))
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        loss = metrics['loss']
        iou = metrics['IoU'] * 100.0
        mean_iou = iou.mean()

        iou_text = ', '.join([
            '{}: {:.1f}'.format(KITTI.classes[i + 1].name, v)
            for i, v in enumerate(iou.tolist())
        ])
        pbar.log_message(
            "Validation results - Epoch: [{}/{}]: Loss: {:.2e}\n IoU: {}\n mIoU: {:.1f}"
            .format(engine.state.epoch, engine.state.max_epochs, loss,
                    iou_text, mean_iou))

    @trainer.on(Events.EXCEPTION_RAISED)
    def handle_exception(engine, e):
        engine.state.exception_raised = True
        if isinstance(e, KeyboardInterrupt) and (engine.state.iteration > 1):
            engine.terminate()
            warnings.warn("KeyboardInterrupt caught. Exiting gracefully.")

            name = 'epoch{}_exception.pth'.format(trainer.state.epoch)
            file = {
                'model': model.state_dict(),
                'epoch': trainer.state.epoch,
                'optimizer': optimizer.state_dict()
            }

            save(file, args.output_dir, 'checkpoint_{}'.format(name))
            save(model.state_dict(), args.output_dir, 'model_{}'.format(name))
        else:
            raise e

    if args.eval_on_start:
        print("Start validation")
        evaluator.run(val_loader, max_epochs=1)

    print("Start training")
    trainer.run(train_loader, max_epochs=args.epochs)
    tb_logger.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    clearml_logger = ClearMLLogger(project_name="examples", task_name="ignite")

    clearml_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training metrics", train_evaluator),
                           ("validation metrics", validation_evaluator)]:
        clearml_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    clearml_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    clearml_logger.attach(
        trainer,
        log_handler=WeightsScalarHandler(model, whitelist=["fc1"]),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    def is_conv(n, _):
        return "conv" in n

    clearml_logger.attach(
        trainer,
        log_handler=WeightsHistHandler(model, whitelist=is_conv),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    clearml_logger.attach(trainer,
                          log_handler=GradsScalarHandler(model),
                          event_name=Events.ITERATION_COMPLETED(every=100))

    clearml_logger.attach(
        trainer,
        log_handler=GradsHistHandler(model, whitelist=["fc2.weight"]),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    handler = Checkpoint(
        {"model": model},
        ClearMLSaver(),
        n_saved=1,
        score_function=lambda e: e.state.metrics["accuracy"],
        score_name="val_acc",
        filename_prefix="best",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    clearml_logger.close()
Beispiel #23
0
#test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True)

# Inisialisasi objek GPU
gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Pakai model RESNET untuk transfer learning
model = torchvision.models.resnet50(pretrained=True)
model.to(gpu)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

trainer = create_supervised_trainer(model, optimizer, criterion, device=gpu)
metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(criterion)
}
train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu)
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=gpu)
training_history = {"accuracy":[], "loss":[]}
validation_history = {"accuracy":[], "loss":[]}
last_epoch = []

# RunningAverage metrics
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")

# EarlyStopping Callbacks
handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
val_evaluator.add_event_handler(Events.COMPLETED, handler)

# Buat Custom Function
Beispiel #24
0
def main(dataset_path, batch_size=256, max_epochs=10, opt="O1"):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(
        dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    model, optimizer = amp.initialize(model, optimizer, opt_level=opt)

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        y_pred = model(x)
        loss = criterion(y_pred, y)

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        optimizer.step()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(
        trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print(f"\t{title} {name}: {engine.state.metrics[name]:.2f}")

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print(f"- Mean elapsed time for 1 epoch: {timer.value()}")
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)
Beispiel #25
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    vd_logger = VisdomLogger(env="mnist_training")

    vd_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        vd_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    vd_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    vd_logger.attach(trainer,
                     log_handler=WeightsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    vd_logger.attach(trainer,
                     log_handler=GradsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        log_dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    vd_logger.close()
Beispiel #26
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
    model = Net()
    writer = SummaryWriter(log_dir=log_dir)

    # Use TPU device
    device = xm.xla_device()

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.NLLLoss()

    # Create trainer and evaluator
    trainer = create_supervised_trainer(
        model, optimizer, criterion, device=device, output_transform=lambda x, y, y_pred, loss: [loss.item(),]
    )

    val_metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)}
    evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device)

    tracker = xm.RateTracker()

    # Add RateTracker as an output of the training step
    @trainer.on(Events.ITERATION_COMPLETED)
    def add_rate_tracker(engine):
        tracker.add(len(engine.state.batch))
        engine.state.output.append(tracker.global_rate())

    # Setup output values of the training step as EMA metrics
    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "batch_loss")
    RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "global_rate")

    # Let's log the EMA metrics every `log_interval` iterations
    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        writer.add_scalar("training/batch_loss", engine.state.metrics["batch_loss"], engine.state.iteration)
        writer.add_scalar("training/global_rate", engine.state.metrics["global_rate"], engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        print(
            f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        print(
            f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    writer.close()
Beispiel #27
0
def main(
    architecture,
    batch_size,
    length_scale,
    centroid_size,
    learning_rate,
    l_gradient_penalty,
    gamma,
    weight_decay,
    final_model,
    output_dir,
):
    writer = SummaryWriter(log_dir=f"runs/{output_dir}")

    ds = all_datasets["CIFAR10"]()
    input_size, num_classes, dataset, test_dataset = ds

    # Split up training set
    idx = list(range(len(dataset)))
    random.shuffle(idx)

    if final_model:
        train_dataset = dataset
        val_dataset = test_dataset
    else:
        val_size = int(len(dataset) * 0.8)
        train_dataset = torch.utils.data.Subset(dataset, idx[:val_size])
        val_dataset = torch.utils.data.Subset(dataset, idx[val_size:])

        val_dataset.transform = (test_dataset.transform
                                 )  # Test time preprocessing for validation

    if architecture == "WRN":
        model_output_size = 640
        epochs = 200
        milestones = [60, 120, 160]
        feature_extractor = WideResNet()
    elif architecture == "ResNet18":
        model_output_size = 512
        epochs = 200
        milestones = [60, 120, 160]
        feature_extractor = resnet18()
    elif architecture == "ResNet50":
        model_output_size = 2048
        epochs = 200
        milestones = [60, 120, 160]
        feature_extractor = resnet50()
    elif architecture == "ResNet110":
        model_output_size = 2048
        epochs = 200
        milestones = [60, 120, 160]
        feature_extractor = resnet110()
    elif architecture == "DenseNet121":
        model_output_size = 1024
        epochs = 200
        milestones = [60, 120, 160]
        feature_extractor = densenet121()

        # Adapted resnet from:
        # https://github.com/kuangliu/pytorch-cifar/blob/master/models/resnet.py
        feature_extractor.conv1 = torch.nn.Conv2d(3,
                                                  64,
                                                  kernel_size=3,
                                                  stride=1,
                                                  padding=1,
                                                  bias=False)
        feature_extractor.maxpool = torch.nn.Identity()
        feature_extractor.fc = torch.nn.Identity()

    if centroid_size is None:
        centroid_size = model_output_size

    model = ResNet_DUQ(
        feature_extractor,
        num_classes,
        centroid_size,
        model_output_size,
        length_scale,
        gamma,
    )
    model = model.cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=learning_rate,
                                momentum=0.9,
                                weight_decay=weight_decay)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=milestones,
                                                     gamma=0.2)

    def calc_gradients_input(x, y_pred):
        gradients = torch.autograd.grad(
            outputs=y_pred,
            inputs=x,
            grad_outputs=torch.ones_like(y_pred),
            create_graph=True,
        )[0]

        gradients = gradients.flatten(start_dim=1)

        return gradients

    def calc_gradient_penalty(x, y_pred):
        gradients = calc_gradients_input(x, y_pred)

        # L2 norm
        grad_norm = gradients.norm(2, dim=1)

        # Two sided penalty
        gradient_penalty = ((grad_norm - 1)**2).mean()

        return gradient_penalty

    def step(engine, batch):
        model.train()

        optimizer.zero_grad()

        x, y = batch
        x, y = x.cuda(), y.cuda()

        x.requires_grad_(True)

        y_pred = model(x)

        y = F.one_hot(y, num_classes).float()

        loss = F.binary_cross_entropy(y_pred, y, reduction="mean")

        if l_gradient_penalty > 0:
            gp = calc_gradient_penalty(x, y_pred)
            loss += l_gradient_penalty * gp

        loss.backward()
        optimizer.step()

        x.requires_grad_(False)

        with torch.no_grad():
            model.eval()
            model.update_embeddings(x, y)

        return loss.item()

    def eval_step(engine, batch):
        model.eval()

        x, y = batch
        x, y = x.cuda(), y.cuda()

        x.requires_grad_(True)

        y_pred = model(x)

        return {"x": x, "y": y, "y_pred": y_pred}

    trainer = Engine(step)
    evaluator = Engine(eval_step)

    metric = Average()
    metric.attach(trainer, "loss")

    metric = Accuracy(output_transform=lambda out: (out["y_pred"], out["y"]))
    metric.attach(evaluator, "accuracy")

    def bce_output_transform(out):
        return (out["y_pred"], F.one_hot(out["y"], num_classes).float())

    metric = Loss(F.binary_cross_entropy,
                  output_transform=bce_output_transform)
    metric.attach(evaluator, "bce")

    metric = Loss(calc_gradient_penalty,
                  output_transform=lambda out: (out["x"], out["y_pred"]))
    metric.attach(evaluator, "gradient_penalty")

    pbar = ProgressBar(dynamic_ncols=True)
    pbar.attach(trainer)

    kwargs = {"num_workers": 4, "pin_memory": True}

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               drop_last=True,
                                               **kwargs)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             **kwargs)

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              **kwargs)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_results(trainer):
        metrics = trainer.state.metrics
        loss = metrics["loss"]

        print(f"Train - Epoch: {trainer.state.epoch} Loss: {loss:.2f}")

        writer.add_scalar("Loss/train", loss, trainer.state.epoch)

        if trainer.state.epoch > (epochs - 5):
            accuracy, auroc = get_cifar_svhn_ood(model)
            print(f"Test Accuracy: {accuracy}, AUROC: {auroc}")
            writer.add_scalar("OoD/test_accuracy", accuracy,
                              trainer.state.epoch)
            writer.add_scalar("OoD/roc_auc", auroc, trainer.state.epoch)

            accuracy, auroc = get_auroc_classification(val_dataset, model)
            print(f"AUROC - uncertainty: {auroc}")
            writer.add_scalar("OoD/val_accuracy", accuracy,
                              trainer.state.epoch)
            writer.add_scalar("OoD/roc_auc_classification", auroc,
                              trainer.state.epoch)

        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        acc = metrics["accuracy"]
        bce = metrics["bce"]
        GP = metrics["gradient_penalty"]
        loss = bce + l_gradient_penalty * GP

        print((f"Valid - Epoch: {trainer.state.epoch} "
               f"Acc: {acc:.4f} "
               f"Loss: {loss:.2f} "
               f"BCE: {bce:.2f} "
               f"GP: {GP:.2f} "))

        writer.add_scalar("Loss/valid", loss, trainer.state.epoch)
        writer.add_scalar("BCE/valid", bce, trainer.state.epoch)
        writer.add_scalar("GP/valid", GP, trainer.state.epoch)
        writer.add_scalar("Accuracy/valid", acc, trainer.state.epoch)

        scheduler.step()

    trainer.run(train_loader, max_epochs=epochs)
    evaluator.run(test_loader)
    acc = evaluator.state.metrics["accuracy"]

    print(f"Test - Accuracy {acc:.4f}")

    torch.save(model.state_dict(), f"runs/{output_dir}/model.pt")
    writer.close()
Beispiel #28
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler)

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Beispiel #29
0
    def setup_training(self, base_model, classifier, setops_model):

        #
        # Create the train and test dataset.
        #
        train_loader, train_subset_loader, val_loader = self.setup_datasets()

        logging.info("Setup logging and controls.")

        #
        # Setup metrics plotters.
        #
        mlflow_logger = MlflowLogger()

        #
        # Setup the optimizer.
        #
        logging.info("Setup optimizers and losses.")

        parameters = list(base_model.parameters())
        parameters += list(setops_model.parameters())
        if self.train_classifier:
            parameters += list(classifier.parameters())

        if self.optimizer_cls == "SGD":
            optimizer = torch.optim.SGD(parameters,
                                        lr=self.lr1,
                                        momentum=0.9,
                                        weight_decay=self.weight_decay)
        else:
            optimizer = torch.optim.Adam(parameters,
                                         lr=self.lr1,
                                         weight_decay=self.weight_decay)

        if self.focal_loss:
            attr_loss = FocalLoss().cuda()
        else:
            attr_loss = torch.nn.MultiLabelSoftMarginLoss().cuda()

        recon_loss = torch.nn.MSELoss(
        ) if self.recon_loss == "mse" else torch.nn.L1Loss()

        #
        # Setup the trainer object and its logging.
        #
        logging.info("Setup trainer")
        trainer = create_setops_trainer(base_model,
                                        classifier,
                                        setops_model,
                                        optimizer,
                                        criterion1=attr_loss,
                                        criterion2=recon_loss.cuda(),
                                        params_object=self,
                                        device=self.device)
        ProgressBar(bar_format=None).attach(trainer)

        mlflow_logger.attach(engine=trainer,
                             prefix="Train ",
                             plot_event=Events.ITERATION_COMPLETED,
                             update_period=LOG_INTERVAL,
                             output_transform=lambda x: x)

        #
        # Define the evaluation metrics.
        #
        logging.info("Setup evaluator")
        evaluation_losses = {
            'real class loss':
                Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["real class a"], o["targets"]["class a"])) + \
                Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["real class b"], o["targets"]["class b"])),
            'fake class loss':
                Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["fake class a"], o["targets"]["class a"])) + \
                Loss(torch.nn.MultiLabelSoftMarginLoss().cuda(), lambda o: (o["outputs"]["fake class b"], o["targets"]["class b"])),
            '{} fake loss'.format(self.recon_loss):
                (Loss(recon_loss.cuda(), lambda o: (o["outputs"]["fake embed a"], o["targets"]["embed a"])) +
                Loss(recon_loss.cuda(), lambda o: (o["outputs"]["fake embed b"], o["targets"]["embed b"]))) / 2,
        }
        labels_list = train_loader.dataset.labels_list
        mask = labels_list_to_1hot(labels_list, labels_list).astype(np.bool)
        evaluation_accuracies = {
            'real class acc':
            (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                "real class a"], o["targets"]["class a"])) +
             MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                 "real class b"], o["targets"]["class b"]))) / 2,
            'fake class acc':
            (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                "fake class a"], o["targets"]["class a"])) +
             MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                 "fake class b"], o["targets"]["class b"]))) / 2,
            'S class acc':
            (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                "a_S_b class"], o["targets"]["a_S_b class"])) +
             MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                 "b_S_a class"], o["targets"]["b_S_a class"]))) / 2,
            'I class acc':
            (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                "a_I_b class"], o["targets"]["a_I_b class"])) +
             MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                 "b_I_a class"], o["targets"]["a_I_b class"]))) / 2,
            'U class acc':
            (MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                "a_U_b class"], o["targets"]["a_U_b class"])) +
             MultiLabelSoftMarginIOUaccuracy(lambda o: (o["outputs"][
                 "b_U_a class"], o["targets"]["a_U_b class"]))) / 2,
            'MSE fake acc':
            (EWMeanSquaredError(lambda o: (o["outputs"]["fake embed a"], o[
                "targets"]["embed a"])) + EWMeanSquaredError(lambda o: (o[
                    "outputs"]["fake embed b"], o["targets"]["embed b"]))) / 2,
            'real mAP':
            mAP(mask=mask,
                output_transform=lambda o:
                (o["outputs"]["real class a"], o["targets"]["class a"])),
            'fake mAP':
            mAP(mask=mask,
                output_transform=lambda o:
                (o["outputs"]["fake class a"], o["targets"]["class a"])),
            'S mAP':
            mAP(mask=mask,
                output_transform=lambda o:
                (o["outputs"]["a_S_b class"], o["targets"]["a_S_b class"])),
            'I mAP':
            mAP(mask=mask,
                output_transform=lambda o:
                (o["outputs"]["a_I_b class"], o["targets"]["a_I_b class"])),
            'U mAP':
            mAP(mask=mask,
                output_transform=lambda o:
                (o["outputs"]["a_U_b class"], o["targets"]["a_U_b class"])),
        }

        #
        # Setup the training evaluator object and its logging.
        #
        train_evaluator = create_setops_evaluator(
            base_model,
            classifier,
            setops_model,
            metrics=evaluation_accuracies.copy(),
            device=self.device)

        mlflow_logger.attach(engine=train_evaluator,
                             prefix="Train Eval ",
                             plot_event=Events.EPOCH_COMPLETED,
                             metric_names=list(evaluation_accuracies.keys()))
        ProgressBar(bar_format=None).attach(train_evaluator)

        #
        # Setup the evaluator object and its logging.
        #
        evaluator = create_setops_evaluator(base_model,
                                            classifier,
                                            setops_model,
                                            metrics={
                                                **evaluation_losses,
                                                **evaluation_accuracies
                                            },
                                            device=self.device)

        mlflow_logger.attach(engine=evaluator,
                             prefix="Eval ",
                             plot_event=Events.EPOCH_COMPLETED,
                             metric_names=list({
                                 **evaluation_losses,
                                 **evaluation_accuracies
                             }.keys()))
        ProgressBar(bar_format=None).attach(evaluator)

        #
        # Checkpoint of the model
        #
        self.setup_checkpoint(base_model, classifier, setops_model, evaluator)

        logging.info("Setup schedulers.")

        #
        # Update learning rate manually using the Visdom interface.
        #
        one_cycle_size = len(train_loader) * self.warmup_epochs * 2

        scheduler_1 = LinearCyclicalScheduler(optimizer,
                                              "lr",
                                              start_value=self.lr1,
                                              end_value=self.lr2,
                                              cycle_size=one_cycle_size)
        scheduler_2 = ReduceLROnPlateau(optimizer,
                                        factor=0.5,
                                        patience=4 * len(train_loader),
                                        cooldown=len(train_loader),
                                        output_transform=lambda x: x["main"])
        lr_scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2],
                                       durations=[one_cycle_size // 2],
                                       save_history=True)
        trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_scheduler)

        #
        # Evaluation
        #
        @trainer.on(Events.EPOCH_COMPLETED)
        def epoch_completed(engine):
            #
            # Re-randomize the indices of the training dataset.
            #
            train_loader.dataset.calc_indices()

            #
            # Run the evaluator on a subset of the training dataset.
            #
            logging.info("Evaluation on a subset of the training data.")
            train_evaluator.run(train_subset_loader)

            #
            # Run the evaluator on the validation set.
            #
            logging.info("Evaluation on the eval data.")
            evaluator.run(val_loader)

        return trainer, train_loader
Beispiel #30
0
def test_zero_div():
    loss = Loss(nll_loss)
    with pytest.raises(NotComputableError):
        loss.compute()