Example #1
0
def setup_timer(engine):
    timer = Timer(average=True)
    timer.attach(engine,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED)
    return timer
def add_progress_bar_eval(evaluator, validation_loader):
    """
    "I can't believe it's not Keras"
    Running average accuracy and loss metrics + TQDM progressbar
    """
    validation_history = {'accuracy': [], 'loss': []}
    last_epoch = []

    RunningAverage(output_transform=lambda x: x[0]).attach(evaluator, 'loss')
    RunningAverage(Accuracy(output_transform=lambda x: (x[0], x[1]))).attach(
        evaluator, 'accuracy')

    prog_bar = ProgressBar()
    prog_bar.attach(evaluator, ['accuracy'])
    #     prog_bar.pbar_cls=tqdm.tqdm

    from ignite.handlers import Timer

    timer = Timer(average=True)
    timer.attach(evaluator,
                 start=Events.EPOCH_STARTED,
                 resume=Events.EPOCH_STARTED,
                 pause=Events.EPOCH_COMPLETED,
                 step=Events.EPOCH_COMPLETED)

    @evaluator.on(Events.EPOCH_COMPLETED)
    def log_validation_results(evaluator):
        metrics = evaluator.state.metrics
        accuracy = metrics['accuracy'] * 100
        loss = metrics['nll']
        validation_history['accuracy'].append(accuracy)
        validation_history['loss'].append(loss)
        val_msg = "Valid Epoch {}:  acc: {:.2f}% loss: {:.2f}, eval time: {:.2f}s".format(
            evaluator.state.epoch, accuracy, loss, timer.value())
        prog_bar.log_message(val_msg)
Example #3
0
class DataflowBenchmark:
    def __init__(self, num_iters=100, prepare_batch=None):

        from ignite.handlers import Timer

        device = idist.device()

        def upload_to_gpu(engine, batch):
            if prepare_batch is not None:
                x, y = prepare_batch(batch, device=device, non_blocking=False)

        self.num_iters = num_iters
        self.benchmark_dataflow = Engine(upload_to_gpu)

        @self.benchmark_dataflow.on(Events.ITERATION_COMPLETED(once=num_iters))
        def stop_benchmark_dataflow(engine):
            engine.terminate()

        if idist.get_rank() == 0:

            @self.benchmark_dataflow.on(
                Events.ITERATION_COMPLETED(every=num_iters // 100))
            def show_progress_benchmark_dataflow(engine):
                print(".", end=" ")

        self.timer = Timer(average=False)
        self.timer.attach(
            self.benchmark_dataflow,
            start=Events.EPOCH_STARTED,
            resume=Events.ITERATION_STARTED,
            pause=Events.ITERATION_COMPLETED,
            step=Events.ITERATION_COMPLETED,
        )

    def attach(self, trainer, train_loader):

        from torch.utils.data import DataLoader

        @trainer.on(Events.STARTED)
        def run_benchmark(_):
            if idist.get_rank() == 0:
                print("-" * 50)
                print(" - Dataflow benchmark")

            self.benchmark_dataflow.run(train_loader)
            t = self.timer.value()

            if idist.get_rank() == 0:
                print(" ")
                print(" Total time ({} iterations) : {:.5f} seconds".format(
                    self.num_iters, t))
                print(" time per iteration         : {} seconds".format(
                    t / self.num_iters))

                if isinstance(train_loader, DataLoader):
                    num_images = train_loader.batch_size * self.num_iters
                    print(" number of images / s       : {}".format(
                        num_images / t))

                print("-" * 50)
Example #4
0
def main(dataset_path, batch_size=256, max_epochs=10):
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

    device = "cuda"

    train_loader, test_loader, eval_train_loader = get_train_eval_loaders(
        dataset_path, batch_size=batch_size)

    model = wide_resnet50_2(num_classes=100).to(device)
    optimizer = SGD(model.parameters(), lr=0.01)
    criterion = CrossEntropyLoss().to(device)

    def train_step(engine, batch):
        x = convert_tensor(batch[0], device, non_blocking=True)
        y = convert_tensor(batch[1], device, non_blocking=True)

        optimizer.zero_grad()

        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()

        optimizer.step()

        return loss.item()

    trainer = Engine(train_step)
    timer = Timer(average=True)
    timer.attach(trainer, step=Events.EPOCH_COMPLETED)
    ProgressBar(persist=True).attach(
        trainer, output_transform=lambda out: {"batch loss": out})

    metrics = {"Accuracy": Accuracy(), "Loss": Loss(criterion)}

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)

    def log_metrics(engine, title):
        for name in metrics:
            print("\t{} {}: {:.2f}".format(title, name,
                                           engine.state.metrics[name]))

    @trainer.on(Events.COMPLETED)
    def run_validation(_):
        print("- Mean elapsed time for 1 epoch: {}".format(timer.value()))
        print("- Metrics:")
        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Train"):
            evaluator.run(eval_train_loader)

        with evaluator.add_event_handler(Events.COMPLETED, log_metrics,
                                         "Test"):
            evaluator.run(test_loader)

    trainer.run(train_loader, max_epochs=max_epochs)
Example #5
0
def run(cfg, train_loader, tr_comp, saver, trainer, valid_dict):
    # TODO resume

    # trainer = Engine(...)
    # trainer.load_state_dict(state_dict)
    # trainer.run(data)
    # checkpoint
    handler = ModelCheckpoint(saver.model_dir, 'train', n_saved=3, create_dir=True)
    checkpoint_params = tr_comp.state_dict()
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              handler,
                              checkpoint_params)

    timer = Timer(average=True)
    timer.attach(trainer,
                 start=Events.EPOCH_STARTED,
                 resume=Events.ITERATION_STARTED,
                 pause=Events.ITERATION_COMPLETED,
                 step=Events.ITERATION_COMPLETED)
    # average metric to attach on trainer
    names = ["Acc", "Loss"]
    names.extend(tr_comp.loss_function_map.keys())
    for n in names:
        RunningAverage(output_transform=Run(n)).attach(trainer, n)

    @trainer.on(Events.EPOCH_COMPLETED)
    def adjust_learning_rate(engine):
        tr_comp.scheduler.step()

    @trainer.on(Events.ITERATION_COMPLETED(every=cfg.TRAIN.LOG_ITER_PERIOD))
    def log_training_loss(engine):
        message = f"Epoch[{engine.state.epoch}], " + \
                  f"Iteration[{engine.state.iteration}/{len(train_loader)}], " + \
                  f"Base Lr: {tr_comp.scheduler.get_last_lr()[0]:.2e}, "

        for loss_name in engine.state.metrics.keys():
            message += f"{loss_name}: {engine.state.metrics[loss_name]:.4f}, "

        if tr_comp.xent and tr_comp.xent.learning_weight:
            message += f"xentWeight: {tr_comp.xent.uncertainty.mean().item():.4f}, "

        logger.info(message)

    # adding handlers using `trainer.on` decorator API
    @trainer.on(Events.EPOCH_COMPLETED)
    def print_times(engine):
        logger.info('Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]'
                    .format(engine.state.epoch, timer.value() * timer.step_count,
                            train_loader.batch_size / timer.value()))
        logger.info('-' * 80)
        timer.reset()

    @trainer.on(Events.EPOCH_COMPLETED(every=cfg.EVAL.EPOCH_PERIOD))
    def log_validation_results(engine):
        logger.info(f"Valid - Epoch: {engine.state.epoch}")
        eval_multi_dataset(cfg, valid_dict, tr_comp)

    trainer.run(train_loader, max_epochs=cfg.TRAIN.MAX_EPOCHS)
Example #6
0
def engine_eval_geomreg(cfg, mode):
    prepare_config_eval(cfg)

    ckpt_path = cfg.eval.general.ckpt_path
    gpu = cfg.general.gpu
    root_path = cfg.log.root_path
    seed = cfg.general.seed

    eu.redirect_stdout(root_path, 'eval_geomreg-{}'.format(mode))
    eu.print_config(cfg)

    eu.seed_random(seed)

    device = eu.get_device(gpu)

    dataloader = get_dataloader_eval_geomreg(cfg, mode)
    num_batches = len(dataloader)

    render_model, desc_model = get_models(cfg)
    render_model.to(device)
    render_model.eval_mode()
    render_model.print_params('render_model')
    desc_model.to(device)
    desc_model.eval_mode()
    desc_model.print_params('desc_model')

    assert eu.is_not_empty(ckpt_path)
    render_model.load(ckpt_path)
    desc_model.load(ckpt_path)

    engine = Engine(
        functools.partial(step_eval_geomreg,
                          render_model=render_model,
                          desc_model=desc_model,
                          device=device,
                          cfg=cfg))

    timer = Timer(average=True)
    timer.attach(engine,
                 start=Events.EPOCH_STARTED,
                 pause=Events.EPOCH_COMPLETED,
                 resume=Events.ITERATION_STARTED,
                 step=Events.ITERATION_COMPLETED)

    engine.add_event_handler(Events.ITERATION_COMPLETED,
                             eu.print_eval_log,
                             timer=timer,
                             num_batches=num_batches)

    engine.add_event_handler(Events.EXCEPTION_RAISED, eu.handle_exception)

    engine.run(dataloader, 1)

    return root_path
Example #7
0
def visdom_loss_handler(modules_dict, model_name):
    """
    Attaches plots and metrics to trainer.
    This handler creates or connects to an environment on a running Visdom dashboard and creates a line plot that tracks the loss function of a
    training loop as a function of the number of iterations. This can be attached to an Ignite Engine, and the training closure must
    have 'loss' as one of the keys in its return dict for this plot to be made.
    See documentation for Ignite (https://github.com/pytorch/ignite) and Visdom (https://github.com/facebookresearch/visdom) for more information.
    """

    tim = Timer()
    tim.attach(
        trainer,
        start=Events.STARTED,
        step=Events.ITERATION_COMPLETED,
    )

    vis = visdom.Visdom(env=environment)

    def create_plot_window(vis, xlabel, ylabel, title):
        return vis.line(X=np.array([1]),
                        Y=np.array([np.nan]),
                        opts=dict(xlabel=xlabel, ylabel=ylabel, title=title))

    train_loss_window = create_plot_window(vis, '#Iterations', 'Loss',
                                           description)
    log_interval = 10

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1)
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration: {} Time: {} Loss: {:.2f}".format(
                engine.state.epoch, iter,
                str(datetime.timedelta(seconds=int(tim.value()))),
                engine.state.output))
        vis.line(X=np.array([engine.state.iteration]),
                 Y=np.array([engine.state.output]),
                 update='append',
                 win=train_loss_window)

    save_interval = 50
    handler = ModelCheckpoint('/tmp/models',
                              model_name,
                              save_interval=save_interval,
                              n_saved=5,
                              create_dir=True,
                              require_empty=False)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, handler,
                              modules_dict)