Esempio n. 1
0
    def setup_training(self):
        assert self.batch_size is not None
        trainer = Engine(lambda e, b: self.train_step(b))
        trainer.register_events("EVAL_DONE")
        Average(lambda o: o['loss']).attach(trainer, 'avg_loss')
        state_vars = dict(model=self.model, opt=self.opt, trainer=trainer)
        checkpoint_handler = ModelCheckpoint(self.run_path, '', score_function=lambda e: e.state.metrics['val_accuracy'],
                                             score_name='val_accuracy', n_saved=2, global_step_transform=lambda e, evt_name: e.state.epoch)
        if checkpoint_handler.last_checkpoint:
            checkpoint_handler.load_objects(state_vars, self.run_path / checkpoint_handler.last_checkpoint)
        trainer.add_event_handler("EVAL_DONE", lambda e: checkpoint_handler(e, state_vars))
        if self.use_lr_decay:
            trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda e: self.lr_decay.step(e.state.iteration * self.batch_size))

        RunningAverage(output_transform=lambda o: o['loss']).attach(trainer, 'running_avg_loss')
        ProgressBar().attach(trainer, ['running_avg_loss'])
        logger.setup_logger(self.run_path, trainer, self.model)

        @trainer.on(Events.EPOCH_COMPLETED)
        def eval_and_log(e: Engine):
            eval_results = self.eval()
            e.state.metrics['val_accuracy'] = eval_results['val'].metrics['accuracy'] 
            e.state.metrics['val_loss'] = eval_results['val'].metrics['avg_loss']
            e.state.eval_results = eval_results
            e.fire_event("EVAL_DONE")

        if self.use_early_stop:
            es = self.make_early_stopper(trainer)
            trainer.add_event_handler("EVAL_DONE", es)

        return trainer
Esempio n. 2
0
def cluster(train_batch_size, val_batch_size):
    device = "cuda"
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    checkpointer = ModelCheckpoint("mnist-cluster",
                                   "resnet50",
                                   n_saved=1,
                                   require_empty=False)
    checkpoint = torch.load("mnist-cluster/resnet50_model_4690.pth")
    checkpointer.load_objects({'model': model}, checkpoint)
    model.cuda()
    model.eval()

    # evaluator = create_supervised_evaluator(
    #     module, metrics={"accuracy": Accuracy()}, device=device)
    # evaluator.run(val_loader)
    # metrics = evaluator.state.metrics
    # print(f"Validation Results Avg accuracy: {metrics['accuracy']:.2f}")
    features = []
    labels = []
    for image, label in val_loader:
        with torch.no_grad():
            image = image.cuda()
            label = label.cuda()
            feature = model(image)
            features.append(feature)
            labels.append(label)

    features = torch.cat(features)
    target = torch.cat(labels)

    dict_matrix = compute_dist(features, if_re_ranking=False)
    class_num, labels = generate_self_label(dict_matrix)
    print(f"class_num {class_num}")
    for i in range(target.size()[0]):
        print(f"{target[i]}  :  {labels[i]}")
Esempio n. 3
0
def run(args, seed):
    config.make_paths()

    torch.random.manual_seed(seed)
    train_loader, val_loader, shape = get_data_loaders(
        config.Training.batch_size,
        proportion=config.Training.proportion,
        test_batch_size=config.Training.batch_size * 2,
    )
    n, d, t = shape
    model = models.ConvNet(d, seq_len=t)

    writer = tb.SummaryWriter(log_dir=config.TENSORBOARD)

    model.to(config.device)  # Move model before creating optimizer
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.MSELoss()

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=config.device)
    trainer.logger = setup_logger("trainer")

    checkpointer = ModelCheckpoint(
        config.MODEL,
        model.__class__.__name__,
        n_saved=2,
        create_dir=True,
        save_as_state_dict=True,
    )
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config.Training.save_every),
        checkpointer,
        {"model": model},
    )

    val_metrics = {
        "mse": Loss(criterion),
        "mae": MeanAbsoluteError(),
        "rmse": RootMeanSquaredError(),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            device=config.device)
    evaluator.logger = setup_logger("evaluator")

    ar_evaluator = create_ar_evaluator(model,
                                       metrics=val_metrics,
                                       device=config.device)
    ar_evaluator.logger = setup_logger("ar")

    @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.save_every))
    def log_ar(engine):
        ar_evaluator.run(val_loader)
        y_pred, y = ar_evaluator.state.output
        fig = plot_output(y, y_pred)
        writer.add_figure("eval/ar", fig, engine.state.epoch)
        plt.close()

    # desc = "ITERATION - loss: {:.2f}"
    # pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED(every=config.Training.log_every))
    def log_training_loss(engine):
        # pbar.desc = desc.format(engine.state.output)
        # pbar.update(log_interval)
        if args.verbose:
            grad_norm = torch.stack(
                [p.grad.norm() for p in model.parameters()]).sum()
            writer.add_scalar("train/grad_norm", grad_norm,
                              engine.state.iteration)
        writer.add_scalar("train/loss", engine.state.output,
                          engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every))
    def log_training_results(engine):
        # pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        for k, v in metrics.items():
            writer.add_scalar(f"train/{k}", v, engine.state.epoch)
        # tqdm.write(
        #    f"Training Results - Epoch: {engine.state.epoch}  Avg mse: {evaluator.state.metrics['mse']:.2f}"
        # )

    @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every))
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics

        for k, v in metrics.items():
            writer.add_scalar(f"eval/{k}", v, engine.state.epoch)
        # tqdm.write(
        #    f"Validation Results - Epoch: {engine.state.epoch}  Avg mse: {evaluator.state.metrics['mse']:.2f}"
        # )

        # pbar.n = pbar.last_print_n = 0

        y_pred, y = evaluator.state.output

        fig = plot_output(y, y_pred)
        writer.add_figure("eval/preds", fig, engine.state.epoch)
        plt.close()

    # @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    # def log_time(engine):
    #    #tqdm.write(
    #    #    f"{trainer.last_event_name.name} took {trainer.state.times[trainer.last_event_name.name]} seconds"
    #    #)
    if args.ckpt is not None:
        ckpt = torch.load(args.ckpt)
        ModelCheckpoint.load_objects({"model": model}, ckpt)

    try:
        trainer.run(train_loader, max_epochs=config.Training.max_epochs)
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    # pbar.close()
    writer.close()
def build_trainer(experiment_dir: DirPath,
                  train_data_loader: data.DataLoader,
                  test_data_loader: data.DataLoader,
                  train_params: dict,
                  net_params: dict,
                  image_size: int,
                  optimizer_params: dict,
                  is_tabolar_mode: str,
                  runmode: str,
                  cpugpu: str = 'gpu') -> ignite.engine:

    checkpoint_dir = osp.join(experiment_dir,
                              train_params['checkpoint_relative_path'])
    logging_dir = osp.join(
        experiment_dir, train_params['logging_dir_relative_path'])  # the log
    tb_dir = osp.join(logging_dir, 'tensorboard')

    if is_tabolar_mode == 'yes':
        mfctr = FeClinicNet(net_params['net_name'],
                            net_params[net_params['net_name']],
                            net_params['classifier_fc_size'],
                            net_params['pretrained'], image_size)
    else:
        mfctr = ModelFactory.create_model(net_params['net_name'],
                                          net_params[net_params['net_name']],
                                          net_params['pretrained'],
                                          net_params['classifier_layer_size'],
                                          image_size)

    if cpugpu == 'gpu':
        model = mfctr.cuda()
        device = 'cuda'
    else:
        model = mfctr.cpu()
        device = None

    loss = nn.CrossEntropyLoss()
    optimizer = OptimizerFactory.create_optimizer(
        optimizer_name=train_params['optimizer_type'],
        net_params=model.parameters(),
        optimizer_params=optimizer_params)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                          gamma=0.9261)
    summary_writer = SummaryWriter(log_dir=tb_dir)

    trainer = create_supervised_trainer(
        model=model,
        optimizer=optimizer,
        device=device,
        is_tabolar_mode=is_tabolar_mode,
        prepare_batch=choose_prepare_batch(is_tabolar_mode),
        non_blocking=True,
        loss_fn=loss)
    metrics = build_metrics()
    evaluator = create_supervised_evaluator(
        model=model,
        metrics=metrics,
        device=device,
        non_blocking=True,
        prepare_batch=choose_prepare_batch(is_tabolar_mode),
        is_tabolar_mode=is_tabolar_mode)
    eval_dir = create_logging_dir_for_eval(log_dir=logging_dir)

    checkpoint_handler = ModelCheckpoint(dirname=checkpoint_dir,
                                         filename_prefix='checkpoint',
                                         save_interval=1,
                                         n_saved=30,
                                         atomic=True,
                                         require_empty=False,
                                         create_dir=True,
                                         save_as_state_dict=True)

    if runmode == 'new':
        shutil.rmtree(tb_dir, ignore_errors=True)
        starting_epoch = 0
    elif runmode == 'resume':
        to_load = {'trainer': trainer, 'model': model, 'optimizer': optimizer}
        checkpoint = torch.load(get_last_checkpoint(checkpoint_dir))
        ModelCheckpoint.load_objects(to_load=to_load, checkpoint=checkpoint)
        starting_epoch = trainer.state.epoch
    else:
        raise ValueError('Unknown runmode, shouldn' 't reach this')

    attach_trainer_events(trainer=trainer,
                          evaluator=evaluator,
                          train_data_loader=train_data_loader,
                          test_data_loader=test_data_loader,
                          checkpoint_handler=checkpoint_handler,
                          model=model,
                          summary_writer=summary_writer,
                          eval_freq=train_params["eval_freq"],
                          starting_epoch=starting_epoch,
                          optimizer=optimizer,
                          eval_dir=eval_dir,
                          lr_scheduler=lr_scheduler)

    return trainer