Ejemplo n.º 1
0
def trainval(exp_dict, savedir, args):
    """
    exp_dict: dictionary defining the hyperparameters of the experiment
    savedir: the directory where the experiment will be saved
    args: arguments passed through the command line
    """
    # -- Datasets
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=args.datadir)

    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=args.datadir)

    # -- Model
    model = models.Model(exp_dict, device=torch.device('cuda'))

    # -- Train & Val Loop
    score_list = []
    for e in range(0, 50):
        # Compute metrics
        score_dict = {"epoch": e}
        score_dict["train_loss"] = model.val_on_dataset(
            val_set, metric_name='softmax_loss')
        score_dict["val_acc"] = model.val_on_dataset(val_set,
                                                     metric_name='softmax_acc')
        score_list += [score_dict]

        # Train model for one epoch
        model.train_on_dataset(train_set)

        # Visualize
        images = model.vis_on_dataset(val_set,
                                      fname=os.path.join(
                                          savedir, 'images', 'results.png'))

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.save_pkl(os.path.join(savedir, 'score_list.pkl'), score_list)
        hu.torch_save(os.path.join(savedir, 'model.pth'), model.state_dict())
        print("Checkpoint Saved: %s" % savedir)

    print('Experiment completed et epoch %d' % e)
Ejemplo n.º 2
0
def trainval():
    print("train")
    num_epochs = 50
    results = {}
    train_dl = datasets.get_dataset(dataroot="data", image_size=64, batch_size=32, num_workers=2)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = models.Model(device)
    
    score_list = []
    for epoch in range(0, num_epochs):
        print(f'epoch {epoch} of {num_epochs}')
        lossD, lossG = model.train_on_dataset(train_dl)
        results["lossD"] = lossD
        results["lossG"] = lossG
        model.vis_on_dataset(fname=os.path.join('training_image_results', f'epcoch{epoch}_results.png'))
def get_stats(exp_dict):

    dataset_name = exp_dict['dataset']['name']
    n_classes = exp_dict['dataset']['n_classes']

    stat_list = []

    print('')
    print(dataset_name, '-', 'n_classes: %d' % n_classes)
    print('===========')

    fname = '.tmp/covid_stats/%s_c%d.csv' % (dataset_name, n_classes)
    if not os.path.exists(fname):
        for split in ['train', 'val', 'test']:
            dataset = datasets.get_dataset(dataset_dict={'name': dataset_name},
                                           datadir=None,
                                           split=split,
                                           exp_dict=exp_dict)
            loader = torch.utils.data.DataLoader(dataset,
                                                 batch_size=1,
                                                 num_workers=100,
                                                 collate_fn=ut.collate_fn)

            for i, b in enumerate(tqdm.tqdm(loader)):
                u_list = np.unique(b['masks'])
                stat_dict = {'split': split}
                b['points'][b['points'] == 0] = 255
                for c in range(n_classes):
                    if c in u_list:
                        stat_dict['class_%d' % c] = 1
                    else:
                        stat_dict['class_%d' % c] = 0
                for c in range(n_classes):
                    if c == 0:
                        continue
                    stat_dict['n_regions_c%d' %
                              c] = (b['points'] == c).sum().item()
                # stat_dict['n_regions_2'] = (b['points'] == 2).sum().item()
                stat_list += [stat_dict]
        stats = pd.DataFrame(stat_list).groupby('split').sum()
        stats.to_csv(fname)
    else:
        stats = pd.read_csv(fname)

    return stats
Ejemplo n.º 4
0
def train(cfg):
    print(cfg.pretty())
    train_config_validator(cfg)
    fix_seed(cfg.seed)

    writer = SummaryWriter(log_dir='logs')
    controller = load_pretrained_weights(
        NAO(**cfg.controller).to(0), cfg.pretrained_model_path)
    dataset = get_dataset(writer=writer, seed=cfg.seed, **cfg.dataset)
    optimizer = get_optimizer(parameters=_get_target_parameters(
        controller, cfg.freeze_encoder_decoder),
                              **cfg.optimizer)
    lr_scheduler = get_scheduler(optimizer=optimizer, **cfg.scheduler)
    end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook

    get_trainer(
        controller=controller,
        dataset=dataset,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        writer=writer,
        end_of_epoch_hook=end_of_epoch_hook,
        **cfg.trainer,
    ).train()
def pretrain(cfg):
    print(cfg.pretty())
    pretrain_config_validator(cfg)
    fix_seed(cfg.seed)

    controller = load_pretrained_weights(
        NAO(**cfg.controller).to(0), cfg.pretrained_model_path)
    models = {'trunk': controller}
    dataset = get_dataset(seed=cfg.seed, **cfg.dataset)
    optimizers = {
        'trunk_optimizer':
        get_optimizer(parameters=models['trunk'].parameters(), **cfg.optimizer)
    }
    lr_schedulers = {
        'trunk_scheduler_by_iteration':
        get_scheduler(optimizer=optimizers['trunk_optimizer'], **cfg.scheduler)
    }
    loss_funcs = {
        'reconstruction_loss': torch.nn.NLLLoss(),
        'metric_loss': get_loss(**cfg.loss)
    }
    mining_funcs = {"tuple_miner": get_miner(**cfg.miner)}
    visualizers = [umap.UMAP(**params) for params in cfg.visualizers]
    end_of_iteration_hook = TensorboardHook(visualizers).end_of_iteration_hook
    end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook
    get_trainer(
        models=models,
        optimizers=optimizers,
        lr_schedulers=lr_schedulers,
        loss_funcs=loss_funcs,
        mining_funcs=mining_funcs,
        dataset=dataset,
        end_of_iteration_hook=end_of_iteration_hook,
        end_of_epoch_hook=end_of_epoch_hook,
        **cfg.trainer,
    ).train()
Ejemplo n.º 6
0
        "model": {
            'name': 'lcfcn',
            'base': "fcn8_vgg16"
        },
        "batch_size": 1,
        "max_epoch": 100,
        'dataset_size': {
            'train': 1,
            'val': 1
        },
        'optimizer': 'adam',
        'lr': 1e-5
    }

    train_set = datasets.get_dataset(dataset_dict=exp_dict['dataset'],
                                     datadir='/mnt/public/datasets/Trancos',
                                     split="test",
                                     exp_dict=exp_dict)
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()
    batch = train_set[0]
    batch['images'] = batch['images'][None]
    batch['points'] = batch['points'][None]

    # train for several iterations
    for i in range(1000):
        loss = model.train_on_batch(batch)
        print(i, '- loss:', float(loss['train_loss']))

    # visualize blobs and heatmap
    model.vis_on_batch(batch, savedir_image='result.png')
Ejemplo n.º 7
0
def trainval(exp_dict, savedir_base, datadir_base, reset=False):
    # bookkeeping stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================

    # load train and acrtive set
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     split="train",
                                     datadir_base=datadir_base,
                                     exp_dict=exp_dict)

    active_set = ActiveLearningDataset(train_set, random_state=42)

    # val set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   split="val",
                                   datadir_base=datadir_base,
                                   exp_dict=exp_dict)
    val_loader = DataLoader(val_set, batch_size=exp_dict["batch_size"])

    # Model
    # ==================
    model = models.get_model(model_name=exp_dict['model']['name'],
                             exp_dict=exp_dict).cuda()

    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.set_state_dict(hu.torch_load(model_path))
        active_set.load_state_dict(
            hu.load_pkl(os.path.join(savedir, "active_set.pkl")))
        score_list = hu.load_pkl(score_list_path)
        inner_s_epoch = score_list[-1]['inner_epoch'] + 1
        s_cycle = score_list[-1]['cycle']
    else:
        # restart experiment
        score_list = []
        inner_s_epoch = 0
        s_cycle = 0

    # Train & Val
    # ==================
    print("Starting experiment at cycle %d epoch %d" %
          (s_cycle, inner_s_epoch))

    for c in range(s_cycle, exp_dict['max_cycle']):
        # Set seed
        np.random.seed(c)
        torch.manual_seed(c)
        torch.cuda.manual_seed_all(c)

        if inner_s_epoch == 0:
            active_set.label_next_batch(model)
            hu.save_pkl(os.path.join(savedir, "active_set.pkl"),
                        active_set.state_dict())

        train_loader = DataLoader(active_set,
                                  sampler=samplers.get_sampler(
                                      exp_dict['sampler']['train'],
                                      active_set),
                                  batch_size=exp_dict["batch_size"])
        # Visualize the model
        model.vis_on_loader(vis_loader,
                            savedir=os.path.join(savedir, "images"))

        for e in range(inner_s_epoch, exp_dict['max_epoch']):
            # Validate only at the start of each cycle
            score_dict = {}
            if e == 0:
                score_dict.update(model.val_on_loader(val_loader))

            # Train the model
            score_dict.update(model.train_on_loader(train_loader))

            # Validate the model
            score_dict["epoch"] = len(score_list)
            score_dict["inner_epoch"] = e
            score_dict["cycle"] = c
            score_dict['n_ratio'] = active_set.n_labelled_ratio
            score_dict["n_train"] = len(train_loader.dataset)
            score_dict["n_pool"] = len(train_loader.dataset.pool)

            # Add to score_list and save checkpoint
            score_list += [score_dict]

            # Report & Save
            score_df = pd.DataFrame(score_list)
            print("\n", score_df.tail(), "\n")
            hu.torch_save(model_path, model.get_state_dict())
            hu.save_pkl(score_list_path, score_list)
            print("Checkpoint Saved: %s" % savedir)

        inner_s_epoch = 0
Ejemplo n.º 8
0
def trainval(exp_dict, savedir, datadir, metrics_flag=True):
    # TODO: Do we get similar results with different seeds?
    # Set seed
    np.random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)

    pprint.pprint(exp_dict)

    # Load Train Dataset
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = DataLoader(train_set,
                              drop_last=True,
                              shuffle=True,
                              batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Load model
    model = models.get_model(exp_dict["model"], train_set=train_set).cuda()

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Resume from last saved state_dict
    if (not os.path.exists(savedir + "/run_dict.pkl")
            or not os.path.exists(savedir + "/score_list.pkl")):
        ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1})
        score_list = []
        s_epoch = 0
    else:
        score_list = ut.load_pkl(savedir + "/score_list.pkl")
        model.load_state_dict(torch.load(savedir + "/model_state_dict.pth"))
        opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth"))
        s_epoch = score_list[-1]["epoch"] + 1

    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        # Set seed
        np.random.seed(epoch)
        torch.manual_seed(epoch)
        torch.cuda.manual_seed_all(epoch)

        score_dict = {"epoch": epoch}

        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model, train_set, metric_name=exp_dict["loss_func"])

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model, val_set, metric_name=exp_dict["acc_func"])

        # 3. Train over train loader
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()
        for images, labels in tqdm.tqdm(train_loader):
            images, labels = images.cuda(), labels.cuda()

            opt.zero_grad()

            if exp_dict["opt"]["name"] in exp_configs.ours_opt_list + ["l4"]:
                closure = lambda: loss_function(
                    model, images, labels, backwards=False)
                opt.step(closure)

            else:
                loss = loss_function(model, images, labels)
                loss.backward()
                opt.step()

        e_time = time.time()

        # Record step size and batch size
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        ut.save_pkl(savedir + "/score_list.pkl", score_list)
        ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict())
        ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict())
        print("Saved: %s" % savedir)

    return score_list
Ejemplo n.º 9
0
    datadir = '/mnt/public/datasets/DeepFish/'

    score_list = []
    for hash_id in hash_list:
        fname = os.path.join('/mnt/public/predictions/habitat/%s.pkl' %
                             hash_id)
        exp_dict = hu.load_json(
            os.path.join(savedir_base, hash_id, 'exp_dict.json'))
        if os.path.exists(fname):
            print('FOUND:', fname)
            val_dict = hu.load_pkl(fname)
        else:

            train_set = datasets.get_dataset(
                dataset_dict=exp_dict["dataset"],
                split='train',
                datadir=datadir,
                exp_dict=exp_dict,
                dataset_size=exp_dict['dataset_size'])

            test_set = datasets.get_dataset(
                dataset_dict=exp_dict["dataset"],
                split='test',
                datadir=datadir,
                exp_dict=exp_dict,
                dataset_size=exp_dict['dataset_size'])

            test_loader = DataLoader(test_set,
                                     batch_size=1,
                                     collate_fn=ut.collate_fn,
                                     num_workers=0)
            pprint.pprint(exp_dict)
Ejemplo n.º 10
0
def trainval(exp_dict, savedir, args):
    """
    exp_dict: dictionary defining the hyperparameters of the experiment
    savedir: the directory where the experiment will be saved
    args: arguments passed through the command line
    """

    # set seed
    # ==================
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)
    if args.use_cuda:
        device = 'cuda'
        torch.cuda.manual_seed_all(seed)
        assert torch.cuda.is_available(
        ), 'cuda is not, available please run with "-c 0"'
    else:
        device = 'cpu'

    print('Running on device: %s' % device)

    # Dataset
    # Load val set and train set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   split="val",
                                   transform=exp_dict.get("transform"),
                                   datadir=args.datadir)
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     split="train",
                                     transform=exp_dict.get("transform"),
                                     datadir=args.datadir)

    # Load train loader, val loader, and vis loader
    train_loader = DataLoader(train_set,
                              sampler=RandomSampler(
                                  train_set,
                                  replacement=True,
                                  num_samples=max(min(500, len(train_set)),
                                                  len(val_set))),
                              batch_size=exp_dict["batch_size"])

    val_loader = DataLoader(val_set,
                            shuffle=False,
                            batch_size=exp_dict["batch_size"])
    vis_loader = DataLoader(val_set,
                            sampler=ut.SubsetSampler(train_set,
                                                     indices=[0, 1, 2]),
                            batch_size=1)

    # Create model, opt, wrapper
    model_original = models.get_model(exp_dict["model"],
                                      exp_dict=exp_dict).cuda()
    opt = torch.optim.Adam(model_original.parameters(),
                           lr=1e-5,
                           weight_decay=0.0005)

    model = wrappers.get_wrapper(exp_dict["wrapper"],
                                 model=model_original,
                                 opt=opt).cuda()

    score_list = []

    # Checkpointing
    # =============
    score_list_path = os.path.join(savedir, "score_list.pkl")
    model_path = os.path.join(savedir, "model_state_dict.pth")
    opt_path = os.path.join(savedir, "opt_state_dict.pth")

    if os.path.exists(score_list_path):
        # resume experiment
        score_list = hu.load_pkl(score_list_path)
        model.load_state_dict(torch.load(model_path))
        opt.load_state_dict(torch.load(opt_path))
        s_epoch = score_list[-1]["epoch"] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Run training and validation
    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        score_dict = {"epoch": epoch}

        # visualize
        model.vis_on_loader(vis_loader,
                            savedir=os.path.join(savedir, "images"))
        # validate
        score_dict.update(model.val_on_loader(val_loader))

        # train
        score_dict.update(model.train_on_loader(train_loader))

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved in %s" % savedir)
Ejemplo n.º 11
0
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    model = eval(args.model)(dataset=args.dataset, device=args.device)

    # for fine-tuning a pre-trained model, we strip out the last fc layer
    if args.save_path:
        saved_dict = torch.load(args.save_path)
        del saved_dict["model.module.fc.weight"]
        del saved_dict["model.module.fc.bias"]
        model.load_state_dict(saved_dict, strict=False)

    model.train()

    train_loader = DataLoader(get_dataset(args.dataset, "train"),
                              shuffle=True,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              pin_memory=False)

    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=1e-4,
                          nesterov=True)
    annealer = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_epochs)

    loss_meter = meter.AverageValueMeter()
    time_meter = meter.TimeMeter(unit=False)
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # set seed
    # ==================
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # ==================
    # train set
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir=datadir,
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])
    # val set
    val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="val",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])

    # test set
    test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                    split="test",
                                    datadir=datadir,
                                    exp_dict=exp_dict,
                                    dataset_size=exp_dict['dataset_size'])

    # val_sampler = torch.utils.data.SequentialSampler(val_set)
    val_loader = DataLoader(
        val_set,
        # sampler=val_sampler,
        batch_size=1,
        collate_fn=ut.collate_fn,
        num_workers=num_workers)
    test_loader = DataLoader(
        test_set,
        # sampler=val_sampler,
        batch_size=1,
        collate_fn=ut.collate_fn,
        num_workers=num_workers)

    # Model
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    # model.opt = optimizers.get_optim(exp_dict['opt'], model)
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % (s_epoch))
    model.waiting = 0
    model.val_score_best = -np.inf

    train_sampler = torch.utils.data.RandomSampler(train_set,
                                                   replacement=True,
                                                   num_samples=2 *
                                                   len(test_set))

    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              collate_fn=ut.collate_fn,
                              batch_size=exp_dict["batch_size"],
                              drop_last=True,
                              num_workers=num_workers)

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}
        test_dict = model.val_on_loader(test_loader,
                                        savedir_images=os.path.join(
                                            savedir, "images"),
                                        n_images=3)
        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate the model
        val_dict = model.val_on_loader(val_loader)
        score_dict["val_score"] = val_dict["val_score"]

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = e
        score_dict["waiting"] = model.waiting

        model.waiting += 1

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Save Best Checkpoint
        score_df = pd.DataFrame(score_list)
        if score_dict["val_score"] >= model.val_score_best:
            test_dict = model.val_on_loader(test_loader,
                                            savedir_images=os.path.join(
                                                savedir, "images"),
                                            n_images=3)
            score_dict.update(test_dict)
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            # score_df.to_csv(os.path.join(savedir, "score_best_df.csv"))
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())
            model.waiting = 0
            model.val_score_best = score_dict["val_score"]
            print("Saved Best: %s" % savedir)

        # Report & Save
        score_df = pd.DataFrame(score_list)
        # score_df.to_csv(os.path.join(savedir, "score_df.csv"))
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        if model.waiting > 100:
            break

    print('Experiment completed et epoch %d' % e)
Ejemplo n.º 13
0
def trainval_svrg(exp_dict, savedir, datadir, metrics_flag=True):
    '''
        SVRG-specific training and validation loop.
    '''
    pprint.pprint(exp_dict)

    # Load Train Dataset
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = DataLoader(train_set,
                              drop_last=False,
                              shuffle=True,
                              batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Load model
    model = models.get_model(exp_dict["model"], train_set=train_set).cuda()

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # lookup the learning rate
    lr = get_svrg_step_size(exp_dict)

    # Load Optimizer
    opt = get_svrg_optimizer(model,
                             loss_function,
                             train_loader=train_loader,
                             lr=lr)

    # Resume from last saved state_dict
    if (not os.path.exists(savedir + "/run_dict.pkl")
            or not os.path.exists(savedir + "/score_list.pkl")):
        ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1})
        score_list = []
        s_epoch = 0
    else:
        score_list = ut.load_pkl(savedir + "/score_list.pkl")
        model.load_state_dict(torch.load(savedir + "/model_state_dict.pth"))
        opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth"))
        s_epoch = score_list[-1]["epoch"] + 1

    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        score_dict = {"epoch": epoch}

        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model, train_set, metric_name=exp_dict["loss_func"])

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model, val_set, metric_name=exp_dict["acc_func"])

        # 3. Train over train loader
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()
        for images, labels in tqdm.tqdm(train_loader):
            images, labels = images.cuda(), labels.cuda()

            opt.zero_grad()
            closure = lambda svrg_model: loss_function(
                svrg_model, images, labels, backwards=True)
            opt.step(closure)

        e_time = time.time()

        # Record step size and batch size
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        ut.save_pkl(savedir + "/score_list.pkl", score_list)
        ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict())
        ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict())
        print("Saved: %s" % savedir)

    return score_list
Ejemplo n.º 14
0
def trainval(exp_dict, savedir_base, datadir, reset=False, num_workers=0):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        hc.delete_and_backup_experiment(savedir)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================
    # train set
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir=datadir,
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])
    # val set
    val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="val",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])

    val_sampler = torch.utils.data.SequentialSampler(val_set)
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=1,
                            num_workers=num_workers)
    # Model
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    # model.opt = optimizers.get_optim(exp_dict['opt'], model)
    model_path = os.path.join(savedir, "model.pth")
    score_list_path = os.path.join(savedir, "score_list.pkl")

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d" % (s_epoch))

    train_sampler = torch.utils.data.RandomSampler(train_set,
                                                   replacement=True,
                                                   num_samples=2 *
                                                   len(val_set))

    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=exp_dict["batch_size"],
                              drop_last=True,
                              num_workers=num_workers)

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}

        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate and Visualize the model
        val_dict = model.val_on_loader(val_loader,
                                       savedir_images=os.path.join(
                                           savedir, "images"),
                                       n_images=3)
        score_dict.update(val_dict)
        # model.vis_on_loader(
        #     vis_loader, savedir=os.path.join(savedir, "images"))

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = len(score_list)

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        # Save Best Checkpoint
        if e == 0 or (score_dict.get("val_score", 0) >
                      score_df["val_score"][:-1].fillna(0).max()):
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())
            print("Saved Best: %s" % savedir)

    print('Experiment completed et epoch %d' % e)
Ejemplo n.º 15
0
def trainval(exp_dict, savedir, args):
    """
    exp_dict: dictionary defining the hyperparameters of the experiment
    savedir: the directory where the experiment will be saved
    args: arguments passed through the command line
    """
    datadir = args.datadir 
    # set seed
    # ==================
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # ==================
    # train set
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir=datadir,
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])
    # val set
    val_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="val",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])

    # test set
    test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                   split="test",
                                   datadir=datadir,
                                   exp_dict=exp_dict,
                                   dataset_size=exp_dict['dataset_size'])


    # val_sampler = torch.utils.data.SequentialSampler(val_set)
    val_loader = DataLoader(val_set,
                            # sampler=val_sampler,
                            batch_size=exp_dict["batch_size"],
                            collate_fn=ut.collate_fn,
                            num_workers=args.num_workers,
                            drop_last=False)

    test_loader = DataLoader(test_set,
                            # sampler=val_sampler,
                            batch_size=1,
                            collate_fn=ut.collate_fn,
                            num_workers=args.num_workers)

    # Model 
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=train_set).cuda()

    chk_dict = hw.get_checkpoint(savedir)
    score_list = chk_dict['score_list']

    # Train & Val
    # ==================
    model.waiting = 0
    model.val_score_best = -np.inf
    
    sampler = exp_dict['dataset'].get('sampler', 'random') 
    if sampler == 'random':
        train_sampler = torch.utils.data.RandomSampler(
                                    train_set, replacement=True, 
                                    num_samples=len(val_set))
    elif sampler == 'balanced':
        train_sampler = samplers.BalancedSampler(
                                    train_set, n_samples=len(val_set))
    train_loader = DataLoader(train_set,
                            sampler=train_sampler,
                            collate_fn=ut.collate_fn,
                            batch_size=exp_dict["batch_size"], 
                            drop_last=True, 
                            num_workers=args.num_workers)
    
    for e in range(chk_dict['epoch'], exp_dict['max_epoch']):
        # Validate only at the start of each cycle
        score_dict = {}
        # Train the model
        train_dict = model.train_on_loader(train_loader)

        # Validate the model
        val_dict = model.val_on_loader(val_loader, 
                                       savedir_images=os.path.join(savedir, "images"), n_images=5)
        score_dict.update(val_dict)

        # Get new score_dict
        score_dict.update(train_dict)
        score_dict["epoch"] = e
        score_dict["waiting"] = model.waiting

        model.waiting += 1

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Save Best Checkpoint
        score_df = pd.DataFrame(score_list)
        if score_dict["val_score"] >= model.val_score_best:
            test_dict = model.val_on_loader(test_loader,
                                    savedir_images=os.path.join(savedir, "images"),
                                    n_images=3)  
            score_dict.update(test_dict)

            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"), score_list)
            # score_df.to_csv(os.path.join(savedir, "score_best_df.csv"))
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                        model.get_state_dict())
            model.waiting = 0
            model.val_score_best = score_dict["val_score"]
            print("Saved Best: %s" % savedir)

        # Report & Save
        hw.save_checkpoint(savedir, score_list=score_list)

        if model.waiting > 100:
            break

    print('Experiment completed et epoch %d' % e)
Ejemplo n.º 16
0
def train(exp_dict, savedir_base, reset, compute_fid=False):
    # Book keeping
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)
    if reset:
        ut.rmtree(savedir)
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    print('Experiment saved in %s' % savedir)

    device = \
        torch.device('cuda:' + exp_dict['gpu'] if torch.cuda.is_available() else 'cpu')

    # 1. Load dataset and loader
    train_set, test_set, num_channels, num_train_classes, num_test_classes = \
        datasets.get_dataset(exp_dict['dataset'],
                             dataset_path=savedir_base,
                             image_size=exp_dict['image_size'])
    train_loader, test_loader = \
            dataloaders.get_dataloader(exp_dict['dataloader'],
                                       train_set, test_set, exp_dict)

    # 2. Fetch model to train
    model = models.get_model(exp_dict['model'], num_train_classes,
                             num_test_classes, num_channels, device, exp_dict)

    # 3. Resume experiment or start from scratch
    score_list_path = os.path.join(savedir, 'score_list.pkl')
    if os.path.exists(score_list_path):
        # Resume experiment if it exists
        model_path = os.path.join(savedir, 'model_state_dict.pth')
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        meta_dict_path = os.path.join(savedir, 'meta_dict.pkl')
        meta_dict = hu.load_pkl(meta_dict_path)
        print('Resuming experiment at episode %d epoch %d' %
              (meta_dict['episode'], meta_dict['epoch']))
    else:
        # Start experiment from scratch
        meta_dict = {'episode': 1, 'epoch': 1}
        score_list = []

        # Remove TensorBoard logs from previous runs
        ut.rmtree(os.path.join(savedir, 'tensorboard_logs'))

        print('Starting experiment at episode %d epoch %d' %
              (meta_dict['episode'], meta_dict['epoch']))

    # 4. Train and eval loop
    s_epoch = meta_dict['epoch']
    for e in range(s_epoch, exp_dict['num_epochs'] + 1):
        # 0. Initialize dicts
        score_dict = {'epoch': e}
        meta_dict['epoch'] = e

        # 1. Train on loader
        train_dict = model.train_on_loader(train_loader)

        # 1b. Compute FID
        if compute_fid == 1:
            if e % 20 == 0 or e == 1 or e == exp_dict['num_epochs']:
                print('Starting FID computation...')
                train_dict['fid'] = fid(model, train_loader.dataset,
                                        train_loader.sampler, save_dir)

        score_dict.update(train_dict)

        # 2. Eval on loader
        eval_dict = model.val_on_loader(test_loader, savedir, e)
        score_dict.update(eval_dict)

        # 3. Report and save model state, optimizer state, and scores
        score_list += [score_dict]
        score_df = pd.DataFrame(score_list)
        print('\n', score_df.tail(), '\n')
        if e % 10 == 0:
            hu.torch_save(os.path.join(savedir, 'model_state_dict.pth'),
                          model.get_state_dict())
        hu.save_pkl(os.path.join(savedir, 'score_list.pkl'), score_list)
        hu.save_pkl(os.path.join(savedir, 'meta_dict.pkl'), meta_dict)
Ejemplo n.º 17
0
def trainval(exp_dict, savedir, args):
    """
    exp_dict: dictionary defining the hyperparameters of the experiment
    savedir: the directory where the experiment will be saved
    args: arguments passed through the command line
    """

    # set seed
    # ==================
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)

    #helen commented out the following lines to hard code in that the device was 'cpu' to resolve errors
    #if args.use_cuda:
    #device = 'cuda'
    #torch.cuda.manual_seed_all(seed)
    #assert torch.cuda.is_available(), 'cuda is not, available please run with "-c 0"'
    #else:
    device = 'cpu'

    print('Running on device: %s' % device)

    # Dataset
    # Load val set and train set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   split="val",
                                   transform=exp_dict.get("transform"),
                                   datadir=args.datadir)
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     split="train",
                                     transform=exp_dict.get("transform"),
                                     datadir=args.datadir)

    # Load train loader, val loader, and vis loader
    train_loader = DataLoader(train_set,
                              sampler=RandomSampler(
                                  train_set,
                                  replacement=True,
                                  num_samples=max(min(500, len(train_set)),
                                                  len(val_set))),
                              batch_size=exp_dict["batch_size"])

    val_loader = DataLoader(val_set,
                            shuffle=False,
                            batch_size=exp_dict["batch_size"])
    vis_loader = DataLoader(val_set,
                            sampler=ut.SubsetSampler(train_set,
                                                     indices=[0, 1, 2]),
                            batch_size=1)

    # Create model, opt, wrapper
    model_original = models.get_model(exp_dict["model"],
                                      exp_dict=exp_dict).cuda()
    opt = torch.optim.Adam(model_original.parameters(),
                           lr=1e-5,
                           weight_decay=0.0005)

    model = wrappers.get_wrapper(exp_dict["wrapper"],
                                 model=model_original,
                                 opt=opt).cuda()

    score_list = []

    # Checkpointing
    # =============
    #score_list_path = os.path.join(savedir, "score_list.pkl")      #helen commented out these three lines and hard coded the model and opt paths to resolve errors
    #model_path = os.path.join(savedir, "model_state_dict.pth")
    #opt_path = os.path.join(savedir, "opt_state_dict.pth")
    score_list_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/score_list.pkl'  #helen added this
    model_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/model_state_dict.pth'  #helen added this
    opt_path = '/Users/helenpropson/Documents/git/marepesca/results/testresults/opt_state_dict.pth'  #helen added this

    #helen hard coded that the experiment would resume instead of restarting from epoch 0
    #if os.path.exists(score_list_path):
    # resume experiment
    score_list = hu.load_pkl(
        score_list_path
    )  #helen changed this from ut.load_pkl to hu.load_pkl to resolve error
    model.load_state_dict(torch.load(model_path))
    opt.load_state_dict(torch.load(opt_path))
    s_epoch = score_list[-1]["epoch"] + 1

    #else:
    # restart experiment
    #score_list = []
    #s_epoch = 0

    # ***************            helen added this code
    im = Image.open("/Users/helenpropson/Documents/git/marepesca/tank.jpg")
    # im.show()  #this line will display the image you are running the model on if uncommented

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    normalize_transform = transforms.Normalize(mean=mean, std=std)

    data_transform = transforms.Compose(
        [transforms.ToTensor(),
         normalize_transform])  #transformations we will use on our image
    im_new = data_transform(
        im)  #transforms the image into a tensor and normalizes it
    im_final = im_new.unsqueeze(
        0)  #adds another dimension so image is the correct shape for the model
    print("now trying helen's code")  #print statement for debugging
    #model.vis_on_batch_helen(im_final, f'im_new')    #uncomment this line to run model on image

    # ***************            this is the end of helen's code

    # Run training and validation
    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        score_dict = {"epoch": epoch}

        # visualize
        model.vis_on_loader(vis_loader,
                            savedir=os.path.join(savedir, "images"))

        print("after vis_on_loader"
              )  #helen add this print statement as an update while iterating

        # validate
        score_dict.update(model.val_on_loader(val_loader))

        print("after validate")

        # train
        score_dict.update(model.train_on_loader(train_loader))

        print("after train")

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved in %s" % savedir)
        exp_dict = exp_configs.EXP_GROUPS[exp_group][0]
        dataset_name = exp_dict['dataset']['name']
        n_classes = exp_dict['dataset']['n_classes']

        stat_list = []

        print('')
        print(dataset_name, '-', 'n_classes: %d' % n_classes)
        print('===========')

        fname = '.tmp/covid_stats/%s_c%d.csv' % (dataset_name, n_classes)
        if not os.path.exists(fname):
            for split in ['train', 'val', 'test']:
                dataset = datasets.get_dataset(
                    dataset_dict={'name': dataset_name},
                    datadir=None,
                    split=split,
                    exp_dict=exp_dict)
                loader = torch.utils.data.DataLoader(dataset,
                                                     batch_size=1,
                                                     num_workers=100,
                                                     collate_fn=ut.collate_fn)

                for i, b in enumerate(tqdm.tqdm(loader)):
                    u_list = np.unique(b['masks'])
                    stat_dict = {'split': split}
                    b['points'][b['points'] == 0] = 255
                    for c in range(n_classes):
                        if c in u_list:
                            stat_dict['class_%d' % c] = 1
                        else:
Ejemplo n.º 19
0
    # lcfcn loss with_affinity=True
    # hash_dir = '84ced18cf5c1fb3ad5820cc1b55a38fa'

    # point level
    # hash_dir = 'd7040c9534b08e765f48c6cb034b26b2'

    # LCFCN
    hash_dir = 'bcba046296675e9e3af5cd9f353d217b'

    savedir = '/mnt/public/predictions'
    datadir = '/mnt/public/datasets/DeepFish/'

    split = 'test'
    test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                    split=split,
                                    datadir=datadir,
                                    exp_dict=exp_dict,
                                    dataset_size=exp_dict['dataset_size'])
    test_loader = DataLoader(
        test_set,
        # sampler=val_sampler,
        batch_size=1,
        collate_fn=ut.collate_fn,
        num_workers=0)

    # Model
    # ==================
    model = models.get_model(model_dict=exp_dict['model'],
                             exp_dict=exp_dict,
                             train_set=test_set).cuda()
Ejemplo n.º 20
0
def newminimum(exp_id,
               savedir_base,
               datadir,
               name,
               exp_dict,
               metrics_flag=True):
    # bookkeeping
    # ---------------

    # get experiment directory
    old_modeldir = os.path.join(savedir_base, exp_id)
    savedir = os.path.join(savedir_base, exp_id, name)

    old_exp_dict = hu.load_json(os.path.join(old_modeldir, 'exp_dict.json'))

    # TODO: compare exp dict for possible errors:
    # optimizer have to be the same
    # same network, dataset

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # set seed
    # ---------------
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # -----------

    # Load Train Dataset
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = torch.utils.data.DataLoader(
        train_set,
        drop_last=True,
        shuffle=True,
        batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Model
    # -----------
    model = models.get_model(exp_dict["model"], train_set=train_set)

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, 'model.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')
    opt_path = os.path.join(savedir, 'opt_state_dict.pth')

    old_model_path = os.path.join(old_modeldir, 'model.pth')
    old_score_list_path = os.path.join(old_modeldir, 'score_list.pkl')
    old_opt_path = os.path.join(old_modeldir, 'opt_state_dict.pth')

    score_list = hu.load_pkl(old_score_list_path)
    model.load_state_dict(torch.load(old_model_path))
    opt.load_state_dict(torch.load(old_opt_path))
    s_epoch = score_list[-1]['epoch'] + 1

    # save current model state for comparison
    minimum = []

    for param in model.parameters():
        minimum.append(param.clone())

    # Train & Val
    # ------------
    print('Starting experiment at epoch %d/%d' %
          (s_epoch, exp_dict['max_epoch']))

    for epoch in range(s_epoch, exp_dict['max_epoch']):
        # Set seed
        np.random.seed(exp_dict['runs'] + epoch)
        torch.manual_seed(exp_dict['runs'] + epoch)
        # torch.cuda.manual_seed_all(exp_dict['runs']+epoch) not needed since no cuda available

        score_dict = {"epoch": epoch}

        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model, train_set, metric_name='softmax_loss')
            #                                    metric_name=exp_dict["loss_func"])
            # TODO: which loss should be used? (normal or with reguralizer?)

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model, val_set, metric_name=exp_dict["acc_func"])

        # 3. Train over train loader
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()
        for images, labels in tqdm.tqdm(train_loader):
            # images, labels = images.cuda(), labels.cuda() no cuda available

            opt.zero_grad()
            loss = loss_function(model, images, labels, minimum,
                                 0.1)  # just works for custom loss function
            loss.backward()
            opt.step()

        e_time = time.time()

        # Record metrics
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

        with torch.nograd():
            print('Current distance: %f',
                  metrics.computedistance(minimum, model))

    print('Experiment completed')
Ejemplo n.º 21
0
def test(exp_dict,
         savedir_base,
         datadir,
         num_workers=0,
         model_path=None,
         scan_id=None):
    # bookkeepting stuff
    # ==================
    pprint.pprint(exp_dict)
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)
    print("Experiment saved in %s" % savedir)

    # Dataset
    # ==================
    # val set
    test_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                    split="val",
                                    datadir=datadir,
                                    exp_dict=exp_dict,
                                    dataset_size=exp_dict['dataset_size'])
    if str(scan_id) != 'None':
        test_set.active_data = test_set.get_scan(scan_id)
    test_sampler = torch.utils.data.SequentialSampler(test_set)
    test_loader = DataLoader(test_set,
                             sampler=test_sampler,
                             batch_size=1,
                             collate_fn=ut.collate_fn,
                             num_workers=num_workers)

    # Model
    # ==================
    # chk = torch.load('best_model.ckpt')
    model = models.get_model_for_onnx_export(model_dict=exp_dict['model'],
                                             exp_dict=exp_dict,
                                             train_set=test_set).cuda()
    epoch = -1

    if str(model_path) != 'None':
        model_path = model_path
        model.load_state_dict(hu.torch_load(model_path))
    else:
        try:
            exp_dict_train = copy.deepcopy(exp_dict)
            del exp_dict_train['test_mode']
            savedir_train = os.path.join(savedir_base,
                                         hu.hash_dict(exp_dict_train))
            model_path = os.path.join(savedir_train, "model_best.pth")
            score_list = hu.load_pkl(
                os.path.join(savedir_train, 'score_list_best.pkl'))
            epoch = score_list[-1]['epoch']
            print('Loaded model at epoch %d with score %.3f' % epoch)
            model.load_state_dict(hu.torch_load(model_path))
        except:
            pass

    s_time = time.time()
    savedir_images = os.path.join(savedir, 'images')

    # delete image folder if exists
    if os.path.exists(savedir_images):
        shutil.rmtree(savedir_images)

    os.makedirs(savedir_images, exist_ok=True)
    # for i in range(20):
    #     score_dict = model.train_on_loader(test_loader)
    score_dict = model.val_on_loader(test_loader,
                                     savedir_images=savedir_images,
                                     n_images=30000,
                                     save_preds=True)

    score_dict['epoch'] = epoch
    score_dict["time"] = time.time() - s_time
    score_dict["saved_at"] = hu.time_to_montreal()
    # save test_score_list
    test_path = os.path.join(savedir, "score_list.pkl")
    if os.path.exists(test_path):
        test_score_list = [
            sd for sd in hu.load_pkl(test_path) if sd['epoch'] != epoch
        ]
    else:
        test_score_list = []

    # append score_dict to last result
    test_score_list += [score_dict]
    hu.save_pkl(test_path, test_score_list)
    print('Final Score is ', str(score_dict["val_score"]) + "\n")
Ejemplo n.º 22
0
        'lr': 1e-06,
        'max_epoch': 100,
        'model': {
            'base': 'fcn8_vgg16',
            'loss': 'point_level',
            'n_channels': 3,
            'n_classes': 2,
            'name': 'semseg'
        },
        'num_channels': 1,
        'optimizer': 'adam'
    }
    pprint.pprint(exp_dict)
    train_set = datasets.get_dataset(dataset_dict=exp_dict["dataset"],
                                     split="train",
                                     datadir='/mnt/public/datasets/DeepFish',
                                     exp_dict=exp_dict,
                                     dataset_size=exp_dict['dataset_size'])

    model_seam = resnet38_SEAM.Net().cuda()
    model_seam.load_state_dict(
        torch.load(os.path.join('/mnt/public/weights', 'resnet38_SEAM.pth')))

    model_aff = resnet38_aff.Net().cuda()
    model_aff.load_state_dict(torch.load(
        os.path.join('/mnt/public/weights', 'resnet38_aff_SEAM.pth')),
                              strict=False)

    # ut.generate_seam_segmentation(train_set,
    #                               path_base='/mnt/datasets/public/issam/seam',
    #                             #   path_base='D:/Issam/SEAM_model/'
Ejemplo n.º 23
0
def trainval(exp_dict, savedir_base, datadir_base, reset=False, 
            num_workers=0, pin_memory=False, ngpu=1, cuda_deterministic=False):
    # bookkeeping
    # ==================

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    if DEVICE.type == "cuda":
        if cuda_deterministic:
            cudnn.benchmark = False
            cudnn.deterministic = True
        else:
            cudnn.benchmark = True

    # Dataset
    # ==================
    trainset = get_dataset(exp_dict['dataset'], 'train',
                           exp_dict=exp_dict, datadir_base=datadir_base,
                           n_samples=exp_dict['dataset_size']['train'],
                           transform_lvl=exp_dict['dataset']['transform_lvl'],
                           colorjitter=exp_dict['dataset'].get('colorjitter')
                           )

    valset = get_dataset(exp_dict['dataset'], 'validation',
                         exp_dict=exp_dict, datadir_base=datadir_base,
                         n_samples=exp_dict['dataset_size']['train'],
                         transform_lvl=0,
                         val_transform=exp_dict['dataset']['val_transform'])

    testset = get_dataset(exp_dict['dataset'], 'test',
                          exp_dict=exp_dict, datadir_base=datadir_base,
                          n_samples=exp_dict['dataset_size']['test'],
                          transform_lvl=0,
                          val_transform=exp_dict['dataset']['val_transform'])
    print("Dataset defined.")

    # define dataloaders
    if exp_dict['dataset']['name'] == 'bach':
        testloader = torch.utils.data.DataLoader(testset, batch_size=1,
                                                 shuffle=False,
                                                 num_workers=num_workers,
                                                 pin_memory=pin_memory)
    else:
        testloader = torch.utils.data.DataLoader(testset, batch_size=exp_dict['batch']['size'],
                                                 shuffle=False,
                                                 num_workers=num_workers,
                                                 pin_memory=pin_memory)

    print("Testloader  defined.")

    # Model
    # ==================
    model = get_model(exp_dict, trainset, device=DEVICE)

    print("Model loaded")

    model_path = os.path.join(savedir, 'model.pth')
    model_best_path = os.path.join(savedir, 'model_best.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')

    # checkpoint management
    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(model_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = len(score_list)
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # define and log random seed for reproducibility
    assert('fixedSeed' in exp_dict)
    seed = exp_dict['fixedSeed']

    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    print("Seed defined.")

    # Train & Val
    # ==================
    print("Starting experiment at epoch %d/%d" % (s_epoch, exp_dict['niter']))

    for epoch in range(s_epoch, exp_dict['niter']):
        s_time = time.time()
        # Sample new train val
        trainloader, valloader = get_train_val_dataloader(exp_dict,
                                                          trainset, valset,
                                                          mixtrainval=exp_dict['mixTrainVal'],
                                                          num_workers=num_workers,
                                                          pin_memory=pin_memory)
        # Train & validate
        train_dict = model.train_on_loader(trainloader, valloader, epoch=epoch,
                                           exp_dict=exp_dict)

        # Test phase
        train_dict_2 = model.test_on_loader(trainloader)
        val_dict = model.test_on_loader(valloader)
        test_dict = model.test_on_loader(testloader)

        # Vis phase
        model.vis_on_loader('train', trainset, savedir_images=os.path.join(
            savedir, 'images'), epoch=epoch)

        score_dict = {}
        score_dict["epoch"] = epoch
        score_dict["test_acc"] = test_dict['acc']
        score_dict["val_acc"] = val_dict['acc']
        score_dict["train_acc"] = train_dict_2['acc']
        score_dict["train_loss"] = train_dict['loss']
        score_dict["time_taken"] = time.time() - s_time
        score_dict["netC_lr"] = train_dict['netC_lr']

        if exp_dict['model']['netA'] is not None:
            if 'transformations_mean' in train_dict:
                for i in range(len(train_dict['transformations_mean'])):
                    score_dict[str(
                        i) + "_mean"] = train_dict['transformations_mean'][i].item()
            if 'transformations_std' in train_dict:
                for i in range(len(train_dict['transformations_std'])):
                    score_dict[str(
                        i) + "_std"] = train_dict['transformations_std'][i].item()

        # Add to score_list and save checkpoint
        score_list += [score_dict]

        # Report & Save
        score_df = pd.DataFrame(score_list)
        print("\n", score_df.tail(), "\n")
        hu.torch_save(model_path, model.get_state_dict())
        hu.save_pkl(score_list_path, score_list)
        print("Checkpoint Saved: %s" % savedir)

        # Update best score
        if epoch == 0 or (score_dict["test_acc"] >= score_df["test_acc"][:-1].max()):
            hu.save_pkl(os.path.join(
                savedir, "score_list_best.pkl"), score_list)
            hu.torch_save(os.path.join(savedir, "model_best.pth"),
                          model.get_state_dict())

            print("Saved Best: %s" % savedir)

    print('experiment completed')
Ejemplo n.º 24
0
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--datadir',
                        type=str,
                        default='/mnt/public/datasets/DeepFish')
    parser.add_argument("-e", "--exp_config", default='loc')
    parser.add_argument("-uc", "--use_cuda", type=int, default=0)
    args = parser.parse_args()

    device = torch.device('cuda' if args.use_cuda else 'cpu')

    exp_dict = exp_configs.EXP_GROUPS[args.exp_config][0]
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     split="train",
                                     transform=exp_dict.get("transform"),
                                     datadir=args.datadir)

    # Create model, opt, wrapper
    model_original = models.get_model(exp_dict["model"],
                                      exp_dict=exp_dict).to('cpu')  #.cuda()
    opt = torch.optim.Adam(model_original.parameters(),
                           lr=1e-5,
                           weight_decay=0.0005)

    model = wrappers.get_wrapper(exp_dict["wrapper"],
                                 model=model_original,
                                 opt=opt).to('cpu')  #.cuda()

    if args.exp_config == 'loc':
        batch = torch.utils.data.dataloader.default_collate([train_set[3]])
Ejemplo n.º 25
0
    'pascal': '/mnt/datasets/public/issam',
    'kitti': '/mnt/datasets/public/issam'
}

if __name__ == "__main__":

    for exp_group in [
            'weakly_covid19_v1_c2', 'weakly_covid19_v2_mixed_c2',
            'weakly_covid19_v2_sep_c2', 'weakly_covid19_v2_mixed_c3',
            'weakly_covid19_v2_sep_c3', 'weakly_covid19_v3_mixed_c2'
    ]:
        exp_dict = exp_configs.EXP_GROUPS[exp_group][0]
        dataset_name = exp_dict['dataset']['name']
        n_classes = exp_dict['dataset']['n_classes']
        train_set = datasets.get_dataset(dataset_dict={'name': dataset_name},
                                         datadir=None,
                                         split="test",
                                         exp_dict=exp_dict)
        for i, b in enumerate(train_set):
            if b['masks'].sum() == 0:
                print(i)
                continue
            break
        batch = ut.collate_fn([b])

        image = batch['images']
        gt = np.asarray(batch['masks'], np.float32)
        gt /= (gt.max() + 1e-8)

        image = F.interpolate(image,
                              size=gt.shape[-2:],
                              mode='bilinear',
Ejemplo n.º 26
0
    argparser.add_argument("--data-parallel", action="store_true")
    argparser.add_argument("--use-val-set", action="store_true")
    argparser.add_argument("--focal", action="store_true")
    argparser.add_argument('--output-dir', type=str, default=os.getenv("PT_OUTPUT_DIR"))
    args = argparser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    model = eval(args.model)(dataset=args.dataset, device=args.device, precision=args.precision,
                             norm_layer=args.norm_layer, focal=args.focal)
    model = DataParallelWrapper(model) if args.data_parallel else model

    if not args.use_val_set:

        train_dataset = get_dataset(args.dataset, "train", args.precision)
        train_loader = get_dataloader(train_dataset, True, args.batch_size, args.num_workers)

        #_, subset_idxs = split_hold_out_set(train_dataset.targets, 10000)
        subset_idxs = np.random.choice(len(train_dataset), 10000, replace=False)
        train_subset_dataset = Subset(train_dataset, list(subset_idxs))
        train_subset_loader = get_dataloader(train_subset_dataset, False,
                                             args.batch_size, args.num_workers)

        test_dataset = get_dataset(args.eval_dataset or args.dataset, "test", args.precision)
        test_loader = get_dataloader(test_dataset, False, args.batch_size, args.num_workers)

        eval_loaders_and_datasets = ((train_subset_loader, len(train_subset_dataset), "train"),
                                     (test_loader, len(test_dataset), "test"))

    else:
Ejemplo n.º 27
0
def trainval(exp_dict, savedir_base, reset=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # set seed
    # ---------------
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # -----------

    # train loader
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=savedir_base,
                                     exp_dict=exp_dict)

    train_loader = torch.utils.data.DataLoader(
        train_set,
        drop_last=True,
        shuffle=True,
        batch_size=exp_dict["batch_size"])

    # val set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=savedir_base,
                                   exp_dict=exp_dict)

    # Model
    # -----------
    model = models.get_model(exp_dict["model"], train_set=train_set).cuda()
    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Compute fstar
    # -------------
    if exp_dict['opt'].get('fstar_flag'):
        ut.compute_fstar(train_set, loss_function, savedir_base, exp_dict)

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt_dict=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, 'model.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')
    opt_path = os.path.join(savedir, 'opt_state_dict.pth')

    if os.path.exists(score_list_path):
        # resume experiment
        score_list = hu.load_pkl(score_list_path)
        model.load_state_dict(torch.load(model_path))
        opt.load_state_dict(torch.load(opt_path))
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ------------
    print('Starting experiment at epoch %d/%d' %
          (s_epoch, exp_dict['max_epoch']))

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Set seed
        seed = e + exp_dict['runs']
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        score_dict = {}

        # Compute train loss over train set
        score_dict["train_loss"] = metrics.compute_metric_on_dataset(
            model, train_set, metric_name=exp_dict["loss_func"])

        # Compute val acc over val set
        score_dict["val_acc"] = metrics.compute_metric_on_dataset(
            model, val_set, metric_name=exp_dict["acc_func"])

        # Train over train loader
        model.train()
        print("%d - Training model with %s..." % (e, exp_dict["loss_func"]))

        # train and validate
        s_time = time.time()
        for batch in tqdm.tqdm(train_loader):
            images, labels = batch["images"].cuda(), batch["labels"].cuda()

            opt.zero_grad()

            # closure
            def closure():
                return loss_function(model, images, labels, backwards=True)

            opt.step(closure)

        e_time = time.time()

        # Record metrics
        score_dict["epoch"] = e
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["step_size_avg"] = opt.state["step_size_avg"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["grad_norm"] = opt.state["grad_norm"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

    print('Experiment completed')
Ejemplo n.º 28
0
def trainval(exp_dict,
             savedir_base,
             reset,
             metrics_flag=True,
             datadir=None,
             cuda=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    print(pprint.pprint(exp_dict))
    print('Experiment saved in %s' % savedir)

    # set seed
    # ==================
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        device = 'cuda'
        torch.cuda.manual_seed_all(seed)
    else:
        device = 'cpu'

    print('Running on device: %s' % device)

    # Dataset
    # ==================
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = DataLoader(train_set,
                              drop_last=True,
                              shuffle=True,
                              sampler=None,
                              batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Model
    # ==================
    use_backpack = exp_dict['opt'].get("backpack", False)

    model = models.get_model(exp_dict["model"],
                             train_set=train_set,
                             backpack=use_backpack).to(device=device)
    if use_backpack:
        assert exp_dict['opt']['name'] in ['nus_wrapper', 'adaptive_second']
        from backpack import extend
        model = extend(model)

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    # ==============
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch,
                                   n_train=len(train_set),
                                   train_loader=train_loader,
                                   model=model,
                                   loss_function=loss_function,
                                   exp_dict=exp_dict,
                                   batch_size=exp_dict["batch_size"])

    # Checkpointing
    # =============
    score_list_path = os.path.join(savedir, "score_list.pkl")
    model_path = os.path.join(savedir, "model_state_dict.pth")
    opt_path = os.path.join(savedir, "opt_state_dict.pth")

    if os.path.exists(score_list_path):
        # resume experiment
        score_list = ut.load_pkl(score_list_path)
        if use_backpack:
            model.load_state_dict(torch.load(model_path), strict=False)
        else:
            model.load_state_dict(torch.load(model_path))
        opt.load_state_dict(torch.load(opt_path))
        s_epoch = score_list[-1]["epoch"] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Start Training
    # ==============
    n_train = len(train_loader.dataset)
    n_batches = len(train_loader)
    batch_size = train_loader.batch_size

    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        # Set seed
        seed = epoch + exp_dict['runs']
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        score_dict = {"epoch": epoch}

        # Validate
        # --------
        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model,
                train_set,
                metric_name=exp_dict["loss_func"],
                batch_size=exp_dict['batch_size'])

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model,
                val_set,
                metric_name=exp_dict["acc_func"],
                batch_size=exp_dict['batch_size'])

        # Train
        # -----
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()

        train_on_loader(model, train_set, train_loader, opt, loss_function,
                        epoch, use_backpack)

        e_time = time.time()

        # Record step size and batch size
        score_dict["step"] = opt.state.get("step",
                                           0) / int(n_batches_per_epoch)
        score_dict["step_size"] = opt.state.get("step_size", {})
        score_dict["step_size_avg"] = opt.state.get("step_size_avg", {})
        score_dict["n_forwards"] = opt.state.get("n_forwards", {})
        score_dict["n_backwards"] = opt.state.get("n_backwards", {})
        score_dict["grad_norm"] = opt.state.get("grad_norm", {})
        score_dict["batch_size"] = batch_size
        score_dict["train_epoch_time"] = e_time - s_time
        score_dict.update(opt.state["gv_stats"])

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        ut.save_pkl(score_list_path, score_list)
        ut.torch_save(model_path, model.state_dict())
        ut.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

    return score_list
Ejemplo n.º 29
0
    argparser.add_argument("--sigma", default=0.0, type=float)
    argparser.add_argument("--noise", default="Clean", type=str)
    argparser.add_argument("--k", default=None, type=int)
    argparser.add_argument("--j", default=None, type=int)
    argparser.add_argument("--a", default=None, type=int)
    argparser.add_argument("--lambd", default=None, type=float)
    argparser.add_argument("--adv", default=2, type=int)
    argparser.add_argument("--experiment-name", default="cifar", type=str)
    argparser.add_argument("--dataset", default="cifar", type=str)
    argparser.add_argument("--model", default="WideResNet", type=str)
    argparser.add_argument("--output-dir",
                           type=str,
                           default=os.getenv("PT_OUTPUT_DIR"))
    args = argparser.parse_args()

    test_dataset = get_dataset(args.dataset, "test")
    test_loader = DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=args.batch_size,  # todo: fix
        num_workers=args.num_workers)

    save_path = f"{args.output_dir}/{args.experiment_name}/model_ckpt.torch"
    model = eval(args.model)(dataset=args.dataset, device=args.device)
    model.load_state_dict(torch.load(save_path))
    model.eval()

    noise = parse_noise_from_args(args,
                                  device=args.device,
                                  dim=get_dim(args.dataset))
Ejemplo n.º 30
0
def trainval(exp_dict,
             savedir_base,
             reset=False,
             num_workers=0,
             run_ssl=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # load datasets
    # ==========================
    train_set = datasets.get_dataset(
        dataset_name=exp_dict["dataset_train"],
        data_root=exp_dict["dataset_train_root"],
        split="train",
        transform=exp_dict["transform_train"],
        classes=exp_dict["classes_train"],
        support_size=exp_dict["support_size_train"],
        query_size=exp_dict["query_size_train"],
        n_iters=exp_dict["train_iters"],
        unlabeled_size=exp_dict["unlabeled_size_train"])

    val_set = datasets.get_dataset(
        dataset_name=exp_dict["dataset_val"],
        data_root=exp_dict["dataset_val_root"],
        split="val",
        transform=exp_dict["transform_val"],
        classes=exp_dict["classes_val"],
        support_size=exp_dict["support_size_val"],
        query_size=exp_dict["query_size_val"],
        n_iters=exp_dict["val_iters"],
        unlabeled_size=exp_dict["unlabeled_size_val"])

    test_set = datasets.get_dataset(
        dataset_name=exp_dict["dataset_test"],
        data_root=exp_dict["dataset_test_root"],
        split="test",
        transform=exp_dict["transform_val"],
        classes=exp_dict["classes_test"],
        support_size=exp_dict["support_size_test"],
        query_size=exp_dict["query_size_test"],
        n_iters=exp_dict["test_iters"],
        unlabeled_size=exp_dict["unlabeled_size_test"])

    # get dataloaders
    # ==========================
    train_loader = torch.utils.data.DataLoader(
        train_set,
        batch_size=exp_dict["batch_size"],
        shuffle=True,
        num_workers=num_workers,
        collate_fn=ut.get_collate(exp_dict["collate_fn"]),
        drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_set,
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=num_workers,
                                             collate_fn=lambda x: x,
                                             drop_last=True)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=num_workers,
                                              collate_fn=lambda x: x,
                                              drop_last=True)

    # create model and trainer
    # ==========================

    # Create model, opt, wrapper
    backbone = backbones.get_backbone(
        backbone_name=exp_dict['model']["backbone"], exp_dict=exp_dict)
    model = models.get_model(model_name=exp_dict["model"]['name'],
                             backbone=backbone,
                             n_classes=exp_dict["n_classes"],
                             exp_dict=exp_dict)

    if run_ssl:
        # runs the SSL experiments
        score_list_path = os.path.join(savedir, 'score_list.pkl')
        if not os.path.exists(score_list_path):
            test_dict = model.test_on_loader(test_loader, max_iter=None)
            hu.save_pkl(score_list_path, [test_dict])
        return

    # Checkpoint
    # -----------
    checkpoint_path = os.path.join(savedir, 'checkpoint.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')

    if os.path.exists(score_list_path):
        # resume experiment
        model.load_state_dict(hu.torch_load(checkpoint_path))
        score_list = hu.load_pkl(score_list_path)
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Run training and validation
    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        score_dict = {"epoch": epoch}
        score_dict.update(model.get_lr())

        # train
        score_dict.update(model.train_on_loader(train_loader))

        # validate
        score_dict.update(model.val_on_loader(val_loader))
        score_dict.update(model.test_on_loader(test_loader))

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report
        score_df = pd.DataFrame(score_list)
        print(score_df.tail())

        # Save checkpoint
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(checkpoint_path, model.get_state_dict())
        print("Saved: %s" % savedir)

        if "accuracy" in exp_dict["target_loss"]:
            is_best = score_dict[exp_dict["target_loss"]] >= score_df[
                exp_dict["target_loss"]][:-1].max()
        else:
            is_best = score_dict[exp_dict["target_loss"]] <= score_df[
                exp_dict["target_loss"]][:-1].min()

        # Save best checkpoint
        if is_best:
            hu.save_pkl(os.path.join(savedir, "score_list_best.pkl"),
                        score_list)
            hu.torch_save(os.path.join(savedir, "checkpoint_best.pth"),
                          model.get_state_dict())
            print("Saved Best: %s" % savedir)

        # Check for end of training conditions
        if model.is_end_of_training():
            break