Exemple #1
0
def trainval(exp_dict, savedir_base, reset=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # set seed
    # ---------------
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # -----------

    # train loader
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=savedir_base,
                                     exp_dict=exp_dict)

    train_loader = torch.utils.data.DataLoader(
        train_set,
        drop_last=True,
        shuffle=True,
        batch_size=exp_dict["batch_size"])

    # val set
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=savedir_base,
                                   exp_dict=exp_dict)

    # Model
    # -----------
    model = models.get_model(exp_dict["model"], train_set=train_set).cuda()
    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Compute fstar
    # -------------
    if exp_dict['opt'].get('fstar_flag'):
        ut.compute_fstar(train_set, loss_function, savedir_base, exp_dict)

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt_dict=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, 'model.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')
    opt_path = os.path.join(savedir, 'opt_state_dict.pth')

    if os.path.exists(score_list_path):
        # resume experiment
        score_list = hu.load_pkl(score_list_path)
        model.load_state_dict(torch.load(model_path))
        opt.load_state_dict(torch.load(opt_path))
        s_epoch = score_list[-1]['epoch'] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Train & Val
    # ------------
    print('Starting experiment at epoch %d/%d' %
          (s_epoch, exp_dict['max_epoch']))

    for e in range(s_epoch, exp_dict['max_epoch']):
        # Set seed
        seed = e + exp_dict['runs']
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        score_dict = {}

        # Compute train loss over train set
        score_dict["train_loss"] = metrics.compute_metric_on_dataset(
            model, train_set, metric_name=exp_dict["loss_func"])

        # Compute val acc over val set
        score_dict["val_acc"] = metrics.compute_metric_on_dataset(
            model, val_set, metric_name=exp_dict["acc_func"])

        # Train over train loader
        model.train()
        print("%d - Training model with %s..." % (e, exp_dict["loss_func"]))

        # train and validate
        s_time = time.time()
        for batch in tqdm.tqdm(train_loader):
            images, labels = batch["images"].cuda(), batch["labels"].cuda()

            opt.zero_grad()

            # closure
            def closure():
                return loss_function(model, images, labels, backwards=True)

            opt.step(closure)

        e_time = time.time()

        # Record metrics
        score_dict["epoch"] = e
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["step_size_avg"] = opt.state["step_size_avg"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["grad_norm"] = opt.state["grad_norm"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

    print('Experiment completed')
Exemple #2
0
def trainval_svrg(exp_dict, savedir, datadir, metrics_flag=True):
    '''
        SVRG-specific training and validation loop.
    '''
    pprint.pprint(exp_dict)

    # Load Train Dataset
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = DataLoader(train_set,
                              drop_last=False,
                              shuffle=True,
                              batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Load model
    model = models.get_model(exp_dict["model"], train_set=train_set).cuda()

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # lookup the learning rate
    lr = get_svrg_step_size(exp_dict)

    # Load Optimizer
    opt = get_svrg_optimizer(model,
                             loss_function,
                             train_loader=train_loader,
                             lr=lr)

    # Resume from last saved state_dict
    if (not os.path.exists(savedir + "/run_dict.pkl")
            or not os.path.exists(savedir + "/score_list.pkl")):
        ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1})
        score_list = []
        s_epoch = 0
    else:
        score_list = ut.load_pkl(savedir + "/score_list.pkl")
        model.load_state_dict(torch.load(savedir + "/model_state_dict.pth"))
        opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth"))
        s_epoch = score_list[-1]["epoch"] + 1

    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        score_dict = {"epoch": epoch}

        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model, train_set, metric_name=exp_dict["loss_func"])

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model, val_set, metric_name=exp_dict["acc_func"])

        # 3. Train over train loader
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()
        for images, labels in tqdm.tqdm(train_loader):
            images, labels = images.cuda(), labels.cuda()

            opt.zero_grad()
            closure = lambda svrg_model: loss_function(
                svrg_model, images, labels, backwards=True)
            opt.step(closure)

        e_time = time.time()

        # Record step size and batch size
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        ut.save_pkl(savedir + "/score_list.pkl", score_list)
        ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict())
        ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict())
        print("Saved: %s" % savedir)

    return score_list
Exemple #3
0
def newminimum(exp_id,
               savedir_base,
               datadir,
               name,
               exp_dict,
               metrics_flag=True):
    # bookkeeping
    # ---------------

    # get experiment directory
    old_modeldir = os.path.join(savedir_base, exp_id)
    savedir = os.path.join(savedir_base, exp_id, name)

    old_exp_dict = hu.load_json(os.path.join(old_modeldir, 'exp_dict.json'))

    # TODO: compare exp dict for possible errors:
    # optimizer have to be the same
    # same network, dataset

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    pprint.pprint(exp_dict)
    print('Experiment saved in %s' % savedir)

    # set seed
    # ---------------
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Dataset
    # -----------

    # Load Train Dataset
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = torch.utils.data.DataLoader(
        train_set,
        drop_last=True,
        shuffle=True,
        batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Model
    # -----------
    model = models.get_model(exp_dict["model"], train_set=train_set)

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Checkpoint
    # -----------
    model_path = os.path.join(savedir, 'model.pth')
    score_list_path = os.path.join(savedir, 'score_list.pkl')
    opt_path = os.path.join(savedir, 'opt_state_dict.pth')

    old_model_path = os.path.join(old_modeldir, 'model.pth')
    old_score_list_path = os.path.join(old_modeldir, 'score_list.pkl')
    old_opt_path = os.path.join(old_modeldir, 'opt_state_dict.pth')

    score_list = hu.load_pkl(old_score_list_path)
    model.load_state_dict(torch.load(old_model_path))
    opt.load_state_dict(torch.load(old_opt_path))
    s_epoch = score_list[-1]['epoch'] + 1

    # save current model state for comparison
    minimum = []

    for param in model.parameters():
        minimum.append(param.clone())

    # Train & Val
    # ------------
    print('Starting experiment at epoch %d/%d' %
          (s_epoch, exp_dict['max_epoch']))

    for epoch in range(s_epoch, exp_dict['max_epoch']):
        # Set seed
        np.random.seed(exp_dict['runs'] + epoch)
        torch.manual_seed(exp_dict['runs'] + epoch)
        # torch.cuda.manual_seed_all(exp_dict['runs']+epoch) not needed since no cuda available

        score_dict = {"epoch": epoch}

        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model, train_set, metric_name='softmax_loss')
            #                                    metric_name=exp_dict["loss_func"])
            # TODO: which loss should be used? (normal or with reguralizer?)

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model, val_set, metric_name=exp_dict["acc_func"])

        # 3. Train over train loader
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()
        for images, labels in tqdm.tqdm(train_loader):
            # images, labels = images.cuda(), labels.cuda() no cuda available

            opt.zero_grad()
            loss = loss_function(model, images, labels, minimum,
                                 0.1)  # just works for custom loss function
            loss.backward()
            opt.step()

        e_time = time.time()

        # Record metrics
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        hu.save_pkl(score_list_path, score_list)
        hu.torch_save(model_path, model.state_dict())
        hu.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

        with torch.nograd():
            print('Current distance: %f',
                  metrics.computedistance(minimum, model))

    print('Experiment completed')
Exemple #4
0
def trainval(exp_dict,
             savedir_base,
             reset,
             metrics_flag=True,
             datadir=None,
             cuda=False):
    # bookkeeping
    # ---------------

    # get experiment directory
    exp_id = hu.hash_dict(exp_dict)
    savedir = os.path.join(savedir_base, exp_id)

    if reset:
        # delete and backup experiment
        hc.delete_experiment(savedir, backup_flag=True)

    # create folder and save the experiment dictionary
    os.makedirs(savedir, exist_ok=True)
    hu.save_json(os.path.join(savedir, 'exp_dict.json'), exp_dict)
    print(pprint.pprint(exp_dict))
    print('Experiment saved in %s' % savedir)

    # set seed
    # ==================
    seed = 42 + exp_dict['runs']
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        device = 'cuda'
        torch.cuda.manual_seed_all(seed)
    else:
        device = 'cpu'

    print('Running on device: %s' % device)

    # Dataset
    # ==================
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = DataLoader(train_set,
                              drop_last=True,
                              shuffle=True,
                              sampler=None,
                              batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Model
    # ==================
    use_backpack = exp_dict['opt'].get("backpack", False)

    model = models.get_model(exp_dict["model"],
                             train_set=train_set,
                             backpack=use_backpack).to(device=device)
    if use_backpack:
        assert exp_dict['opt']['name'] in ['nus_wrapper', 'adaptive_second']
        from backpack import extend
        model = extend(model)

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    # ==============
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch,
                                   n_train=len(train_set),
                                   train_loader=train_loader,
                                   model=model,
                                   loss_function=loss_function,
                                   exp_dict=exp_dict,
                                   batch_size=exp_dict["batch_size"])

    # Checkpointing
    # =============
    score_list_path = os.path.join(savedir, "score_list.pkl")
    model_path = os.path.join(savedir, "model_state_dict.pth")
    opt_path = os.path.join(savedir, "opt_state_dict.pth")

    if os.path.exists(score_list_path):
        # resume experiment
        score_list = ut.load_pkl(score_list_path)
        if use_backpack:
            model.load_state_dict(torch.load(model_path), strict=False)
        else:
            model.load_state_dict(torch.load(model_path))
        opt.load_state_dict(torch.load(opt_path))
        s_epoch = score_list[-1]["epoch"] + 1
    else:
        # restart experiment
        score_list = []
        s_epoch = 0

    # Start Training
    # ==============
    n_train = len(train_loader.dataset)
    n_batches = len(train_loader)
    batch_size = train_loader.batch_size

    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        # Set seed
        seed = epoch + exp_dict['runs']
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        score_dict = {"epoch": epoch}

        # Validate
        # --------
        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model,
                train_set,
                metric_name=exp_dict["loss_func"],
                batch_size=exp_dict['batch_size'])

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model,
                val_set,
                metric_name=exp_dict["acc_func"],
                batch_size=exp_dict['batch_size'])

        # Train
        # -----
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()

        train_on_loader(model, train_set, train_loader, opt, loss_function,
                        epoch, use_backpack)

        e_time = time.time()

        # Record step size and batch size
        score_dict["step"] = opt.state.get("step",
                                           0) / int(n_batches_per_epoch)
        score_dict["step_size"] = opt.state.get("step_size", {})
        score_dict["step_size_avg"] = opt.state.get("step_size_avg", {})
        score_dict["n_forwards"] = opt.state.get("n_forwards", {})
        score_dict["n_backwards"] = opt.state.get("n_backwards", {})
        score_dict["grad_norm"] = opt.state.get("grad_norm", {})
        score_dict["batch_size"] = batch_size
        score_dict["train_epoch_time"] = e_time - s_time
        score_dict.update(opt.state["gv_stats"])

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        ut.save_pkl(score_list_path, score_list)
        ut.torch_save(model_path, model.state_dict())
        ut.torch_save(opt_path, opt.state_dict())
        print("Saved: %s" % savedir)

    return score_list
Exemple #5
0
def trainval(exp_dict, savedir, datadir, metrics_flag=True):
    # TODO: Do we get similar results with different seeds?
    # Set seed
    np.random.seed(42)
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)

    pprint.pprint(exp_dict)

    # Load Train Dataset
    train_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                     train_flag=True,
                                     datadir=datadir,
                                     exp_dict=exp_dict)

    train_loader = DataLoader(train_set,
                              drop_last=True,
                              shuffle=True,
                              batch_size=exp_dict["batch_size"])

    # Load Val Dataset
    val_set = datasets.get_dataset(dataset_name=exp_dict["dataset"],
                                   train_flag=False,
                                   datadir=datadir,
                                   exp_dict=exp_dict)

    # Load model
    model = models.get_model(exp_dict["model"], train_set=train_set).cuda()

    # Choose loss and metric function
    loss_function = metrics.get_metric_function(exp_dict["loss_func"])

    # Load Optimizer
    n_batches_per_epoch = len(train_set) / float(exp_dict["batch_size"])
    opt = optimizers.get_optimizer(opt=exp_dict["opt"],
                                   params=model.parameters(),
                                   n_batches_per_epoch=n_batches_per_epoch)

    # Resume from last saved state_dict
    if (not os.path.exists(savedir + "/run_dict.pkl")
            or not os.path.exists(savedir + "/score_list.pkl")):
        ut.save_pkl(savedir + "/run_dict.pkl", {"running": 1})
        score_list = []
        s_epoch = 0
    else:
        score_list = ut.load_pkl(savedir + "/score_list.pkl")
        model.load_state_dict(torch.load(savedir + "/model_state_dict.pth"))
        opt.load_state_dict(torch.load(savedir + "/opt_state_dict.pth"))
        s_epoch = score_list[-1]["epoch"] + 1

    for epoch in range(s_epoch, exp_dict["max_epoch"]):
        # Set seed
        np.random.seed(epoch)
        torch.manual_seed(epoch)
        torch.cuda.manual_seed_all(epoch)

        score_dict = {"epoch": epoch}

        if metrics_flag:
            # 1. Compute train loss over train set
            score_dict["train_loss"] = metrics.compute_metric_on_dataset(
                model, train_set, metric_name=exp_dict["loss_func"])

            # 2. Compute val acc over val set
            score_dict["val_acc"] = metrics.compute_metric_on_dataset(
                model, val_set, metric_name=exp_dict["acc_func"])

        # 3. Train over train loader
        model.train()
        print("%d - Training model with %s..." %
              (epoch, exp_dict["loss_func"]))

        s_time = time.time()
        for images, labels in tqdm.tqdm(train_loader):
            images, labels = images.cuda(), labels.cuda()

            opt.zero_grad()

            if exp_dict["opt"]["name"] in exp_configs.ours_opt_list + ["l4"]:
                closure = lambda: loss_function(
                    model, images, labels, backwards=False)
                opt.step(closure)

            else:
                loss = loss_function(model, images, labels)
                loss.backward()
                opt.step()

        e_time = time.time()

        # Record step size and batch size
        score_dict["step_size"] = opt.state["step_size"]
        score_dict["n_forwards"] = opt.state["n_forwards"]
        score_dict["n_backwards"] = opt.state["n_backwards"]
        score_dict["batch_size"] = train_loader.batch_size
        score_dict["train_epoch_time"] = e_time - s_time

        # Add score_dict to score_list
        score_list += [score_dict]

        # Report and save
        print(pd.DataFrame(score_list).tail())
        ut.save_pkl(savedir + "/score_list.pkl", score_list)
        ut.torch_save(savedir + "/model_state_dict.pth", model.state_dict())
        ut.torch_save(savedir + "/opt_state_dict.pth", opt.state_dict())
        print("Saved: %s" % savedir)

    return score_list