Ejemplo n.º 1
0
def train(dataset, base_model, niter, j):
    preference = np.array([1. - j, j])
    n_tasks = 2
    print("Preference Vector = {}".format(preference))

    # LOAD DATASET
    # ------------
    # MultiMNIST: multi_mnist.pickle
    if dataset == 'mnist':
        with open('data/multi_mnist.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    # MultiFashionMNIST: multi_fashion.pickle
    if dataset == 'fashion':
        with open('data/multi_fashion.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle
    if dataset == 'fashion_and_mnist':
        with open('data/multi_fashion_and_mnist.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    trainX = torch.from_numpy(trainX.reshape(120000, 1, 36, 36)).float()
    trainLabel = torch.from_numpy(trainLabel).long()
    testX = torch.from_numpy(testX.reshape(20000, 1, 36, 36)).float()
    testLabel = torch.from_numpy(testLabel).long()

    train_set = torch.utils.data.TensorDataset(trainX, trainLabel)
    test_set = torch.utils.data.TensorDataset(testX, testLabel)

    batch_size = 256
    train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                               batch_size=batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                              batch_size=batch_size,
                                              shuffle=False)

    print('==>>> total trainning batch number: {}'.format(len(train_loader)))
    print('==>>> total testing batch number: {}'.format(len(test_loader)))
    # ---------***---------

    # DEFINE MODEL
    # ---------------------
    if base_model == 'lenet':
        model = RegressionTrain(RegressionModel(n_tasks), preference)
    if base_model == 'resnet18':
        model = RegressionTrainResNet(MnistResNet(n_tasks), preference)

    if torch.cuda.is_available():
        model.cuda()
    # ---------***---------

    # DEFINE OPTIMIZERS
    # -----------------
    # Choose different optimizers for different base model
    if base_model == 'lenet':
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.0)
        # scheduler = torch.optim.lr_scheduler.MultiStepLR(
        #     optimizer, milestones=[15, 30, 45, 60, 75, 90], gamma=0.5)

    if base_model == 'resnet18':
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         milestones=[10, 20],
                                                         gamma=0.1)

    # ---------***---------

    # CONTAINERS FOR KEEPING TRACK OF PROGRESS
    # ----------------------------------------
    weights = []
    task_train_losses = []
    train_accs = []
    # ---------***---------

    # TRAIN
    # -----
    for t in range(niter):

        scheduler.step()
        n_manual_adjusts = 0
        model.train()
        for (it, batch) in enumerate(train_loader):

            X = batch[0]
            ts = batch[1]
            if torch.cuda.is_available():
                X = X.cuda()
                ts = ts.cuda()

            # Update using only j th task
            optimizer.zero_grad()
            task_j_loss = model(X, ts, j)
            task_j_loss.backward()
            optimizer.step()

        if n_manual_adjusts > 0:
            print(f"\t # manual tweek={n_manual_adjusts}")
        # Calculate and record performance
        if t == 0 or (t + 1) % 2 == 0:
            model.eval()
            with torch.no_grad():
                total_train_loss = []
                train_acc = []

                correct1_train = 0
                correct2_train = 0

                for (it, batch) in enumerate(test_loader):

                    X = batch[0]
                    ts = batch[1]
                    if torch.cuda.is_available():
                        X = X.cuda()
                        ts = ts.cuda()

                    valid_train_loss = model(X, ts)
                    total_train_loss.append(valid_train_loss)
                    output1 = model.model(X).max(2, keepdim=True)[1][:, 0]
                    output2 = model.model(X).max(2, keepdim=True)[1][:, 1]
                    correct1_train += output1.eq(
                        ts[:, 0].view_as(output1)).sum().item()
                    correct2_train += output2.eq(
                        ts[:, 1].view_as(output2)).sum().item()

                train_acc = np.stack([
                    1.0 * correct1_train / len(train_loader.dataset),
                    1.0 * correct2_train / len(train_loader.dataset)
                ])

                total_train_loss = torch.stack(total_train_loss)
                average_train_loss = torch.mean(total_train_loss, dim=0)

            # record and print
            if torch.cuda.is_available():

                task_train_losses.append(average_train_loss.data.cpu().numpy())
                train_accs.append(train_acc)

                print('{}/{}: train_loss={}, train_acc={}'.format(
                    t + 1, niter, task_train_losses[-1], train_accs[-1]))

                # weights.append(weight_vec.cpu().numpy())
                # print('{}/{}: weights={}, train_loss={}, train_acc={}'.format(
                #     t + 1, niter, weights[-1], task_train_losses[-1], train_accs[-1]))
    # torch.save(model.model.state_dict(), './saved_model/%s_%s_niter_%d.pickle' %
    #            (dataset, base_model, niter, npref))
    torch.save(model.model.state_dict(),
               f'./saved_model/{dataset}_{base_model}_niter_{niter}.pickle')

    result = {
        "training_losses": task_train_losses,
        "training_accuracies": train_accs
    }

    return result
Ejemplo n.º 2
0
def train(dataset, base_model, niter, npref, init_weight, pref_idx, leak):

    # generate #npref preference vectors
    n_tasks = 2

    # load dataset

    # MultiMNIST: multi_mnist.pickle
    if dataset == 'mnist':
        with open('data/multi_mnist.pickle','rb') as f:
            trainX, trainLabel,testX, testLabel = pickle.load(f)

    # MultiFashionMNIST: multi_fashion.pickle
    if dataset == 'fashion':
        with open('data/multi_fashion.pickle','rb') as f:
            trainX, trainLabel,testX, testLabel = pickle.load(f)


    # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle
    if dataset == 'fashion_and_mnist':
        with open('data/multi_fashion_and_mnist.pickle','rb') as f:
            trainX, trainLabel,testX, testLabel = pickle.load(f)

    trainX = torch.from_numpy(trainX.reshape(120000,1,36,36)).float()
    trainLabel = torch.from_numpy(trainLabel).long()
    testX = torch.from_numpy(testX.reshape(20000,1,36,36)).float()
    testLabel = torch.from_numpy(testLabel).long()


    train_set = torch.utils.data.TensorDataset(trainX, trainLabel)
    test_set  = torch.utils.data.TensorDataset(testX, testLabel)


    batch_size = 256
    train_loader = torch.utils.data.DataLoader(
                     dataset=train_set,
                     batch_size=batch_size,
                     shuffle=True)
    test_loader = torch.utils.data.DataLoader(
                    dataset=test_set,
                    batch_size=batch_size,
                    shuffle=False)

    print('==>>> total trainning batch number: {}'.format(len(train_loader)))
    print('==>>> total testing batch number: {}'.format(len(test_loader)))


    # define the base model for ParetoMTL
    if base_model == "lenet":
        model = RegressionTrain(RegressionModel(n_tasks), init_weight)
    if base_model == "resnet18":
        model = RegressionTrainResNet(MnistResnNet(n_tasks), init_weight)

    if torch.cuda.is_available():
        model.cuda()


    # choose different optimizer for different base model
    if base_model == 'lenet':
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.)
        # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,30,45,60,75,90], gamma=0.5)

    if base_model == 'resnet18':
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20], gamma=0.1)


    # store infomation during optimization
    task_train_losses = []
    train_accs = []

    # run niter epochs of MGDA
    for t in range(niter):

        # scheduler.step()

        model.train()
        for (it, batch) in enumerate(train_loader):

            X = batch[0]
            ts = batch[1]
            if torch.cuda.is_available():
                X = X.cuda()
                ts = ts.cuda()

            # obtain and store the gradient
            flat_grads = {}
            for i in range(n_tasks):
                optimizer.zero_grad()
                task_loss = model(X, ts)
                task_loss[i].backward()
                flat_grads[i] = flatten_grad(model.parameters())

            grads = [flat_grads[i]["grad"] for i in range(len(flat_grads))]
            grads = torch.stack(grads)

            # calculate the gradient
            grads = get_d_graddrop(grads, leak)
            grads = recover_flattened(grads, flat_grads[0]["indices"], flat_grads[0]["shapes"])

            # optimization step
            optimizer.zero_grad()
            for i, params in enumerate(model.parameters()):
                if grads[i] is not None:
                    params.grad = grads[i]
            optimizer.step()


        # calculate and record performance
        if t == 0 or (t + 1) % 2 == 0:

            model.eval()
            with torch.no_grad():

                total_train_loss = []
                train_acc = []

                correct1_train = 0
                correct2_train = 0

                for (it, batch) in enumerate(test_loader):

                    X = batch[0]
                    ts = batch[1]
                    if torch.cuda.is_available():
                        X = X.cuda()
                        ts = ts.cuda()

                    valid_train_loss = model(X, ts)
                    total_train_loss.append(valid_train_loss)
                    output1 = model.model(X).max(2, keepdim=True)[1][:,0]
                    output2 = model.model(X).max(2, keepdim=True)[1][:,1]
                    correct1_train += output1.eq(ts[:,0].view_as(output1)).sum().item()
                    correct2_train += output2.eq(ts[:,1].view_as(output2)).sum().item()


                train_acc = np.stack([1.0 * correct1_train / len(test_loader.dataset),1.0 * correct2_train / len(test_loader.dataset)])

                total_train_loss = torch.stack(total_train_loss)
                average_train_loss = torch.mean(total_train_loss, dim = 0)

            # record and print
            if torch.cuda.is_available():
                task_train_losses.append(average_train_loss.data.cpu().numpy())
                train_accs.append(train_acc)
                print('{}/{}: train_loss={}, train_acc={}'.format(
                        t + 1, niter,  task_train_losses[-1],train_accs[-1]))

    result = {"training_losses": task_train_losses,
              "training_accuracies": train_accs}
    return result, model
Ejemplo n.º 3
0
def train(dataset, base_model, niter, preference):

    n_tasks = 2
    print("Preference Vector = {}".format(preference))

    # LOAD DATASET
    # ------------
    # MultiMNIST: multi_mnist.pickle
    if dataset == 'mnist':
        with open('data/multi_mnist.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    # MultiFashionMNIST: multi_fashion.pickle
    if dataset == 'fashion':
        with open('data/multi_fashion.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle
    if dataset == 'fashion_and_mnist':
        with open('data/multi_fashion_and_mnist.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    trainX = torch.from_numpy(trainX.reshape(120000, 1, 36, 36)).float()
    trainLabel = torch.from_numpy(trainLabel).long()
    testX = torch.from_numpy(testX.reshape(20000, 1, 36, 36)).float()
    testLabel = torch.from_numpy(testLabel).long()

    train_set = torch.utils.data.TensorDataset(trainX, trainLabel)
    test_set = torch.utils.data.TensorDataset(testX, testLabel)

    batch_size = 256
    train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                               batch_size=batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                              batch_size=batch_size,
                                              shuffle=False)

    print('==>>> total trainning batch number: {}'.format(len(train_loader)))
    print('==>>> total testing batch number: {}'.format(len(test_loader)))
    # ---------***---------

    # DEFINE MODEL
    # ---------------------
    if base_model == 'lenet':
        model = RegressionTrain(RegressionModel(n_tasks), preference)
    if base_model == 'resnet18':
        model = RegressionTrainResNet(MnistResNet(n_tasks), preference)
    # model.randomize()
    if torch.cuda.is_available():
        model.cuda()
    # ---------***---------

    # DEFINE OPTIMIZERS
    # -----------------
    # Choose different optimizers for different base model
    if base_model == 'lenet':
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.)
        # scheduler = torch.optim.lr_scheduler.MultiStepLR(
        #     optimizer, milestones=[15, 30, 45, 60, 75, 90], gamma=0.8)

    if base_model == 'resnet18':
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         milestones=[10, 20],
                                                         gamma=0.1)

    # Instantia EPO Linear Program Solver
    _, n_params = getNumParams(model.parameters())
    print(f"# params={n_params}")
    epo_lp = EPO_LP(m=n_tasks, n=n_params, r=preference)
    # ---------***---------

    # CONTAINERS FOR KEEPING TRACK OF PROGRESS
    # ----------------------------------------
    task_train_losses = []
    train_accs = []
    # ---------***---------

    # TRAIN
    # -----
    for t in range(niter):
        # scheduler.step()

        n_manual_adjusts = 0
        descent = 0.
        model.train()
        for (it, batch) in enumerate(train_loader):
            X = batch[0]
            ts = batch[1]
            if torch.cuda.is_available():
                X = X.cuda()
                ts = ts.cuda()

            # Obtain losses and gradients
            grads = {}
            losses = []
            for i in range(n_tasks):
                optimizer.zero_grad()
                task_loss = model(X, ts)
                losses.append(task_loss[i].data.cpu().numpy())
                task_loss[i].backward()

                # One can use scalable method proposed in the MOO-MTL paper
                # for large scale problem; but we use the gradient
                # of all parameters in this experiment.
                grads[i] = []
                for param in model.parameters():
                    if param.grad is not None:
                        grads[i].append(
                            Variable(param.grad.data.clone().flatten(),
                                     requires_grad=False))

            grads_list = [torch.cat(grads[i]) for i in range(len(grads))]
            G = torch.stack(grads_list)
            GG = G @ G.T
            losses = np.stack(losses)

            try:
                # Calculate the alphas from the LP solver
                alpha = epo_lp.get_alpha(losses, G=GG.cpu().numpy(), C=True)
                if epo_lp.last_move == "dom":
                    descent += 1
            except Exception as e:
                print(e)
                alpha = None
            if alpha is None:  # A patch for the issue in cvxpy
                alpha = preference / preference.sum()
                n_manual_adjusts += 1

            if torch.cuda.is_available:
                alpha = n_tasks * torch.from_numpy(alpha).cuda()
            else:
                alpha = n_tasks * torch.from_numpy(alpha)
            # Optimization step
            optimizer.zero_grad()
            task_losses = model(X, ts)
            weighted_loss = torch.sum(task_losses *
                                      alpha)  # * 5. * max(epo_lp.mu_rl, 0.2)
            weighted_loss.backward()
            optimizer.step()

        print(f"\tdescent={descent/len(train_loader)}")
        if n_manual_adjusts > 0:
            print(f"\t # manual tweek={n_manual_adjusts}")

        # Calculate and record performance
        if t == 0 or (t + 1) % 2 == 0:
            model.eval()
            with torch.no_grad():
                total_train_loss = []
                train_acc = []

                correct1_train = 0
                correct2_train = 0

                for (it, batch) in enumerate(test_loader):

                    X = batch[0]
                    ts = batch[1]
                    if torch.cuda.is_available():
                        X = X.cuda()
                        ts = ts.cuda()

                    valid_train_loss = model(X, ts)
                    total_train_loss.append(valid_train_loss)
                    output1 = model.model(X).max(2, keepdim=True)[1][:, 0]
                    output2 = model.model(X).max(2, keepdim=True)[1][:, 1]
                    correct1_train += output1.eq(
                        ts[:, 0].view_as(output1)).sum().item()
                    correct2_train += output2.eq(
                        ts[:, 1].view_as(output2)).sum().item()

                train_acc = np.stack([
                    1.0 * correct1_train / len(train_loader.dataset),
                    1.0 * correct2_train / len(train_loader.dataset)
                ])

                total_train_loss = torch.stack(total_train_loss)
                average_train_loss = torch.mean(total_train_loss, dim=0)

            # record and print
            if torch.cuda.is_available():

                task_train_losses.append(average_train_loss.data.cpu().numpy())
                train_accs.append(train_acc)

                print('{}/{}: train_loss={}, train_acc={}'.format(
                    t + 1, niter, task_train_losses[-1], train_accs[-1]))

    torch.save(model.model.state_dict(),
               f'./saved_model/{dataset}_{base_model}_niter_{niter}.pickle')

    result = {
        "training_losses": task_train_losses,
        "training_accuracies": train_accs
    }

    return result
Ejemplo n.º 4
0
def train(dataset, base_model, niter, npref, init_weight, pref_idx):

    # generate #npref preference vectors
    n_tasks = 2
    # ref_vec = torch.tensor(circle_points([1], [npref])[0]).cuda().float()
    rvecs = circle_points(npref,
                          min_angle=0.0001 * np.pi / 2,
                          max_angle=0.9999 * np.pi / 2)
    ref_vec = torch.tensor(rvecs).cuda().float()

    # load dataset

    # MultiMNIST: multi_mnist.pickle
    if dataset == 'mnist':
        with open('data/multi_mnist.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    # MultiFashionMNIST: multi_fashion.pickle
    if dataset == 'fashion':
        with open('data/multi_fashion.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle
    if dataset == 'fashion_and_mnist':
        with open('data/multi_fashion_and_mnist.pickle', 'rb') as f:
            trainX, trainLabel, testX, testLabel = pickle.load(f)

    trainX = torch.from_numpy(trainX.reshape(120000, 1, 36, 36)).float()
    trainLabel = torch.from_numpy(trainLabel).long()
    testX = torch.from_numpy(testX.reshape(20000, 1, 36, 36)).float()
    testLabel = torch.from_numpy(testLabel).long()

    train_set = torch.utils.data.TensorDataset(trainX, trainLabel)
    test_set = torch.utils.data.TensorDataset(testX, testLabel)

    batch_size = 256
    train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                               batch_size=batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                              batch_size=batch_size,
                                              shuffle=False)

    print('==>>> total trainning batch number: {}'.format(len(train_loader)))
    print('==>>> total testing batch number: {}'.format(len(test_loader)))

    # define the base model for ParetoMTL
    model = RegressionTrain(RegressionModel(n_tasks), init_weight)

    if torch.cuda.is_available():
        model.cuda()

    # choose different optimizer for different base model
    if base_model == 'lenet':
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.)
        # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,30,45,60,75,90], gamma=0.5)

    if base_model == 'resnet18':
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                         milestones=[10, 20],
                                                         gamma=0.1)

    # store infomation during optimization
    weights = []
    task_train_losses = []
    train_accs = []

    # print the current preference vector
    print('Preference Vector ({}/{}):'.format(pref_idx + 1, npref))
    print(ref_vec[pref_idx].cpu().numpy())

    # run at most 2 epochs to find the initial solution
    # stop early once a feasible solution is found
    # usually can be found with a few steps
    for t in range(2):

        model.train()
        for (it, batch) in enumerate(train_loader):
            X = batch[0]
            ts = batch[1]
            if torch.cuda.is_available():
                X = X.cuda()
                ts = ts.cuda()

            grads = {}
            losses_vec = []

            # obtain and store the gradient value
            for i in range(n_tasks):
                optimizer.zero_grad()
                task_loss = model(X, ts)
                losses_vec.append(task_loss[i].data)

                task_loss[i].backward()

                grads[i] = []

                # can use scalable method proposed in the MOO-MTL paper for large scale problem
                # but we keep use the gradient of all parameters in this experiment
                for param in model.parameters():
                    if param.grad is not None:
                        grads[i].append(
                            Variable(param.grad.data.clone().flatten(),
                                     requires_grad=False))

            grads_list = [torch.cat(grads[i]) for i in range(len(grads))]
            grads = torch.stack(grads_list)

            # calculate the weights
            losses_vec = torch.stack(losses_vec)
            flag, weight_vec = get_d_paretomtl_init(grads, losses_vec, ref_vec,
                                                    pref_idx)

            # early stop once a feasible solution is obtained
            if flag == True:
                print("fealsible solution is obtained.")
                break

            # optimization step
            optimizer.zero_grad()
            for i in range(len(task_loss)):
                task_loss = model(X, ts)
                if i == 0:
                    loss_total = weight_vec[i] * task_loss[i]
                else:
                    loss_total = loss_total + weight_vec[i] * task_loss[i]

            loss_total.backward()
            optimizer.step()

        else:
            # continue if no feasible solution is found
            continue
        # break the loop once a feasible solutions is found
        break

    # run niter epochs of ParetoMTL
    for t in range(niter):

        # scheduler.step()

        model.train()
        for (it, batch) in enumerate(train_loader):

            X = batch[0]
            ts = batch[1]
            if torch.cuda.is_available():
                X = X.cuda()
                ts = ts.cuda()

            # obtain and store the gradient
            grads = {}
            losses_vec = []

            for i in range(n_tasks):
                optimizer.zero_grad()
                task_loss = model(X, ts)
                losses_vec.append(task_loss[i].data)

                task_loss[i].backward()

                # can use scalable method proposed in the MOO-MTL paper for large scale problem
                # but we keep use the gradient of all parameters in this experiment
                grads[i] = []
                for param in model.parameters():
                    if param.grad is not None:
                        grads[i].append(
                            Variable(param.grad.data.clone().flatten(),
                                     requires_grad=False))

            grads_list = [torch.cat(grads[i]) for i in range(len(grads))]
            grads = torch.stack(grads_list)

            # calculate the weights
            losses_vec = torch.stack(losses_vec)
            weight_vec = get_d_paretomtl(grads, losses_vec, ref_vec, pref_idx)

            # normalize_coeff = n_tasks / torch.sum(torch.abs(weight_vec))
            normalize_coeff = 1. / torch.sum(torch.abs(weight_vec))
            weight_vec = weight_vec * normalize_coeff

            # optimization step
            optimizer.zero_grad()
            for i in range(len(task_loss)):
                task_loss = model(X, ts)
                if i == 0:
                    loss_total = weight_vec[i] * task_loss[i]
                else:
                    loss_total = loss_total + weight_vec[i] * task_loss[i]

            loss_total.backward()
            optimizer.step()

        # calculate and record performance
        if t == 0 or (t + 1) % 2 == 0:

            model.eval()
            with torch.no_grad():

                total_train_loss = []
                train_acc = []

                correct1_train = 0
                correct2_train = 0

                for (it, batch) in enumerate(test_loader):

                    X = batch[0]
                    ts = batch[1]
                    if torch.cuda.is_available():
                        X = X.cuda()
                        ts = ts.cuda()

                    valid_train_loss = model(X, ts)
                    total_train_loss.append(valid_train_loss)
                    output1 = model.model(X).max(2, keepdim=True)[1][:, 0]
                    output2 = model.model(X).max(2, keepdim=True)[1][:, 1]
                    correct1_train += output1.eq(
                        ts[:, 0].view_as(output1)).sum().item()
                    correct2_train += output2.eq(
                        ts[:, 1].view_as(output2)).sum().item()

                train_acc = np.stack([
                    1.0 * correct1_train / len(test_loader.dataset),
                    1.0 * correct2_train / len(test_loader.dataset)
                ])

                total_train_loss = torch.stack(total_train_loss)
                average_train_loss = torch.mean(total_train_loss, dim=0)

            # record and print
            if torch.cuda.is_available():

                task_train_losses.append(average_train_loss.data.cpu().numpy())
                train_accs.append(train_acc)

                weights.append(weight_vec.cpu().numpy())

                print('{}/{}: weights={}, train_loss={}, train_acc={}'.format(
                    t + 1, niter, weights[-1], task_train_losses[-1],
                    train_accs[-1]))

    result = {
        "training_losses": task_train_losses,
        "training_accuracies": train_accs
    }
    return result, model
Ejemplo n.º 5
0
def train(dataset, base_model, niter, npref, init_weight, pref_idx, alpha=0.0):

    # generate #npref preference vectors
    n_tasks = 2

    # load dataset
    print(f"loading dataset {dataset}")

    # MultiMNIST: multi_mnist.pickle
    if dataset == 'mnist':
        with open('data/multi_mnist.pickle','rb') as f:
            trainX, trainLabel,testX, testLabel = pickle.load(f)

    # MultiFashionMNIST: multi_fashion.pickle
    if dataset == 'fashion':
        with open('data/multi_fashion.pickle','rb') as f:
            trainX, trainLabel,testX, testLabel = pickle.load(f)


    # Multi-(Fashion+MNIST): multi_fashion_and_mnist.pickle
    if dataset == 'fashion_and_mnist':
        with open('data/multi_fashion_and_mnist.pickle','rb') as f:
            trainX, trainLabel,testX, testLabel = pickle.load(f)

    trainX = torch.from_numpy(trainX.reshape(120000,1,36,36)).float()
    trainLabel = torch.from_numpy(trainLabel).long()
    testX = torch.from_numpy(testX.reshape(20000,1,36,36)).float()
    testLabel = torch.from_numpy(testLabel).long()


    train_set = torch.utils.data.TensorDataset(trainX, trainLabel)
    test_set  = torch.utils.data.TensorDataset(testX, testLabel)


    batch_size = 256
    train_loader = torch.utils.data.DataLoader(
                     dataset=train_set,
                     batch_size=batch_size,
                     shuffle=True)
    test_loader = torch.utils.data.DataLoader(
                    dataset=test_set,
                    batch_size=batch_size,
                    shuffle=False)

    print('==>>> total trainning batch number: {}'.format(len(train_loader)))
    print('==>>> total testing batch number: {}'.format(len(test_loader)))


    # define the base model for ParetoMTL
    if base_model == "lenet":
        model = RegressionTrain(RegressionModel(n_tasks), init_weight)
    if base_model == "resnet18":
        model = RegressionTrainResNet(MnistResnNet(n_tasks), init_weight)

    if torch.cuda.is_available():
        model.cuda()

    # initialize weights
    weights = torch.ones(n_tasks)
    if torch.cuda.is_available():
        weights = weights.cuda()
    weights.requires_grad_()

    # choose different optimizer for different base model
    if base_model == 'lenet':
        optimizer = torch.optim.SGD(list(model.parameters()) + [weights], lr=1e-3, momentum=0.)
        # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,30,45,60,75,90], gamma=0.5)

    if base_model == 'resnet18':
        optimizer = torch.optim.Adam(list(model.parameters()) + [weights], lr=1e-3)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10,20], gamma=0.1)


    # store infomation during optimization
    task_train_losses = []
    train_accs = []

    # initialize loss
    initial_task_loss = None

    # run niter epochs of MGDA
    for t in range(niter):

        # scheduler.step()

        model.train()
        for (it, batch) in enumerate(train_loader):

            X = batch[0]
            ts = batch[1]
            if torch.cuda.is_available():
                X = X.cuda()
                ts = ts.cuda()

            # compute loss
            optimizer.zero_grad()
            task_loss = model(X, ts)
            if initial_task_loss is None:
                initial_task_loss = task_loss.detach()

            # compute parameter gradients
            weighted_loss = torch.sum(task_loss * weights)
            weighted_loss.backward(retain_graph=True)
            weights.grad.data = weights.grad.data * 0.

            # compute gradient gradients
            grad_norms = []
            for i in range(len(task_loss)):
                grad = torch.autograd.grad(task_loss[i], model.model.parameters(), retain_graph=True)
                grad = torch.cat([torch.flatten(x) for x in grad])
                grad_norms.append(torch.linalg.norm(weights[i] * grad))
            grad_norms = torch.stack(grad_norms)

            mean_grad_norm = torch.mean(grad_norms.detach())
            loss_ratio = task_loss.detach() / initial_task_loss
            inverse_loss_ratio = loss_ratio / torch.mean(loss_ratio)
            weight_loss = torch.sum(torch.abs(grad_norms - mean_grad_norm * (inverse_loss_ratio ** alpha)))
            weights.grad.data = torch.autograd.grad(weight_loss, weights)[0]

            # SGD step
            optimizer.step()

            # normalize weights
            weights.data = weights.data / torch.norm(weights.data)

        # calculate and record performance
        if t == 0 or (t + 1) % 2 == 0:

            model.eval()
            with torch.no_grad():

                total_train_loss = []
                train_acc = []

                correct1_train = 0
                correct2_train = 0

                for (it, batch) in enumerate(test_loader):

                    X = batch[0]
                    ts = batch[1]
                    if torch.cuda.is_available():
                        X = X.cuda()
                        ts = ts.cuda()

                    valid_train_loss = model(X, ts)
                    total_train_loss.append(valid_train_loss)
                    output1 = model.model(X).max(2, keepdim=True)[1][:,0]
                    output2 = model.model(X).max(2, keepdim=True)[1][:,1]
                    correct1_train += output1.eq(ts[:,0].view_as(output1)).sum().item()
                    correct2_train += output2.eq(ts[:,1].view_as(output2)).sum().item()


                train_acc = np.stack([1.0 * correct1_train / len(test_loader.dataset),1.0 * correct2_train / len(test_loader.dataset)])

                total_train_loss = torch.stack(total_train_loss)
                average_train_loss = torch.mean(total_train_loss, dim = 0)

            # record and print
            if torch.cuda.is_available():
                task_train_losses.append(average_train_loss.data.cpu().numpy())
                train_accs.append(train_acc)
                print('{}/{}: weight={} train_loss={}, train_acc={}'.format(
                        t + 1, niter, weights, task_train_losses[-1],train_accs[-1]))

    result = {"training_losses": task_train_losses,
              "training_accuracies": train_accs}
    return result, model