コード例 #1
0
def aggregation_thread():
    # Getting test loader
    test_loader = getTestLoader(serverargs)
    # Call aggregator
    agg_func = selectAggregator(serverargs)
    model_data = []
    node_model = 0
    for node in node_details:
        node_model = torch.load(serverargs['aggregated_model_location']+node['node_name']+'.pt') \
            .to(serverargs['device'])
        # test(serverargs, node_model, test_loader, logger=logger)
        node_tuple = (node_model, node['no_of_samples'])
        model_data.append(node_tuple)
    if serverargs['smpc']:
        model_data = layer_sharing(model_data, serverargs)
    agg_model = agg_func(model_data, serverargs)
    print("ModelCheck: ", checkModelEqual(node_model, agg_model))
    compare_models(node_model, agg_model)
#         torch.save(node_model, serverargs['aggregated_model_location']+'agg_model.pt')
    torch.save(agg_model, serverargs['aggregated_model_location']+'agg_model.pt')
#         agg_model = torch.load(serverargs['aggregated_model_location']+'agg_model.pt')
#         import pdb; pdb.set_trace()
    print("---Aggregation Done---")
    #testing agg_model
#         import pdb; pdb.set_trace()
    # test(serverargs, node_model, test_loader, logger=logger)
    test(serverargs, agg_model, test_loader, logger=logger)
    serverargs['current_agg_epoch']+=1

    serverargs['aggregator'] = 'comed'
    agg_model = agg_func(model_data, serverargs)
    print("---Aggregation Done---")
    test(serverargs, agg_model, test_loader, logger=logger)
コード例 #2
0
def sendmodel():
    file = request.files['file']
    path = os.path.join(serverargs['aggregated_model_location'], \
        request.files['file'].filename)
    file.save(path)

    count_done = 0
    # update train phase of the node
    for node in node_details:
        if node['node_name'] == request.files['file'].filename.split('.')[0]:
            node['agg_epoch'] += 1
        if ((node['agg_epoch'] - 1) == serverargs['current_agg_epoch']):
            count_done += 1
    if count_done == serverargs['num_of_nodes']:
        # Getting test loader
        test_loader = getTestLoader(serverargs)
        # Call aggregator
        agg_func = selectAggregator(serverargs)
        model_data = []
        node_model = 0
        for node in node_details:
            node_model = torch.load(serverargs['aggregated_model_location']+node['node_name']+'.pt') \
                .to(serverargs['device'])
            # test(serverargs, node_model, test_loader, logger=logger)
            node_tuple = (node_model, node['no_of_samples'])
            model_data.append(node_tuple)
        agg_model = agg_func(model_data, serverargs)
        print("ModelCheck: ", checkModelEqual(node_model, agg_model))
        compare_models(node_model, agg_model)
        #         torch.save(node_model, serverargs['aggregated_model_location']+'agg_model.pt')
        torch.save(agg_model,
                   serverargs['aggregated_model_location'] + 'agg_model.pt')
        #         agg_model = torch.load(serverargs['aggregated_model_location']+'agg_model.pt')
        #         import pdb; pdb.set_trace()
        print("---Aggregation Done---")
        #testing agg_model
        #         import pdb; pdb.set_trace()
        test(serverargs, node_model, test_loader, logger=logger)
        test(serverargs, agg_model, test_loader, logger=logger)
        serverargs['current_agg_epoch'] += 1
    return json.dumps({"status": "model sent successfully!"})
コード例 #3
0
def main():

    assert "WORLD_SIZE" in os.environ, "WORLD_SIZE not set"
    assert "RANK" in os.environ, "RANK not set"
    assert "MASTER_ADDR" in os.environ, "MASTER_ADDR not set"
    assert "MASTER_PORT" in os.environ, "MASTER_PORT not set"

    world_size = int(os.environ['WORLD_SIZE'])
    number_nodes = world_size - 1
    rank = int(os.environ['RANK'])

    assert rank == 0, "Master does not have rank 0"
    assert world_size > 1, "World size is smaller than 2 (no workers)"

    print(socket.gethostbyname(socket.gethostname()))
    sys.stdout.flush()

    parser = argparse.ArgumentParser(description='Hong\'s ADMM')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--max-iterations',
                        type=int,
                        default=10,
                        help='How many iterations t? (default: 10)')
    parser.add_argument('--rho',
                        type=float,
                        default=1,
                        help='Rho for all nodes (default: 100)')
    parser.add_argument('--multiplier',
                        type=str2bool,
                        default=True,
                        help='Use lag. multipliers?')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='Learning rate for node (default: 0.001)')
    parser.add_argument('--split', type=str2bool, default=True, help='split?')
    # parser.add_argument('--lambda1', type=float, default=0.01, help='lambda 1 (default: 0.01)')
    # parser.add_argument('--lambda2', type=float, default=0.02, help='lambda 2 (default: 0.02)')
    args = parser.parse_args()

    filename = f'sync_mult{args.multiplier}_split{args.split}_r{args.rho}_lr{str(args.lr)}_n{number_nodes}.pdf'
    print(filename)

    # do not use cuda to avoid unnec. sending between gpu and cpu
    use_cuda = False  # not args.no_cuda and torch.cuda.is_available()
    torch.manual_seed(args.seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    test_dataloader = dataloader.getTestLoader(kwargs)
    progress_dataloader = dataloader.getProgressLoader(kwargs)

    # TODO: receive rhos from workers
    rhos = [args.rho] * number_nodes

    x0_model = model.Net().to(device)
    xk_models, yk_models = [], []
    for k in range(number_nodes):
        xk_dummy = model.Net().to(device)
        xk_models.append(xk_dummy)
        yk_dummy = model.Net().to(device)
        yk_models.append(yk_dummy)

    augmented_lagrangians, progress_losses, progress_accs = [], [], []

    dist.init_process_group(backend='gloo')
    print("init_process_group done")

    for t in range(args.max_iterations):

        # recieve models from workers
        for w in range(1, world_size):
            xk_models[w - 1] = receive(xk_models[w - 1], src=w, tag=1)
            if args.multiplier:
                yk_models[w - 1] = receive(yk_models[w - 1], src=w, tag=2)

        # x0 update
        x0_model.train()
        if args.multiplier:
            x0_model = x0SolverWithMult.solve(x0_model, xk_models, yk_models,
                                              rhos)
        else:
            x0_model = x0SolverNoMult.solve(x0_model, xk_models, rhos)

        # evaluation
        x0_model.eval()
        for k in range(number_nodes):
            xk_models[k].eval()
            yk_models[k].eval()
        if args.multiplier:
            aug_lagrangian, progress_loss, progress_acc = augLagrangianWithMult.get(
                progress_dataloader, device, x0_model, xk_models, yk_models,
                rhos)
        else:
            aug_lagrangian, progress_loss, progress_acc = augLagrangianNoMult.get(
                progress_dataloader, device, x0_model, xk_models, rhos)
        augmented_lagrangians.append(aug_lagrangian)
        progress_losses.append(progress_loss)
        progress_accs.append(progress_acc)
        print(
            f"[{t}] Augmented Lagrangian: {aug_lagrangian}, Loss: {progress_loss}, Acc: {(progress_acc * 100):.1f}%"
        )

        # send out x0 model to workers
        for w in range(1, world_size):
            send(x0_model, dst=w, tag=0)

    # Create graphs
    print("DONE")
    # fig, ax = plt.subplots(1, figsize=(10,5))
    # ax.set_title('Augmented Lagrangian')
    # ax.set_yscale('log')
    # ax.plot(augmented_lagrangians)
    # fig.savefig(f"graphs/sync_auglag_{filename}", bbox_inches='tight')
    #
    # fig, ax = plt.subplots(1, figsize=(10,5))
    # ax.set_title('x0 Cross Entropy Loss')
    # ax.set_yscale('log')
    # ax.plot(progress_losses)
    # fig.savefig(f"graphs/sync_xentrop_{filename}", bbox_inches='tight')

    # detailed graph
    fig, ax = plt.subplots(3, figsize=(10, 20))
    ax[0].set_title('Augmented Lagrangian')
    ax[0].plot(augmented_lagrangians)
    ax[1].set_title('x0 Cross Entropy Loss')
    ax[1].plot(progress_losses)
    ax[2].set_title('Accuracy')
    ax[2].plot(progress_accs)
    # ax[3].set_title('Node Objective Function Scores')
    # for k in range(args.number_nodes):
    #     ax[3].plot(node_scores[k])
    # ax[4].set_title('Node Losses')
    # for k in range(args.number_nodes):
    #     ax[4].plot(node_losses[k])
    # ax[5].set_title('L1 Residuals')
    # for k in range(args.number_nodes):
    #     ax[5].plot(node_residuals[k])
    fig.savefig(f"graphs/{filename}", bbox_inches='tight')
コード例 #4
0
import torch
import os

import dataloader
import models
import parallel_run

#initialize syft
import syft as sy

# Get configuration parameters
import config
args = config.Arguments()

# Load and simulate distribution of data
FLdataloaders, datasample_count, nodelist = dataloader.getDataLoaders(args, sy)
testloader = dataloader.getTestLoader(args)

parallel_run.runTrainParallel(nodelist, datasample_count, args, FLdataloaders,
                              testloader)
コード例 #5
0
args['architecture'] = serverargs['architecture']
model_path = os.path.join(args['model_location'], args['node_name'] + '.pt')

# initialize wandb
if args['wandb'] == True:
    logger = log.initialize_wandb(args['node_name'])
else:
    logger = None

while (True):  # change to aggregation epoch iteration max
    # Gets model from the aggregator and stores in local system
    getModel(agg_url, model_path, local_agg_epoch)

    # Creating train loader and test loader
    train_loader = getTrainLoader(args)
    test_loader = getTestLoader(args)
    # Train loop
    local_model = loadModel(model_path).to(args['device'])

    agg_model = torch.load('../../aggregated_model/agg_model.pt').to(
        args['device'])
    print("ModelCheck: ", checkModelEqual(local_model, agg_model))

    optimizer = optim.Adam(local_model.parameters(), lr=args['lr'])
    for epoch in range(1, args['epochs'] + 1):
        if args['byzantine'] == 'FAIL':
            local_model = createRandomInitializedModel(args).to(args['device'])
        else:
            train.train(logger=logger ,\
                args=args, model=local_model,
                train_loader=train_loader,
コード例 #6
0
def main():

    assert "WORLD_SIZE" in os.environ, "WORLD_SIZE not set"
    assert "RANK" in os.environ, "RANK not set"
    assert "MASTER_ADDR" in os.environ, "MASTER_ADDR not set"
    assert "MASTER_PORT" in os.environ, "MASTER_PORT not set"

    world_size = int(os.environ['WORLD_SIZE'])
    number_nodes = world_size - 1
    rank = int(os.environ['RANK'])

    assert rank == 0, "Master does not have rank 0"
    assert world_size > 1, "World size is smaller than 2 (no workers)"

    print(socket.gethostbyname(socket.gethostname()))
    sys.stdout.flush()

    parser = argparse.ArgumentParser(description='Hong\'s ADMM')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--max-iterations',
                        type=int,
                        default=10,
                        help='How many iterations t? (default: 10)')
    parser.add_argument('--rho',
                        type=float,
                        default=1,
                        help='Rho for all nodes (default: 100)')
    parser.add_argument('--multiplier',
                        type=str2bool,
                        default=False,
                        help='Use lag. multipliers?')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='Learning rate for node (default: 0.001)')
    parser.add_argument('--split', type=str2bool, default=False, help='split?')
    parser.add_argument('--barrier',
                        type=int,
                        default=1,
                        help='Partial Barrier')
    parser.add_argument('--experiment',
                        type=str,
                        default="admm",
                        help='Experiment identifier')
    args = parser.parse_args()

    filename = f'async_{args.experiment}_mult{args.multiplier}_split{args.split}_b{args.barrier}_r{args.rho}_lr{str(args.lr)}_n{number_nodes}.pdf'
    loss_file = open(f'data/loss_{filename}.csv', 'w+')
    acc_file = open(f'data/acc_{filename}.csv', 'w+')
    delay_file = open(f'data/delays_{filename}.csv', 'w+')
    time_file = open(f'data/time_{filename}.csv', 'w+')
    print(filename)

    torch.manual_seed(args.seed)

    # do not use cuda to avoid unnec. sending between gpu and cpu
    use_cuda = False  # not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    test_dataloader = dataloader.getTestLoader(kwargs)
    progress_dataloader = dataloader.getProgressLoader(kwargs)

    # TODO: receive rhos from workers
    rhos = [args.rho] * number_nodes

    x0_model = model.Net().to(device)
    xk_models, yk_models = [], []
    for k in range(number_nodes):
        xk_dummy = model.Net().to(device)
        xk_models.append(xk_dummy)
        yk_dummy = model.Net().to(device)
        yk_models.append(yk_dummy)

    augmented_lagrangians, progress_losses, progress_accs = [], [], []

    dist.init_process_group(backend='gloo')
    print("init_process_group done")

    wait_threads_for_nodes = []
    node_xk_weights = [None] * number_nodes  # xk_weights
    node_yk_weights = [None] * number_nodes  # yk_weights
    for k in range(number_nodes):
        node_xk_weights[k] = model.save(xk_models[k])
        node_yk_weights[k] = model.save(yk_models[k])

    # start all receiving jobs at the beginning
    for w in range(1, world_size):  # for k in range(number_nodes):
        wait_threads_for_weights = []
        for i in range(len(node_xk_weights[w - 1])):
            req = dist.irecv(tensor=node_xk_weights[w - 1][i],
                             src=w,
                             tag=1 * 1000 + i)
            thr = threading.Thread(target=wait_thread,
                                   args=(req, ),
                                   daemon=True)
            thr.start()
            wait_threads_for_weights.append(thr)
        if args.multiplier:
            for i in range(len(node_yk_weights[w - 1])):
                req = dist.irecv(tensor=node_yk_weights[w - 1][i],
                                 src=w,
                                 tag=2 * 1000 + i)
                thr = threading.Thread(target=wait_thread,
                                       args=(req, ),
                                       daemon=True)
                thr.start()
                wait_threads_for_weights.append(thr)
        wait_threads_for_nodes.append(wait_threads_for_weights)

    node_iterations = [0] * number_nodes

    tic = time.time()
    t = 0
    while True:
        # check status if something has been received
        iteration_done = []
        for k in range(number_nodes):
            done = all(not thr.is_alive() for thr in wait_threads_for_nodes[k])
            if done:
                iteration_done.append(k)

        if len(iteration_done) < args.barrier:
            print(
                f"Not enough nodes ready: {len(iteration_done)}/{args.barrier}. Sleep..."
            )
            time.sleep(1)
            continue

        print(f"Perform x0 update using nodes {iteration_done}")
        for k in iteration_done:
            node_iterations[k] = node_iterations[k] + 1
        delay_file.write(', '.join(str(e) for e in iteration_done))
        delay_file.write("\r\n")
        print(node_iterations)

        for k in iteration_done:
            xk_models[k] = model.load(node_xk_weights[k], xk_models[k])
            if args.multiplier:
                yk_models[k] = model.load(node_yk_weights[k], yk_models[k])

        # x0 update
        t = t + 1
        if args.multiplier:
            x0_model = x0SolverWithMult.solve(x0_model, xk_models, yk_models,
                                              rhos)
        else:
            x0_model = x0SolverNoMult.solve(x0_model, xk_models, rhos)

        # send out new x0 model to iteration_done
        x0_weights = model.save(x0_model)
        reqs = []
        for k in iteration_done:
            for i, x0_weight in enumerate(x0_weights):
                req = dist.isend(tensor=x0_weight, dst=k + 1, tag=0 * 1000 + i)
                print(f"x0_weight {i} sending out to {k+1}. Tag: {0*1000+i}")
                reqs.append(req)
        # TODO: wait?
        for req in reqs:
            req.wait()

        # start receiving from nodes again for iteration_done
        for k in iteration_done:
            node_xk_weights[k] = model.save(xk_models[k])
            node_yk_weights[k] = model.save(yk_models[k])
            wait_threads_for_weights = []
            for i in range(len(node_xk_weights[k])):
                req = dist.irecv(tensor=node_xk_weights[k][i],
                                 src=k + 1,
                                 tag=1 * 1000 + i)
                thr = threading.Thread(target=wait_thread,
                                       args=(req, ),
                                       daemon=True)
                thr.start()
                wait_threads_for_weights.append(thr)
            if args.multiplier:
                for i in range(len(node_yk_weights[k])):
                    req = dist.irecv(tensor=node_yk_weights[k][i],
                                     src=k + 1,
                                     tag=2 * 1000 + i)
                    thr = threading.Thread(target=wait_thread,
                                           args=(req, ),
                                           daemon=True)
                    thr.start()
                    wait_threads_for_weights.append(thr)
            wait_threads_for_nodes[k] = wait_threads_for_weights

        # evaluation
        x0_model.eval()
        for k in range(number_nodes):
            xk_models[k].eval()
            yk_models[k].eval()
        if args.multiplier:
            aug_lagrangian, progress_loss, progress_acc = augLagrangianWithMult.get(
                progress_dataloader, device, x0_model, xk_models, yk_models,
                rhos)
        else:
            aug_lagrangian, progress_loss, progress_acc = augLagrangianNoMult.get(
                progress_dataloader, device, x0_model, xk_models, rhos)
        augmented_lagrangians.append(aug_lagrangian)
        progress_losses.append(progress_loss)
        progress_accs.append(progress_acc)
        current_time = time.time() - tic
        loss_file.write(f"{current_time}; {progress_loss}\r\n")
        acc_file.write(f"{current_time}; {progress_acc}\r\n")
        print(
            f"Augmented Lagrangian: {aug_lagrangian}, Loss: {progress_loss}, Acc: {(progress_acc * 100):.1f}%"
        )
        x0_model.train()
        for k in range(number_nodes):
            xk_models[k].train()
            yk_models[k].train()

        # stop condition
        if all(it > args.max_iterations for it in node_iterations):
            break

    toc = time.time()
    tic_toc = toc - tic
    time_file.write(str(tic_toc))

    print("DONE")

    # Create graphs
    # fig, ax = plt.subplots(1, figsize=(10,5))
    # ax.set_title('Augmented Lagrangian')
    # ax.set_yscale('log')
    # ax.plot(augmented_lagrangians)
    # fig.savefig(f"graphs/sync_auglag_{filename}", bbox_inches='tight')
    #
    # fig, ax = plt.subplots(1, figsize=(10,5))
    # ax.set_title('x0 Cross Entropy Loss')
    # ax.set_yscale('log')
    # ax.plot(progress_losses)
    # fig.savefig(f"graphs/sync_xentrop_{filename}", bbox_inches='tight')

    # detailed graph
    fig, ax = plt.subplots(3, figsize=(10, 20))
    ax[0].set_title('Augmented Lagrangian')
    ax[0].plot(augmented_lagrangians)
    ax[1].set_title('x0 Cross Entropy Loss')
    ax[1].plot(progress_losses)
    ax[2].set_title('Accuracy')
    ax[2].plot(progress_accs)
    # ax[3].set_title('Node Objective Function Scores')
    # for k in range(args.number_nodes):
    #     ax[3].plot(node_scores[k])
    # ax[4].set_title('Node Losses')
    # for k in range(args.number_nodes):
    #     ax[4].plot(node_losses[k])
    # ax[5].set_title('L1 Residuals')
    # for k in range(args.number_nodes):
    #     ax[5].plot(node_residuals[k])
    fig.savefig(f"graphs/{filename}", bbox_inches='tight')
コード例 #7
0
ファイル: admm.py プロジェクト: filipre/master-experiments
def main():
    parser = argparse.ArgumentParser(description='Hong\'s ADMM')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--max-delay',
        type=int,
        default=1,
        metavar='D',
        help='maximal gradient delay (default: 1, i.e. no delay)')
    parser.add_argument(
        '--number-nodes',
        type=int,
        default=3,
        help='How many nodes should we simulate? (default: 10)')
    parser.add_argument('--node-batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size within node (default: 64)')
    parser.add_argument('--node-epoch',
                        type=int,
                        default=1,
                        metavar='N',
                        help='number of epoch in node (default: 1)')
    parser.add_argument('--max-iterations',
                        type=int,
                        default=10,
                        help='How many iterations t? (default: 10)')
    parser.add_argument('--rho',
                        type=float,
                        default=1,
                        help='Rho for all nodes (default: 100)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='Learning rate for all nodes (default: 0.001)')
    parser.add_argument('--delay-method',
                        type=str,
                        default='constant',
                        help='constant, uniform, ...')
    parser.add_argument('--multiplier',
                        type=str2bool,
                        default=False,
                        help='Use lag. multipliers?')
    parser.add_argument('--split', type=str2bool, default=False, help='split?')
    parser.add_argument('--partial',
                        type=int,
                        default=None,
                        help='partial? (default: None)')
    # parser.add_argument('--lambda1', type=float, default=0.01, help='lambda 1 (default: 0.01)')
    # parser.add_argument('--lambda2', type=float, default=0.02, help='lambda 2 (default: 0.02)')
    args = parser.parse_args()

    filenameCsv = f'dm{args.delay_method}_d{args.max_delay}_mult{args.multiplier}_split{args.split}_r{args.rho}_lr{str(args.lr)}_n{args.number_nodes}.csv'
    # augmented_file = open(f'data/augmented_{filenameCsv}', 'w+')
    loss_file = open(f'data/loss_{filenameCsv}', 'w+')
    acc_file = open(f'data/acc_{filenameCsv}', 'w+')
    filename = f'dm{args.delay_method}_d{args.max_delay}_mult{args.multiplier}_split{args.split}_r{args.rho}_lr{str(args.lr)}_n{args.number_nodes}.pdf'
    print(filename)

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    torch.manual_seed(args.seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    # setting device on GPU if available, else CPU
    print('Using device:', device)
    print()

    if args.split:
        train_dataloaders = dataloader.getSplittedTrainingLoaders(
            args.number_nodes,
            args.node_batch_size,
            kwargs,
            partial=args.partial)
    else:
        train_dataloaders = dataloader.getSameTrainingLoaders(
            args.number_nodes,
            args.node_batch_size,
            kwargs,
            partial=args.partial)
    test_dataloader = dataloader.getTestLoader(kwargs)
    progress_dataloader = dataloader.getProgressLoader(kwargs)

    # Initialization
    x0_model = model.Net().to(device)  #x0
    x0_model_queue = deque([x0_model])
    xk_models_queue, yk_models_queue = [], []
    for node in range(args.number_nodes):
        xk_model = model.Net().to(
            device
        )  # model.copyWeights(x0_model_delays[0], x_model) # start with the same weights
        xk_models_queue.append(deque([xk_model]))
        yk_model = model.Net().to(device)
        yk_models_queue.append(deque([yk_model]))

    rhos = [args.rho] * args.number_nodes
    lrs = [args.lr] * args.number_nodes

    augmented_lagrangians, progress_losses, progress_accs = [], [], []
    node_scores, node_losses, node_residuals = [], [], []
    for node in range(args.number_nodes):
        node_scores.append([])
        node_losses.append([])
        node_residuals.append([])

    # Algorithm
    for t in range(args.max_iterations):

        x0_model = model.Net().to(device)
        x0_model.train()
        model.copyWeights(x0_model_queue[0], x0_model)
        xk_models = delay.forMaster(xk_models_queue, args.max_delay,
                                    args.delay_method)
        yk_models = delay.forMaster(yk_models_queue, args.max_delay,
                                    args.delay_method)

        # x0 update
        if args.multiplier:
            x0_model = x0SolverWithMult.solve(x0_model, xk_models, yk_models,
                                              rhos)
        else:
            x0_model = x0SolverNoMult.solve(x0_model, xk_models, rhos)

        # push new model to queue
        x0_model_queue.appendleft(x0_model)
        if len(x0_model_queue) > args.max_delay:
            x0_model_queue.pop()  # remove oldest model

        for k in range(args.number_nodes):

            xk_model = model.Net().to(device)
            xk_model.train()
            model.copyWeights(xk_models_queue[k][0], xk_model)
            yk_model = model.Net().to(device)
            yk_model.train()
            model.copyWeights(yk_models_queue[k][0], yk_model)
            x0_model = delay.forWorker(x0_model_queue, args.max_delay,
                                       args.delay_method)

            # xk update
            if args.multiplier:
                xk_model, scores, losses, residuals = xkSolverWithMult.solve(
                    xk_model, train_dataloaders[k], device, x0_model, yk_model,
                    rhos[k], lrs[k], args.node_epoch)
            else:
                xk_model, scores, losses, residuals = xkSolverNoMult.solve(
                    xk_model, train_dataloaders[k], device, x0_model, rhos[k],
                    lrs[k], args.node_epoch)
            node_scores[k] = node_scores[k] + scores
            node_losses[k] = node_losses[k] + losses
            node_residuals[k] = node_residuals[k] + residuals

            # yk update
            if args.multiplier:
                yk_model = ykSolver.solve(yk_model, x0_model, xk_model,
                                          rhos[k])

            xk_models_queue[k].appendleft(xk_model)
            yk_models_queue[k].appendleft(yk_model)
            assert len(xk_models_queue[k]) == len(
                yk_models_queue[k]), "something is wrong wiht the queues"
            if len(xk_models_queue[k]) > args.max_delay:
                xk_models_queue[k].pop()
                yk_models_queue[k].pop()

        # evaluation
        x0_model = x0_model_queue[0]
        x0_model.eval()
        xk_models, yk_models = [], []
        for k in range(args.number_nodes):
            xk_model = xk_models_queue[k][0]
            xk_model.eval()
            xk_models.append(xk_model)
            yk_model = yk_models_queue[k][0]
            yk_model.eval()
            yk_models.append(yk_model)

        if args.multiplier:
            aug_lagrangian, progress_loss, progress_acc = augLagrangianWithMult.get(
                progress_dataloader, device, x0_model, xk_models, yk_models,
                rhos)
        else:
            aug_lagrangian, progress_loss, progress_acc = augLagrangianNoMult.get(
                progress_dataloader, device, x0_model, xk_models, rhos)
        augmented_lagrangians.append(aug_lagrangian)
        progress_losses.append(progress_loss)
        progress_accs.append(progress_acc)
        loss_file.write(f"{progress_loss}\r\n")
        acc_file.write(f"{progress_acc}\r\n")
        print(
            f"[{t}] Augmented Lagrangian: {aug_lagrangian}, Loss: {progress_loss}, Acc: {(progress_acc * 100):.1f}%"
        )

    # Create graphs
    print("DONE")
    fig, ax = plt.subplots(1, figsize=(10, 5))
    ax.set_title('Augmented Lagrangian')
    ax.set_yscale('log')
    ax.plot(augmented_lagrangians)
    fig.savefig(f"graphs/auglag_{filename}", bbox_inches='tight')

    fig, ax = plt.subplots(1, figsize=(10, 5))
    ax.set_title('x0 Cross Entropy Loss')
    ax.set_yscale('log')
    ax.plot(progress_losses)
    fig.savefig(f"graphs/xentrop_{filename}", bbox_inches='tight')

    # detailed graph
    fig, ax = plt.subplots(6, figsize=(10, 20))
    ax[0].set_title('Augmented Lagrangian')
    ax[0].plot(augmented_lagrangians)
    ax[1].set_title('x0 Cross Entropy Loss')
    ax[1].plot(progress_losses)
    ax[2].set_title('Accuracy')
    ax[2].plot(progress_accs)
    ax[3].set_title('Node Objective Function Scores')
    for k in range(args.number_nodes):
        ax[3].plot(node_scores[k])
    ax[4].set_title('Node Losses')
    for k in range(args.number_nodes):
        ax[4].plot(node_losses[k])
    ax[5].set_title('L1 Residuals')
    for k in range(args.number_nodes):
        ax[5].plot(node_residuals[k])
    fig.savefig(f"graphs/details_{filename}", bbox_inches='tight')
コード例 #8
0
ファイル: sgd.py プロジェクト: filipre/master-experiments
def main():
    # training batch 1, test batch 1000, epoch 10, lr 0.01, momentum 0.5
    parser = argparse.ArgumentParser(description='SGD')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')
    parser.add_argument('--max-delay', type=int, default=1, metavar='D',
                        help='maximal gradient delay (default: 1, i.e. no delay)')
    parser.add_argument('--check-progress', type=int, default=1, help='Check progress every n iterations (default: 10)')
    parser.add_argument('--delay-method', type=str, default='constant', help='constant, uniform, ...')
    parser.add_argument('--partial', type=int, default=None, help='partial? (default: None)')
    # parser.add_argument('--lambda1', type=float, default=0.01, help='lambda 1 (default: 0.01)')
    # parser.add_argument('--lambda2', type=float, default=0.02, help='lambda 2 (default: 0.02)')
    args = parser.parse_args()
    print(args)

    filename = f'dm{args.delay_method}_d{args.max_delay}_lr{str(args.lr)}'
    graph_filename = f'graphs/graph_{filename}.pdf'
    loss_file = open(f'data/loss_{filename}.csv', 'w+')
    acc_file = open(f'data/acc_{filename}.csv', 'w+')
    print(filename)

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    torch.manual_seed(args.seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    # setting device on GPU if available, else CPU
    print('Using device:', device)
    print()

    train_dataloader = dataloader.getSameTrainingLoader(args.batch_size, kwargs, partial=args.partial)
    print(len(train_dataloader))
    test_dataloader = dataloader.getTestLoader(kwargs)
    progress_dataloader = dataloader.getProgressLoader(kwargs)

    x_model = model.Net().to(device) # This is the "master" model on which we update the parameters
    x_model.train()
    x_model_queue = deque([x_model])

    progress_losses, progress_accs = [], []

    for epoch in range(args.epochs):

        # Training
        for batch_idx, (data, target) in enumerate(train_dataloader):

            # TRAIN step
            x_model = model.Net().to(device)
            x_model.train()
            model.copyWeights(x_model_queue[0], x_model) # get most recent parameters
            delayed_model = delay.delayModel(x_model_queue, args.max_delay, args.delay_method)
            delayed_model.train()

            x_model, loss = solver.solve(x_model, data, target, device, delayed_model, args.lr)

            x_model_queue.appendleft(x_model)
            if len(x_model_queue) > args.max_delay:
                x_model_queue.pop() # if we have more models than we want, remove the oldest

        # Evaluation after each epoch
        x_model_queue[0].eval()
        progress_loss = 0
        progress_correct = 0
        with torch.no_grad():
            for data, target in progress_dataloader:
                data, target = data.to(device), target.to(device)
                output = x_model_queue[0](data)
                progress_loss += F.nll_loss(output, target, reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                progress_correct += pred.eq(target.view_as(pred)).sum().item()
        progress_loss /= len(progress_dataloader.dataset)
        progress_losses.append(progress_loss)
        progress_acc = progress_correct / len(progress_dataloader.dataset)
        progress_accs.append(progress_acc)
        loss_file.write(f"{progress_loss}\r\n")
        acc_file.write(f"{progress_acc}\r\n")
        print(f"[{epoch}] Progress Loss: {progress_loss}, Acc: {(progress_acc * 100):.1f}%")

    fig, ax = plt.subplots(2, figsize=(10,20))
    ax[0].set_title('Cross-entropy loss')
    ax[0].plot(progress_losses)
    ax[1].set_title('Accuracy')
    ax[1].plot(progress_accs)
    fig.savefig(graph_filename, bbox_inches='tight')