Ejemplo n.º 1
0
def train(model, criterion, optimizer, loader, epoch):

    model.train()

    losses = util.Meter(ptag='Loss')
    top1 = util.Meter(ptag='Prec@1')

    for batch_idx, (data, target) in enumerate(loader):
        # data loading
        data = data.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # forward pass
        output = model(data)
        loss = criterion(output, target)

        # backward pass
        loss.backward()

        # gradient step
        optimizer.step()
        optimizer.zero_grad()

        # write log files
        train_acc = util.comp_accuracy(output, target)

        losses.update(loss.item(), data.size(0))
        top1.update(train_acc[0].item(), data.size(0))

        if batch_idx % args.print_freq == 0 and args.save:
            logging.debug(
                'epoch {} itr {}, '
                'rank {}, loss value {:.4f}, train accuracy {:.3f}'.format(
                    epoch, batch_idx, rank, losses.avg, top1.avg))

            with open(args.out_fname, '+a') as f:
                print('{ep},{itr},'
                      '{loss.val:.4f},{loss.avg:.4f},'
                      '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch,
                                                                itr=batch_idx,
                                                                loss=losses,
                                                                top1=top1),
                      file=f)

    with open(args.out_fname, '+a') as f:
        print('{ep},{itr},'
              '{loss.val:.4f},{loss.avg:.4f},'
              '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch,
                                                        itr=batch_idx,
                                                        loss=losses,
                                                        top1=top1),
              file=f)
Ejemplo n.º 2
0
def evaluate(model, test_loader):
    model.eval()
    top1 = util.Meter(ptag='Acc')

    with torch.no_grad():
        for data, target in test_loader:
            data = data.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)
            outputs = model(data)
            acc1 = util.comp_accuracy(outputs, target)
            top1.update(acc1[0].item(), data.size(0))

    return top1.avg
Ejemplo n.º 3
0
def evaluate(model, test_loader):
    model.eval()
    top1 = util.AverageMeter()

    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data = data.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)
            outputs = model(data)
            acc1 = util.comp_accuracy(outputs, target)
            top1.update(acc1[0].item(), data.size(0))

    return top1.avg
Ejemplo n.º 4
0
def evaluate(model, test_loader, criterion):
    model.eval()
    top1 = util.AverageMeter()
    losses = util.AverageMeter()

    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            outputs = model(data)
            loss = criterion(outputs, target)
            acc1 = util.comp_accuracy(outputs, target)
            top1.update(acc1[0].item(), data.size(0))
            losses.update(loss.item(), data.size(0))

    model.train()

    return top1.avg, losses.avg
Ejemplo n.º 5
0
def run(size):
    models = []
    anchor_models = []
    optimizers = []
    ratios = []
    iters = []
    cps = args.cp
    save_names = []
    loss_Meters = []
    top1_Meters = []
    best_test_accs = []

    if args.constant_cp:
        cps = args.cp * args.size
    elif args.persistent:
        cps = [5, 5, 5, 5, 5, 5, 5, 20, 20, 20]
    else:
        local_cps = args.cp * np.ones(size, dtype=int)
        num_slow_nodes = int(size * args.slowRatio)
        np.random.seed(2020)
        random_cps = 5 + np.random.randn(num_slow_nodes) * 2
        for i in range(len(random_cps)):
            random_cps[i] = round(random_cps[i])
        local_cps[:num_slow_nodes] = random_cps
        # local_iterations = local_cps[rank]
        cps = local_cps

    for rank in range(args.size):
        # initiate experiments folder
        save_path = 'new_results/'
        folder_name = save_path + args.name
        if rank == 0 and os.path.isdir(folder_name) == False and args.save:
            os.mkdir(folder_name)
        # initiate log files
        tag = '{}/lr{:.3f}_bs{:d}_cr{:d}_avgcp{:.3f}_e{}_r{}_n{}.csv'
        saveFileName = tag.format(folder_name, args.lr, args.bs, args.cr,
                                  np.mean(args.cp), args.seed, rank, size)
        args.out_fname = saveFileName
        save_names.append(saveFileName)
        with open(args.out_fname, 'w+') as f:
            print('BEGIN-TRAINING\n'
                  'World-Size,{ws}\n'
                  'Batch-Size,{bs}\n'
                  'itr,'
                  'Loss,avg:Loss,Prec@1,avg:Prec@1,val'.format(ws=args.size,
                                                               bs=args.bs),
                  file=f)

        globalCp = args.globalCp
        total_size = args.total_size

        # seed for reproducibility
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.deterministic = True

        # load datasets
        train_loader, test_loader, dataRatio, x, y = partition_dataset(
            rank, total_size, 1, args.alpha, args.beta, args)
        ratios.append(dataRatio)
        print(sum([len(i) for i in x]))
        data_iter = iter(train_loader)
        iters.append(data_iter)

        # define neural nets model, criterion, and optimizer
        model = util.select_model(args.model, args)
        anchor_model = util.select_model(args.model, args)

        models.append(model)
        anchor_models.append(anchor_model)

        criterion = nn.CrossEntropyLoss()
        if args.FedProx:
            optimizer = FedProx.FedProxSGD(model.parameters(),
                                           lr=args.lr,
                                           momentum=0,
                                           nesterov=False,
                                           weight_decay=1e-4)
        else:
            optimizer = optim.SGD(model.parameters(),
                                  lr=args.lr,
                                  momentum=0,
                                  nesterov=False,
                                  weight_decay=1e-4)
        optimizers.append(optimizer)

        batch_idx = 0
        best_test_accuracy = 0
        best_test_accs.append(best_test_accuracy)

        losses = util.Meter(ptag='Loss')
        top1 = util.Meter(ptag='Prec@1')
        loss_Meters.append(losses)
        top1_Meters.append(top1)

        model.train()
        tic = time.time()
        print(dataRatio, len(train_loader), len(test_loader))

    round_communicated = 0
    while round_communicated < args.cr:
        for rank in range(args.size):
            model = models[rank]
            anchor_model = anchor_models[rank]
            data_iter = iters[rank]
            optimizer = optimizers[rank]
            losses = loss_Meters[rank]
            top1 = top1_Meters[rank]

            for cp in range(cps[rank]):
                try:
                    data, target = data_iter.next()
                except StopIteration:
                    data_iter = iter(train_loader)
                    data, target = data_iter.next()

                # data loading
                data = data
                target = target

                # forward pass
                output = model(data)
                loss = criterion(output, target)

                # backward pass
                loss.backward()
                if args.FedProx:
                    optimizer.step(anchor_model, args.mu)
                else:
                    optimizer.step()
                optimizer.zero_grad()

                train_acc = util.comp_accuracy(output, target)
                losses.update(loss.item(), data.size(0))
                top1.update(train_acc[0].item(), data.size(0))

                # batch_idx += 1
            # change the worker
            train_loader, dataRatio = get_next_trainloader(
                round_communicated, x, y, rank, args)
            data_iter = iter(train_loader)
            iters[rank] = data_iter
            ratios[rank] = dataRatio

        if args.NSGD:
            NormalSGDALLreduce(models, anchor_models, cps, globalCp, ratios)
        elif args.FedProx:
            FedProx_SyncAllreduce(models, ratios, anchor_models)
        else:
            unbalanced_SyncAllreduce(models, ratios)
        round_communicated += 1
        # update_lr(optimizer, round_communicated)

        if round_communicated % 4 == 0:
            for rank in range(args.size):
                name = save_names[rank]
                losses = loss_Meters[rank]
                top1 = top1_Meters[rank]

                with open(name, '+a') as f:
                    print('{itr},'
                          '{loss.val:.4f},{loss.avg:.4f},'
                          '{top1.val:.3f},{top1.avg:.3f},-1'.format(
                              itr=round_communicated, loss=losses, top1=top1),
                          file=f)

        if round_communicated % 12 == 0:
            for rank in range(args.size):
                name = save_names[rank]
                model = models[rank]
                losses = loss_Meters[rank]
                top1 = top1_Meters[rank]
                name = save_names[rank]

                test_acc, global_loss = evaluate(model, test_loader, criterion)

                if test_acc > best_test_accs[rank]:
                    best_test_accs[rank] = test_acc

                print('itr {}, '
                      'rank {}, loss value {:.4f}, '
                      'train accuracy {:.3f}, test accuracy {:.3f}, '
                      'elasped time {:.3f}'.format(round_communicated, rank,
                                                   losses.avg, top1.avg,
                                                   test_acc,
                                                   time.time() - tic))

                with open(name, '+a') as f:
                    print('{itr},{filler},{filler},'
                          '{filler},{loss:.4f},'
                          '{val:.4f}'.format(itr=-1,
                                             filler=-1,
                                             loss=global_loss,
                                             val=test_acc),
                          file=f)

                losses.reset()
                top1.reset()
                tic = time.time()
                # return

    for rank in range(args.size):
        name = save_names[rank]
        with open(name, '+a') as f:
            print('{itr} best test accuracy: {val:.4f}'.format(
                itr=-2, val=best_test_accs[rank]),
                  file=f)
Ejemplo n.º 6
0
def train(model, criterion, optimizer, batch_meter, comm_meter, loader, epoch,
          req):

    model.train()

    losses = util.Meter(ptag='Loss')
    top1 = util.Meter(ptag='Prec@1')
    weights = [1 / args.size for i in range(args.size)]

    iter_time = time.time()
    for batch_idx, (data, target) in enumerate(loader):
        # data loading
        data = data.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # forward pass
        output = model(data)
        loss = criterion(output, target)

        # backward pass
        loss.backward()
        update_learning_rate(optimizer,
                             epoch,
                             itr=batch_idx,
                             itr_per_epoch=len(loader))
        optimizer.step()
        optimizer.zero_grad()

        torch.cuda.synchronize()
        comm_start = time.time()

        ## CoCoD-SGD
        # optimizer.async_CoCoD_SGD_step(batch_idx, args.cp, req)

        ## Local SGD
        # if batch_idx != 0 and batch_idx % args.cp == 0:
        # SyncAllreduce(model, rank, size)

        optimizer.OverlapLocalSGD_step(batch_idx, args.cp, req)

        ## EASGD
        #optimizer.elastic_average(batch_idx, args.cp)

        if not (epoch == 0 and batch_idx == 0):
            torch.cuda.synchronize()
            comm_meter.update(time.time() - comm_start)
            batch_meter.update(time.time() - iter_time)

        # write log files
        train_acc = util.comp_accuracy(output, target)
        losses.update(loss.item(), data.size(0))
        top1.update(train_acc[0].item(), data.size(0))

        if batch_idx % args.print_freq == 0 and args.save:
            print('epoch {} itr {}, '
                  'rank {}, loss value {:.4f}, train accuracy {:.3f}'.format(
                      epoch, batch_idx, rank, losses.avg, top1.avg))

            with open(args.out_fname, '+a') as f:
                print('{ep},{itr},{bt},{ct},'
                      '{loss.val:.4f},{loss.avg:.4f},'
                      '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch,
                                                                itr=batch_idx,
                                                                bt=batch_meter,
                                                                ct=comm_meter,
                                                                loss=losses,
                                                                top1=top1),
                      file=f)

        torch.cuda.synchronize()
        iter_time = time.time()

    with open(args.out_fname, '+a') as f:
        print('{ep},{itr},{bt},{ct},'
              '{loss.val:.4f},{loss.avg:.4f},'
              '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch,
                                                        itr=batch_idx,
                                                        bt=batch_meter,
                                                        ct=comm_meter,
                                                        loss=losses,
                                                        top1=top1),
              file=f)
    return req