def train(model, criterion, optimizer, loader, epoch): model.train() losses = util.Meter(ptag='Loss') top1 = util.Meter(ptag='Prec@1') for batch_idx, (data, target) in enumerate(loader): # data loading data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # forward pass output = model(data) loss = criterion(output, target) # backward pass loss.backward() # gradient step optimizer.step() optimizer.zero_grad() # write log files train_acc = util.comp_accuracy(output, target) losses.update(loss.item(), data.size(0)) top1.update(train_acc[0].item(), data.size(0)) if batch_idx % args.print_freq == 0 and args.save: logging.debug( 'epoch {} itr {}, ' 'rank {}, loss value {:.4f}, train accuracy {:.3f}'.format( epoch, batch_idx, rank, losses.avg, top1.avg)) with open(args.out_fname, '+a') as f: print('{ep},{itr},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch, itr=batch_idx, loss=losses, top1=top1), file=f) with open(args.out_fname, '+a') as f: print('{ep},{itr},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch, itr=batch_idx, loss=losses, top1=top1), file=f)
def evaluate(model, test_loader): model.eval() top1 = util.Meter(ptag='Acc') with torch.no_grad(): for data, target in test_loader: data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) outputs = model(data) acc1 = util.comp_accuracy(outputs, target) top1.update(acc1[0].item(), data.size(0)) return top1.avg
def evaluate(model, test_loader): model.eval() top1 = util.AverageMeter() with torch.no_grad(): for batch_idx, (data, target) in enumerate(test_loader): data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) outputs = model(data) acc1 = util.comp_accuracy(outputs, target) top1.update(acc1[0].item(), data.size(0)) return top1.avg
def evaluate(model, test_loader, criterion): model.eval() top1 = util.AverageMeter() losses = util.AverageMeter() with torch.no_grad(): for batch_idx, (data, target) in enumerate(test_loader): outputs = model(data) loss = criterion(outputs, target) acc1 = util.comp_accuracy(outputs, target) top1.update(acc1[0].item(), data.size(0)) losses.update(loss.item(), data.size(0)) model.train() return top1.avg, losses.avg
def run(size): models = [] anchor_models = [] optimizers = [] ratios = [] iters = [] cps = args.cp save_names = [] loss_Meters = [] top1_Meters = [] best_test_accs = [] if args.constant_cp: cps = args.cp * args.size elif args.persistent: cps = [5, 5, 5, 5, 5, 5, 5, 20, 20, 20] else: local_cps = args.cp * np.ones(size, dtype=int) num_slow_nodes = int(size * args.slowRatio) np.random.seed(2020) random_cps = 5 + np.random.randn(num_slow_nodes) * 2 for i in range(len(random_cps)): random_cps[i] = round(random_cps[i]) local_cps[:num_slow_nodes] = random_cps # local_iterations = local_cps[rank] cps = local_cps for rank in range(args.size): # initiate experiments folder save_path = 'new_results/' folder_name = save_path + args.name if rank == 0 and os.path.isdir(folder_name) == False and args.save: os.mkdir(folder_name) # initiate log files tag = '{}/lr{:.3f}_bs{:d}_cr{:d}_avgcp{:.3f}_e{}_r{}_n{}.csv' saveFileName = tag.format(folder_name, args.lr, args.bs, args.cr, np.mean(args.cp), args.seed, rank, size) args.out_fname = saveFileName save_names.append(saveFileName) with open(args.out_fname, 'w+') as f: print('BEGIN-TRAINING\n' 'World-Size,{ws}\n' 'Batch-Size,{bs}\n' 'itr,' 'Loss,avg:Loss,Prec@1,avg:Prec@1,val'.format(ws=args.size, bs=args.bs), file=f) globalCp = args.globalCp total_size = args.total_size # seed for reproducibility torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True # load datasets train_loader, test_loader, dataRatio, x, y = partition_dataset( rank, total_size, 1, args.alpha, args.beta, args) ratios.append(dataRatio) print(sum([len(i) for i in x])) data_iter = iter(train_loader) iters.append(data_iter) # define neural nets model, criterion, and optimizer model = util.select_model(args.model, args) anchor_model = util.select_model(args.model, args) models.append(model) anchor_models.append(anchor_model) criterion = nn.CrossEntropyLoss() if args.FedProx: optimizer = FedProx.FedProxSGD(model.parameters(), lr=args.lr, momentum=0, nesterov=False, weight_decay=1e-4) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0, nesterov=False, weight_decay=1e-4) optimizers.append(optimizer) batch_idx = 0 best_test_accuracy = 0 best_test_accs.append(best_test_accuracy) losses = util.Meter(ptag='Loss') top1 = util.Meter(ptag='Prec@1') loss_Meters.append(losses) top1_Meters.append(top1) model.train() tic = time.time() print(dataRatio, len(train_loader), len(test_loader)) round_communicated = 0 while round_communicated < args.cr: for rank in range(args.size): model = models[rank] anchor_model = anchor_models[rank] data_iter = iters[rank] optimizer = optimizers[rank] losses = loss_Meters[rank] top1 = top1_Meters[rank] for cp in range(cps[rank]): try: data, target = data_iter.next() except StopIteration: data_iter = iter(train_loader) data, target = data_iter.next() # data loading data = data target = target # forward pass output = model(data) loss = criterion(output, target) # backward pass loss.backward() if args.FedProx: optimizer.step(anchor_model, args.mu) else: optimizer.step() optimizer.zero_grad() train_acc = util.comp_accuracy(output, target) losses.update(loss.item(), data.size(0)) top1.update(train_acc[0].item(), data.size(0)) # batch_idx += 1 # change the worker train_loader, dataRatio = get_next_trainloader( round_communicated, x, y, rank, args) data_iter = iter(train_loader) iters[rank] = data_iter ratios[rank] = dataRatio if args.NSGD: NormalSGDALLreduce(models, anchor_models, cps, globalCp, ratios) elif args.FedProx: FedProx_SyncAllreduce(models, ratios, anchor_models) else: unbalanced_SyncAllreduce(models, ratios) round_communicated += 1 # update_lr(optimizer, round_communicated) if round_communicated % 4 == 0: for rank in range(args.size): name = save_names[rank] losses = loss_Meters[rank] top1 = top1_Meters[rank] with open(name, '+a') as f: print('{itr},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},-1'.format( itr=round_communicated, loss=losses, top1=top1), file=f) if round_communicated % 12 == 0: for rank in range(args.size): name = save_names[rank] model = models[rank] losses = loss_Meters[rank] top1 = top1_Meters[rank] name = save_names[rank] test_acc, global_loss = evaluate(model, test_loader, criterion) if test_acc > best_test_accs[rank]: best_test_accs[rank] = test_acc print('itr {}, ' 'rank {}, loss value {:.4f}, ' 'train accuracy {:.3f}, test accuracy {:.3f}, ' 'elasped time {:.3f}'.format(round_communicated, rank, losses.avg, top1.avg, test_acc, time.time() - tic)) with open(name, '+a') as f: print('{itr},{filler},{filler},' '{filler},{loss:.4f},' '{val:.4f}'.format(itr=-1, filler=-1, loss=global_loss, val=test_acc), file=f) losses.reset() top1.reset() tic = time.time() # return for rank in range(args.size): name = save_names[rank] with open(name, '+a') as f: print('{itr} best test accuracy: {val:.4f}'.format( itr=-2, val=best_test_accs[rank]), file=f)
def train(model, criterion, optimizer, batch_meter, comm_meter, loader, epoch, req): model.train() losses = util.Meter(ptag='Loss') top1 = util.Meter(ptag='Prec@1') weights = [1 / args.size for i in range(args.size)] iter_time = time.time() for batch_idx, (data, target) in enumerate(loader): # data loading data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # forward pass output = model(data) loss = criterion(output, target) # backward pass loss.backward() update_learning_rate(optimizer, epoch, itr=batch_idx, itr_per_epoch=len(loader)) optimizer.step() optimizer.zero_grad() torch.cuda.synchronize() comm_start = time.time() ## CoCoD-SGD # optimizer.async_CoCoD_SGD_step(batch_idx, args.cp, req) ## Local SGD # if batch_idx != 0 and batch_idx % args.cp == 0: # SyncAllreduce(model, rank, size) optimizer.OverlapLocalSGD_step(batch_idx, args.cp, req) ## EASGD #optimizer.elastic_average(batch_idx, args.cp) if not (epoch == 0 and batch_idx == 0): torch.cuda.synchronize() comm_meter.update(time.time() - comm_start) batch_meter.update(time.time() - iter_time) # write log files train_acc = util.comp_accuracy(output, target) losses.update(loss.item(), data.size(0)) top1.update(train_acc[0].item(), data.size(0)) if batch_idx % args.print_freq == 0 and args.save: print('epoch {} itr {}, ' 'rank {}, loss value {:.4f}, train accuracy {:.3f}'.format( epoch, batch_idx, rank, losses.avg, top1.avg)) with open(args.out_fname, '+a') as f: print('{ep},{itr},{bt},{ct},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch, itr=batch_idx, bt=batch_meter, ct=comm_meter, loss=losses, top1=top1), file=f) torch.cuda.synchronize() iter_time = time.time() with open(args.out_fname, '+a') as f: print('{ep},{itr},{bt},{ct},' '{loss.val:.4f},{loss.avg:.4f},' '{top1.val:.3f},{top1.avg:.3f},-1'.format(ep=epoch, itr=batch_idx, bt=batch_meter, ct=comm_meter, loss=losses, top1=top1), file=f) return req