def main(args): time_stamp = '{:.0f}'.format(time.time() % 100000) if torch.cuda.is_available() is True: logging.info('Utilizing GPU', extra=args.client) print('Utilizing GPU') train_loader, val_loader = load_data(args) model = get_model(args.model, args) if args.batch_size > 256 and args.dataset == 'imagenet' and args.model == 'resnet': batch_accumulate_num = args.batch_size // 256 else: batch_accumulate_num = 1 # create model if args.dataset == 'imagenet': if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256 else: args.iterations_per_epoch = len( train_loader.dataset.imgs) // args.batch_size val_len = len(val_loader.dataset.imgs) // 1024 else: args.iterations_per_epoch = len( train_loader.dataset.train_labels) // args.batch_size val_len = len(val_loader.dataset.test_labels) // 1024 # get the number of model parameters log_str = 'Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()])) logging.info(log_str, extra=args.client) print(log_str) # for training on multiple GPUs. model = torch.nn.DataParallel(model) model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume + '/checkpoint.pth.tar'): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume + '/checkpoint.pth.tar') args.start_epoch = checkpoint['epoch'] server = checkpoint['server'] val_statistics = checkpoint['val_stats'] train_statistics = checkpoint['train_stats'] model.load_state_dict(checkpoint['state_dict']) print('=> loaded checkpoint {} (epoch {})'.format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # # Synchronous to Asynchronous Adjustments # print('Resetting Parameter Server to Asynchronous Mode') # logging.info('Resetting Parameter Server to Asynchronous Mode', extra=args.client) # server._shards_weights = list() # weights = server._get_model_weights() # for i in range(0, args.workers_num): # server._shards_weights.append(deepcopy(weights)) # server._workers_num = args.workers_num # # learning rate initialization # batch_baseline = args.baseline # server._lr = args.lr * np.sqrt((args.workers_num * args.batch_size) // batch_baseline) / (args.workers_num) # server._fast_im = args.fast_im # server._lr_warm_up = args.lr_warm_up # server._current_lr = args.lr # server._m_off = args.m_off # server._current_momentum = args.momentum # server._iterations_per_epoch = args.iterations_per_epoch # server._momentum = args.momentum # server._client = args.client # if args.fast_im is True: # end_lr = args.lr * ((args.workers_num * args.batch_size) // batch_baseline) / np.sqrt(args.workers_num) # start_lr = args.lr / (args.workers_num) # server._lr = end_lr # server._start_lr = start_lr # server._lr_increment_const = (end_lr - start_lr) / (args.iterations_per_epoch * 5) # log_str = 'Fast ImageNet Mode - Warm Up [{:.5f}]->[{:.5f}] In 5 Epochs'.format(start_lr, end_lr) # logging.info(log_str, extra=args.client) # print(log_str) # else: # server._start_lr = 0 # server._lr_increment_const = 0 # for param_group in server._optimizer.param_groups: # param_group['lr'] = start_lr # param_group['momentum'] = server._momentum # # Synchronous to Asynchronous Adjustments - End cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%') else: train_bar = None val_bar = None log_str = '{}: Training neural network for {} epochs with {} workers'.format( args.id, args.epochs, args.workers_num) logging.info(log_str, extra=args.client) print(log_str) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, batch_accumulate_num, train_bar, train_statistics, args.client) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() with torch.no_grad(): val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar) train_statistics.save_loss(train_loss) train_statistics.save_error(train_error) train_statistics.save_weight_mean_dist( server.get_workers_mean_statistics()) train_statistics.save_weight_master_dist( server.get_workers_master_statistics()) train_statistics.save_mean_master_dist(server.get_mean_master_dist()) train_statistics.save_weight_norm(server.get_server_weights()) train_statistics.save_gradient_norm(server.get_server_gradients()) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \ 'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss, train_error, val_time, val_loss, val_error) logging.info(log_str, extra=args.client) print(log_str) if epoch % args.save == 0 and epoch > 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'val_stats': val_statistics, 'train_stats': train_statistics, 'server': server }, sim_name=(args.name + time_stamp + '_' + str(epoch))) train_time = time.time() return train_statistics, val_statistics
def main(args): time_stamp = '{:.0f}'.format(time.time() % 100000) if torch.cuda.is_available() is True: logging.info('Utilizing GPU', extra=args.client) print('Utilizing GPU') train_loader, val_loader = load_data(args) model = get_model(args.model, args) if args.batch_size > 256: # and args.dataset == 'imagenet' and args.model == 'resnet': batch_accumulate_num = args.batch_size // 256 else: batch_accumulate_num = 1 # create model if args.dataset == 'imagenet': if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256 else: args.iterations_per_epoch = len(train_loader.dataset.imgs) // args.batch_size val_len = len(val_loader.dataset.imgs) // 1024 else: if batch_accumulate_num > 1: args.iterations_per_epoch = len(train_loader.dataset.train_labels) // 256 else: args.iterations_per_epoch = len(train_loader.dataset.train_labels) // args.batch_size val_len = len(val_loader.dataset.test_labels) // 1024 # get the number of model parameters log_str = 'Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()])) logging.info(log_str, extra=args.client) print(log_str) # for training on multiple GPUs. model = torch.nn.DataParallel(model) model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume + '/checkpoint.pth.tar'): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume + '/checkpoint.pth.tar') args.start_epoch = checkpoint['epoch'] server = checkpoint['server'] val_statistics = checkpoint['val_stats'] train_statistics = checkpoint['train_stats'] model.load_state_dict(checkpoint['state_dict']) print('=> loaded checkpoint {} (epoch {})'.format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params).cuda() # criterion = nn.CrossEntropyLoss().cuda() if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%') else: train_bar = None val_bar = None log_str = '{}: Training neural network for {} epochs with {} workers'.format(args.id, args.epochs, args.workers_num) logging.info(log_str, extra=args.client) print(log_str) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, batch_accumulate_num, train_bar, train_statistics, args.client) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() with torch.no_grad(): val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar) train_statistics.save_loss(train_loss) train_statistics.save_error(train_error) train_statistics.save_weight_mean_dist(server.get_workers_mean_statistics()) train_statistics.save_weight_master_dist(server.get_workers_master_statistics()) train_statistics.save_mean_master_dist(server.get_mean_master_dist()) train_statistics.save_weight_norm(server.get_server_weights()) train_statistics.save_gradient_norm(server.get_server_gradients()) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \ 'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss, train_error, val_time, val_loss, val_error) logging.info(log_str, extra=args.client) print(log_str) if epoch % args.save == 0 and epoch > 0: save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'val_stats': val_statistics, 'train_stats': train_statistics, 'server': server}, sim_name=(args.name + time_stamp + '_' + str(epoch))) train_time = time.time() return train_statistics, val_statistics
def main(args): if torch.cuda.is_available() is True: print('Utilizing GPU') # torch.cuda.set_device(args.gpu_num) train_loader, val_loader = load_data(args) # create model if args.dataset == 'image_net': model = alexnet() top_k = (1, 5) val_len = len(val_loader.dataset.imgs) else: model = WideResNet(args.layers, args.dataset == 'cifar10' and 10 or 100, args.widen_factor, dropRate=args.droprate) top_k = (1,) val_len = len(val_loader.dataset.test_labels) # get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # for training on multiple GPUs. model = torch.nn.DataParallel(model).cuda() model = model.cuda() server = ParameterServer.get_server(args.optimizer, model, args) val_statistics = Statistics.get_statistics('image_classification', args) train_statistics = Statistics.get_statistics('image_classification', args) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # ghost batch normalization (128 as baseline) repeat = args.batch_size // 128 if args.gbn == 1 else 1 total_iterations = args.iterations_per_epoch + val_len // args.batch_size if args.bar is True: train_bar = IncrementalBar('Training ', max=args.iterations_per_epoch, suffix='%(percent)d%%') val_bar = IncrementalBar('Evaluating', max=total_iterations, suffix='%(percent)d%%') else: train_bar = None val_bar = None print( '{}: Training neural network for {} epochs with {} workers'.format(args.sim_num, args.epochs, args.workers_num)) train_time = time.time() for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, repeat, train_bar) train_time = time.time() - train_time if args.bar is True: train_bar.finish() train_bar.index = 0 # evaluate on validation set val_time = time.time() val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, top_k, val_bar) train_loss, train_error = validate(train_loader, model, criterion, server, train_statistics, top_k, val_bar, save_norm=True) val_time = time.time() - val_time if args.bar is True: val_bar.finish() val_bar.index = 0 print('Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] |' ' Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]' .format(epoch, train_time, train_loss, train_error, val_time, val_loss, val_error)) train_time = time.time() return train_statistics, val_statistics