def init(args): log.set_rank(args.rank) if args.output_dir is not None: log.set_directory(args.output_dir) log.set_level(args.verbosity) args = validate_args(args) if args.apex: from apex import amp log.info('Configurations:\n' + pformat(args.__dict__)) log.info('world_size = %d, batch_size = %d, device = %s, backend = %s', args.world_size, args.batch_size, args.device, args.backend) if not args.cpu: torch.cuda.set_device(args.local_rank) torch.backends.cudnn.benchmark = True if args.deterministic: torch.manual_seed(args.rank) np.random.seed(args.rank) torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(args.rank) dist.init_process_group(backend=args.backend)
def save_data(train_res, val_res, fname, output_dir='data'): if not os.path.isdir(output_dir): os.makedirs(output_dir) if output_dir.endswith('/'): output_dir = output_dir[:-1] header = 'iterations train_time run_time loss' if len(train_res[0]) == 5: header += ' accuracy' def _save(res, name): res = np.array(res) np.savetxt(output_dir + '/full_' + fname + name, res, header=header, comments='') # Downsample if needed if len(res) > 500: idx = np.r_[:50, 50:500:5, 500:len(res):int((len(res)) / 100)] res = res[idx] np.savetxt(output_dir + '/' + fname + name, res, header=header, comments='') _save(train_res, '_train.txt') _save(val_res, '_val.txt') log.info('Data saved to %s/[full_]%s_[train/val].txt', output_dir, fname)
def __init__(self, *args, sync_freq=1, fp16_grads=False, **kwargs): r"""Init function. Args: module: The module to be wrapped. sync_freq: Number of steps between communications. fp16_grads: Whether to use fp16 gradients. kwargs: Other args torch.nn.parallel.DistributedDataParallel requires. """ log.info('Using %s', self.__class__.__name__) # Test PyTorch version if torch.__version__ < '1.7.0': log.FATAL( "Please install PyTorch v1.7.0-rc1 to use DistributedGradientParallel!" ) if dist.get_backend() != 'nccl': log.warn('DistributedGradientParallel performs better with NCCL') super().__init__(*args, **kwargs) self.sync_freq = sync_freq self.fp16_grads = fp16_grads self._iter_counter = 0 if self.fp16_grads: log.info('Using fp16 gradients') if dist.get_backend() != 'nccl': self._register_comm_hook(state=None, hook=fp16_compress_hook_gloo) else: self._register_comm_hook(state=None, hook=fp16_compress_hook_nccl) def _forward_pre_hook(*args, **kwargs): if self.training: # Update iteration counter self._iter_counter += 1 self._iter_counter %= self.sync_freq log.debug('_forward_pre_hook called on %s, _iter_counter %d', self.device, self._iter_counter) if self._iter_counter == 0: self.require_backward_grad_sync = True else: self.require_backward_grad_sync = False self.register_forward_pre_hook(_forward_pre_hook)
def wrap_model(model, args, optimizer=None): if args.apex: log.info('Apex wrapping') from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) if args.ddp == 'DistributedDataParallel': model = ceddl.parallel.DistributedDataParallel(model, **args.__dict__) elif args.ddp == 'SparseDistributedDataParallel': model = ceddl.parallel.SparseDistributedDataParallel( model, **args.__dict__) elif args.ddp == 'NetworkDataParallel': model = ceddl.parallel.NetworkDataParallel(model, **args.__dict__) else: if args.cpu: device_ids = None else: device_ids = [args.rank] if args.ddp == 'pytorch': model = torch.nn.parallel.DistributedDataParallel( model, device_ids=device_ids) elif args.ddp == 'DistributedGradientParallel': model = ceddl.parallel.DistributedGradientParallel( model, fp16_grads=args.fp16_grads, sync_freq=args.sync_freq, device_ids=device_ids) return model, optimizer
args.batch_size) model = Net().to(args.device) args.use_ref_module = True args.tracking_loader = train_loader args.criterion = nn.CrossEntropyLoss() model, optimizer = utils.wrap_model(model, args) optimizer = ceddl.optim.NetworkSVRG(model, lr=args.lr) # optimizer = torch.optim.SGD(model.parameters(), # lr=args.lr, # weight_decay=args.weight_decay, # momentum=args.momentum) log.info('Model is on %s', next(model.parameters()).device) classes = [int(i) for i in range(10)] criterion = nn.CrossEntropyLoss() train_res, val_res = utils.train(model, criterion, optimizer, train_loader, args, exp_name='mnist', val_loader=val_loader, classes=classes) log.info('Process %d exited', args.rank)
utils.init(args) # log.set_allowed_ranks(list(range(args.world_size))) # Local data local_n_samples = int(args.n_samples / args.world_size) X = torch.zeros(local_n_samples, args.dim) Y = torch.zeros(local_n_samples, 1) if args.rank == 0: # Set the random seed so the data is the same in every run np.random.seed(0) # Generate random data at node 0, X_total, Y_total, w_0, loss_0 = generate_data(args.n_samples, args.dim, args.condition_number, args.noise_variance) log.info('Data generated, the best loss is %.7f' % loss_0) # then send to all other processes dist.scatter(X, [_ for _ in X_total.split(local_n_samples)]) dist.scatter(Y, [_ for _ in Y_total.split(local_n_samples)]) else: dist.scatter(X) dist.scatter(Y) dataset = torch.utils.data.TensorDataset(X, Y) train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False) val_loader = torch.utils.data.DataLoader(dataset, batch_size=local_n_samples, shuffle=False)
def validate(model, val_loader, criterion, classes=None, device=None): log.info('Validating model') losses = [] if classes is not None: confusion_matrix = np.zeros((len(classes), len(classes))) for data, target in val_loader: target = target.to(device=device, non_blocking=True) data = data.to(device=device, non_blocking=True) output = model(data) loss = criterion(output, target) losses.append(loss.cpu().item()) if classes is not None: _, predicted = torch.max(output, 1) for i in range(len(target)): l = target[i] p = predicted[i] confusion_matrix[l][p] += 1 loss = np.mean(losses) / dist.get_world_size() loss = torch.Tensor([loss]).to(device) dist.all_reduce(loss) loss = loss.cpu().item() if classes is not None: confusion_matrix = torch.from_numpy(confusion_matrix).to(device) dist.all_reduce(confusion_matrix) confusion_matrix = confusion_matrix.cpu().numpy() log.debug('Synchronized from other wokers') if classes is not None: acc = np.diag(confusion_matrix).sum() / confusion_matrix.sum() confusion_matrix /= confusion_matrix.sum(axis=1) # log.debug(confusion_matrix) max_len = str(max([len(str(c)) for c in classes])) if len(classes) > 10: log.info('Accuracy of first 5 classes') for i in range(5): log.info('%-' + max_len + 's: %8.5f%%', classes[i], 100 * confusion_matrix[i, i]) log.info('Accuracy of last 5 classes') for i in range(len(classes) - 5, len(classes)): log.info('%-' + max_len + 's: %8.5f%%', classes[i], 100 * confusion_matrix[i, i]) else: log.info('Accuracy of each class') for i in range(len(classes)): log.info('%-' + max_len + 's: %8.5f%%', classes[i], 100 * confusion_matrix[i, i]) log.info('Validation loss %.5f, accuracy %.5f%%', loss, acc * 100) return loss, acc else: log.info('Validation loss %.5f', loss) return [loss]
def train(model, criterion, optimizer, train_loader, args, val_loader=None, exp_name=None, classes=None, scheduler=None): if args.apex: from apex import amp def _val(): if args.val_interval is not None: val_start = time() model.eval() val_res.append([ i, train_time, run_time, *validate(model, val_loader, criterion, classes=classes, device=args.device) ]) model.train() val_end = time() return val_end - val_start else: return 0 def _save(): if args.rank == 0: fname = get_fname(args, exp_name=None) save_data(train_res, val_res, fname, output_dir=args.output_dir) log.debug('Data saved to %s', fname) def _eta(): _time = train_time / i * (total_batches - i) if args.val_interval is not None: _time += val_time / (i // args.val_interval + 1) * ( (total_batches - i) // args.val_interval + 1) h = _time / 3600 if h > 1: return "%.2fh" % h m = _time / 60 if m > 1: return "%.2fm" % m return "%.2fs" % _time total_batches = len(train_loader) * args.epochs train_res = [] val_res = [] running_loss = [] running_acc = [] i = 0 val_time = run_time = train_time = 0 train_start = time() printed = False val_time += _val() log.info('Training started') model.train() optimizer.zero_grad() if args.gradient_accumulation and args.ddp == 'pytorch': model.require_backward_grad_sync = False for epoch in range(1, args.epochs + 1): for _, (data, target) in enumerate(train_loader): i += 1 target = target.to(device=args.device, non_blocking=True) data = data.to(device=args.device, non_blocking=True) if args.ddp == 'pytorch': if args.gradient_accumulation and i % args.sync_freq != 0: model.require_backward_grad_sync = False else: model.require_backward_grad_sync = True # ==== Step begin ==== output = model(data) loss = criterion(output, target) if args.gradient_accumulation: loss /= args.sync_freq if args.apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.ddp == 'DistributedGradientParallel' and printed == False: for n, p in model.named_parameters(): log.warn( '%s.grad.dtype = %s, max difference between original grad and half precision grad is %f', n, p.grad.dtype, (p.grad - p.grad.clone().half()).abs().max()) printed = True if not args.gradient_accumulation or i % args.sync_freq == 0: log.debug('[%d/%d, %5d/%d] optimizer step', epoch, args.epochs, i, total_batches) optimizer.step() optimizer.zero_grad() loss = loss.item() running_loss.append(loss) if classes is not None: acc = accuracy(output, target).item() running_acc.append(acc) # ==== Step done ==== current_time = time() run_time = current_time - train_start train_time = run_time - val_time if args.gradient_accumulation: tmp_res = [i, train_time, run_time, loss * args.sync_freq] else: tmp_res = [i, train_time, run_time, loss] if classes is not None: tmp_res += [acc] train_res.append(tmp_res) if i % args.disp_interval == 0: log.info( '[%d/%d, %5d/%d] local running loss %.5f, local running acc %.5f%%, average train time %.4f seconds per batch, eta %s', epoch, args.epochs, i, total_batches, np.mean(running_loss), np.mean(running_acc) * 100, train_time / i, _eta()) running_loss = [] running_acc = [] if args.val_interval is not None and i % args.val_interval == 0: val_time += _val() # Update saved data after every validation _save() # end for current_time = time() run_time = current_time - train_start train_time = run_time - val_time log.info( 'Training epoch %d ends, total run time %.4f seconds, average train time %.4f seconds per batch', epoch, run_time, train_time / i) if scheduler is not None: log.debug('schedule.step() called') scheduler.step() if args.val_interval is not None and i % args.val_interval != 0: val_time += _val() current_time = time() run_time = current_time - train_start train_time = run_time - val_time _save() if classes is not None: best_acc = max([x[-1] for x in val_res]) log.info( 'Training finished, %d epochs, final val loss %.5f, final val acc %.5f%%, best val acc %.5f%%', epoch, val_res[-1][-2], val_res[-1][-1] * 100, best_acc * 100) else: log.info('Training finished, %d epochs, final val loss %.5f', epoch, val_res[-1][-1]) return train_res, val_res
def __init__(self, module, world_local_size=None, node_rank=None, local_rank=None, sync_freq=1, num_streams=1, premultiplier=None, **kwargs): r"""Init function. Args: module: The module to be wrapped. sync_freq: Number of steps between communications. num_streams: Number of CUDA streams to use for communication. premultiplier: The multiplier to be applied before communication. If not none, parameters will be multiplied by pre-multiplier before communication, then divided by the pre-multiplier after communication. """ super().__init__() log.info('Using %s', self.__class__.__name__) self.module = module self.device = next(self.module.parameters()).device # Assume torch.dist is initialized self.rank = dist.get_rank() self.world_size = dist.get_world_size() self.local_rank = local_rank if local_rank is not None else self.rank self.node_rank = node_rank if node_rank is not None else 0 self.world_local_size = world_local_size if world_local_size is not None else 1 # When the counter equals to sync_freq, perform communication and reset self.premultiplier = premultiplier self.sync_freq = sync_freq self._iter_counter = 0 self.param_info = [{ 'numel': param.numel(), 'shape': param.shape } for param in self.parameters()] self.flat_parameters, self.flat_indexes = self.flatten_tensors( list(self.parameters())) self.assign_unflattened_tensors(self.parameters(), self.flat_parameters) log.debug('Broadcasting init params') for param in self.flat_parameters: dist.broadcast(param, 0) log.debug('Broadcasting init params done') self.num_streams = num_streams if self.device.type == 'cuda': self.streams = [ torch.cuda.Stream() for _ in range(self.num_streams) ]
def __init__(self, world_size, **kwargs): cycle = int(np.log(world_size - 1) / np.log(2)) super().__init__(world_size, cycle=cycle, **kwargs) log.info('Exponential graph initialized with cycle %d', self.cycle)
def __init__(self, world_size, **kwargs): super().__init__(world_size, cycle=1, **kwargs) log.info('Complete graph initialized')
import numpy as np from .communication_graph import CommunicationGraph from ceddl import log class CompleteGraph(CommunicationGraph): def __init__(self, world_size, **kwargs): super().__init__(world_size, cycle=1, **kwargs) log.info('Complete graph initialized') def update_graph(self): # Don't update pass def generate_adjacency_matrix(self, t): return np.ones((self.world_size, self.world_size)) if __name__ == '__main__': a = CompleteGraph(5, n_peers=1) log.info(str(a.adjacency_matrix)) log.info(a.cycle)