def main(): # is_chief indicates this machine will do shared tasks for the cluster # such as logging and checkpointing # is_chief must be true only for at most 1 process in training cluster # $RANK is set by pytorch.distributed.launch # https://github.com/pytorch/pytorch/blob/db6e4576dab097abf01d032c3326e4b285eb8499/torch/distributed/launch.py#L193 global is_chief, event_writer, global_example_count, last_recv_bytes, last_transmit_bytes, last_log_time is_chief = (not args.distributed) or (int(os.environ['RANK'])==0) global_example_count = 0 if is_chief: print(f"Logging to {args.logdir}") event_writer = SummaryWriter(args.logdir) log_tb("first", time.time()) else: event_writer = NoOp() # baseline number for network bytes last_recv_bytes, last_transmit_bytes = network_bytes() last_log_time = time.time() print(args) print("~~epoch\thours\ttop1Accuracy\n") # need to index validation directory before we start counting the time dataloader.sort_ar(args.data+'/validation') global reduce_function if args.c10d: print('Distributed: loading c10d process group') # https://github.com/pytorch/pytorch/blob/master/torch/lib/c10d/TCPStore.hpp torch.cuda.set_device(args.local_rank) rank = int(os.environ['RANK']) store = c10d.TCPStore(os.environ['MASTER_ADDR'], int(os.environ['MASTER_PORT']), rank==0) # (masterAddr, masterPort, isServer) process_group = c10d.ProcessGroupNCCL(store, rank, args.world_size) # (store, rank, size) reduce_function = lambda t: process_group.allreduce(t, c10d.AllreduceOptions().reduceOp) elif args.distributed: print('Distributed: initializing process group') torch.cuda.set_device(args.local_rank) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) assert(args.world_size == dist.get_world_size()) reduce_function = lambda t: dist.all_reduce(t, op=dist.reduce_op.SUM) print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size)) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." print("Loading model") if args.factorized_resnet: model = resnet.resnet50factorized(pretrained=args.pretrained) else: model = resnet.resnet50(pretrained=args.pretrained) model = model.cuda() if args.init_bn0: resnet.init_dist_weights(model) # Sets batchnorm std to 0 if args.fp16: model = network_to_half(model) best_prec5 = 93 # only save models over 92%. Otherwise it stops to save every time # Load model from checkpoint. This must happen distributed as model is saved without it if args.resume: checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank)) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_prec5 = checkpoint['best_prec5'] if args.c10d: model = distributed_c10d._DistributedDataParallelC10d(model, process_group, device_ids=[args.local_rank], output_device=args.local_rank) c10d_sanity_check() elif args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: model_params = master_params = model.parameters() optim_params = experimental_utils.bnwd_optim_params(model, model_params, master_params) if args.no_bn_wd else master_params # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay) # start with 0 lr. Scheduler will change this later if args.resume: # we must resume optimizer params separately checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank)) optimizer.load_state_dict(checkpoint['optimizer']) # Load data data manager and lr scheduler from phases phases = eval(args.phases) print("Creating data loaders (this could take 6-12 minutes)") dm = DataManager([p for p in phases if 'bs' in p]) scheduler = Scheduler(optimizer, [p for p in phases if 'lr' in p], args.scale_lr) start_time = datetime.now() # Loading start to after everything is loaded if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time) if args.distributed: print('Syncing machines before training') sum_tensor(torch.tensor([1.0]).float().cuda()) print("Begin training") estart = time.time() for epoch in range(args.start_epoch, scheduler.tot_epochs): estart = time.time() dm.set_epoch(epoch) train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch) if args.prof: break prec5 = validate(dm.val_dl, model, criterion, epoch, start_time) is_best = prec5 > best_prec5 best_prec5 = max(prec5, best_prec5) if args.local_rank == 0: if is_best: save_checkpoint(epoch, model, best_prec5, optimizer, is_best=True, filename='model_best.pth.tar') phase = dm.get_phase(epoch) if phase:save_checkpoint(epoch, model, best_prec5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar') event_writer.export_scalars_to_json(args.logdir+'/scalars.json') event_writer.close()
def allreduce(x, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce([x], opts) work.wait()
def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True, process_group=None, bucket_cap_mb=25): super(_DistributedDataParallelC10d, self).__init__() # Use all devices by default if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] if process_group is None: self.process_group = c10d.get_default_group() else: self.process_group = process_group self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device self.broadcast_buffers = broadcast_buffers self.allreduce_opts = c10d.AllreduceOptions() MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well self.broadcast_bucket_size = 25 * MB # Sync params and buffers module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] # .data() of each parameter for each model replica self.modules_params_data = [[] for _ in range(len(self.device_ids))] # .data() of each buffer for each model replica self.modules_buffers_data = [[] for _ in range(len(self.device_ids))] for dev_idx, module in enumerate(self._module_copies): self.modules_params_data[dev_idx] = [ p.data for p in module.parameters() ] self.modules_buffers_data[dev_idx] = [ b.data for b in module.buffers() ] bucket_bytes_cap = bucket_cap_mb * MB # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well param_buckets = [ list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies ] self.bucket_sizes = [] self.bucket_map = {} # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for idx, param_tuple in enumerate(zip(*param_buckets_tuple)): if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = (bucket_idx, idx) self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # The number of params ready in each bucket self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # coalesced bucket for only device 0 self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] # We will always reduce the bucket following the reverse order # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 self.next_bucket = len(self.bucket_sizes) - 1 self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] # default stream tracking to launch nccl reduce kernels self.default_streams = [] for dev_id in self.device_ids: with torch.cuda.device(dev_id): self.default_streams.append(torch.cuda.current_stream()) self._register_grad_hooks()
def allreduce(tensors, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce(tensors, opts) work.wait()