def flat_dist_call(tensors, call, extra_args=None): flat_dist_call.warn_on_half = True buckets = {} for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) if flat_dist_call.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") flat_dist_call.warn_on_half = False for tp in buckets: bucket = buckets[tp] coalesced = _flatten_dense_tensors(bucket) if extra_args is not None: call(coalesced, *extra_args) else: call(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)): buf.copy_(synced)
def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=device_ids[0], streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip(grad_batch, _unflatten_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def test_send_recv(self): rank = dist.get_rank() tensor = _build_tensor(rank + 1) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(src + 1, value=-1) expected_tensor = _build_tensor(src + 1) dist.recv(tensor, src) self.assertEqual(tensor, expected_tensor) self._barrier()
def __init__(self, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.extra = 0
def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, rank) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) recv_ranks = set() for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(10, value=-1) dist.recv(tensor) recv_ranks.add(tensor.resize_(1)[0]) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def reduction_fn_nccl(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [[] for _ in range(len(self._module_copies))] all_grads_buckets_iters = [] # Bucketing all the gradients for dev_idx, module in enumerate(self._module_copies): for param in module.parameters(): if not param.requires_grad or param.grad is None: continue if param.grad.requires_grad: raise RuntimeError("DistributedDataParallel only works " "with gradients that don't require " "grad") # Adding the gradients for reduction all_grads[dev_idx].append(param.grad.data) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads[dev_idx], self.nccl_reduce_bucket_size) all_grads_buckets_iters.append(dev_grads_buckets) # Now reduce each bucket one after another for grads_batch in zip(*all_grads_buckets_iters): grads_batch_coalesced = [] # Coalesce each bucket for dev_idx, dev_grads_batch in enumerate(grads_batch): dev_id = self.device_ids[dev_idx] with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # We will only use device 0's results, but this single op should be # faster than doing the following two operation sequentially: # (1) intra-node reduce to lead GPU, followed by # (2) inter-node allreduce for all the first lead GPUs in all nodes dist.all_reduce_multigpu(grads_batch_coalesced, group=self.nccl_reduction_group_id) # Now only work on the first device of self.device_ids, uncoalesce # the gradients for each bucket grads_batch_coalesced[0] /= dist.get_world_size() grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # clear the gradients and save memory for replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def __init__(self, dataset, num_replicas=None, rank=None): if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.dataset = dataset self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def test_mpi(): dist.init_process_group('mpi') world_size = dist.get_world_size() rank = dist.get_rank() vector = [0] * world_size vector[rank] = 1 vector = torch.DoubleTensor(vector) dist.all_reduce(vector, op=dist.reduce_op.SUM) print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
def allreduce_params(): if self.needs_reduction: self.needs_reduction = False buckets = defaultdict(list) for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) buckets[tp].append(param) for bucket in buckets.values(): grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None): """ Samples batches assuming they are in order of size to batch similarly sized samples together. """ super(DistributedBucketingSampler, self).__init__(data_source) if num_replicas is None: num_replicas = get_world_size() if rank is None: rank = get_rank() self.data_source = data_source self.ids = list(range(0, len(data_source))) self.batch_size = batch_size self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)] self.num_replicas = num_replicas self.rank = rank self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def config_pytorch(options): """Config pytorch packages. Fix random number for packages and initialize distributed environment for pytorch. Setup cuda environment for pytorch. :param options: A global object containing specified options. :type options: argparse.Namespace """ # Setting `cudnn.deterministic = True` will turn on # CUDNN deterministic setting which can slow down training considerably. # Unexpected behavior may also be observed from checkpoint. # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py if options.cudnn_deterministic: cudnn.deterministic = True log.warning('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.', 0) if options.seed is not None: random.seed(options.seed) torch.manual_seed(options.seed) # define the graph for the computation. if options.use_cuda: assert torch.cuda.is_available() options.rank = dist.get_rank() options.world_size = dist.get_world_size() options.graph = FCGraph(options) # enable cudnn accelerator if we are using cuda. if options.use_cuda: options.graph.assigned_gpu_id() torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True if torch.backends.cudnn.version() is None: log.warning("CUDNN not found on device.") log.info("World size={}, Rank={}, hostname={}, cuda_available={}, cuda_device={}".format( options.world_size, options.rank, socket.gethostname(), torch.cuda.is_available(), torch.cuda.current_device()))
def _init_multigpu_helper(self): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ nGPUs = torch.cuda.device_count() world_size = dist.get_world_size() visible_devices = range(nGPUs) if BACKEND == 'nccl': apply_hack_for_nccl() nGPUs_per_process = nGPUs // world_size rank_to_GPU = {i: list(visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process]) for i in range(world_size)} return rank_to_GPU
def test_isend(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: requests = [ dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size) ] for request in requests: request.wait() self.assertTrue(request.is_completed()) else: tensor = _build_tensor(rank, -1) dist.recv(tensor, 0) self.assertEqual(tensor, _build_tensor(rank, 10)) self._barrier()
def test_irecv(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] requests = [ dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size) ] for src in range(1, world_size): requests[src - 1].wait() self.assertTrue(requests[src - 1].is_completed()) self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) else: tensor = _build_tensor(rank, 10) dist.send(tensor, 0) self._barrier()
def test_get_rank(self): test_dir = os.path.join(TEMP_DIR, 'test_dir') pid = str(os.getpid()) num_processes = dist.get_world_size() with open(os.path.join(test_dir, pid), 'w') as f: f.write(str(dist.get_rank())) self._barrier() all_ranks = set() for f_name in os.listdir(test_dir): with open(os.path.join(test_dir, f_name), 'r') as f: all_ranks.add(int(f.read())) self.assertEqual(len(all_ranks), num_processes) self._barrier() if dist.get_rank() == 0: for f_name in os.listdir(test_dir): os.unlink(os.path.join(test_dir, f_name)) self._barrier()
def sync(cls, timeout=5): cls.barrier_id += 1 barrier_dir = os.path.join(TEMP_DIR, 'barrier') pid = str(os.getpid()) barrier_file = os.path.join(barrier_dir, pid) with _lock(): with open(barrier_file, 'w') as f: f.write(str(cls.barrier_id)) start_time = time.time() while True: arrived = 0 with _lock(): for f_name in os.listdir(barrier_dir): with open(os.path.join(barrier_dir, f_name), 'r') as f: data = f.read() if int(data) >= cls.barrier_id: arrived += 1 if arrived == dist.get_world_size(): break if time.time() - start_time > timeout: raise RuntimeError("barrier timeout") time.sleep(0.1)
def _init_multigpu_helper(self): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ nGPUs = torch.cuda.device_count() world_size = dist.get_world_size() visible_devices = range(nGPUs) # This is a hack for a known NCCL issue using multiprocess # in conjunction with multiple threads to manage different GPUs which # may cause ncclCommInitRank to fail. # http://docs.nvidia.com/deeplearning/sdk/nccl-release-notes/rel_2.1.4.html#rel_2.1.4 # It slows down the performance of collective operations. # Without this setting NCCL might throw unhandled error. os.environ['NCCL_MAX_NRINGS'] = '1' nGPUs_per_process = int(nGPUs / world_size) rankToGPUMapping = {} for i in range(world_size): rankToGPUMapping[i] = visible_devices[ i * nGPUs_per_process: (i + 1) * nGPUs_per_process] return rankToGPUMapping
def allreduce_params(): if (self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = (args.processor == 'gpu') or (args.num_gpus > 0) logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) os.environ['RANK'] = str(host_rank) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. ' .format(args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) # TODO: assert the logs when we move to the SDK local mode logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset))) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset))) model = Net() if is_distributed and use_cuda: # multi-machine multi-gpu case logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.") # establish host rank and set device on this node torch.cuda.set_device(host_rank) model.cuda(host_rank) # for multiprocessing distributed, the DDP constructor should always set # the single device scope. otherwise, DDP will use all available devices. model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[host_rank], output_device=host_rank) elif use_cuda: # single-machine multi-gpu case logger.debug("Single-machine multi-gpu: using DataParallel().cuda().") model = model.to(device) model = torch.nn.DataParallel(model).to(device) else: # single-machine or multi-machine cpu case logger.debug("Single-machine/multi-machine cpu: using DataParallel.") model = model.to(device) model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): if is_distributed and use_cuda: # multi-machine multi-gpu case - allow asynchrous GPU copies of the data data, target = data.cuda(non_blocking=True), target.cuda( non_blocking=True) else: data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % args.log_interval == 0: logger.debug( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader, device) save_model(model, args.model_dir) if is_distributed and host_rank == 0 or not is_distributed: assert_can_track_sagemaker_experiments()
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = args.num_gpus > 0 logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) os.environ['RANK'] = str(host_rank) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case model = torch.nn.parallel.DistributedDataParallel(model) else: # single-machine multi-gpu case or single-machine or multi-machine cpu case model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % args.log_interval == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader, device) save_model(model, args.model_dir)
def average_tensors(tensors): world_size = distributed.get_world_size() for tensor in tensors: distributed.all_reduce(tensor) tensor /= world_size
def retrieve(self, combined_hidden_states: np.ndarray,current_hidden_states: np.ndarray, history_hidden_states: np.ndarray, n_docs: int, dialog_lengths: List[Tuple]=None) -> \ Tuple[np.ndarray, np.ndarray, np.ndarray, List[dict]]: """ Retrieves documents for specified ``question_hidden_states``. The main process, which has the access to the index stored in memory, gathers queries from all the processes in the main training process group, performs the retrieval and scatters back the results. Args: question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`): A batch of query vectors to retrieve with. n_docs (:obj:`int`): The number of docs retrieved per query. Output: retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)` The retrieval embeddings of the retrieved docs per query. doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`) The ids of the documents in the index doc_dicts (:obj:`List[dict]`): The retrieved_doc_embeds examples per query. """ # single GPU training if not dist.is_initialized(): # doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs) doc_ids, retrieved_doc_embeds, doc_scores = self._main_retrieve(combined_hidden_states, current_hidden_states, history_hidden_states, n_docs, dialog_lengths) # return retrieved_doc_embeds, doc_ids, self.index.get_doc_dicts(doc_ids) return retrieved_doc_embeds, doc_ids, doc_scores, self.index.get_doc_dicts(doc_ids) # distributed training world_size = dist.get_world_size(group=self.process_group) # gather logic gather_list_1 = None gather_list_2 = None gather_list_3 = None if self._is_main(): gather_list_1 = [torch.empty(combined_hidden_states.shape, dtype=torch.float32) for _ in range(world_size)] gather_list_2 = [torch.empty(current_hidden_states.shape, dtype=torch.float32) for _ in range(world_size)] gather_list_3 = [torch.empty(history_hidden_states.shape, dtype=torch.float32) for _ in range(world_size)] dist.gather(torch.tensor(combined_hidden_states), dst=0, gather_list=gather_list_1, group=self.process_group) dist.gather(torch.tensor(current_hidden_states), dst=0, gather_list=gather_list_2, group=self.process_group) dist.gather(torch.tensor(history_hidden_states), dst=0, gather_list=gather_list_3, group=self.process_group) # scatter logic n_queries = combined_hidden_states.shape[0] scatter_ids = [] scatter_vectors = [] scatter_scores = [] if self._is_main(): assert len(gather_list_1) == len(gather_list_2) == len(gather_list_3) == world_size comb_h_s = torch.cat(gather_list_1).numpy() curr_h_s = torch.cat(gather_list_2).numpy() hist_h_s = torch.cat(gather_list_3).numpy() ids, vectors, scores = self._main_retrieve(comb_h_s, curr_h_s, hist_h_s, n_docs, dialog_lengths) ids, vectors, scores = torch.tensor(ids), torch.tensor(vectors), torch.tensor(scores) scatter_ids = self._chunk_tensor(ids, n_queries) scatter_vectors = self._chunk_tensor(vectors, n_queries) scatter_scores = self._chunk_tensor(scores, n_queries) doc_ids = self._scattered(scatter_ids, [n_queries, n_docs], target_type=torch.int64) retrieved_doc_embeds = self._scattered(scatter_vectors, [n_queries, n_docs, combined_hidden_states.shape[1]]) doc_scores = self._scattered(scatter_scores, [n_queries, n_docs], torch.float64) return retrieved_doc_embeds.numpy(), doc_ids.numpy(), doc_scores.numpy(), self.index.get_doc_dicts(doc_ids)
end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.all_reduce(tensor) dist.barrier() if rank == 0: print_header("scatter") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) tensors = [tensor for n in range(0, dist.get_world_size())] for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.scatter(tensor, scatter_list=tensors) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.scatter(tensor, src=0) dist.barrier()
def losses(self, indices, gt_instances, anchors, pred_class_logits, pred_anchor_deltas): pred_class_logits = cat(pred_class_logits, dim=1).view(-1, self.num_classes) pred_anchor_deltas = cat(pred_anchor_deltas, dim=1).view(-1, 4) anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] N = len(anchors) # list[Tensor(R, 4)], one for each image all_anchors = Boxes.cat(anchors).tensor # Boxes(Tensor(N*R, 4)) predicted_boxes = self.box2box_transform.apply_deltas( pred_anchor_deltas, all_anchors) predicted_boxes = predicted_boxes.reshape(N, -1, 4) ious = [] pos_ious = [] for i in range(N): src_idx, tgt_idx = indices[i] iou, _ = box_iou(predicted_boxes[i, ...], gt_instances[i].gt_boxes.tensor) if iou.numel() == 0: max_iou = iou.new_full((iou.size(0), ), 0) else: max_iou = iou.max(dim=1)[0] a_iou, _ = box_iou(anchors[i].tensor, gt_instances[i].gt_boxes.tensor) if a_iou.numel() == 0: pos_iou = a_iou.new_full((0, ), 0) else: pos_iou = a_iou[src_idx, tgt_idx] ious.append(max_iou) pos_ious.append(pos_iou) ious = torch.cat(ious) ignore_idx = ious > self.neg_ignore_thresh pos_ious = torch.cat(pos_ious) pos_ignore_idx = pos_ious < self.pos_ignore_thresh src_idx = torch.cat([ src + idx * anchors[0].tensor.shape[0] for idx, (src, _) in enumerate(indices) ]) gt_classes = torch.full(pred_class_logits.shape[:1], self.num_classes, dtype=torch.int64, device=pred_class_logits.device) gt_classes[ignore_idx] = -1 target_classes_o = torch.cat( [t.gt_classes[J] for t, (_, J) in zip(gt_instances, indices)]) target_classes_o[pos_ignore_idx] = -1 gt_classes[src_idx] = target_classes_o valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) num_foreground = foreground_idxs.sum() gt_classes_target = torch.zeros_like(pred_class_logits) gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 dist.all_reduce(num_foreground) num_foreground = num_foreground * 1.0 / dist.get_world_size() # cls loss loss_cls = sigmoid_focal_loss_jit( pred_class_logits[valid_idxs], gt_classes_target[valid_idxs], alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) # reg loss target_boxes = torch.cat( [t.gt_boxes.tensor[i] for t, (_, i) in zip(gt_instances, indices)], dim=0) target_boxes = target_boxes[~pos_ignore_idx] matched_predicted_boxes = predicted_boxes.reshape( -1, 4)[src_idx[~pos_ignore_idx]] loss_box_reg = (1 - torch.diag( generalized_box_iou(matched_predicted_boxes, target_boxes))).sum() return { "loss_cls": loss_cls / max(1, num_foreground), "loss_box_reg": loss_box_reg / max(1, num_foreground), }
def __init__(self, latent_size, generator, discriminator, inverter, generator_optimizer, discriminator_optimizer, inverter_optimizer, train_data_loader, val_data_loader, generator_lr_scheduler=None, discriminator_lr_scheduler=None, inverter_lr_scheduler=None, train_sampler=None, val_sampler=None, divergence_loss_weight=0.1, real_gradient_penalty_weight=0.0, fake_gradient_penalty_weight=0.0, log_steps=100, log_dir='log'): self.latent_size = latent_size self.generator = generator self.discriminator = discriminator self.inverter = inverter self.generator_optimizer = generator_optimizer self.discriminator_optimizer = discriminator_optimizer self.inverter_optimizer = inverter_optimizer self.train_data_loader = train_data_loader self.val_data_loader = val_data_loader self.generator_lr_scheduler = generator_lr_scheduler self.discriminator_lr_scheduler = discriminator_lr_scheduler self.train_sampler = train_sampler self.val_sampler = val_sampler self.divergence_loss_weight = divergence_loss_weight self.real_gradient_penalty_weight = real_gradient_penalty_weight self.fake_gradient_penalty_weight = fake_gradient_penalty_weight self.log_steps = log_steps self.summary_dir = os.path.join(log_dir, 'summaries') self.checkpoint_dir = os.path.join(log_dir, 'checkpoints') self.epoch = 0 self.global_step = 0 self.rank = distributed.get_rank() self.world_size = distributed.get_world_size() os.makedirs(self.summary_dir, exist_ok=True) os.makedirs(self.checkpoint_dir, exist_ok=True) self.summary_writer = None if not self.rank: self.summary_writer = SummaryWriter(self.summary_dir) for tensor in self.generator.state_dict().values(): if tensor.numel(): distributed.broadcast(tensor, 0) for tensor in self.discriminator.state_dict().values(): if tensor.numel(): distributed.broadcast(tensor, 0) for tensor in self.inverter.state_dict().values(): if tensor.numel(): distributed.broadcast(tensor, 0) # NOTE: Without doing this, all gradients is initialized to None. # NOTE: This causes that some of gradients of the same parameters on different devices can be None and cannot be reduced # NOTE: if they don't contribute to the loss because of path sampling. for parameter in self.generator.parameters(): if parameter.requires_grad: parameter.grad = torch.zeros_like(parameter) for parameter in self.discriminator.parameters(): if parameter.requires_grad: parameter.grad = torch.zeros_like(parameter) for parameter in self.inverter.parameters(): if parameter.requires_grad: parameter.grad = torch.zeros_like(parameter)
def train(args): world_size = len(args.hosts) is_distributed = world_size > 1 logger.debug('Number of hosts {}. Distributed training - {}'.format(world_size, is_distributed)) use_cuda = args.num_gpus > 0 logger.debug('Number of gpus available - {}'.format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device('cuda' if use_cuda else 'cpu') if is_distributed: # Initialize the distributed environment. backend = 'gloo' os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format( dist.get_rank(), torch.cuda.is_available(), args.num_gpus)) # set the seed for generating random numbers seed = 1 torch.manual_seed(seed) if use_cuda: torch.cuda.manual_seed(seed) train_sampler, train_loader = _get_train_data_loader(args.data_dir, is_distributed, args.batch_size, **kwargs) test_loader = _get_test_data_loader(args.data_dir, **kwargs) logger.debug('Processes {}/{} ({:.0f}%) of train data'.format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug('Processes {}/{} ({:.0f}%) of test data'.format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case logger.debug('Multi-machine multi-gpu: using DistributedDataParallel.') model = torch.nn.parallel.DistributedDataParallel(model) elif use_cuda: # single-machine multi-gpu case logger.debug('Single-machine multi-gpu: using DataParallel().cuda().') model = torch.nn.DataParallel(model) else: # single-machine or multi-machine cpu case logger.debug('Single-machine/multi-machine cpu: using DataParallel.') model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.5) log_interval = 100 for epoch in range(1, args.epochs + 1): if is_distributed: train_sampler.set_epoch(epoch) model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % log_interval == 0: logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) accuracy = test(model, test_loader, device) save_model(model, args.model_dir) logger.debug('Overall test accuracy: {}'.format(accuracy))
def get_world_size(): return dist.get_world_size( ) if dist.is_available() and dist.is_initialized() else 1
def do_epoch(args: argparse.Namespace, train_loader: torch.utils.data.DataLoader, model: DDP, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler, epoch: int, callback: VisdomLogger, iter_per_epoch: int, log_iter: int) -> Tuple[torch.tensor, torch.tensor]: loss_meter = AverageMeter() train_losses = torch.zeros(log_iter).to(dist.get_rank()) train_mIous = torch.zeros(log_iter).to(dist.get_rank()) iterable_train_loader = iter(train_loader) if main_process(args): bar = tqdm(range(iter_per_epoch)) else: bar = range(iter_per_epoch) for i in bar: model.train() current_iter = epoch * len(train_loader) + i + 1 images, gt = iterable_train_loader.next() images = images.to(dist.get_rank(), non_blocking=True) gt = gt.to(dist.get_rank(), non_blocking=True) loss = compute_loss(args=args, model=model, images=images, targets=gt.long(), num_classes=args.num_classes_tr, ) optimizer.zero_grad() loss.backward() optimizer.step() if args.scheduler == 'cosine': scheduler.step() if i % args.log_freq == 0: model.eval() logits = model(images) intersection, union, target = intersectionAndUnionGPU(logits.argmax(1), gt, args.num_classes_tr, 255) if args.distributed: dist.all_reduce(loss) dist.all_reduce(intersection) dist.all_reduce(union) dist.all_reduce(target) allAcc = (intersection.sum() / (target.sum() + 1e-10)) # scalar mAcc = (intersection / (target + 1e-10)).mean() mIoU = (intersection / (union + 1e-10)).mean() loss_meter.update(loss.item() / dist.get_world_size()) if main_process(args): if callback is not None: t = current_iter / len(train_loader) callback.scalar('loss_train_batch', t, loss_meter.avg, title='Loss') callback.scalars(['mIoU', 'mAcc', 'allAcc'], t, [mIoU, mAcc, allAcc], title='Training metrics') for index, param_group in enumerate(optimizer.param_groups): lr = param_group['lr'] callback.scalar('lr', t, lr, title='Learning rate') break train_losses[int(i / args.log_freq)] = loss_meter.avg train_mIous[int(i / args.log_freq)] = mIoU if args.scheduler != 'cosine': scheduler.step() return train_mIous, train_losses
def get_world_size(): if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size()
def forward(ctx, x): if (dist.is_available() and dist.is_initialized() and (dist.get_world_size() > 1)): x = x.contiguous() / dist.get_world_size() dist.all_reduce(x) return x
def train(self, train, dev, test, buckets=32, batch_size=5000, lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, epochs=5000, patience=100, verbose=True, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() if dist.is_initialized(): args.batch_size = args.batch_size // dist.get_world_size() logger.info("Loading the data") train = Dataset(self.transform, args.train, **args) dev = Dataset(self.transform, args.dev) test = Dataset(self.transform, args.test) logger.info("Building the datasets") train.build(args.batch_size, args.buckets, True, dist.is_initialized()) logger.info("train built") dev.build(args.batch_size, args.buckets) logger.info("dev built") test.build(args.batch_size, args.buckets) logger.info( f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n") logger.info(f"{self.model}\n") if dist.is_initialized(): self.model = DDP(self.model, device_ids=[args.local_rank], find_unused_parameters=True) self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) elapsed = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() logger.info(f"Epoch {epoch} / {args.epochs}:") self._train(train.loader) loss, dev_metric = self._evaluate(dev.loader) logger.info(f"{'dev:':6} - loss: {loss:.4f} - {dev_metric}") loss, test_metric = self._evaluate(test.loader) logger.info(f"{'test:':6} - loss: {loss:.4f} - {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric: best_e, best_metric = epoch, dev_metric if is_master(): self.save(args.path) logger.info(f"{t}s elapsed (saved)\n") else: logger.info(f"{t}s elapsed\n") elapsed += t if epoch - best_e >= args.patience: break loss, metric = self.load(**args)._evaluate(test.loader) logger.info(f"Epoch {best_e} saved") logger.info(f"{'dev:':6} - {best_metric}") logger.info(f"{'test:':6} - {metric}") logger.info(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")
def test_synchronize_sgd(): torch.manual_seed(42) dist.init_process_group('mpi') rank = dist.get_rank() world_size = dist.get_world_size() device = torch.device('cpu') # device = torch.device('cuda') # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold input and outputs x = torch.randn(N, D_in, device=device) y = torch.randn(N, D_out, device=device) x = x[rank::world_size] y = y[rank::world_size] # Create random Tensors for weights; setting requires_grad=True means that we # want to compute gradients for these Tensors during the backward pass. w1 = torch.randn(D_in, H, device=device, requires_grad=True) w2 = torch.randn(H, D_out, device=device, requires_grad=True) learning_rate = 1e-6 for t in range(500): # Forward pass: compute predicted y using operations on Tensors. Since w1 and # w2 have requires_grad=True, operations involving these Tensors will cause # PyTorch to build a computational graph, allowing automatic computation of # gradients. Since we are no longer implementing the backward pass by hand we # don't need to keep references to intermediate values. y_pred = x.mm(w1).clamp(min=0).mm(w2) # Compute and print loss. Loss is a Tensor of shape (), and loss.item() # is a Python number giving its value. loss = (y_pred - y).pow(2).sum() if rank == 0: print("Iter {} : {:10.3e}".format(t, loss.item())) # Use autograd to compute the backward pass. This call will compute the # gradient of loss with respect to all Tensors with requires_grad=True. # After this call w1.grad and w2.grad will be Tensors holding the gradient # of the loss with respect to w1 and w2 respectively. loss.backward() # Update weights using gradient descent. For this step we just want to mutate # the values of w1 and w2 in-place; we don't want to build up a computational # graph for the update steps, so we use the torch.no_grad() context manager # to prevent PyTorch from building a computational graph for the updates with torch.no_grad(): w1 -= learning_rate * w1.grad w2 -= learning_rate * w2.grad # Manually zero the gradients after running the backward pass w1.grad.zero_() w2.grad.zero_() # Synchronize weights dist.all_reduce(w1, op=dist.reduce_op.SUM) dist.all_reduce(w2, op=dist.reduce_op.SUM) w1 /= world_size w2 /= world_size
def _train(args): is_distributed = len(args.hosts) > 1 and args.dist_backend is not None logger.debug("Distributed training - {}".format(is_distributed)) if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size) logger.info( 'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.dist_backend, dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format( dist.get_rank(), torch.cuda.is_available(), args.num_gpus)) device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info("Device Type: {}".format(device)) logger.info("Loading Cifar10 dataset") transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root=args.data_dir, train=True, download=False, transform=transform) train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) testset = torchvision.datasets.CIFAR10(root=args.data_dir, train=False, download=False, transform=transform) test_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) logger.info("Model loaded") model = Net() if torch.cuda.device_count() > 1: logger.info("Gpu count: {}".format(torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(0, args.epochs): running_loss = 0.0 for i, data in enumerate(train_loader): # get the inputs inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 print('Finished Training') return _save_model(model, args.model_dir)
def _average_gradients(model): # Gradient averaging. size = float(dist.get_world_size()) for param in model.parameters(): dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0) param.grad.data /= size
def get_world_size(): return dist.get_world_size()
def all_gather_list(data, max_size=16384): """ Gather arbitrary data from all nodes into a list. Similar to `~torch.distributed.all_gather` but for arbitrary Python data. Note that *data* must be picklable. :param data: data from the local worker to be gathered on other workers :param int max_size: maximum size of the data to be gathered across workers :returns: a list containing [data1, data2, ...] of all workers """ if not is_distributed(): # fall back to just keeping things basic if we're not distributed return [data] # stolen shamelessly from fairseq # https://github.com/pytorch/fairseq/blob/c37250ab1c845919af721cd3f5c4cec2993aefe1/fairseq/distributed_utils.py#L116-L170 rank = dist.get_rank() world_size = dist.get_world_size() buffer_size = max_size * world_size if ( not hasattr(all_gather_list, '_buffer') or all_gather_list._buffer.numel() < buffer_size ): all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size) buffer = all_gather_list._buffer buffer.zero_() enc = pickle.dumps(data) enc_size = len(enc) if enc_size + 2 > max_size: raise ValueError('encoded data exceeds max_size: {}'.format(enc_size + 2)) assert max_size < 255 * 256 buffer_rank = buffer[rank * max_size : (rank + 1) * max_size] buffer_rank[0] = enc_size // 255 # this encoding works for max_size < 65k buffer_rank[1] = enc_size % 255 buffer_rank[2 : enc_size + 2] = torch.ByteTensor(list(enc)) dist.all_reduce(buffer) result = [] for i in range(world_size): out_buffer = buffer[i * max_size : (i + 1) * max_size] size = (255 * out_buffer[0].item()) + out_buffer[1].item() if size > 0: try: result.append(pickle.loads(bytes(out_buffer[2 : size + 2].tolist()))) except pickle.UnpicklingError: raise RuntimeError( 'There was an unpickling error in all_gather_list. This likely ' 'means your workers got out of syncronization (e.g. one is ' 'expecting to sync and another is not.)' ) return result
def sharded_embedding_bag(types, args, kwargs, pg): """ Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding_bag``. This method computes a sharded embedding bag aggregation and has the following limitations: 1. Supports only sharding of ``weight``. 2. Supports only ``ChunkShardingSpec``. 3. Supports only a single local shard per rank. 4. Supports only limited specs like offsets, per_sample_weights, and excluding specs like padding_idx, max_norm, etc. Based on the dimension that the weight is sharded on, there are two algorithms: ROWWISE SHARDING ================ For row-wise sharding the weight is sharded on dimension 0. The overall algorithm can be best explained with an example. Let's assume the dims for input are (4 x 6) and W are (16 x 17) and W is sharded across 4 GPUs creating 4 shard of (4 x 17). The algorithm is as follows: 1. First if the input is a 2D tensor, we sort by row. (If it's a 1D tensor, we sort the tensor per interval defined by offset. For example if the given input is generated within [1, 9] like tensor([[ 3, 7, 7, 9, 2, 1], [ 0, 0, 14, 5, 3, 12], [ 4, 5, 5, 9, 5, 13], [10, 3, 0, 7, 13, 9]]) Then we have the sorted 2D tensor like: tensor([[ 1, 2, 3, 7, 7, 9], [ 0, 0, 3, 5, 12, 14], [ 4, 5, 5, 5, 9, 13], [ 0, 3, 7, 9, 10, 13]]) Note if placement not equal to rank we will rearrange accordingly. 2. Based on sorted result, we now have the offset like the following: [tensor([0, 3, 5, 6]), tensor([0, 3, 4, 4]), tensor([0, 0, 4, 5]), tensor([0, 2, 3, 5])] Note that embedding bag does allow the offset idx equal to length of input or repetitive. For these cases, it return a zero tensor. 3. Next, we rearrange the sorted tensor into different ranks by first flattening it and grouping by ranks. Finally, we get a list of 1D tensors. So the given tensor now becomes: [tensor([1, 2, 3, 0, 0, 3, 0, 3]), tensor([7, 7, 5, 4, 5, 5, 5, 7]), tensor([9, 9, 9, 10]), tensor([12, 14, 13, 13])] We sync offsets with IDs. Offset now becomes: [tensor([0, 3, 6, 6]), tensor([0, 2, 3, 7]), tensor([0, 1, 1, 2]), tensor([0, 0, 2, 3])] 5. Before we send out the array to other ranks, we need to do the modular operation so that each rank do use that for embedding look-up. The above ID tensor list will look like the below after performing the moduler of 4: [tensor([1, 2, 3, 0, 0, 3, 0, 3]), tensor([3, 3, 1, 0, 1, 1, 1, 3]), tensor([1, 1, 1, 2]), tensor([0, 2, 1, 1])] 4. The example above only happens in one rank and each rank does a very similar thing with different rearranged IDs and offsets list. We then send IDs and offsets to the corresponding rank. Each rank do the look-up and aggregation on its local shard. We then use reduce_scatter to send the result back to each rank and perform the aggregation simultaneously. 5. For "Mean" mode we need to divide by either column size (2D) or the interval length defined by the offset. We also need to mask the unexisting row to neg Inf so that negative value does not gets wiped out in the "Max" mode. COLWISE SHARDING ================ For col-wise sharding the weight is sharded on dimension 1. The overall algorithm can be best explained with an example. Let's assume the dims for input are (4 x 6) and W are (16 x 17) and W is sharded across 4 GPUs creating 3 shards of (16 x 5) and 1 shard of (16 x 2). The algorithm is as follows: 1. First the input is broadcasted to all ranks, since this is SPMD we actually do an all_gather for all the inputs resulting in 4 (4 x 6) inputs on each rank. 2. Next we perform local embedding bag operation under the given mode by apply each input (4 x 6) with the local shard (16 x 5) ((16 x 2) for the last). This results in 4 (5 x 4) ((2 x 4) for the last) matrices on each rank. We transpose the aggregation result. 3. Next, we concatenate these 4 matrices and perform an all2all to share the appropriate (5 x 4) or (2 x 4) matrices to each rank. 4. Now, each rank receives a (17 x 4) matrix which is basically the size of the result we need. 5. If placements are not in order any appropriate rearrangement of columns are done for the (17 x 4) matrix and finally we transpose the output again. """ # Validate input params _validate_embedding_bag_param(args, kwargs) input = args[0] weight = args[1] offsets = kwargs["offsets"] per_sample_weights = kwargs["per_sample_weights"] mode = kwargs["mode"] local_shard = weight.local_shards()[0].tensor.contiguous() sharding_dim = weight._sharding_spec.dim world_size = dist.get_world_size(pg) if sharding_dim == 1: return _handle_col_wise_sharding( input, world_size, weight, local_shard, offsets, per_sample_weights, mode, pg, ) elif sharding_dim == 0: return _handle_row_wise_sharding( input, world_size, weight, local_shard, offsets, per_sample_weights, mode, pg, ) else: raise RuntimeError( f"nn.EmbeddingBag weight sharded on dim {sharding_dim} not supported!" )
def __init__( self, module: nn.Module, sharded_optimizer: Union[OSS, List[OSS]], process_group: Any = None, broadcast_buffers: bool = True, sync_models_at_startup: bool = True, ): super().__init__() self.module = module self.sharded_optimizers = [sharded_optimizer] if isinstance( sharded_optimizer, OSS) else sharded_optimizer self.enable_broadcast_buffers = broadcast_buffers # Handle a no_sync() context which prevents the gradient synchronization, # accumulate in place self.should_accumulate_grads = False # Communication related attributes self.process_group = process_group if process_group is not None else dist.group.WORLD self.world_size = dist.get_world_size(self.process_group) self.reference_global_rank = OSS.get_global_rank( self.process_group, 0) # picking rank 0 as the reference self.rank = dist.get_rank(self.process_group) self.global_rank = OSS.get_global_rank(self.process_group, self.rank) # Expose some of the PytorchDDP attributes, some frameworks rely on them. # See https://pytorch.org/docs/stable/_modules/torch/nn/parallel/distributed.html#DistributedDataParallel # device_id related logic is not present, this is not handled devices = {p.device for p in self.module.parameters()} self.is_multi_device_module = len(devices) > 1 self.device = list(devices)[0] distinct_device_types = { p.device.type for p in self.module.parameters() } assert len(distinct_device_types) == 1, ( "ShardedDataParallel's input module must be on " "the same type of devices, but input module parameters are located on {} different device types." ).format(distinct_device_types) self.device_type = list(distinct_device_types)[0] # Scafolding to be able to reduce the grads during the BW pass # several optimizers can be present each working on seperate parameter sets, # we build an iterator which goes through all the parameters involved globally self._param_iterator = chain(*[ optim.should_bucket_param.keys() for optim in self.sharded_optimizers ]) self._grad_to_be_reduced = [True for _ in self._param_iterator] self._reduced_grads: Dict[OSS, int] = {} self._reduced_grads_max = { o: len(o.param_to_rank.values()) for o in self.sharded_optimizers } self._clear_counters() self._grad_accs: List[Callable] = [] self._setup_backward_hooks() # Make sure that all ranks start with the same model if sync_models_at_startup: self._sync_params_and_buffers()
end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.all_reduce(tensor) dist.barrier() if rank == 0: print_header("scatter") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) tensors = [tensor for n in range(0, dist.get_world_size())] for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.scatter_send(tensors, tensor) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.scatter_recv(tensor, 0) dist.barrier()
def main(): args = create_argparser().parse_args() dist_util.setup_dist() logger.configure() logger.log("creating model...") model, diffusion = sr_create_model_and_diffusion( **args_to_dict(args, sr_model_and_diffusion_defaults().keys())) model.load_state_dict( dist_util.load_state_dict(args.model_path, map_location="cpu")) # matched, not_in_model, not_in_checkpoint = load_tolerant(model, args.model_path) # assert not not_in_checkpoint, f"missing keys in checkpoint {not_in_checkpoint}" # assert len(not_in_model) in (0,1), f"multiple keys missing in model {not_in_model}" model.to(dist_util.dev()) if args.use_fp16: model.convert_to_fp16() model.eval() logger.log("loading data...") data = load_data_for_worker(args.base_samples, args.batch_size, args.class_cond) logger.log("creating samples...") all_images = [] while len(all_images) * args.batch_size < args.num_samples: model_kwargs = next(data) model_kwargs = { k: v.to(dist_util.dev()) for k, v in model_kwargs.items() } sample = diffusion.p_sample_loop( model, (args.batch_size, 3, args.large_size, args.large_size), clip_denoised=args.clip_denoised, model_kwargs=model_kwargs, ) sample = ((sample + 1) * 127.5).clamp(0, 255).to(th.uint8) sample = sample.permute(0, 2, 3, 1) sample = sample.contiguous() all_samples = [ th.zeros_like(sample) for _ in range(dist.get_world_size()) ] dist.all_gather(all_samples, sample) # gather not supported with NCCL for sample in all_samples: all_images.append(sample.cpu().numpy()) logger.log(f"created {len(all_images) * args.batch_size} samples") arr = np.concatenate(all_images, axis=0) arr = arr[:args.num_samples] if dist.get_rank() == 0: shape_str = "x".join([str(x) for x in arr.shape]) out_path = os.path.join(logger.get_dir(), f"samples_{shape_str}.npz") logger.log(f"saving to {out_path}") np.savez(out_path, arr) dist.barrier() logger.log("sampling complete")
def _init_global_test(self): group = [i for i in range(0, dist.get_world_size())] group_id = dist.group.WORLD rank = dist.get_rank() return (group, group_id, rank)
def optimize(self, num_traces, dataset, dataset_valid, batch_size=64, valid_every=None, optimizer_type=Optimizer.ADAM, learning_rate=0.0001, momentum=0.9, weight_decay=1e-5, save_file_name_prefix=None, save_every_sec=600, distributed_backend=None, distributed_params_sync_every=10000, distributed_loss_update_every=None, dataloader_offline_num_workers=0, *args, **kwargs): if not self._layers_initialized: self._init_layers_observe_embedding( self._observe_embeddings, example_trace=dataset.__getitem__(0)) self._init_layers() self._layers_initialized = True if distributed_backend is None: distributed_world_size = 1 distributed_rank = 0 else: dist.init_process_group(backend=distributed_backend) distributed_world_size = dist.get_world_size() distributed_rank = dist.get_rank() util.init_distributed_print(distributed_rank, distributed_world_size, False) print( colored('Distributed synchronous training', 'yellow', attrs=['bold'])) print( colored('Distributed backend : {}'.format( distributed_backend), 'yellow', attrs=['bold'])) print( colored('Distributed world size : {}'.format( distributed_world_size), 'yellow', attrs=['bold'])) print( colored( 'Distributed minibatch size: {} (global), {} (per node)'. format(batch_size * distributed_world_size, batch_size), 'yellow', attrs=['bold'])) print( colored('Distributed learning rate : {} (global), {} (base)'. format(learning_rate * distributed_world_size, learning_rate), 'yellow', attrs=['bold'])) print( colored('Distributed optimizer : {}'.format( str(optimizer_type)), 'yellow', attrs=['bold'])) self._distributed_backend = distributed_backend self._distributed_world_size = distributed_world_size self._optimizer_type = optimizer_type self._batch_size = batch_size self._learning_rate = learning_rate * distributed_world_size self._momentum = momentum self.train() prev_total_train_seconds = self._total_train_seconds time_start = time.time() time_loss_min = time.time() time_last_batch = time.time() if valid_every is None: valid_every = max(100, num_traces / 1000) if distributed_loss_update_every is None: distributed_loss_update_every = valid_every last_validation_trace = -valid_every + 1 epoch = 0 iteration = 0 trace = 0 stop = False print( 'Train. time | Epoch| Trace | Init. loss| Min. loss | Curr. loss| T.since min | Traces/sec' ) max_print_line_len = 0 loss_min_str = '' time_since_loss_min_str = '' last_auto_save_time = time.time() - save_every_sec num_workers = 0 if isinstance(dataset, OfflineDataset): # and (distributed_world_size == 1): num_workers = dataloader_offline_num_workers # print('num_workers', num_workers) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=lambda x: Batch(x)) if dataset_valid is not None: dataloader_valid = DataLoader(dataset_valid, batch_size=batch_size, num_workers=num_workers, collate_fn=lambda x: Batch(x)) while not stop: epoch += 1 for i_batch, batch in enumerate(dataloader): # Important, a self._distributed_sync_parameters() needs to happen at the very beginning of a training if (distributed_world_size > 1) and ( iteration % distributed_params_sync_every == 0): self._distributed_sync_parameters() if self._layers_pre_generated: # and (distributed_world_size > 1): layers_changed = False else: layers_changed = self._polymorph(batch) if (self._optimizer is None) or layers_changed: if optimizer_type == Optimizer.ADAM: self._optimizer = optim.Adam(self.parameters(), lr=learning_rate * distributed_world_size, weight_decay=weight_decay) else: # optimizer_type == Optimizer.SGD self._optimizer = optim.SGD(self.parameters(), lr=learning_rate * distributed_world_size, momentum=momentum, nesterov=True, weight_decay=weight_decay) # self._optimizer.zero_grad() if distributed_world_size > 1: self._distributed_zero_grad() else: self._optimizer.zero_grad() success, loss = self._loss(batch) if not success: print( colored( 'Cannot compute loss, skipping batch. Loss: {}'. format(loss), 'red', attrs=['bold'])) else: loss.backward() if distributed_world_size > 1: self._distributed_sync_grad(distributed_world_size) self._optimizer.step() loss = float(loss) if self._loss_initial is None: self._loss_initial = loss self._loss_max = loss loss_initial_str = '{:+.2e}'.format(self._loss_initial) # loss_max_str = '{:+.3e}'.format(self._loss_max) if loss < self._loss_min: self._loss_min = loss loss_str = colored('{:+.2e}'.format(loss), 'green', attrs=['bold']) loss_min_str = colored('{:+.2e}'.format( self._loss_min), 'green', attrs=['bold']) time_loss_min = time.time() time_since_loss_min_str = colored( util.days_hours_mins_secs_str(0), 'green', attrs=['bold']) elif loss > self._loss_max: self._loss_max = loss loss_str = colored('{:+.2e}'.format(loss), 'red', attrs=['bold']) # loss_max_str = colored('{:+.3e}'.format(self._loss_max), 'red', attrs=['bold']) else: if loss < self._loss_previous: loss_str = colored('{:+.2e}'.format(loss), 'green') elif loss > self._loss_previous: loss_str = colored('{:+.2e}'.format(loss), 'red') else: loss_str = '{:+.2e}'.format(loss) loss_min_str = '{:+.2e}'.format(self._loss_min) # loss_max_str = '{:+.3e}'.format(self._loss_max) time_since_loss_min_str = util.days_hours_mins_secs_str( time.time() - time_loss_min) self._loss_previous = loss self._total_train_iterations += 1 trace += batch.size self._total_train_traces += batch.size * distributed_world_size total_train_traces_str = '{:9}'.format('{:,}'.format( self._total_train_traces)) epoch_str = '{:4}'.format('{:,}'.format(epoch)) self._total_train_seconds = prev_total_train_seconds + ( time.time() - time_start) total_training_seconds_str = util.days_hours_mins_secs_str( self._total_train_seconds) traces_per_second_str = '{:,.1f}'.format( int(batch.size * distributed_world_size / (time.time() - time_last_batch))) time_last_batch = time.time() if num_traces is not None: if trace >= num_traces: stop = True self._history_train_loss.append(loss) self._history_train_loss_trace.append( self._total_train_traces) if dataset_valid is not None: if trace - last_validation_trace > valid_every: print('\rComputing validation loss... ', end='\r') valid_loss = 0 with torch.no_grad(): for i_batch, batch in enumerate( dataloader_valid): _, v = self._loss(batch) valid_loss += v valid_loss = float(valid_loss / len(dataset_valid)) self._history_valid_loss.append(valid_loss) self._history_valid_loss_trace.append( self._total_train_traces) last_validation_trace = trace - 1 if distributed_world_size > 1: self._distributed_update_train_loss( loss, distributed_world_size) self._distributed_update_valid_loss( valid_loss, distributed_world_size) if (distributed_world_size > 1) and ( iteration % distributed_loss_update_every == 0): self._distributed_update_train_loss( loss, distributed_world_size) if (distributed_rank == 0) and (save_file_name_prefix is not None): if time.time() - last_auto_save_time > save_every_sec: last_auto_save_time = time.time() file_name = '{}_{}_traces_{}.network'.format( save_file_name_prefix, util.get_time_stamp(), self._total_train_traces) print('\rSaving to disk... ', end='\r') self._save(file_name) print_line = '{} | {} | {} | {} | {} | {} | {} | {}'.format( total_training_seconds_str, epoch_str, total_train_traces_str, loss_initial_str, loss_min_str, loss_str, time_since_loss_min_str, traces_per_second_str) max_print_line_len = max(len(print_line), max_print_line_len) print(print_line.ljust(max_print_line_len), end='\r') sys.stdout.flush() if stop: break iteration += 1 print() if (distributed_rank == 0) and (save_file_name_prefix is not None): file_name = '{}_{}_traces_{}.network'.format( save_file_name_prefix, util.get_time_stamp(), self._total_train_traces) print('\rSaving to disk... ', end='\r') self._save(file_name)
def average_gradients(parameters): world_size = distributed.get_world_size() for parameter in parameters: if parameter.requires_grad: distributed.all_reduce(parameter.grad) parameter.grad /= world_size
def _multi_worker_islice( self, iterable: Iterable[Any], transform: Optional[Callable[[Any], Instance]] = None, ensure_lazy: bool = False, ) -> Iterable[Instance]: """ Helper method that determines which raw instances to skip based on the current node rank (for distributed training) and worker ID (for multi-process data loading). # Parameters iterable : `Iterable[Any]` An iterable that yields raw data that can be transformed into `Instance`s through the `transform` function. transform : `Optional[Callable[[Any], Instance]]`, optional (default = `None`) An optional function that will be applied to the raw data generated by `iterable` to create `Instance`s. This is used, e.g., when reading cached data. ensure_lazy : `bool`, optional (default = `False`) If `True`, a `ConfigurationError` error will be raised if `iterable` is a list instead of a lazy generator type. # Returns `Iterable[Instance]` """ if ensure_lazy and isinstance(iterable, (list, tuple)): raise ConfigurationError( "For a lazy dataset reader, _read() must return a generator") wrap_with_tqdm = True start_index = 0 step_size = 1 if not self.manual_distributed_sharding and util.is_distributed(): start_index = dist.get_rank() step_size = dist.get_world_size() worker_info = None if self.manual_multi_process_sharding else get_worker_info( ) if worker_info: warnings.warn( "Using multi-process data loading without setting " "DatasetReader.manual_multi_process_sharding to True.\n" "Did you forget to set this?\n" "If you're not handling the multi-process sharding logic within your " "_read() method, there is probably no benefit to using more than one " "worker.", UserWarning, ) # Scale `start_index` by `num_workers`, then shift by worker `id`. start_index *= worker_info.num_workers start_index += worker_info.id # Scale `step_size` by `num_workers`. step_size *= worker_info.num_workers if worker_info.id > 0: # We only want to log with tqdm from the main loader process. wrap_with_tqdm = False islice = itertools.islice(iterable, start_index, self.max_instances, step_size) if wrap_with_tqdm: islice = Tqdm.tqdm(islice, desc="reading instances") if transform is not None: return (transform(x) for x in islice) return islice
def train(args): is_distributed = len(args.hosts) > 1 and args.backend is not None logger.debug("Distributed training - {}".format(is_distributed)) use_cuda = args.num_gpus > 0 logger.debug("Number of gpus available - {}".format(args.num_gpus)) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} device = torch.device("cuda" if use_cuda else "cpu") if is_distributed: # Initialize the distributed environment. world_size = len(args.hosts) os.environ['WORLD_SIZE'] = str(world_size) host_rank = args.hosts.index(args.current_host) dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( dist.get_rank(), args.num_gpus)) # set the seed for generating random numbers torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset) )) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset) )) model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case model = torch.nn.parallel.DistributedDataParallel(model) else: # single-machine multi-gpu case or single-machine or multi-machine cpu case model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() if is_distributed and not use_cuda: # average gradients manually for multi-machine cpu case only _average_gradients(model) optimizer.step() if batch_idx % args.log_interval == 0: logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader, device) save_model(model, args.model_dir)
def reduce_mean(tensor): if not (dist.is_available() and dist.is_initialized()): return tensor tensor = tensor.float().clone() dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) return tensor
def _average_gradients(model): # Gradient averaging. size = float(dist.get_world_size()) for param in model.parameters(): dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM) param.grad.data /= size
validate(epoch, model, validation_data, optimizer, normStd, num_batches_val, valF) #save the most recent version of the model save_model({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}, output_dir, rank, epoch) #Close the training and testing log files trainF.close() testF.close() if __name__ == "__main__": #Initiate MPI process dist.init_process_group(backend='mpi') #Gather the size and rank of the MPI call size = dist.get_world_size() rank = dist.get_rank() #Initialize the printing for each node init_print(rank, size) #Initiate the main function main(rank, size)
def __init__(self, init_optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, dynamic_loss_args=None, verbose=True, dp_process_group=None, partition_size=None, mpu=None, all_gather_partitions=True, allgather_size=500000000, clip_grad=0.0): if dp_process_group is not None and partition_size is not None: raise ValueError("Cannot specify both dp_process_group " "and partition size") if dp_process_group is None: dp_process_group = _initialize_parameter_parallel_groups( partition_size) if not torch.cuda.is_available: raise SystemError("Cannot use fp16 without CUDA.") self.optimizer = init_optimizer self.verbose = verbose self.dp_process_group = dp_process_group # TODO: automatically turn off if #params > some_limit self.all_gather_partitions = all_gather_partitions self.allgather_size = allgather_size # param flattened by groups self.fp16_groups = [] self.fp16_groups_flat = [] #param partitioned by data parallel degree #this will contain a list of equal sized tensors #each of which will be updated by a different process self.parallel_partitioned_fp16_groups = [] #a single 32-bit partition of the parallel partitioned parameters #that this process will update self.single_partition_of_fp32_groups = [] #param partition info #These are the parameters in each group that will not be updated by this process directly self.params_not_in_partition = [] #These are the parameters that will be updated by this process directly self.params_in_partition = [] #Offset from the first paramter in the the self.params_in_partition #the parameter boundaries may not align with partition boundaries #so we need to keep track of the offset self.first_offset = [] #number of elements per partition in each group self.partition_size = [] partition_id = dist.get_rank(group=self.dp_process_group) # loop to deal with groups for i, param_group in enumerate(self.optimizer.param_groups): # push this group to list before modify self.fp16_groups.append(param_group['params']) self.fp16_groups_flat.append( flatten_dense_tensors_aligned( self.fp16_groups[i], dist.get_world_size(group=self.dp_process_group), self.dp_process_group)) # set model fp16 weight to slices of flattened buffer updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data #divide the flat weights into near equal paritition equal to the data parallel degree #each process will compute on a different part of the partition data_parallel_partitions = self.get_data_parallel_partitions( self.fp16_groups_flat[i]) self.parallel_partitioned_fp16_groups.append( data_parallel_partitions) # a partition of the fp32 master weights that will be updated by this process self.single_partition_of_fp32_groups.append( self.parallel_partitioned_fp16_groups[i] [partition_id].clone().float().detach()) # modify optimizer of have flat master weight self.single_partition_of_fp32_groups[ i].requires_grad = True # keep this in case internal optimizer uses it param_group['params'] = [self.single_partition_of_fp32_groups[i]] partition_size = len( self.fp16_groups_flat[i]) / dist.get_world_size( group=self.dp_process_group) params_in_partition, params_not_in_partition, first_offset = self.get_partition_info( self.fp16_groups[i], partition_size, partition_id) self.partition_size.append(partition_size) self.params_in_partition.append(params_in_partition) self.params_not_in_partition.append(params_not_in_partition) self.first_offset.append(first_offset) # we may have a way of fusing dynamic scale. Do not support for now if dynamic_loss_scale: self.dynamic_loss_scale = True if dynamic_loss_args is None: self.loss_scaler = DynamicLossScaler() else: self.loss_scaler = DynamicLossScaler(**dynamic_loss_args) else: self.dynamic_loss_scale = False self.loss_scaler = LossScaler(scale=static_loss_scale) self.cur_iter = 0 self.mpu = mpu self.clip_grad = clip_grad self.overflow = False self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
def reduce_tensor(tensor): rt = tensor.clone() dist.all_reduce(rt, op=dist.ReduceOp.SUM) rt /= dist.get_world_size() return rt
debug = False seed = 12 device = "cuda" fp16_opt_level = "O2" num_classes = 21 batch_size = 9 # ~9GB GPU RAM val_batch_size = 24 non_blocking = True num_workers = 12 // dist.get_world_size() val_interval = 1 accumulation_steps = 4 val_img_size = 513 train_img_size = 480 # ############################## # Setup Dataflow # ############################## mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225)
def get_world_size(): if not is_dist_avail_and_initialized(): return 1 return dist.get_world_size()
def step(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow self.overflow = self.overflow_checker.check() prev_scale = self.loss_scale self._update_scale(self.overflow) if self.overflow: self.zero_grad() if self.verbose: print("[deepspeed] OVERFLOW! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.loss_scale)) return self.overflow norm_groups = [] single_partition_grad_groups = [] partition_id = dist.get_rank(group=self.dp_process_group) for i, group in enumerate(self.fp16_groups): norm_groups.append(get_grad_norm(group, mpu=self.mpu)) #free gradients for all the parameters that are not updated by this process self.free_grad_in_param_list(self.params_not_in_partition[i]) #create a flat gradients for parameters updated by this process single_grad_partition = self.get_flat_partition( self.params_in_partition[i], self.first_offset[i], self.partition_size[i], dtype=self.single_partition_of_fp32_groups[i].dtype) self.single_partition_of_fp32_groups[ i].grad = single_grad_partition #release all the gradient since we have already created a necessary copy in dp_grad_partition self.free_grad_in_param_list(self.params_in_partition[i]) single_partition_grad_groups.append(single_grad_partition) self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups) self.optimizer.step() #get rid of the fp32 gradients. Not needed anymore for group in self.single_partition_of_fp32_groups: group.grad = None for i in range(len(norm_groups)): for fp16_partitions, fp32_partition in zip( self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups): fp16_partitions[partition_id].data.copy_(fp32_partition.data) dp_world_size = dist.get_world_size(group=self.dp_process_group) #gather the updated weights from everyone for _, partitioned_params in enumerate( self.parallel_partitioned_fp16_groups): if self.all_gather_partitions: # controllable memory-time tradeoff num_shards = max( 1, partitioned_params[partition_id].numel() * dp_world_size // self.allgather_size) shard_size = partitioned_params[partition_id].numel( ) // num_shards num_elements = shard_size for shard_id in range(num_shards + 1): if shard_id == num_shards: if shard_size * num_shards >= partitioned_params[ partition_id].numel(): break else: num_elements = partitioned_params[ partition_id].numel() - shard_id * shard_size shard_list = [] for dp_id in range(dp_world_size): curr_shard = partitioned_params[dp_id].narrow( 0, shard_id * shard_size, num_elements) shard_list.append(curr_shard) dist.all_gather(shard_list, shard_list[partition_id], group=self.dp_process_group) else: #this should require less memory but should be faster for src, partitioned_param in enumerate(partitioned_params): global_src = _get_global_rank(self.dp_process_group, src) dist.broadcast(partitioned_param, global_src, group=self.dp_process_group) # TODO: we probably don't need this? just to be safe for i in range(len(norm_groups)): updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data return self.overflow
def __init__(self, layers, num_stages=None, topology=None, loss_fn=None, seed_layers=False, seed_fn=None, base_seed=1234, partition_method='parameters', activation_checkpoint_interval=0, activation_checkpoint_func=checkpointing.checkpoint, checkpointable_layers=None): """Modules to be parallelized with pipeline parallelism. The key constraint that enables pipeline parallelism is the representation of the forward pass as a sequence of layers and the enforcement of a simple interface between them. The forward pass is implicitly defined by the module ``layers``. The key assumption is that the output of each layer can be directly fed as input to the next, like a ``torch.nn.Sequence``. The forward pass is implicitly: .. code-block:: python def forward(self, inputs): x = inputs for layer in self.layers: x = layer(x) return x .. note:: Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3. Args: layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module. num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided. topology (``deepseed.pipe.ProcessTopology``, optional): Defines the axes of parallelism axes for training. Must be provided if ``num_stages`` is ``None``. loss_fn (callable, optional): Loss is computed ``loss = loss_fn(outputs, label)`` base_seed (int, optional): [description]. Defaults to 1234. partition_method (str, optional): [description]. Defaults to 'parameters'. activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing. activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``. """ super().__init__() if num_stages is None and topology is None: raise RuntimeError('must provide num_stages or topology') self.micro_offset = 0 self.loss_fn = loss_fn self.checkpointable_layers = checkpointable_layers if checkpointable_layers is not None: assert isinstance( checkpointable_layers, list), "param `checkpointable_layers` must be type of list." self.seed_layers = seed_layers self.seed_fn = seed_fn self.base_seed = base_seed if dist.get_rank() == 0: try: seed_str = self.seed_fn.__name__ except AttributeError: seed_str = None print( f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}' ) # Setup world info self.world_group = dist.new_group(ranks=range(dist.get_world_size())) self.global_rank = dist.get_rank(group=self.world_group) self.world_size = dist.get_world_size(group=self.world_group) self.local_rank = int(os.environ.get("LOCAL_RANK", None)) assert self.local_rank != None if topology: self._topo = topology self.num_stages = self._topo.get_dim('pipe') else: self.num_stages = num_stages if topology is None: if self.world_size % self.num_stages != 0: raise RuntimeError( f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})' ) dp = self.world_size // num_stages topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp) self._topo = topology # Construct communicators for pipeline topology self._grid = PipelineParallelGrid(process_group=self.world_group, topology=self._topo) self.stage_id = self._topo.get_coord(self.global_rank).pipe # Initialize partition information self._layer_specs = list(layers) self._num_layers = len(self._layer_specs) self._local_start = 0 self._local_stop = None self._partition_layers(method=partition_method) self.forward_funcs = [] self.tied_modules = nn.ModuleDict() self.tied_weight_attrs = {} # Offset the random seed by the stage ID. #newseed = torch.cuda.initial_seed() + self._grid.get_stage_id() #ds_utils.set_random_seed(newseed) #with torch.random.fork_rng(devices=[torch.cuda.current_device()]): self._build() self.to(f'cuda:{self.local_rank}') self.tied_comms = self._index_tied_modules() self._synchronize_tied_weights() self.activation_checkpoint_interval = activation_checkpoint_interval self.activation_checkpoint_func = activation_checkpoint_func
def reduce_loss(self, loss): dist.reduce(loss, dst=self.master_rank, op=dist.reduce_op.SUM) return loss.item() / dist.get_world_size()