def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: # Use multi-tensor scale instead of loop & individual parameter copies for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): multi_tensor_applier( amp_C.multi_tensor_scale, dummy_overflow_buf, [ apex_C.unflatten(flat_master.data, model_bucket), model_bucket ], 1.0)
def synchronize(self): synced = False if self.count_down == 0: missing_p = self._requires_update - set(self._handles.keys()) for p in missing_p: self._allreduce_tensor(p) if self._multi_node: for p, value in self._handles.items(): handle, ctx = value output = synchronize(handle) p.grad.set_( self._compression.decompress(output, ctx) / self.accumulation_step) else: buckets = OrderedDict() for tensor in self._handles.values(): tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) for tp in buckets: bucket = buckets[tp] coalesced = flatten( bucket) / self.world_size / self.accumulation_step torch.distributed.all_reduce_multigpu([coalesced]) for buf, synced in zip(bucket, unflatten(coalesced, bucket)): buf.copy_(synced) self._handles.clear() synced = True self.count_down = self.accumulation_step self.count_down -= 1 return synced
def take_optimizer_step(args, optimizer, model, overflow_buf, global_step): global skipped_steps if args.allreduce_post_accumulation: # manually allreduce gradients after all accumulation steps # check for Inf/NaN # 1. allocate an uninitialized buffer for flattened gradient loss_scale = _amp_state.loss_scalers[0].loss_scale() if args.fp16 else 1 master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None] flat_grad_size = sum(p.numel() for p in master_grads) allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32 flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype) # 2. combine unflattening and predivision of unscaled 'raw' gradient allreduced_views = apex_C.unflatten(flat_raw, master_grads) overflow_buf.zero_() amp_C.multi_tensor_scale(65536, overflow_buf, [master_grads, allreduced_views], loss_scale / (get_world_size() * args.gradient_accumulation_steps)) # 3. sum gradient across ranks. Because of the predivision, this averages the gradient torch.distributed.all_reduce(flat_raw) # 4. combine unscaling and unflattening of allreduced gradient overflow_buf.zero_() amp_C.multi_tensor_scale(65536, overflow_buf, [allreduced_views, master_grads], 1./loss_scale) # 5. update loss scale if args.fp16: scaler = _amp_state.loss_scalers[0] old_overflow_buf = scaler._overflow_buf scaler._overflow_buf = overflow_buf had_overflow = scaler.update_scale() scaler._overfloat_buf = old_overflow_buf else: had_overflow = 0 # 6. call optimizer step function if had_overflow == 0: optimizer.step() global_step += 1 else: # Overflow detected, print message and clear gradients skipped_steps += 1 if is_main_process(): scaler = _amp_state.loss_scalers[0] dllogger.log(step="PARAMETER", data={"loss_scale": scaler.loss_scale()}) if _amp_state.opt_properties.master_weights: for param in optimizer._amp_stash.all_fp32_from_fp16_params: param.grad = None for param in model.parameters(): param.grad = None else: if args.apply_optimizer > 0: optimizer.step() # optimizer.zero_grad() for param in model.parameters(): param.grad = None global_step += 1 return global_step
def all_reduce(self, overflow_buf, accum=1): scaler = amp.scaler.LossScaler(1.0) # 1. allocate an uninitialized buffer for flattened gradient master_grads = [ p.grad for p in amp.master_params(self.optimizer) if p.grad is not None ] flat_grad_size = sum(p.numel() for p in master_grads) allreduce_dtype = torch.float32 flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype) # 2. combine unflattening and predivision of unscaled 'raw' gradient allreduced_views = apex_C.unflatten(flat_raw, master_grads) overflow_buf.zero_() amp_C.multi_tensor_scale( 65536, overflow_buf, [master_grads, allreduced_views], scaler.loss_scale() / (self.team_size * accum)) # 3. sum gradient across ranks. Because of the predivision, # this averages the gradient torch.distributed.all_reduce(flat_raw, group=self.local_group) # 4. combine unscaling and unflattening of allreduced gradient overflow_buf.zero_() amp_C.multi_tensor_scale(65536, overflow_buf, [allreduced_views, master_grads], 1. / scaler.loss_scale())
def allreduce_maybe_retain(self, bucket, bucket_idx=-1): allreduced = self.allreduce_bucket(bucket) if self.retain_allreduce_buffers: if self.allreduce_buffers[bucket_idx] is not None: raise RuntimeError("The backward pass is attempting to replace an already-filled " "allreduce buffer. This is almost certainly an error.") self.allreduce_buffers[bucket_idx] = allreduced else: for buf, synced in zip(bucket, unflatten(allreduced, bucket)): buf.copy_(synced)
def apply_flat_dist_call(bucket, call, extra_args=None): coalesced = flatten(bucket) #print("Rank", dist.get_rank(), "Broadcasting ", coalesced.device, " Size", coalesced.size()) if extra_args is not None: call(coalesced, *extra_args) else: call(coalesced) if call is dist.all_reduce: coalesced /= dist.get_world_size() for buf, synced in zip(bucket, unflatten(coalesced, bucket)): buf.copy_(synced)
def apply_flat_dist_call(bucket, call, extra_args=None): coalesced = flatten(bucket) if extra_args is not None: call(coalesced, *extra_args) else: call(coalesced) if call is dist.all_reduce: coalesced /= dist.get_world_size() for buf, synced in zip(bucket, unflatten(coalesced, bucket)): buf.copy_(synced)
def _step_distributed_fp16(self) -> None: # manually allreduce gradients after all accumulation steps # check for Inf/NaN # 1. allocate an uninitialized buffer for flattened gradient scaler = _amp_state.loss_scalers[0] master_grads = [ p.grad for p in amp.master_params(self.optimizer) if p.grad is not None ] flat_grad_size = sum(p.numel() for p in master_grads) # allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else \ # torch.float32 allreduce_dtype = torch.float16 flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype) # 2. combine unflattening and predivision of unscaled 'raw' gradient allreduced_views = apex_C.unflatten(flat_raw, master_grads) self._overflow_buf.zero_() amp_C.multi_tensor_scale( 65536, self._overflow_buf, [master_grads, allreduced_views], scaler.loss_scale() / (torch.distributed.get_world_size() * self.gradient_accumulation_steps)) # 3. sum gradient across ranks. Because of the predivision, this averages the gradient torch.distributed.all_reduce(flat_raw) # 4. combine unscaling and unflattening of allreduced gradient self._overflow_buf.zero_() amp_C.multi_tensor_scale(65536, self._overflow_buf, [allreduced_views, master_grads], 1. / scaler.loss_scale()) # 5. update loss scale scaler = _amp_state.loss_scalers[0] old_overflow_buf = scaler._overflow_buf scaler._overflow_buf = self._overflow_buf had_overflow = scaler.update_scale() scaler._overfloat_buf = old_overflow_buf # 6. call optimizer step function if had_overflow == 0: self._step() else: # Overflow detected, print message and clear gradients logger.info( f"Gradient overflow. Skipping step, reducing loss scale to " f"{scaler.loss_scale()}") if _amp_state.opt_properties.master_weights: for param in self.optimizer._amp_stash.all_fp32_from_fp16_params: param.grad = None for param in self.model.parameters(): param.grad = None
def take_optimizer_step(args, optimizer, grad_scaler, model, overflow_buf, global_step): global skipped_steps if args.allreduce_post_accumulation: # manually allreduce gradients after all accumulation steps # check for Inf/NaN # 1. allocate an uninitialized buffer for flattened gradient loss_scale = grad_scaler._get_scale_async() if args.fp16 else 1. master_grads = [ p.grad for p in model.parameters() if p.grad is not None ] flat_grad_size = sum(p.numel() for p in master_grads) allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32 flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype) # 2. combine unflattening and predivision of unscaled 'raw' gradient allreduced_views = apex_C.unflatten(flat_raw, master_grads) overflow_buf.zero_() amp_C.multi_tensor_scale( 65536, overflow_buf, [master_grads, allreduced_views], loss_scale / (get_world_size() * args.gradient_accumulation_steps)) # 3. sum gradient across ranks. Because of the predivision, this averages the gradient torch.distributed.all_reduce(flat_raw) # 4. combine unscaling and unflattening of allreduced gradient overflow_buf.zero_() amp_C.multi_tensor_scale(65536, overflow_buf, [allreduced_views, master_grads], 1.) # 5. update loss scale if args.fp16: had_overflow = overflow_buf.item() else: had_overflow = 0 # 6. call optimizer step function if had_overflow == 0: global_step += 1 else: # Overflow detected, print message and clear gradients skipped_steps += 1 else: global_step += 1 grad_scaler.step(optimizer) grad_scaler.update() optimizer.zero_grad(set_to_none=True) return global_step
def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): for model, master in zip( model_bucket, apex_C.unflatten(flat_master.data, model_bucket)): model.data.copy_(master.data)
def allreduce_and_copy(self, small_bucket): allreduced = self.allreduce_bucket(small_bucket) for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)): buf.copy_(synced)