def backward_step(optimizer, model, loss, args, timers): """Backward step.""" # Backward pass. if args.deepspeed: model.backward(loss) else: optimizer.zero_grad() if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() # Update master gradients. if not args.deepspeed: if args.fp16: optimizer.update_master_grads() # Clipping gradients helps prevent the exploding gradient. if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) return loss
def backward_step(optimizer, model, lm_loss, args, timers): """Backward step.""" # Total loss. loss = lm_loss # Backward pass. if args.deepspeed: model.backward(loss) else: optimizer.zero_grad() if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() # Reduce across processes. lm_loss_reduced = lm_loss reduced_losses = lm_loss.view(1) if args.deepspeed: # DeepSpeed backward propagation already addressed all reduce communication. # Reset the timer to avoid breaking timer logs below. timers('allreduce').reset() else: torch.distributed.all_reduce(reduced_losses.data) reduced_losses.data = reduced_losses.data / args.world_size if not USE_TORCH_DDP: timers('allreduce').start() model.allreduce_params(reduce_after=False, fp32_allreduce=args.fp32_allreduce) timers('allreduce').stop() lm_loss_reduced = reduced_losses # Update master gradients. if not args.deepspeed: if args.fp16: optimizer.update_master_grads() # Clipping gradients helps prevent the exploding gradient. if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) return lm_loss_reduced
def backward_step(optimizer, model, lm_loss, args, timers, zero_grad=True): """Backward step.""" # Total loss. loss = lm_loss #+ nsp_loss # Backward pass. if zero_grad: optimizer.zero_grad() if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() # Reduce across processes. lm_loss_reduced = lm_loss #nsp_loss_reduced = nsp_loss reduced_losses = lm_loss.view(1) #torch.cat((lm_loss.view(1), nsp_loss.view(1))) torch.distributed.all_reduce(reduced_losses.data) reduced_losses.data = reduced_losses.data / args.world_size if args.DDP_impl == 'local': timers('allreduce').start() model.allreduce_params(reduce_after=False, fp32_allreduce=args.fp32_allreduce) timers('allreduce').stop() lm_loss_reduced = reduced_losses[0] #nsp_loss_reduced = reduced_losses[1] # Update master gradients. if args.fp16: optimizer.update_master_grads() # Clipping gradients helps prevent the exploding gradient. if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) return lm_loss_reduced#, nsp_loss_reduced
def backward_step(optimizer, model, lm_loss, args, timers): """Backward step.""" # Total loss. loss = lm_loss # Backward pass. if args.deepspeed: model.backward(loss) else: # optimizer.zero_grad() if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() if args.deepspeed or args.DDP_impl == 'torch': # DeepSpeed backward propagation already addressed all reduce communication. # Reset the timer to avoid breaking timer logs below. timers('allreduce').reset() else: timers('allreduce').start() model.allreduce_params(reduce_after=False, fp32_allreduce=args.fp32_allreduce) timers('allreduce').stop() # Update master gradients. if not args.deepspeed: if args.fp16: optimizer.update_master_grads() # Clipping gradients helps prevent the exploding gradient. if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) return lm_loss
def backward_step(optimizer, model, lm_loss, nsp_loss, args): """Backward step.""" # Total loss. loss = lm_loss + nsp_loss # Backward pass. optimizer.zero_grad() if args.fp16: optimizer.backward(loss, update_master_grads=False) else: loss.backward() # Reduce across processes. lm_loss_reduced = lm_loss nsp_loss_reduced = nsp_loss reduced_losses = torch.cat((lm_loss.view(1), nsp_loss.view(1))) torch.distributed.all_reduce(reduced_losses.data) reduced_losses.data = reduced_losses.data / args.world_size if not USE_TORCH_DDP: model.allreduce_params(reduce_after=False, fp32_allreduce=args.fp32_allreduce) lm_loss_reduced = reduced_losses[0] nsp_loss_reduced = reduced_losses[1] # Update master gradients. if args.fp16: optimizer.update_master_grads() # Clipping gradients helps prevent the exploding gradient. if args.clip_grad > 0: if not args.fp16: mpu.clip_grad_norm(model.parameters(), args.clip_grad) else: optimizer.clip_master_grads(args.clip_grad) return lm_loss_reduced, nsp_loss_reduced