def fetch_hostfile(hostfile_path): if not os.path.isfile(hostfile_path): logger.warning("Unable to find hostfile, will proceed with training " "with local resources only.") return None # e.g., worker-0 slots=16 with open(hostfile_path, 'r') as fd: resource_pool = collections.OrderedDict() for line in fd.readlines(): try: hostname, slots = line.split() _, slot_count = slots.split("=") slot_count = int(slot_count) except ValueError as err: logger.error("Hostfile is not formatted correctly, unable to " "proceed with training.") raise err if hostname in resource_pool: logger.error("Hostfile contains duplicate hosts, unable to " "proceed with training.") raise ValueError("host {} is already defined".format(hostname)) resource_pool[hostname] = slot_count return resource_pool
def _configure_optimizer(self, client_optimizer, model_parameters): if client_optimizer is not None: basic_optimizer = client_optimizer logger.info('Using client Optimizer as basic optimizer') else: basic_optimizer = self._configure_basic_optimizer(model_parameters) logger.info( 'Using DeepSpeed Optimizer param name {} as basic optimizer'.format( self.optimizer_name())) logger.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer)) if self.zero_optimization(): if self.optimizer_name() != ADAM_OPTIMIZER: assert self.zero_allow_untested_optimizer(), \ 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' logger.warning( "**** You are using ZeRO with an untested optimizer, proceed with caution *****" ) self.optimizer = self._configure_zero_optimizer(basic_optimizer) elif self.fp16_enabled(): self.optimizer = self._configure_fp16_optimizer(basic_optimizer) else: self.optimizer = basic_optimizer
def get_lr(self): if self.last_batch_iteration < 0: logger.warning( "Attempting to get learning rate from scheduler before it has started" ) return [0.0] gamma = self._get_gamma() return [ min_lr + (delta_lr * gamma) for min_lr, delta_lr in zip(self.min_lrs, self.delta_lrs) ]
def _configure_lr_scheduler(self, client_lr_scheduler): # First check for scheduler in json configuration lr_scheduler = self._scheduler_from_config(self.optimizer) if lr_scheduler: logger.info( f'DeepSpeed using configured LR scheduler = {self.scheduler_name()}') self.lr_scheduler = lr_scheduler else: logger.warning('DeepSpeed using client LR scheduler') self.lr_scheduler = client_lr_scheduler logger.info(f'DeepSpeed LR Scheduler = {self.lr_scheduler}')
def read_zero_config_deprecated(self, param_dict): zero_config_dict = {} zero_config_dict[ ZERO_OPTIMIZATION_STAGE] = 1 if param_dict[ZERO_OPTIMIZATION] else 0 if zero_config_dict[ZERO_OPTIMIZATION_STAGE] > 0: zero_config_dict[ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE] = get_scalar_param( param_dict, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT) logger.warning( 'DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}' .format(ZERO_FORMAT)) return zero_config_dict
def _do_args_sanity_check(self, args): if hasattr(args, 'deepscale_config') and args.deepscale_config is not None: logger.warning( "************ --deepscale_config is deprecated, please use --deepspeed_config ************" ) if hasattr(args, 'deepspeed_config'): assert args.deepspeed_config is None, "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config" args.deepspeed_config = args.deepscale_config assert hasattr(args, 'local_rank') and type(args.local_rank) == int, \ 'DeepSpeed requires integer command line parameter --local_rank' if self.config_params is None: assert hasattr(args, 'deepspeed_config') and args.deepspeed_config is not None, \ 'DeepSpeed requires --deepspeed_config to specify configuration file' assert os.path.isfile(args.deepspeed_config), \ 'DeepSpeed configuration file: {} is not an existing file'.format(args.deepspeed_config)
def _scale_loss(self, prescaled_loss): if isinstance(prescaled_loss, torch.Tensor): scaled_loss = prescaled_loss / self.gradient_accumulation_steps() elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list): scaled_loss = [] for l in prescaled_loss: if isinstance(l, torch.Tensor): scaled_loss.append(l / self.gradient_accumulation_steps()) else: scaled_loss.append(l) else: scaled_loss = prescaled_loss if self.warn_unscaled_loss: logger.warning( f'DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}' ) self.warn_unscaled_loss = False return scaled_loss
def _do_warning_check(self): fp16_enabled = self.fp16_enabled or self.zero_enabled vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT) if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0: logger.warning( "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization." .format(vocabulary_size, TENSOR_CORE_ALIGN_SIZE)) if self.optimizer_params is not None and \ MAX_GRAD_NORM in self.optimizer_params.keys() and \ self.optimizer_params[MAX_GRAD_NORM] > 0: if fp16_enabled: logger.warning( 'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper' .format(MAX_GRAD_NORM, self.optimizer_params[MAX_GRAD_NORM])) else: logger.warning( 'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero' .format(self.optimizer_params[MAX_GRAD_NORM])) self.optimizer_params[MAX_GRAD_NORM] = 0.0
def __init__(self, args, model, optimizer=None, model_parameters=None, training_data=None, lr_scheduler=None, mpu=None, dist_init_required=None, collate_fn=None, config_params=None): super(DeepSpeedLight, self).__init__() self.client_optimizer = optimizer self.client_model_parameters = model_parameters self.client_lr_scheduler = lr_scheduler self.training_data = training_data self.collate_fn = collate_fn self.mpu = mpu self.data_parallel_group = None self.global_steps = 0 self.micro_steps = 0 self.skipped_steps = 0 self.gradient_average = True self.warn_unscaled_loss = True self.config_params = config_params if dist_init_required is None: dist_init_required = not dist.is_initialized() self._mpi_check(args, dist_init_required) self.dist_backend = "nccl" if dist_init_required: if not dist.is_initialized(): logger.info("Initializing torch distributed with backend: {}".format( self.dist_backend)) dist.init_process_group(backend=self.dist_backend) else: logger.warning( "Was given dist_init_required=True but detected that torch" "distributed was already initialized, cannot initialize twice.") self._do_args_sanity_check(args) self._configure_with_arguments(args, mpu) self._do_sanity_check() self.sample_count = 0 if self.tensorboard_enabled(): self.summary_writer = self.get_summary_writer() self._init_distributed(dist_init_required) # Configure distributed model self._configure_distributed_model(model) # Configure wall clock timer self.timers = SynchronizedWallClockTimer() # Throughput timer self.tput_timer = ThroughputTimer( batch_size=self.train_micro_batch_size_per_gpu(), num_workers=self.dp_world_size, monitor_memory=False) self.training_dataloader = self.deepspeed_io( training_data) if training_data else None # Configure optimizer and scheduler self.optimizer = None self.lr_scheduler = None if model_parameters or optimizer: self._configure_optimizer(optimizer, model_parameters) self._configure_lr_scheduler(lr_scheduler) self._report_progress(0) # Bookkeeping for csr support self.csr_tensor_module_names = set() if self.sparse_gradients_enabled(): for name, module in self.module.named_modules(): if isinstance(module, torch.nn.Embedding): self.csr_tensor_module_names.add(name + ".weight") logger.info("Will convert {} to sparse (csr) " "tensor during training".format(name)) self.save_non_zero_checkpoint = False self.save_zero_checkpoint = False self._configure_checkpointing(dist_init_required) if self.global_rank == 0: self._config.print('DeepSpeedLight configuration') if self.dump_state(): print_configuration(self, 'DeepSpeedLight')
import deepspeed.pt.deepspeed_lr_schedules as lr_schedules from deepspeed.pt.deepspeed_csr_tensor import CSRTensor MEMORY_OPT_ALLREDUCE_SIZE = 500000000 SUMMARY_WRITER_DIR_NAME = "JobId" try: from apex_C import flatten from apex_C import unflatten except ImportError: try: _ = warned_flatten except NameError: logger.warning( "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten." ) warned_flatten = True from torch._utils import _flatten_dense_tensors as flatten from torch._utils import _unflatten_dense_tensors as unflatten def split_half_float_double_csr(tensors): dtypes = [ "torch.cuda.HalfTensor", "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor", CSRTensor.type() ] buckets = [] for i, dtype in enumerate(dtypes):