def partition_activations_in_checkpoint(partition_activation): global PARTITION_ACTIVATIONS PARTITION_ACTIVATIONS = partition_activation if dist.get_rank() == 0: logger.info( f"**************Partition Activations {PARTITION_ACTIVATIONS}************" )
def _save_checkpoint(self, save_dir, tag, client_state={}): save_path = self._get_ckpt_name(save_dir, tag) #self._ensure_directory_exists(save_path) state = { 'module': self.module_state_dict(), 'optimizer': self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None, 'lr_scheduler': self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None, 'csr_tensor_module_names': self.csr_tensor_module_names, 'skipped_steps': self.skipped_steps, 'global_steps': self.global_steps, } state.update(client_state) logger.info('Saving model checkpoint: {}'.format(save_path)) torch.save(state, save_path)
def _report_progress(self, step): lr = self.get_lr() mom = self.get_mom() logger.info('rank:{} step={}, skipped={}, lr={}, mom={}'.format( self.global_rank, step, self.skipped_steps, lr, mom))
def _configure_lr_scheduler(self, client_lr_scheduler): # First check for scheduler in json configuration lr_scheduler = self._scheduler_from_config(self.optimizer) if lr_scheduler: logger.info( f'DeepSpeed using configured LR scheduler = {self.scheduler_name()}') self.lr_scheduler = lr_scheduler else: logger.warning('DeepSpeed using client LR scheduler') self.lr_scheduler = client_lr_scheduler logger.info(f'DeepSpeed LR Scheduler = {self.lr_scheduler}')
def _init_distributed(self, dist_init_required): if self.local_rank >= 0: torch.cuda.set_device(self.local_rank) self.device = torch.device("cuda", self.local_rank) self.world_size = dist.get_world_size() self.global_rank = dist.get_rank() logger.info("Set device to local rank {} within node.".format( self.local_rank)) else: self.world_size = 1 self.global_rank = 0 self.device = torch.device("cuda")
def _configure_using_config_file(deepspeed_config): global num_layers, PARTITION_ACTIVATIONS, CONTIGUOUS_CHECKPOINTING, \ PA_TO_CPU, SYNCHRONIZE, PROFILE_TIME config = DeepSpeedConfig(deepspeed_config).activation_checkpointing_config logger.info(config.repr()) PARTITION_ACTIVATIONS = config.partition_activations CONTIGUOUS_CHECKPOINTING = config.contiguous_memory_optimization num_layers = config.number_checkpoints PA_TO_CPU = config.cpu_checkpointing SYNCHRONIZE = config.synchronize_checkpoint_boundary PROFILE_TIME = config.profile
def _handle_overflow(cpu_sum, x, i): import math rank = torch.distributed.get_rank() if rank == 0: t_i = -1 for v_i, v in enumerate(x.data.contiguous().view(-1)): if not math.isfinite(float(v)): t_i = v_i break logger.info( f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}" )
def step(self, closure=None): """ Not supporting closure. """ if self.fused_adam_legacy: return self.step_fused_adam() # First compute norm for all group so we know if there is overflow grads_groups_flat = [] norm_groups = [] for i, group in enumerate(self.fp16_groups): data_type = self.fp32_groups_flat[i].dtype grads_groups_flat.append( _flatten_dense_tensors([ torch.zeros(p.size(), dtype=data_type, device=p.device) if p.grad is None else p.grad.to(data_type) for p in group ])) self.fp32_groups_flat[i].grad = grads_groups_flat[i] norm_groups.append( get_grad_norm(self.fp32_groups_flat, mpu=self.mpu)) self.overflow = self.overflow_checker.check_using_norm(norm_groups) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: logger.info( "[deepspeed] OVERFLOW! Skipping step. Attempted loss " "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow self.unscale_and_clip_grads(grads_groups_flat, norm_groups) self.optimizer.step() #get rid of the fp32 gradients. Not needed anymore for group in self.fp32_groups_flat: group.grad = None for i in range(len(norm_groups)): updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data.copy_(q.data) return self.overflow
def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True): zero_checkpoint_name = self._get_zero_ckpt_name(load_dir, tag) if not os.path.exists(zero_checkpoint_name): logger.warn( 'Client provided checkpoint load path: {} does not exist ... skip checkpoint load' .format(zero_checkpoint_name)) return None zero_sd = torch.load(zero_checkpoint_name, map_location='cpu') self.optimizer.load_state_dict(zero_sd['optimizer_state_dict'], load_optimizer_states=load_optimizer_states) logger.info('loading zero checkpoint {}'.format(zero_checkpoint_name))
def _set_batch_related_parameters(self): train_batch = self.train_batch_size micro_batch = self.train_micro_batch_size_per_gpu grad_acc = self.gradient_accumulation_steps #all values are provided nothing needs to be set if train_batch is not None and \ micro_batch is not None and \ grad_acc is not None: return #global_accumulation_steps needs to be set elif train_batch is not None and \ micro_batch is not None: grad_acc = train_batch // micro_batch grad_acc //= self.world_size self.gradient_accumulation_steps = grad_acc #micro_batch_per_gpu needs to be set elif train_batch is not None and \ grad_acc is not None: micro_batch = train_batch // self.world_size micro_batch //= grad_acc self.train_micro_batch_size_per_gpu = micro_batch #train_batch_size needs to be set elif micro_batch is not None and \ grad_acc is not None: train_batch_size = micro_batch * grad_acc train_batch_size *= self.world_size self.train_batch_size = train_batch_size #gradient_accumulation_steps and micro_batch_per_gpus is set elif train_batch is not None: self.gradient_accumulation_steps = 1 self.train_micro_batch_size_per_gpu = train_batch // self.world_size #train_batch_size and gradient_accumulation_step is set elif micro_batch is not None: self.train_batch_size = micro_batch * self.world_size self.gradient_accumulation_steps = 1 #either none of the three parameters are provided or just gradient_accumulation_step is provided else: assert False, \ 'Either train_batch_size or micro_batch_per_gpu needs to be provided' logger.info( f' After Train batch {self.train_batch_size} micro_batch {self.train_micro_batch_size_per_gpu} and grad_acc {self.gradient_accumulation_steps}' )
def _initialize_parameter_parallel_groups(parameter_parallel_size=None): data_parallel_size = int(dist.get_world_size()) parameter_parallel_size = parameter_parallel_size or data_parallel_size logger.info("data_parallel_size: %s, parameter_parallel_size: %s", data_parallel_size, parameter_parallel_size) assert data_parallel_size % parameter_parallel_size == 0, \ 'world size should be divisible by parameter parallel size' rank = dist.get_rank() my_group = None for i in range(data_parallel_size // parameter_parallel_size): ranks = range(i * parameter_parallel_size, (i + 1) * parameter_parallel_size) group = torch.distributed.new_group(ranks) if rank in ranks: my_group = group return my_group
def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_count): group_paddings = [] flattened_size = sum([tensor.numel() for tensor in tensor_list]) for i in range(sub_partition_count): padding = get_alignment_padding(flattened_size, i, sub_partition_size) group_paddings.append(padding) logger.info("****Padding information*****") logger.info(f"tensor_size = {flattened_size}") logger.info(f"sub_partition_size = {sub_partition_size}") logger.info(f"sub_partition_count = {sub_partition_count}") for i, padding in enumerate(group_paddings): logger.info(f"padding[{i}] = {padding}") return group_paddings
def step(self, closure=None): """ Not supporting closure. """ if self.fused_lamb_legacy: return self.step_fused_lamb() self.overflow = self.overflow_checker.check() prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: logger.info( "[deepspeed] OVERFLOW! Skipping step. Attempted loss " "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow norm_groups = [] for i, group in enumerate(self.fp16_groups): norm_groups.append(get_grad_norm(group, mpu=self.mpu)) # copying gradients to fp32 to work with fp32 parameters for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]): if fp16_param.grad is None: fp32_param.grad = torch.zeros(fp16_param.size(), dtype=fp32_param.dtype, device=fp32_param.device) else: fp32_param.grad = fp16_param.grad.to(fp32_param.dtype) self.unscale_and_clip_grads(norm_groups) self.optimizer.step() for fp32_group, fp16_group in zip(self.fp32_groups, self.fp16_groups): for fp32_param, fp16_param in zip(fp32_group, fp16_group): #remove the fp32 grad fp32_param.grad = None #copy data from fp32 to fp16 fp16_param.data.copy_(fp32_param.data) return self.overflow
def _configure_distributed_model(self, model): self.module = model if self.fp16_enabled(): self.module.half() self.module.to(self.device) if self.mpu is None: self.data_parallel_group = _initialize_parameter_parallel_groups() self.dp_world_size = dist.get_world_size() src_rank = 0 else: self.data_parallel_group = self.mpu.get_data_parallel_group() self.dp_world_size = self.mpu.get_data_parallel_world_size() src_rank = _get_global_rank(self.mpu.get_data_parallel_group(), 0) logger.info(f"global src_rank={src_rank}") for p in self.module.parameters(): if torch.is_tensor(p): dist.broadcast(p, src_rank, group=self.data_parallel_group)
def _load_checkpoint(self, load_dir, tag, load_module_strict=True, load_optimizer_states=True, load_lr_scheduler_states=True): load_path = self._get_ckpt_name(load_dir, tag) if not os.path.exists(load_path): logger.warn( 'Client provided checkpoint load path: {} does not exist ... skip checkpoint load' .format(load_path)) return None, None logger.info('Loading checkpoint: {}'.format(load_path)) checkpoint = torch.load(load_path, map_location=lambda storage, loc: storage) self.load_module_state_dict(state_dict=checkpoint['module'], strict=load_module_strict) if not self.zero_optimization(): self.optimizer.load_state_dict(checkpoint['optimizer'], load_optimizer_states=load_optimizer_states) if load_lr_scheduler_states and self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) self.csr_tensor_module_names = checkpoint['csr_tensor_module_names'] self.global_steps = checkpoint['global_steps'] self.skipped_steps = checkpoint['skipped_steps'] deepspeed_states = [ 'module', 'optimizer', 'lr_scheduler', 'csr_tensor_module_names', 'skipped_steps', 'global_steps' ] client_state = { key: value for key, value in checkpoint.items() if not key in deepspeed_states } return load_path, client_state
def step_fused_adam(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow grads_groups_flat = [] norm_groups = [] for i, group in enumerate(self.fp16_groups): grads_groups_flat.append( _flatten_dense_tensors([ torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group ])) norm_groups.append(get_weight_norm(grads_groups_flat[i], mpu=self.mpu)) self.overflow = self.overflow_checker.check_using_norm(norm_groups) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(grads_groups_flat, norm_groups, apply_scale=False) # norm is in fact norm*cur_scale self.optimizer.step(grads=[[g] for g in grads_groups_flat], output_params=[[p] for p in self.fp16_groups_flat], scale=combined_scale, grad_norms=norm_groups) # TODO: we probably don't need this? just to be safe for i in range(len(norm_groups)): updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data return self.overflow
def _configure_zero_optimizer(self, optimizer): zero_stage = self.zero_optimization_stage() logger.info('Creating fp16 ZeRO stage {} optimizer'.format(zero_stage)) if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES: assert self.zero_reduce_scatter( ), 'Stage 1 only supports reduce scatter mode' optimizer = FP16_DeepSpeedZeroOptimizer_Stage1( optimizer, static_loss_scale=self.loss_scale(), dynamic_loss_scale=self.dynamic_loss_scale(), dynamic_loss_args=self.dynamic_loss_scale_args(), clip_grad=self.gradient_clipping(), all_gather_partitions=self.zero_allgather_partitions(), allgather_size=self.zero_allgather_bucket_size(), max_elements_per_comm=self.zero_reduce_bucket_size(), dp_process_group=self.data_parallel_group, mpu=self.mpu) elif zero_stage == ZERO_OPTIMIZATION_GRADIENTS: assert self.gradient_accumulation_steps( ) == 1, "ZeRO stage 2 does not support gradient accumulation, if you need gradient accumulation please use stage 1" optimizer = FP16_DeepSpeedZeroOptimizer( optimizer, timers=self.timers, static_loss_scale=self.loss_scale(), dynamic_loss_scale=self.dynamic_loss_scale(), dynamic_loss_args=self.dynamic_loss_scale_args(), clip_grad=self.gradient_clipping(), contiguous_gradients=self.zero_contiguous_gradients(), reduce_bucket_size=self.zero_reduce_bucket_size(), allgather_bucket_size=self.zero_allgather_bucket_size(), dp_process_group=self.data_parallel_group, reduce_scatter=self.zero_reduce_scatter(), overlap_comm=self.zero_overlap_comm(), mpu=self.mpu, postscale_gradients=self.postscale_gradients(), gradient_predivide_factor=self.gradient_predivide_factor()) else: raise NotImplementedError( "ZeRO stage {} not implemented".format(zero_stage)) return optimizer
def see_memory_usage(message): # Print message except when distributed but not rank 0 logger.info(message) logger.info( "Memory Allocated %s GigaBytes ", torch.cuda.memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Max Memory Allocated %s GigaBytes", torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Cache Allocated %s GigaBytes", torch.cuda.memory_cached() / (1024 * 1024 * 1024), ) logger.info( "Max cache Allocated %s GigaBytes", torch.cuda.max_memory_cached() / (1024 * 1024 * 1024), )
def _configure_optimizer(self, client_optimizer, model_parameters): if client_optimizer is not None: basic_optimizer = client_optimizer logger.info('Using client Optimizer as basic optimizer') else: basic_optimizer = self._configure_basic_optimizer(model_parameters) logger.info( 'Using DeepSpeed Optimizer param name {} as basic optimizer'.format( self.optimizer_name())) logger.info('DeepSpeed Basic Optimizer = {}'.format(basic_optimizer)) if self.zero_optimization(): if self.optimizer_name() != ADAM_OPTIMIZER: assert self.zero_allow_untested_optimizer(), \ 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.' logger.warning( "**** You are using ZeRO with an untested optimizer, proceed with caution *****" ) self.optimizer = self._configure_zero_optimizer(basic_optimizer) elif self.fp16_enabled(): self.optimizer = self._configure_fp16_optimizer(basic_optimizer) else: self.optimizer = basic_optimizer
def _mpi_check(self, args, dist_init_required): if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi: from mpi4py import MPI import subprocess comm = MPI.COMM_WORLD rank = comm.Get_rank() world_size = comm.Get_size() master_addr = None if rank == 0: hostname_cmd = ["hostname -I"] result = subprocess.check_output(hostname_cmd, shell=True) master_addr = result.decode('utf-8').split()[0] master_addr = comm.bcast(master_addr, root=0) # Determine local rank by assuming hostnames are unique proc_name = MPI.Get_processor_name() all_procs = comm.allgather(proc_name) local_rank = sum([i == proc_name for i in all_procs[:rank]]) os.environ['RANK'] = str(rank) os.environ['WORLD_SIZE'] = str(world_size) args.local_rank = local_rank os.environ['MASTER_ADDR'] = master_addr os.environ['MASTER_PORT'] = TORCH_DISTRIBUTED_DEFAULT_PORT logger.info( "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}" .format(os.environ['RANK'], args.local_rank, os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'])) if not dist_init_required and dist.is_initialized(): assert dist.get_rank( ) == rank, "MPI rank {} does not match torch rank {}".format( rank, dist.get_rank()) assert dist.get_world_size( ) == world_size, "MPI world size {} does not match torch world size {}".format( world_size, dist.get_world_size())
def model_parallel_cuda_manual_seed(seed): """Initialize model parallel cuda seed. This function should be called after the model parallel is initialized. Also, no torch.cuda.manual_seed should be called after this function. Basically, this is replacement for that function. Two set of RNG states are tracked: default state: This is for data parallelism and is the same among a set of model parallel GPUs but different across different model paralle groups. This is used for example for dropout in the non-model-parallel regions. model-parallel state: This state is different among a set of model parallel GPUs, but the same across data parallel groups. This is used for example for dropout in model parallel regions. """ global mpu # 2718 is just for fun and any POSITIVE value will work. offset = seed + 2718 model_parallel_seed = offset + mpu.get_model_parallel_rank() # Data parallel gets the original sedd. data_parallel_seed = seed if torch.distributed.get_rank() == 0: logger.info( '> initializing model parallel cuda seeds on global rank {}, ' 'model parallel rank {}, and data parallel rank {} with ' 'model parallel seed: {} and data parallel seed: {}'.format( torch.distributed.get_rank(), mpu.get_model_parallel_rank(), mpu.get_data_parallel_rank(), model_parallel_seed, data_parallel_seed), ) _CUDA_RNG_STATE_TRACKER.reset() # Set the default state. torch.cuda.manual_seed(data_parallel_seed) # and model parallel state. _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, model_parallel_seed)
def see_memory_usage(message, force=False): #return if not force: return #dist.barrier() if dist.get_rank() == 0: logger.info(message) logger.info( "Memory Allocated %s GigaBytes", torch.cuda.memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Max Memory Allocated %s GigaBytes", torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Cache Allocated %s GigaBytes", torch.cuda.memory_cached() / (1024 * 1024 * 1024), ) logger.info( "Max cache Allocated %s GigaBytes", torch.cuda.max_memory_cached() / (1024 * 1024 * 1024), )
def step_fused_lamb(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow grads_groups_flat = [] grads_groups = [] norm_groups = [] for i, group in enumerate(self.fp16_groups): grads = [ torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group ] grads_groups.append(grads) grads_groups_flat.append(_flatten_dense_tensors(grads)) norm_groups.append( get_weight_norm(grads_groups_flat[i], mpu=self.mpu)) self.overflow = self.overflow_checker.check_using_norm(norm_groups) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: logger.info( "[deepspeed] OVERFLOW! Skipping step. Attempted loss " "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(norm_groups, apply_scale=False) self.optimizer.step(grads=grads_groups, output_params=self.fp16_groups, scale=combined_scale) return self.overflow
def see_memory_usage(message): return if torch.distributed.is_initialized( ) and not torch.distributed.get_rank() == 0: return # Print message except when distributed but not rank 0 logger.info(message) logger.info( "Memory Allocated %s GigaBytes ", torch.cuda.memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Max Memory Allocated %s GigaBytes", torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024), ) logger.info( "Cache Allocated %s GigaBytes", torch.cuda.memory_cached() / (1024 * 1024 * 1024), ) logger.info( "Max cache Allocated %s GigaBytes", torch.cuda.max_memory_cached() / (1024 * 1024 * 1024), )
def print(self, name): logger.info('{}:'.format(name)) for arg in sorted(vars(self)): if arg != '_param_dict': dots = '.' * (29 - len(arg)) logger.info(' {} {} {}'.format(arg, dots, getattr(self, arg))) logger.info(' json = {}'.format( json.dumps(self._param_dict, sort_keys=True, indent=4, separators=(',', ':'))))
def _configure_fp16_optimizer(self, optimizer): initial_dynamic_scale = self.initial_dynamic_scale() dynamic_loss_args = self.dynamic_loss_scale_args() clip_grad = self.gradient_clipping() if self.optimizer_name() == ADAM_OPTIMIZER: if self.dynamic_loss_scale(): logger.info('Creating fp16 optimizer with dynamic loss scale') timers = self.timers if self.wall_clock_breakdown() else None optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=True, initial_dynamic_scale=initial_dynamic_scale, dynamic_loss_args=dynamic_loss_args, mpu=self.mpu, clip_grad=clip_grad, fused_adam_legacy=self.optimizer_legacy_fusion(), timers=timers) else: logger.info( 'Creating fp16 optimizer with static loss scale: {}'. format(self.loss_scale())) optimizer = FP16_Optimizer( optimizer, static_loss_scale=self.loss_scale(), mpu=self.mpu, clip_grad=clip_grad, fused_adam_legacy=self.optimizer_legacy_fusion()) else: logger.info( 'Creating fp16 unfused optimizer with dynamic loss scale') optimizer = FP16_UnfusedOptimizer( optimizer, dynamic_loss_scale=self.dynamic_loss_scale(), dynamic_loss_args=dynamic_loss_args, mpu=self.mpu, clip_grad=clip_grad, fused_lamb_legacy=self.optimizer_name() == LAMB_OPTIMIZER) return optimizer
def main(args=None): args = parse_args(args) if args.num_nodes >= 0 or args.num_gpus >= 0: if args.include != "" or args.exclude != "": raise ValueError("Cannot specify num_nodes/gpus with include/exclude") multi_node_exec = True resource_pool = fetch_hostfile(args.hostfile) if not resource_pool: resource_pool = {} device_count = torch.cuda.device_count() if device_count == 0: raise RuntimeError("Unable to proceed, no GPU resources available") resource_pool['localhost'] = device_count args.master_addr = "127.0.0.1" multi_node_exec = False if not multi_node_exec and args.num_nodes > 1: raise ValueError("Num nodes is >1 but no extra nodes available via hostfile") active_resources = parse_inclusion_exclusion(resource_pool, args.include, args.exclude) env = os.environ.copy() if not args.master_addr: first_host = list(active_resources.keys())[0] hostname_cmd = ["ssh {} hostname -I".format(first_host)] result = subprocess.check_output(hostname_cmd, shell=True) args.master_addr = result.decode('utf-8').split()[0] logger.info("Using IP address of {} for node {}".format( args.master_addr, first_host)) if args.num_nodes > 0: updated_active_resources = collections.OrderedDict() for count, hostname in enumerate(active_resources.keys()): if args.num_nodes == count: break updated_active_resources[hostname] = active_resources[hostname] active_resources = updated_active_resources if args.num_gpus > 0: updated_active_resources = collections.OrderedDict() for hostname in active_resources.keys(): updated_active_resources[hostname] = list(range(args.num_gpus)) active_resources = updated_active_resources # encode world info as base64 to make it easier to pass via command line world_info_base64 = encode_world_info(active_resources) multi_node_exec = len(active_resources) > 1 if multi_node_exec and not shutil.which('pdsh'): raise RuntimeError("pdsh is not installed, unable to proceed") if not multi_node_exec: deepspeed_launch = [ sys.executable, "-u", "-m", "deepspeed.pt.deepspeed_launch", "--world_info={}".format(world_info_base64), "--master_addr={}".format(args.master_addr), "--master_port={}".format(args.master_port) ] cmd = deepspeed_launch + [args.user_script] + args.user_args else: env['PDSH_RCMD_TYPE'] = 'ssh' active_workers = ",".join(active_resources.keys()) logger.info("Running on the following workers: %s" % active_workers) # PDSH flags for max node fan out and specific hosts to launch on # See https://linux.die.net/man/1/pdsh for flag details pdsh_cmd_args = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers] num_nodes = len(active_resources.keys()) num_gpus_per_node = None curr_path = os.path.abspath('.') if 'PYTHONPATH' in env: env['PYTHONPATH'] = curr_path + ":" + env['PYTHONPATH'] else: env['PYTHONPATH'] = curr_path exports = "" for var in env.keys(): if any(map(lambda name: var.startswith(name), EXPORT_ENVS)): exports += "export {}={}; ".format(var, env[var]) for environ_path in DEEPSPEED_ENVIRONMENT_PATHS: environ_file = os.path.join(environ_path, DEEPSPEED_ENVIRONMENT_NAME) if os.path.isfile(environ_file): with open(environ_file, 'r') as fd: for var in fd.readlines(): exports += "export {}; ".format(var.strip()) deepspeed_launch = [ exports, "cd {};".format(curr_path), sys.executable, "-u", "-m", "deepspeed.pt.deepspeed_launch", '--world_info={}'.format(world_info_base64), "--node_rank=%n", "--master_addr={}".format(args.master_addr), "--master_port={}".format(args.master_port) ] user_args = list( map(lambda x: x if x.startswith("-") else "'{}'".format(x), args.user_args)) cmd = pdsh_cmd_args + deepspeed_launch + [args.user_script] + user_args logger.info("cmd={}".format(cmd)) result = subprocess.Popen(cmd, env=env) result.wait()
def parse_resource_filter(host_info, include_str="", exclude_str=""): '''Parse an inclusion or exclusion string and filter a hostfile dictionary. String format is NODE_SPEC[@NODE_SPEC ...], where NODE_SPEC = NAME[:SLOT[,SLOT ...]]. If :SLOT is omitted, include/exclude all slots on that host. Examples: include_str="worker-0@worker-1:0,2" will use all slots on worker-0 and slots [0, 2] on worker-1. exclude_str="worker-1:0" will use all available resources except slot 0 on worker-1. ''' # Constants that define our syntax NODE_SEP = '@' SLOT_LIST_START = ':' SLOT_SEP = ',' # Ensure include/exclude are mutually exclusive if (include_str != "") and (exclude_str != ""): raise ValueError('include_str and exclude_str are mutually exclusive.') # no-op if (include_str == "") and (exclude_str == ""): return host_info # Either build from scratch or remove items filtered_hosts = dict() if include_str: parse_str = include_str if exclude_str != "": filtered_hosts = deepcopy(host_info) parse_str = exclude_str # foreach node in the list for node_config in parse_str.split(NODE_SEP): # Node can either be alone or node:slot,slot,slot if SLOT_LIST_START in node_config: hostname, slots = node_config.split(SLOT_LIST_START) slots = [int(x) for x in slots.split(SLOT_SEP)] # sanity checks if hostname not in host_info: raise ValueError("Hostname '{}' not found in hostfile".format(hostname)) for s in slots: if s not in host_info[hostname]: raise ValueError("No slot '{}' specified on host '{}'".format( s, hostname)) # If include string, build the list from here if include_str: filtered_hosts[hostname] = slots elif exclude_str: for s in slots: logger.info('removing {} from {}'.format(s, hostname)) filtered_hosts[hostname].remove(s) # User just specified the whole node else: hostname = node_config # sanity check hostname if hostname not in host_info: raise ValueError("Hostname '{}' not found in hostfile".format(hostname)) if include_str: filtered_hosts[hostname] = host_info[hostname] elif exclude_str: filtered_hosts[hostname] = [] # Post-processing to remove duplicates and empty nodes del_keys = [] for hostname in filtered_hosts: # Remove duplicates filtered_hosts[hostname] = list(set(filtered_hosts[hostname])) # Remove empty hosts if len(filtered_hosts[hostname]) == 0: del_keys.append(hostname) for name in del_keys: del filtered_hosts[name] # Lastly, go over filtered_hosts and convert to a OrderedDict() to ensure # we map ranks to nodes correctly by maintaining host_info ordering. ordered_hosts = collections.OrderedDict() for host in host_info: if host in filtered_hosts: ordered_hosts[host] = filtered_hosts[host] return ordered_hosts
def _update_scale(self, skip): if self.dynamic_loss_scale: prev_scale = self.cur_scale if skip: self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_loss_scale) self.last_overflow_iter = self.cur_iter if self.verbose: logger.info("Grad overflow on iteration: %s", self.cur_iter) logger.info( f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}" ) else: # Ensure self.scale_window updates since last overflow stable_interval = (self.cur_iter - self.last_overflow_iter) - 1 if (stable_interval > 0) and (stable_interval % self.scale_window == 0): self.cur_scale *= self.scale_factor if self.verbose: logger.info( f"No Grad overflow for {self.scale_window} iterations" ) logger.info( f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}" ) else: if skip: logger.info("Grad overflow on iteration %s", self.cur_iter) logger.info("Using static loss scale of %s", self.cur_scale) self.cur_iter += 1 return
def __init__(self, init_optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, dynamic_loss_args=None, verbose=True, mpu=None, clip_grad=0.0, fused_lamb_legacy=False): self.fused_lamb_legacy = fused_lamb_legacy if torch.distributed.get_rank() == 0: logger.info(f'Fused Lamb Legacy : {self.fused_lamb_legacy} ') if not torch.cuda.is_available: raise SystemError("Cannot use fp16 without CUDA.") self.optimizer = init_optimizer # param groups self.fp16_groups = [] self.fp32_groups = [] # loop to deal with groups for i, param_group in enumerate(self.optimizer.param_groups): #fp16 weights that represents the actual model weights self.fp16_groups.append(param_group['params']) #creating a fp32 copy of the weights that will be updated first then #copied to fp16 weights fp32_group = [ p.clone().float().detach() for p in param_group['params'] ] #incase the internal optimizer needs it for p in fp32_group: p.requires_grad = True #setting the param groups in the optimizer to point to fp32 #note these are not the weights used by the model #the model uses the fp16 version that we added to fp16_group self.fp32_groups.append(fp32_group) param_group['params'] = self.fp32_groups[i] # we may have a way of fusing dynamic scale. Do not support for now if dynamic_loss_scale: self.dynamic_loss_scale = True self.cur_iter = 0 self.last_overflow_iter = -1 self.scale_factor = 2.0 if dynamic_loss_args is None: self.cur_scale = 1.0 * 2**16 self.scale_window = 1000 self.min_loss_scale = 0.25 else: self.cur_scale = dynamic_loss_args[INITIAL_LOSS_SCALE] self.scale_window = dynamic_loss_args[SCALE_WINDOW] self.min_loss_scale = dynamic_loss_args[MIN_LOSS_SCALE] else: self.dynamic_loss_scale = False self.cur_iter = 0 self.cur_scale = static_loss_scale self.verbose = verbose self.clip_grad = clip_grad self.norm_type = 2 TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) if TORCH_MAJOR == 0 and TORCH_MINOR <= 4: self.clip_grad_norm = torch.nn.utils.clip_grad_norm else: self.clip_grad_norm = torch.nn.utils.clip_grad_norm_ self.mpu = None self.overflow = False self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)