def tpu_train(self, tpu_core_idx, model): # put model on tpu model.to(xm.xla_device()) # get the appropriate tpu ranks self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() # avoid duplicating progress bar self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if self.tpu_global_core_rank == 0 else 0 # track current tpu self.current_tpu_idx = tpu_core_idx self.proc_rank = self.tpu_local_core_rank set_proc_rank(self.proc_rank) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) # init 16 bit for TPU if self.precision == 16: os.environ['XLA_USE_BF16'] = str(1) log.info(f'INIT TPU local core: {self.tpu_local_core_rank},' f' global rank: {self.tpu_global_core_rank}') # continue training routine self.run_pretrain_routine(model) self.save_spawn_weights(model)
def ddp_train(self, process_idx, model): """ Entry point into a DP thread :param gpu_idx: :param model: :param cluster_obj: :return: """ # node rank using relative slurm id # otherwise default to node rank 0 try: node_id = os.environ['SLURM_NODEID'] self.node_rank = int(node_id) except Exception: self.node_rank = 0 # show progressbar only on progress_rank 0 self.progress_bar_refresh_rate = (self.progress_bar_refresh_rate if self.node_rank == 0 and process_idx == 0 else 0) # determine which process we are and world size if self.use_ddp: self.proc_rank = self.node_rank * self.num_processes + process_idx self.world_size = self.num_nodes * self.num_processes elif self.use_ddp2: self.proc_rank = self.node_rank self.world_size = self.num_nodes # set warning rank set_proc_rank(self.proc_rank) # let the exp know the rank to avoid overwriting logs if self.logger is not None: self.logger.rank = self.proc_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self model.init_ddp_connection(self.proc_rank, self.world_size) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # MODEL # copy model to each gpu if self.on_gpu: self.root_gpu = self.data_parallel_device_ids[process_idx] torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) # set model properties before going into wrapper self.copy_trainer_model_properties(model) # AMP # run through amp wrapper before going to distributed DP if self.use_amp: # An example model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # DDP2 uses all GPUs on the machine if self.distributed_backend == 'ddp': device_ids = [self.root_gpu] elif self.use_ddp2: device_ids = self.data_parallel_device_ids else: # includes ddp_cpu device_ids = None # allow user to configure ddp model = model.configure_ddp(model, device_ids) # continue training routine self.run_pretrain_routine(model) # when ddp ends, we save the model self.save_spawn_weights(model)
def ddp_train(self, process_idx, model): """ Entry point into a DP thread :param gpu_idx: :param model: :param cluster_obj: :return: """ # node rank using relative slurm id if under slurm management # otherwise use given node rank or default to node rank 0 try: node_id = os.environ[ 'SLURM_NODEID'] if self.is_slurm_managing_tasks else os.environ[ 'NODE_RANK'] self.node_rank = int(node_id) except KeyError: log.warning( "SLURM_NODEID or NODE_RANK environment variable is not defined. Set as 0." ) self.node_rank = 0 # show progressbar only on progress_rank 0 if (self.node_rank != 0 or process_idx != 0) and self.progress_bar_callback is not None: self.progress_bar_callback.disable() # determine which process we are and world size if self.use_ddp: self.proc_rank = self.node_rank * self.num_processes + process_idx self.world_size = self.num_nodes * self.num_processes elif self.use_ddp2: self.proc_rank = self.node_rank self.world_size = self.num_nodes # set warning rank set_proc_rank(self.proc_rank) # let the exp know the rank to avoid overwriting logs if self.logger is not None: self.logger.rank = self.proc_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table model.trainer = self model.init_ddp_connection(self.proc_rank, self.world_size, self.is_slurm_managing_tasks) # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # MODEL # copy model to each gpu if self.on_gpu: self.root_gpu = process_idx torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) # set model properties before going into wrapper self.copy_trainer_model_properties(model) # AMP # run through amp wrapper before going to distributed DP # TODO: remove in v0.8.0 if self.use_amp and not self.use_native_amp: model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # DDP2 uses all GPUs on the machine if self.distributed_backend == 'ddp': device_ids = [self.root_gpu] elif self.use_ddp2: device_ids = self.data_parallel_device_ids else: # includes ddp_cpu device_ids = None # allow user to configure ddp model = model.configure_ddp(model, device_ids) # continue training routine self.run_pretrain_routine(model) # when ddp ends, we save the model self.save_spawn_weights(model)
def horovod_train(self, model): # Horovod: initialize library hvd.init() if torch.cuda.is_available() and self.on_gpu: # Horovod: pin GPU to local rank torch.cuda.set_device(hvd.local_rank()) model.cuda(hvd.local_rank()) # Only show progress bar from the first worker self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank( ) == 0 else 0 # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() if self.use_amp: # An example model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters( model, optimizer)) for optimizer in self.optimizers ] # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.proc_rank = hvd.rank() set_proc_rank(self.proc_rank) if self.logger: self.logger.rank = self.proc_rank if model.logger: model.logger.rank = self.proc_rank with ExitStack() as stack: for optimizer in self.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) self.run_pretrain_routine(model)