def test_detect(): """Test the detection of a SLURM environment configuration.""" with mock.patch.dict(os.environ, {}, clear=True): assert not SLURMEnvironment.detect() with mock.patch.dict(os.environ, {"SLURM_NTASKS": "2"}): assert SLURMEnvironment.detect()
def _is_slurm_managing_tasks(self) -> bool: """used by choosing cluster enviroment.""" if not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash": return False total_requested_devices = len(self._parallel_devices) * self._num_nodes_flag num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) return num_slurm_tasks == total_requested_devices
def test_default_attributes(): """ Test the default attributes when no environment variables are set. """ env = SLURMEnvironment() assert env.creates_children() assert env.master_address() == "127.0.0.1" assert env.master_port() == 12910 assert env.world_size() is None with pytest.raises(KeyError): # local rank is required to be passed as env variable env.local_rank() with pytest.raises(KeyError): # node_rank is required to be passed as env variable env.node_rank()
def environment_combinations(): expected = dict(global_rank=3, local_rank=1, node_rank=1, world_size=4) # Lightning variables = { "CUDA_VISIBLE_DEVICES": "0,1,2,4", "LOCAL_RANK": "1", "NODE_RANK": "1", "WORLD_SIZE": "8" } environment = LightningEnvironment() yield environment, variables, expected # SLURM variables = { "CUDA_VISIBLE_DEVICES": "0,1,2,4", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_LOCALID": "1", "SLURM_NODEID": "1", "SLURM_PROCID": "3", "SLURM_NTASKS": "4", } environment = SLURMEnvironment() yield environment, variables, expected # TorchElastic variables = { "CUDA_VISIBLE_DEVICES": "0,1,2,4", "LOCAL_RANK": "1", "GROUP_RANK": "1", "RANK": "3", "WORLD_SIZE": "4", "LOCAL_WORLD_SIZE": "2", } environment = TorchElasticEnvironment() yield environment, variables, expected
def test_default_attributes(): """Test the default attributes when no environment variables are set.""" env = SLURMEnvironment() assert env.creates_processes_externally assert env.main_address == "127.0.0.1" assert env.main_port == 12910 assert env.job_name() is None assert env.job_id() is None with pytest.raises(KeyError): # world size is required to be passed as env variable env.world_size() with pytest.raises(KeyError): # local rank is required to be passed as env variable env.local_rank() with pytest.raises(KeyError): # node_rank is required to be passed as env variable env.node_rank()
def test_attributes_from_environment_variables(): """ Test that the SLURM cluster environment takes the attributes from the environment variables. """ env = SLURMEnvironment() assert env.master_address() == "1.1.1.1" assert env.master_port() == 15000 + 1234 assert env.world_size() is None assert env.local_rank() == 2 assert env.node_rank() == 3
def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag if self._is_slurm_managing_tasks(): rank_zero_info("Multiprocessing is handled by SLURM.") return SLURMEnvironment() for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): if env_type.detect(): return env_type() return LightningEnvironment()
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() else: env = LightningEnvironment() return env
def _is_slurm_managing_tasks(self) -> bool: """Returns whether we let SLURM manage the processes or not. Returns ``True`` if and only if these conditions match: - A SLURM cluster is detected - A distributed plugin is being used - The process is not launching in interactive mode - The number of tasks in SLURM matches the requested number of devices and nodes in the Trainer """ if ( (not self.use_ddp and not self.use_ddp2) or not SLURMEnvironment.detect() or SLURMEnvironment.job_name() == "bash" # in interactive mode we don't manage tasks ): return False total_requested_devices = (self.num_gpus or self.num_processes) * self.num_nodes num_slurm_tasks = int(os.environ["SLURM_NTASKS"], 0) return num_slurm_tasks == total_requested_devices
def test_signal_handlers_restored_in_teardown(): """Test that the SignalConnector restores the previously configured handler on teardown.""" assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL trainer = Trainer(plugins=SLURMEnvironment()) connector = SignalConnector(trainer) connector.register_signal_handlers() assert signal.getsignal(signal.SIGTERM) is not signal.SIG_DFL connector.teardown() assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL
def _choose_and_init_cluster_environment(self) -> ClusterEnvironment: if isinstance(self._cluster_environment_flag, ClusterEnvironment): return self._cluster_environment_flag if self._is_slurm_managing_tasks(): rank_zero_info("Multiprocessing is handled by SLURM.") return SLURMEnvironment() for env_type in (BaguaEnvironment, TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): if env_type.detect(): # Ignore type error because it is a false positive: https://github.com/python/mypy/issues/13044 return env_type() # type: ignore[abstract] return LightningEnvironment()
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self._is_slurm_managing_tasks(): rank_zero_info("Multiprocessing is handled by SLURM.") return SLURMEnvironment() for env_type in (TorchElasticEnvironment, KubeflowEnvironment, LSFEnvironment): if env_type.detect(): return env_type() return LightningEnvironment()
def select_cluster_environment(self): if self.cluster_environment is not None: return self.cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() elif self.is_using_torchelastic: env = TorchElasticEnvironment() # TODO: decouple DDP from TE # maybe introduce a DefaultEnvironment? os.environ["PL_IN_DDP_SUBPROCESS"] = "1" else: # TODO: maybe introduce a DefaultEnvironment? env = TorchElasticEnvironment() return env
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self._is_slurm_managing_tasks(): env = SLURMEnvironment() rank_zero_info("Multiprocessing is handled by SLURM.") elif TorchElasticEnvironment.is_using_torchelastic(): env = TorchElasticEnvironment() elif KubeflowEnvironment.is_using_kubeflow(): env = KubeflowEnvironment() elif LSFEnvironment.is_using_lsf(): env = LSFEnvironment() else: env = LightningEnvironment() return env
def select_cluster_environment(self) -> ClusterEnvironment: if self._cluster_environment is not None: return self._cluster_environment if self.is_slurm_managing_tasks: env = SLURMEnvironment() # TODO: decouple DDP from SLURM # refactor and let generic cluster env hold the information about who spawns the processes os.environ["PL_IN_DDP_SUBPROCESS"] = "1" elif self.is_using_torchelastic: env = TorchElasticEnvironment() # TODO: decouple DDP from TE # refactor and let generic cluster env hold the information about who spawns the processes os.environ["PL_IN_DDP_SUBPROCESS"] = "1" else: # TODO: maybe introduce a DefaultEnvironment? env = TorchElasticEnvironment() return env
def test_auto_requeue_flag(auto_requeue): trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=auto_requeue)]) connector = SignalConnector(trainer) connector.register_signal_handlers() if auto_requeue: sigterm_handlers = signal.getsignal(signal.SIGTERM).signal_handlers assert len(sigterm_handlers) == 1 assert sigterm_handlers[0].__qualname__ == "SignalConnector.sigterm_handler_fn" sigusr1_handlers = signal.getsignal(signal.SIGUSR1).signal_handlers assert len(sigusr1_handlers) == 1 assert sigusr1_handlers[0].__qualname__ == "SignalConnector.slurm_sigusr1_handler_fn" else: assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL assert signal.getsignal(signal.SIGUSR1) is signal.SIG_DFL connector.teardown()
def process_args(args, comm=None): """ Process arguments for running inference """ conf_args = process_config(args.config) for k, v in vars(conf_args).items(): if not hasattr(args, k): setattr(args, k, v) logger = args.logger # set up logger if logger is None: logger = logging.getLogger() logger.setLevel(logging.INFO) if args.debug: logger.setLevel(logging.DEBUG) args.logger = logger rank = 0 size = 1 local_rank = 0 env = None if args.slurm: env = SLURMEnvironment() elif args.lsf: env = LSFEnvironment() if env is not None: local_rank = env.local_rank() rank = env.global_rank() size = env.world_size() # Figure out the checkpoint file to read from # and where to save outputs to if args.output is None: if os.path.isdir(args.checkpoint): ckpt = list(glob.glob(f"{args.checkpoint}/*.ckpt")) if len(ckpt) == 0: print(f'No checkpoint file found in {args.checkpoint}', file=sys.stderr) sys.exit(1) elif len(ckpt) > 1: print( f'More than one checkpoint file found in {args.checkpoint}. ' 'Please specify checkpoint with -c', file=sys.stderr) sys.exit(1) args.checkpoint = ckpt[0] outdir = args.checkpoint if outdir.endswith('.ckpt'): outdir = outdir[:-5] if not os.path.isdir(outdir): os.mkdir(outdir) args.output = os.path.join(outdir, 'outputs.h5') # setting classify to so that we can get labels when # we load data. We do this here because we assume that # network is going to output features, and we want to use the # labels for downstream analysis args.classify = True # load the model and override batch size model = process_model(args, inference=True) model.set_inference(True) if args.batch_size is not None: model.hparams.batch_size = args.batch_size args.n_outputs = model.hparams.n_outputs args.save_seq_ids = model.hparams.window is not None # remove ResNet features if args.resnet_features: if 'ResNet' not in model.__class__.__name__: raise ValueError("Cannot use -f without ResNet model - got %s" % model.__class__.__name__) from .models.resnet import ResNetFeatures args.n_outputs = model.fc.in_features if isinstance( model.fc, nn.Linear) else model.fc[0].in_features model = ResNetFeatures(model) args.features = True if size > 1: dataset = LazySeqDataset(path=args.input, hparams=argparse.Namespace(**model.hparams), keep_open=True, comm=comm, size=size, rank=rank) else: dataset = LazySeqDataset(path=args.input, hparams=argparse.Namespace(**model.hparams), keep_open=True) tot_bases = dataset.orig_difile.get_seq_lengths().sum() args.logger.info( f'rank {rank} - processing {tot_bases} bases across {len(dataset)} samples' ) tmp_dset = dataset kwargs = dict(batch_size=args.batch_size, shuffle=False) if args.num_workers > 0: kwargs['num_workers'] = args.num_workers kwargs['multiprocessing_context'] = 'spawn' kwargs['worker_init_fn'] = dataset.worker_init kwargs['persistent_workers'] = True loader = get_loader(tmp_dset, inference=True, **kwargs) args.difile = dataset.difile # return the model, any arguments, and Lighting Trainer args just in case # we want to use them down the line when we figure out how to use Lightning for # inference if not args.features: model = nn.Sequential(model, nn.Softmax(dim=1)) model.eval() ret = [model, dataset, loader, args, env] if size > 1: args.device = torch.device('cuda:%d' % local_rank) else: args.device = torch.device('cuda') return tuple(ret)
for key in sd.keys(): if not (key in this_sd): missing_params.append(key) excessive_params = [] for key in this_sd.keys(): if not (key in sd): excessive_params.append(key) print('Loaded chkpt key not in this model:', missing_params[:10]) print('This model key not in chkpt:', excessive_params[:10]) model.af2.load_state_dict(sd) if args.precision == 'bf16': model = model.to(dtype=torch.bfloat16) if "SLURM_JOB_ID" in os.environ: cluster_environment = SLURMEnvironment() else: cluster_environment = None trainer = pl.Trainer( accelerator="gpu", gpus=args.num_gpus, logger=logger, max_steps=args.max_iter, num_nodes=args.num_nodes, # strategy=CustomDDPPlugin(find_unused_parameters=False), strategy=DeepSpeedPlugin(config=args.deepspeed_config_path, load_full_weights=True), accumulate_grad_batches=args.num_accum, gradient_clip_val=0.1, gradient_clip_algorithm='norm',
def test_main_address_from_slurm_node_list(slurm_node_list, expected): """Test extracting the main node from different formats for the SLURM_NODELIST.""" with mock.patch.dict(os.environ, {"SLURM_NODELIST": slurm_node_list}): env = SLURMEnvironment() assert env.main_address == expected
def test_attributes_from_environment_variables(caplog): """Test that the SLURM cluster environment takes the attributes from the environment variables.""" env = SLURMEnvironment() assert env.auto_requeue is True assert env.main_address == "1.1.1.1" assert env.main_port == 15000 + 1234 assert env.job_id() == int("0001234") assert env.world_size() == 20 assert env.global_rank() == 1 assert env.local_rank() == 2 assert env.node_rank() == 3 assert env.job_name() == "JOB" # setter should be no-op with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): env.set_global_rank(100) assert env.global_rank() == 1 assert "setting global rank is not allowed" in caplog.text caplog.clear() with caplog.at_level(logging.DEBUG, logger="pytorch_lightning.plugins.environments"): env.set_world_size(100) assert env.world_size() == 20 assert "setting world size is not allowed" in caplog.text
@RunIf(skip_windows=True) def test_sync_batchnorm_set_in_custom_strategy(tmpdir): """Tests if layer_sync is automatically set for custom strategy.""" class CustomParallelStrategy(DDPStrategy): def __init__(self, **kwargs): super().__init__(**kwargs) # Set to None so it will be overwritten by the accelerator connector. self._layer_sync = None strategy = CustomParallelStrategy() assert strategy._layer_sync is None Trainer(strategy=strategy, sync_batchnorm=True) assert isinstance(strategy._layer_sync, NativeSyncBatchNorm) @pytest.mark.parametrize( ["plugins", "expected"], [ ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"), ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"), ( [PrecisionPlugin(), DoublePrecisionPlugin(), LightningEnvironment(), SLURMEnvironment()], "PrecisionPlugin, ClusterEnvironment", ), ], ) def test_plugin_only_one_instance_for_one_type(plugins, expected): with pytest.raises(MisconfigurationException, match=f"Received multiple values for {expected}"): Trainer(plugins=plugins)
def process_args(args=None, return_io=False): """ Process arguments for running training """ if not isinstance(args, argparse.Namespace): args = parse_args(args) args.loader_kwargs = dict() targs = dict(max_epochs=args.epochs, ) targs['accumulate_grad_batches'] = args.accumulate env = None if args.ipu: targs['accelerator'] = 'ipu' targs['devices'] = process_gpus(args.gpus) else: targs['gpus'] = process_gpus(args.gpus) targs['num_nodes'] = args.num_nodes if args.lsf: ########################################################################################## # Currently coding against pytorch-lightning 1.4.3 ########################################################################################## if args.num_workers > 4: print0( "num_workers (-k) > 4 can lead to hanging on Summit -- setting to 4", file=sys.stderr) args.num_workers = 4 args.loader_kwargs[ 'num_workers'] = 1 # Set as a default. This will get overridden elsewhere args.loader_kwargs['multiprocessing_context'] = 'spawn' env = LSFEnvironment() elif args.slurm: env = SLURMEnvironment() if env is not None: global RANK global SIZE try: RANK = env.global_rank() SIZE = env.world_size() except: print( ">>> Could not get global rank -- setting RANK to 0 and SIZE to 1", file=sys.stderr) RANK = 0 SIZE = 1 if targs['gpus'] is not None: targs['accelerator'] = 'gpu' if targs['gpus'] == 1: targs['devices'] = 1 else: if env is None: raise ValueError( 'Please specify environment (--lsf or --slurm) if using more than one GPU' ) # parallel_devices = [torch.device(i) for i in range(torch.cuda.device_count()) if i < targs['gpus']] # precision_plugin = NativeMixedPrecisionPlugin(16, 'cuda') torch.cuda.set_device(env.local_rank()) targs['devices'] = targs['gpus'] targs['strategy'] = DDPStrategy( find_unused_parameters=False, cluster_environment=env, #accelerator=GPUAccelerator(), #parallel_devices=parallel_devices, #precision_plugin=precision_plugin, ) print( "---- Rank %s - Using GPUAccelerator with DDPStrategy" % env.global_rank(), file=sys.stderr) else: targs['accelerator'] = 'cpu' del args.gpus if args.sanity: if isinstance(args.sanity, str): args.sanity = int(args.sanity) else: args.sanity = 4000 targs['limit_train_batches'] = args.sanity targs['limit_val_batches'] = args.sanity // 4 if args.lr_find: targs['auto_lr_find'] = True del args.lr_find if args.checkpoint is not None: if os.path.exists(args.checkpoint): targs['resume_from_checkpoint'] = args.checkpoint else: warnings.warn( "Ignoring -c/--checkpoint argument because {args.checkpoint} does not exist." ) args.checkpoint = None if args.cuda_profile: targs['profiler'] = PyTorchProfiler( filename=f'pytorch_prof.{RANK:0{len(str(SIZE))}}', emit_nvtx=True) targs['replace_sampler_ddp'] = False args.loader_kwargs = dict() # make sure we are classifying if we are using adding classifier layers # to a resnet features model if args.features_checkpoint is not None: if args.manifold: raise ValueError( 'Cannot use manifold loss (i.e. -M) if adding classifier (i.e. -F)' ) args.classify = True data_mod = DeepIndexDataModule(args, keep_open=True, seed=args.seed + RANK, rank=RANK, size=SIZE) # if classification problem, use the number of taxa as the number of outputs if args.classify: args.n_outputs = data_mod.dataset.n_outputs args.input_nc = 136 if args.tnf else len(data_mod.dataset.vocab) model = process_model(args, taxa_table=data_mod.dataset.difile.taxa_table) if args.num_workers > 0: data_mod.dataset.close() ret = [model, args, targs] if return_io: ret.append(io) ret.append(data_mod) return tuple(ret)
class CustomParallelStrategy(DDPStrategy): def __init__(self, **kwargs): super().__init__(**kwargs) # Set to None so it will be overwritten by the accelerator connector. self._layer_sync = None strategy = CustomParallelStrategy() assert strategy._layer_sync is None Trainer(strategy=strategy, sync_batchnorm=True) assert isinstance(strategy._layer_sync, NativeSyncBatchNorm) @pytest.mark.parametrize( ["plugins", "expected"], [ ([LightningEnvironment(), SLURMEnvironment()], "ClusterEnvironment"), ([TorchCheckpointIO(), TorchCheckpointIO()], "CheckpointIO"), ( [ PrecisionPlugin(), DoublePrecisionPlugin(), LightningEnvironment(), SLURMEnvironment() ], "PrecisionPlugin, ClusterEnvironment", ), ], ) def test_plugin_only_one_instance_for_one_type(plugins, expected): with pytest.raises(MisconfigurationException, match=f"Received multiple values for {expected}"):