def run_distributed_test( device_ids: List[int] = [-1, -1], func: Callable = None, *args, **kwargs, ): """ This runs the `func` in a simulated distributed environment. # Parameters device_ids: `List[int]` List of devices. There need to be at least 2 devices. Default is [-1, -1]. func: `Callable` `func` needs to be global for spawning the processes, so that it can be pickled. """ check_for_gpu(device_ids) nprocs = world_size = len(device_ids) mp.start_processes( init_process, args=(device_ids, world_size, func, args, kwargs), nprocs=nprocs, start_method="fork", )
def run_distributed_test( device_ids: List[int] = None, func: Callable = None, *args, **kwargs, ): """ This runs the `func` in a simulated distributed environment. # Parameters device_ids: `List[int]` List of devices. There need to be at least 2 devices. Default is [-1, -1]. func: `Callable` `func` needs to be global for spawning the processes, so that it can be pickled. """ device_ids = device_ids or [-1, -1] check_for_gpu(device_ids) # "fork" start method is the default and should be preferred, except when we're # running the tests on GPU, in which case we need to use "spawn". start_method = "spawn" if any(x >= 0 for x in device_ids) else "fork" nprocs = world_size = len(device_ids) mp.start_processes( init_process, args=(world_size, device_ids, func, args, kwargs), nprocs=nprocs, start_method=start_method, )
def test_exception_all(self): with self.assertRaisesRegex( Exception, "\nValueError: legitimate exception from process (0|1)$", ): mp.start_processes(test_exception_all_func, nprocs=2, start_method=self.start_method)
def test_first_argument_index(self): context = mp.get_context(self.start_method) queue = context.SimpleQueue() mp.start_processes(test_success_single_arg_func, args=(queue, ), nprocs=2, start_method=self.start_method) self.assertEqual([0, 1], sorted([queue.get(), queue.get()]))
def launch( main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, backend="nccl", dist_url=None, args=(), timeout=DEFAULT_TIMEOUT, ): """ Args: main_func: a function that will be called by `main_func(*args)` num_machines (int): the total number of machines machine_rank (int): the rank of this machine (one per machine) dist_url (str): url to connect to for distributed training, including protocol e.g. "tcp://127.0.0.1:8686". Can be set to auto to automatically select a free port on localhost args (tuple): arguments passed to main_func """ world_size = num_machines * num_gpus_per_machine if world_size > 1: # https://github.com/pytorch/pytorch/pull/14391 # TODO prctl in spawned processes if dist_url == "auto": assert (num_machines == 1 ), "dist_url=auto cannot work with distributed training." port = _find_free_port() dist_url = f"tcp://127.0.0.1:{port}" start_method = "spawn" cache = vars(args[1]).get("cache", False) # To use numpy memmap for caching image into RAM, we have to use fork method if cache: assert sys.platform != "win32", ( "As Windows platform doesn't support fork method, " "do not add --cache in your training command.") start_method = "fork" mp.start_processes( _distributed_worker, nprocs=num_gpus_per_machine, args=( main_func, world_size, num_gpus_per_machine, machine_rank, backend, dist_url, args, ), daemon=False, start_method=start_method, ) else: main_func(*args)
def test_terminate_exit(self): exitcode = 123 with self.assertRaisesRegex( Exception, "process 0 terminated with exit code %d" % exitcode, ): mp.start_processes(test_terminate_exit_func, args=(exitcode, ), nprocs=2, start_method=self.start_method)
def test_success_first_then_exception(self): exitcode = 123 with self.assertRaisesRegex( Exception, "ValueError: legitimate exception", ): mp.start_processes(test_success_first_then_exception_func, args=(exitcode, ), nprocs=2, start_method=self.start_method)
def test_exception_single(self): nprocs = 2 for i in range(nprocs): with self.assertRaisesRegex( Exception, "\nValueError: legitimate exception from process %d$" % i, ): mp.start_processes(test_exception_single_func, args=(i, ), nprocs=nprocs, start_method=self.start_method)
def test_nested(self): context = mp.get_context(self.start_method) pids_queue = context.Queue() nested_child_sleep = 20.0 mp_context = mp.start_processes( fn=test_nested, args=(pids_queue, nested_child_sleep, self.start_method), nprocs=1, join=False, daemon=False, start_method=self.start_method, ) # Wait for nested children to terminate in time pids = pids_queue.get() start = time.time() while len(pids) > 0: for pid in pids: try: os.kill(pid, 0) except ProcessLookupError: pids.remove(pid) break # This assert fails if any nested child process is still # alive after (nested_child_sleep / 2) seconds. By # extension, this test times out with an assertion error # after (nested_child_sleep / 2) seconds. self.assertLess(time.time() - start, nested_child_sleep / 2) time.sleep(0.1)
def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]: spec = worker_group.spec store = worker_group.store master_addr, master_port = super()._get_master_addr_port(store) restart_count = spec.max_restarts - self._remaining_restarts dist_infos: Dict[int, _DistInfo] = {} for worker in worker_group.workers: local_rank = worker.local_rank dist_infos[local_rank] = _DistInfo( worker.global_rank, worker_group.group_rank, worker.world_size, master_addr, master_port, restart_count, spec.max_restarts, ) self._ret_vals.clear() self._process_context = mp.start_processes( fn=_wrap, args=(self._ret_vals, dist_infos, spec.fn, spec.args), nprocs=spec.local_world_size, join=False, daemon=False, start_method=self._start_method, ) return { local_rank: pid for local_rank, pid in enumerate(self._process_context.pids()) }
def test_terminate_signal(self): # SIGABRT is aliased with SIGIOT message = "process 0 terminated with signal (SIGABRT|SIGIOT)" # Termination through with signal is expressed as a negative exit code # in multiprocessing, so we know it was a signal that caused the exit. # This doesn't appear to exist on Windows, where the exit code is always # positive, and therefore results in a different exception message. # Exit code 22 means "ERROR_BAD_COMMAND". if IS_WINDOWS: message = "process 0 terminated with exit code 22" with self.assertRaisesRegex(Exception, message): mp.start_processes(test_terminate_signal_func, nprocs=2, start_method=self.start_method)
def test_success_non_blocking(self): mp_context = mp.start_processes(test_success_func, nprocs=2, join=False, start_method=self.start_method) # After all processes (nproc=2) have joined it must return True mp_context.join(timeout=None) mp_context.join(timeout=None) self.assertTrue(mp_context.join(timeout=None))
def spawn(fn: Callable, args: Tuple, kwargs_dict: Optional[Mapping] = None, nproc_per_node: int = 1, nnodes: int = 1, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 2222, backend: str = "nccl", **kwargs): world_size = nnodes * nproc_per_node spawn_kwargs = { "join": kwargs.get("join", True), "daemon": kwargs.get("daemon", False), } # start_method in pytorch >= 1.5 if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"): spawn_kwargs["start_method"] = kwargs.get( "start_method", "spawn") mp.start_processes( _NativeDistModel._dist_worker_task_fn, nprocs=nproc_per_node, args=( backend, fn, args, kwargs_dict, world_size, nproc_per_node, node_rank, master_addr, master_port, kwargs, ), **spawn_kwargs, )
def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any: """Launches processes that run the given function in parallel. The function is allowed to have a return value. However, when all processes join, only the return value of worker process 0 gets returned from this `launch` method in the main process. Arguments: function: The entry point for all launched processes. *args: Optional positional arguments to be passed to the given function. trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer` for which a selected set of attributes get restored in the main process after processes join. **kwargs: Optional keyword arguments to be passed to the given function. """ # The default cluster environment in Lightning chooses a random free port number # This needs to be done in the main process here before starting processes to ensure each rank will connect # through the same port os.environ["MASTER_PORT"] = str( self._strategy.cluster_environment.main_port) context = mp.get_context(self._start_method) return_queue = context.SimpleQueue() mp.start_processes( self._wrapping_function, args=(trainer, function, args, kwargs, return_queue), nprocs=self._strategy.num_processes, start_method=self._start_method, ) worker_output = return_queue.get() if trainer is None: return worker_output self._recover_results_in_main_process(worker_output, trainer) return worker_output.trainer_results
def main(): mp.set_start_method('spawn') parser = argparse.ArgumentParser() parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N') parser.add_argument('-g', '--gpus', default=4, type=int, help='number of gpus per node') parser.add_argument('-nr', '--node_rank', default=0, type=int, help='ranking within the nodes') args = parser.parse_args() ######################################################### args.world_size = args.gpus * args.nodes # os.environ['MASTER_ADDR'] = 'frost-6.las.iastate.edu' # os.environ['MASTER_PORT'] = '8888' # # you must fork, so that environments can be forked again. Do not spawn mp.start_processes(single_process_main, nprocs=args.gpus, args=(args, ), start_method="spawn") #
def _start_processes(self): import torch.multiprocessing as mp return mp.start_processes( fn=BackgroundWorkerDynamicBatchWorld.launch_process, nprocs=self._num_workers, # note that index is an an implied argument added by start_processes args=(self.opt, self.get_model_agent(), self._process_queue), join=False, # launch in fork mode so that we can share the model agent easily # note that this prevents us from using ANY threads in ANY of the # subprocesses! (See ChunkTeacher for one example). Fortunately, we # CAN use threads in the MAIN process, and we exploit this at # times. start_method='fork', )
def _start(self): if self._pc: raise ValueError("The process context already initialized." " Most likely the start method got called twice.") self._pc = mp.start_processes( fn=_wrap, args=( self.entrypoint, self.args, self.envs, self.stdouts, self.stderrs, self._ret_vals, self._worker_finished_event, ), nprocs=self.nprocs, join=False, daemon=False, start_method=self.start_method, )
def start_processes(params: List[MpParameters], start_method: str = "spawn") -> MpProcessContext: r""" Launches processes and returns context object. Users can use that object to wait for processes results. """ nprocs = len(params) ret_val_queues: Dict[int, mp.SimpleQueue] = { i: mp.get_context(start_method).SimpleQueue() for i in range(0, nprocs) } mp_proc_context = mp.start_processes( nprocs=nprocs, fn=_wrap, args=(params, ret_val_queues), join=False, daemon=False, start_method=start_method, ) return MpProcessContext(mp_proc_context, ret_val_queues)
def test_nested(i, pids_queue, nested_child_sleep, start_method): context = mp.get_context(start_method) nested_child_ready_queue = context.Queue() nprocs = 2 mp_context = mp.start_processes( fn=test_nested_child_body, args=(nested_child_ready_queue, nested_child_sleep), nprocs=nprocs, join=False, daemon=False, start_method=start_method, ) pids_queue.put(mp_context.pids()) # Wait for both children to have started, to ensure that they # have called prctl(2) to register a parent death signal. for _ in range(nprocs): nested_child_ready_queue.get() # Kill self. This should take down the child processes as well. os.kill(os.getpid(), signal.SIGTERM)
def test_wrapper_fn_kill_script_process(self): """ tests that the wrapper_fn properly terminates the script process (the script process is the sub_sub_process of the agent """ nprocs = 2 sleep = 300 # wraps wrapper_fn to be torch.multiprocessing compatible # which requires rank to be passed as first arugment def wrap_wrap(rank, *args): launch.wrapper_fn(*args) context = start_processes( fn=wrap_wrap, args=(None, (path("bin/sleep_script.py"), "--sleep", f"{sleep}")), nprocs=nprocs, join=False, start_method="fork", ) # quick check to see that the wrapper_fn started running # without this join() call we don't see an exception on typos # and other silly mistakes (silently fails) context.join(timeout=-1) script_pids = [] for wrapper_fn_pid in context.pids(): script_pid = get_child_pids(wrapper_fn_pid) # there should only be one child of wrapper_fn self.assertEqual(1, len(script_pid)) script_pids.append(script_pid[0]) for wrapper_fn_proc in context.processes: wrapper_fn_proc.terminate() wrapper_fn_proc.join() for script_pid in script_pids: self.assertFalse(pid_exists(script_pid))
def PrepareMultiprocessing(args, func): if(args.slurm): args.node_rank = int(os.environ["SLURM_NODEID"]) args.master_addr = os.environ["SLURM_SRUN_COMM_HOST"] args.nnodes = int(os.environ["SLURM_STEP_NUM_NODES"]) args.nproc_per_node = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) gpuList = os.environ["CUDA_VISIBLE_DEVICES"].split(",") args.available_gpu = [int(gpu) for gpu in gpuList] world_size = args.nproc_per_node * args.nnodes local_ranks = args.nproc_per_node node_rank = args.node_rank os.environ["MASTER_ADDR"] = args.master_addr os.environ["MASTER_PORT"] = str(args.master_port) os.environ["WORLD_SIZE"] = str(world_size) #os.environ["NCCL_SOCKET_IFNAME"] = "ib" processes = [] if 'OMP_NUM_THREADS' not in os.environ and args.nproc_per_node > 1: os.environ["OMP_NUM_THREADS"] = str(1) print("*****************************************\n" "Setting OMP_NUM_THREADS environment variable for each process " "to be {} in default, to avoid your system being overloaded, " "please further tune the variable for optimal performance in " "your application as needed. \n" "*****************************************".format(os.environ["OMP_NUM_THREADS"])) #Spawny if(world_size > 1): args.distributed = True globalLock = mp.Lock() prcs = mp.start_processes(func, args=[args, globalLock], nprocs=local_ranks, start_method='forkserver') else: args.distributed = False func(0,args,mp.Lock())
def main(cfg: DictConfig): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '11337' torchmp.start_processes(train, nprocs=len(cfg.distributed.gpus), args=(cfg, Path(hydra.utils.get_original_cwd())), start_method="spawn")
def _mp_fork(*args, **kwargs): return mp.start_processes(*args, **kwargs, join=False, start_method="fork")
def test_success(self): mp.start_processes(test_success_func, nprocs=2, start_method=self.start_method)
parser.add_argument("--nproc_per_node", type=int, default=2) parser.add_argument("--log_interval", type=int, default=4) parser.add_argument("--nb_samples", type=int, default=128) parser.add_argument("--batch_size", type=int, default=16) args_parsed = parser.parse_args() assert dist.is_available() if args_parsed.backend == "nccl": assert torch.cuda.is_available() assert dist.is_nccl_available() elif args_parsed.backend == "gloo": assert dist.is_gloo_available() else: raise ValueError( f"unvalid backend `{args_parsed.backend}` (valid: `gloo` or `nccl`)" ) config = { "log_interval": args_parsed.log_interval, "batch_size": args_parsed.batch_size, "nb_samples": args_parsed.nb_samples, } args = (args_parsed.nproc_per_node, args_parsed.backend, config) # Specific torch.distributed start_processes(training, args=args, nprocs=args_parsed.nproc_per_node, start_method="spawn")
def notebook_launcher(function, args=(), num_processes=None, use_fp16=False, use_port="29500"): """ Launches a training function, using several processes if it's possible in the current environment (TPU with multiple cores for instance). Args: function (:obj:`Callable`): The training function to execute. If it accepts arguments, the first argument should be the index of the process run. args (:obj:`Tuple`): Tuple of arguments to pass to the function (it will receive :obj:`*args`). num_processes (:obj:`int`, `optional`): The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to the number of GPUs available otherwise. use_fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`): If :obj:`True`, will use mixed precision training on multi-GPU. use_port (:obj:`str`, `optional`, defaults to :obj:`"29500"`): The port to use to communicate between processes when launching a multi-GPU training. """ # Are we in a google colab or a Kaggle Kernel? if "IPython" in sys.modules: in_colab_or_kaggle = "google.colab" in str( sys.modules["IPython"].get_ipython()) elif any(key.startswith("KAGGLE") for key in os.environ.keys()): in_colab_or_kaggle = True else: in_colab_or_kaggle = False if in_colab_or_kaggle: if os.environ.get("TPU_NAME", None) is not None: # TPU launch import torch_xla.distributed.xla_multiprocessing as xmp if len(AcceleratorState._shared_state) > 0: raise ValueError( "To train on TPU in Colab or Kaggle Kernel, the `Accelerator` should only be initialized inside " "your training function. Restart your notebook and make sure no cells initializes an " "`Accelerator`.") if num_processes is None: num_processes = 8 launcher = PrepareForLaunch(function, distributed_type="TPU") print(f"Launching a training on {num_processes} TPU cores.") xmp.spawn(launcher, args=args, nprocs=num_processes, start_method="fork") else: # No need for a distributed launch otherwise as it's either CPU or one GPU. if torch.cuda.is_available(): print("Launching training on one GPU.") else: print("Launching training on CPU.") function(*args) else: if num_processes is None: raise ValueError( "You have to specify the number of GPUs you would like to use, add `num_process=...` to your call." ) if num_processes > 1: # Multi-GPU launch if len(AcceleratorState._shared_state) > 0: raise ValueError( "To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized " "inside your training function. Restart your notebook and make sure no cells initializes an " "`Accelerator`.") if torch.cuda.is_initialized(): raise ValueError( "To launch a multi-GPU training from your notebook, you need to avoid running any instruction " "using `torch.cuda` in any cell. Restart your notebook and make sure no cells use any CUDA " "function.") # torch.distributed will expect a few environment variable to be here. We set the ones common to each # process here (the other ones will be set be the launcher). os.environ["WORLD_SIZE"] = str(num_processes) os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = str(use_port) os.environ["USE_FP16"] = str(use_fp16) launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU") try: print(f"Launching a training on {num_processes} GPUs.") start_processes(launcher, nprocs=num_processes, start_method="fork") finally: # Clean up the environment variables set. del os.environ["WORLD_SIZE"] del os.environ["MASTER_ADDR"] del os.environ["MASTER_PORT"] else: # No need for a distributed launch otherwise as it's either CPU or one GPU. if torch.cuda.is_available(): print("Launching training on one GPU.") else: print("Launching training on CPU.") function(*args)