Example #1
0
def run_distributed_test(
    device_ids: List[int] = [-1, -1],
    func: Callable = None,
    *args,
    **kwargs,
):
    """
    This runs the `func` in a simulated distributed environment.

    # Parameters

    device_ids: `List[int]`
        List of devices. There need to be at least 2 devices. Default is [-1, -1].

    func: `Callable`
        `func` needs to be global for spawning the processes, so that it can be pickled.
    """

    check_for_gpu(device_ids)
    nprocs = world_size = len(device_ids)
    mp.start_processes(
        init_process,
        args=(device_ids, world_size, func, args, kwargs),
        nprocs=nprocs,
        start_method="fork",
    )
Example #2
0
def run_distributed_test(
    device_ids: List[int] = None,
    func: Callable = None,
    *args,
    **kwargs,
):
    """
    This runs the `func` in a simulated distributed environment.

    # Parameters

    device_ids: `List[int]`
        List of devices. There need to be at least 2 devices. Default is [-1, -1].

    func: `Callable`
        `func` needs to be global for spawning the processes, so that it can be pickled.
    """
    device_ids = device_ids or [-1, -1]
    check_for_gpu(device_ids)
    # "fork" start method is the default and should be preferred, except when we're
    # running the tests on GPU, in which case we need to use "spawn".
    start_method = "spawn" if any(x >= 0 for x in device_ids) else "fork"
    nprocs = world_size = len(device_ids)
    mp.start_processes(
        init_process,
        args=(world_size, device_ids, func, args, kwargs),
        nprocs=nprocs,
        start_method=start_method,
    )
Example #3
0
 def test_exception_all(self):
     with self.assertRaisesRegex(
             Exception,
             "\nValueError: legitimate exception from process (0|1)$",
     ):
         mp.start_processes(test_exception_all_func,
                            nprocs=2,
                            start_method=self.start_method)
Example #4
0
 def test_first_argument_index(self):
     context = mp.get_context(self.start_method)
     queue = context.SimpleQueue()
     mp.start_processes(test_success_single_arg_func,
                        args=(queue, ),
                        nprocs=2,
                        start_method=self.start_method)
     self.assertEqual([0, 1], sorted([queue.get(), queue.get()]))
Example #5
0
def launch(
        main_func,
        num_gpus_per_machine,
        num_machines=1,
        machine_rank=0,
        backend="nccl",
        dist_url=None,
        args=(),
        timeout=DEFAULT_TIMEOUT,
):
    """
    Args:
        main_func: a function that will be called by `main_func(*args)`
        num_machines (int): the total number of machines
        machine_rank (int): the rank of this machine (one per machine)
        dist_url (str): url to connect to for distributed training, including protocol
                       e.g. "tcp://127.0.0.1:8686".
                       Can be set to auto to automatically select a free port on localhost
        args (tuple): arguments passed to main_func
    """
    world_size = num_machines * num_gpus_per_machine
    if world_size > 1:
        # https://github.com/pytorch/pytorch/pull/14391
        # TODO prctl in spawned processes

        if dist_url == "auto":
            assert (num_machines == 1
                    ), "dist_url=auto cannot work with distributed training."
            port = _find_free_port()
            dist_url = f"tcp://127.0.0.1:{port}"

        start_method = "spawn"
        cache = vars(args[1]).get("cache", False)

        # To use numpy memmap for caching image into RAM, we have to use fork method
        if cache:
            assert sys.platform != "win32", (
                "As Windows platform doesn't support fork method, "
                "do not add --cache in your training command.")
            start_method = "fork"

        mp.start_processes(
            _distributed_worker,
            nprocs=num_gpus_per_machine,
            args=(
                main_func,
                world_size,
                num_gpus_per_machine,
                machine_rank,
                backend,
                dist_url,
                args,
            ),
            daemon=False,
            start_method=start_method,
        )
    else:
        main_func(*args)
Example #6
0
 def test_terminate_exit(self):
     exitcode = 123
     with self.assertRaisesRegex(
             Exception,
             "process 0 terminated with exit code %d" % exitcode,
     ):
         mp.start_processes(test_terminate_exit_func,
                            args=(exitcode, ),
                            nprocs=2,
                            start_method=self.start_method)
Example #7
0
 def test_success_first_then_exception(self):
     exitcode = 123
     with self.assertRaisesRegex(
             Exception,
             "ValueError: legitimate exception",
     ):
         mp.start_processes(test_success_first_then_exception_func,
                            args=(exitcode, ),
                            nprocs=2,
                            start_method=self.start_method)
Example #8
0
 def test_exception_single(self):
     nprocs = 2
     for i in range(nprocs):
         with self.assertRaisesRegex(
                 Exception,
                 "\nValueError: legitimate exception from process %d$" % i,
         ):
             mp.start_processes(test_exception_single_func,
                                args=(i, ),
                                nprocs=nprocs,
                                start_method=self.start_method)
Example #9
0
    def test_nested(self):
        context = mp.get_context(self.start_method)
        pids_queue = context.Queue()
        nested_child_sleep = 20.0
        mp_context = mp.start_processes(
            fn=test_nested,
            args=(pids_queue, nested_child_sleep, self.start_method),
            nprocs=1,
            join=False,
            daemon=False,
            start_method=self.start_method,
        )

        # Wait for nested children to terminate in time
        pids = pids_queue.get()
        start = time.time()
        while len(pids) > 0:
            for pid in pids:
                try:
                    os.kill(pid, 0)
                except ProcessLookupError:
                    pids.remove(pid)
                    break

            # This assert fails if any nested child process is still
            # alive after (nested_child_sleep / 2) seconds. By
            # extension, this test times out with an assertion error
            # after (nested_child_sleep / 2) seconds.
            self.assertLess(time.time() - start, nested_child_sleep / 2)
            time.sleep(0.1)
Example #10
0
    def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
        spec = worker_group.spec
        store = worker_group.store
        master_addr, master_port = super()._get_master_addr_port(store)
        restart_count = spec.max_restarts - self._remaining_restarts

        dist_infos: Dict[int, _DistInfo] = {}
        for worker in worker_group.workers:
            local_rank = worker.local_rank
            dist_infos[local_rank] = _DistInfo(
                worker.global_rank,
                worker_group.group_rank,
                worker.world_size,
                master_addr,
                master_port,
                restart_count,
                spec.max_restarts,
            )

        self._ret_vals.clear()
        self._process_context = mp.start_processes(
            fn=_wrap,
            args=(self._ret_vals, dist_infos, spec.fn, spec.args),
            nprocs=spec.local_world_size,
            join=False,
            daemon=False,
            start_method=self._start_method,
        )

        return {
            local_rank: pid
            for local_rank, pid in enumerate(self._process_context.pids())
        }
Example #11
0
    def test_terminate_signal(self):
        # SIGABRT is aliased with SIGIOT
        message = "process 0 terminated with signal (SIGABRT|SIGIOT)"

        # Termination through with signal is expressed as a negative exit code
        # in multiprocessing, so we know it was a signal that caused the exit.
        # This doesn't appear to exist on Windows, where the exit code is always
        # positive, and therefore results in a different exception message.
        # Exit code 22 means "ERROR_BAD_COMMAND".
        if IS_WINDOWS:
            message = "process 0 terminated with exit code 22"

        with self.assertRaisesRegex(Exception, message):
            mp.start_processes(test_terminate_signal_func,
                               nprocs=2,
                               start_method=self.start_method)
Example #12
0
    def test_success_non_blocking(self):
        mp_context = mp.start_processes(test_success_func,
                                        nprocs=2,
                                        join=False,
                                        start_method=self.start_method)

        # After all processes (nproc=2) have joined it must return True
        mp_context.join(timeout=None)
        mp_context.join(timeout=None)
        self.assertTrue(mp_context.join(timeout=None))
Example #13
0
        def spawn(fn: Callable,
                  args: Tuple,
                  kwargs_dict: Optional[Mapping] = None,
                  nproc_per_node: int = 1,
                  nnodes: int = 1,
                  node_rank: int = 0,
                  master_addr: str = "127.0.0.1",
                  master_port: int = 2222,
                  backend: str = "nccl",
                  **kwargs):
            world_size = nnodes * nproc_per_node

            spawn_kwargs = {
                "join": kwargs.get("join", True),
                "daemon": kwargs.get("daemon", False),
            }
            # start_method in pytorch >= 1.5
            if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"):
                spawn_kwargs["start_method"] = kwargs.get(
                    "start_method", "spawn")

            mp.start_processes(
                _NativeDistModel._dist_worker_task_fn,
                nprocs=nproc_per_node,
                args=(
                    backend,
                    fn,
                    args,
                    kwargs_dict,
                    world_size,
                    nproc_per_node,
                    node_rank,
                    master_addr,
                    master_port,
                    kwargs,
                ),
                **spawn_kwargs,
            )
Example #14
0
    def launch(self,
               function: Callable,
               *args: Any,
               trainer: Optional["pl.Trainer"] = None,
               **kwargs: Any) -> Any:
        """Launches processes that run the given function in parallel.

        The function is allowed to have a return value. However, when all processes join, only the return value
        of worker process 0 gets returned from this `launch` method in the main process.

        Arguments:
            function: The entry point for all launched processes.
            *args: Optional positional arguments to be passed to the given function.
            trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer` for which
                a selected set of attributes get restored in the main process after processes join.
            **kwargs: Optional keyword arguments to be passed to the given function.
        """
        # The default cluster environment in Lightning chooses a random free port number
        # This needs to be done in the main process here before starting processes to ensure each rank will connect
        # through the same port
        os.environ["MASTER_PORT"] = str(
            self._strategy.cluster_environment.main_port)
        context = mp.get_context(self._start_method)
        return_queue = context.SimpleQueue()
        mp.start_processes(
            self._wrapping_function,
            args=(trainer, function, args, kwargs, return_queue),
            nprocs=self._strategy.num_processes,
            start_method=self._start_method,
        )
        worker_output = return_queue.get()
        if trainer is None:
            return worker_output

        self._recover_results_in_main_process(worker_output, trainer)
        return worker_output.trainer_results
def main():
    mp.set_start_method('spawn')
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
    parser.add_argument('-g',
                        '--gpus',
                        default=4,
                        type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr',
                        '--node_rank',
                        default=0,
                        type=int,
                        help='ranking within the nodes')
    args = parser.parse_args()
    #########################################################
    args.world_size = args.gpus * args.nodes  #
    os.environ['MASTER_ADDR'] = 'frost-6.las.iastate.edu'  #
    os.environ['MASTER_PORT'] = '8888'  #
    # you must fork, so that environments can be forked again. Do not spawn
    mp.start_processes(single_process_main,
                       nprocs=args.gpus,
                       args=(args, ),
                       start_method="spawn")  #
Example #16
0
    def _start_processes(self):
        import torch.multiprocessing as mp

        return mp.start_processes(
            fn=BackgroundWorkerDynamicBatchWorld.launch_process,
            nprocs=self._num_workers,
            # note that index is an an implied argument added by start_processes
            args=(self.opt, self.get_model_agent(), self._process_queue),
            join=False,
            # launch in fork mode so that we can share the model agent easily
            # note that this prevents us from using ANY threads in ANY of the
            # subprocesses! (See ChunkTeacher for one example). Fortunately, we
            # CAN use threads in the MAIN process, and we exploit this at
            # times.
            start_method='fork',
        )
Example #17
0
 def _start(self):
     if self._pc:
         raise ValueError("The process context already initialized."
                          " Most likely the start method got called twice.")
     self._pc = mp.start_processes(
         fn=_wrap,
         args=(
             self.entrypoint,
             self.args,
             self.envs,
             self.stdouts,
             self.stderrs,
             self._ret_vals,
             self._worker_finished_event,
         ),
         nprocs=self.nprocs,
         join=False,
         daemon=False,
         start_method=self.start_method,
     )
Example #18
0
def start_processes(params: List[MpParameters],
                    start_method: str = "spawn") -> MpProcessContext:
    r"""
    Launches processes and returns context object. Users can use that object
    to wait for processes results.
    """
    nprocs = len(params)
    ret_val_queues: Dict[int, mp.SimpleQueue] = {
        i: mp.get_context(start_method).SimpleQueue()
        for i in range(0, nprocs)
    }
    mp_proc_context = mp.start_processes(
        nprocs=nprocs,
        fn=_wrap,
        args=(params, ret_val_queues),
        join=False,
        daemon=False,
        start_method=start_method,
    )
    return MpProcessContext(mp_proc_context, ret_val_queues)
Example #19
0
def test_nested(i, pids_queue, nested_child_sleep, start_method):
    context = mp.get_context(start_method)
    nested_child_ready_queue = context.Queue()
    nprocs = 2
    mp_context = mp.start_processes(
        fn=test_nested_child_body,
        args=(nested_child_ready_queue, nested_child_sleep),
        nprocs=nprocs,
        join=False,
        daemon=False,
        start_method=start_method,
    )
    pids_queue.put(mp_context.pids())

    # Wait for both children to have started, to ensure that they
    # have called prctl(2) to register a parent death signal.
    for _ in range(nprocs):
        nested_child_ready_queue.get()

    # Kill self. This should take down the child processes as well.
    os.kill(os.getpid(), signal.SIGTERM)
Example #20
0
    def test_wrapper_fn_kill_script_process(self):
        """
        tests that the wrapper_fn properly terminates
        the script process (the script process is the sub_sub_process of
        the agent
        """
        nprocs = 2
        sleep = 300

        # wraps wrapper_fn to be torch.multiprocessing compatible
        # which requires rank to be passed as first arugment
        def wrap_wrap(rank, *args):
            launch.wrapper_fn(*args)

        context = start_processes(
            fn=wrap_wrap,
            args=(None, (path("bin/sleep_script.py"), "--sleep", f"{sleep}")),
            nprocs=nprocs,
            join=False,
            start_method="fork",
        )
        # quick check to see that the wrapper_fn started running
        # without this join() call we don't see an exception on typos
        # and other silly mistakes (silently fails)
        context.join(timeout=-1)

        script_pids = []
        for wrapper_fn_pid in context.pids():
            script_pid = get_child_pids(wrapper_fn_pid)
            # there should only be one child of wrapper_fn
            self.assertEqual(1, len(script_pid))
            script_pids.append(script_pid[0])

        for wrapper_fn_proc in context.processes:
            wrapper_fn_proc.terminate()
            wrapper_fn_proc.join()

        for script_pid in script_pids:
            self.assertFalse(pid_exists(script_pid))
Example #21
0
def PrepareMultiprocessing(args, func):
    if(args.slurm):
        args.node_rank = int(os.environ["SLURM_NODEID"])
        args.master_addr = os.environ["SLURM_SRUN_COMM_HOST"]
        args.nnodes = int(os.environ["SLURM_STEP_NUM_NODES"])
        args.nproc_per_node = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
        gpuList = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
        args.available_gpu = [int(gpu) for gpu in gpuList]

    world_size = args.nproc_per_node * args.nnodes
    local_ranks = args.nproc_per_node
    node_rank = args.node_rank
    os.environ["MASTER_ADDR"] = args.master_addr
    os.environ["MASTER_PORT"] = str(args.master_port)
    os.environ["WORLD_SIZE"] = str(world_size)
    #os.environ["NCCL_SOCKET_IFNAME"] = "ib"

    processes = []
    if 'OMP_NUM_THREADS' not in os.environ and args.nproc_per_node > 1:
        os.environ["OMP_NUM_THREADS"] = str(1)
        print("*****************************************\n"
              "Setting OMP_NUM_THREADS environment variable for each process "
              "to be {} in default, to avoid your system being overloaded, "
              "please further tune the variable for optimal performance in "
              "your application as needed. \n"
              "*****************************************".format(os.environ["OMP_NUM_THREADS"]))

    #Spawny
    if(world_size > 1):
        args.distributed = True
        globalLock = mp.Lock()
        prcs = mp.start_processes(func, args=[args, globalLock], nprocs=local_ranks, start_method='forkserver')
        
    else:
        args.distributed = False
        func(0,args,mp.Lock())
Example #22
0
def main(cfg: DictConfig):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '11337'
    torchmp.start_processes(train, nprocs=len(cfg.distributed.gpus), args=(cfg, Path(hydra.utils.get_original_cwd())),
                            start_method="spawn")
Example #23
0
def _mp_fork(*args, **kwargs):
    return mp.start_processes(*args, **kwargs, join=False, start_method="fork")
Example #24
0
 def test_success(self):
     mp.start_processes(test_success_func,
                        nprocs=2,
                        start_method=self.start_method)
Example #25
0
    parser.add_argument("--nproc_per_node", type=int, default=2)
    parser.add_argument("--log_interval", type=int, default=4)
    parser.add_argument("--nb_samples", type=int, default=128)
    parser.add_argument("--batch_size", type=int, default=16)
    args_parsed = parser.parse_args()

    assert dist.is_available()
    if args_parsed.backend == "nccl":
        assert torch.cuda.is_available()
        assert dist.is_nccl_available()
    elif args_parsed.backend == "gloo":
        assert dist.is_gloo_available()
    else:
        raise ValueError(
            f"unvalid backend `{args_parsed.backend}` (valid: `gloo` or `nccl`)"
        )

    config = {
        "log_interval": args_parsed.log_interval,
        "batch_size": args_parsed.batch_size,
        "nb_samples": args_parsed.nb_samples,
    }

    args = (args_parsed.nproc_per_node, args_parsed.backend, config)

    # Specific torch.distributed
    start_processes(training,
                    args=args,
                    nprocs=args_parsed.nproc_per_node,
                    start_method="spawn")
Example #26
0
def notebook_launcher(function,
                      args=(),
                      num_processes=None,
                      use_fp16=False,
                      use_port="29500"):
    """
    Launches a training function, using several processes if it's possible in the current environment (TPU with
    multiple cores for instance).

    Args:
        function (:obj:`Callable`):
            The training function to execute. If it accepts arguments, the first argument should be the index of the
            process run.
        args (:obj:`Tuple`):
            Tuple of arguments to pass to the function (it will receive :obj:`*args`).
        num_processes (:obj:`int`, `optional`):
            The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
            the number of GPUs available otherwise.
        use_fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If :obj:`True`, will use mixed precision training on multi-GPU.
        use_port (:obj:`str`, `optional`, defaults to :obj:`"29500"`):
            The port to use to communicate between processes when launching a multi-GPU training.
    """
    # Are we in a google colab or a Kaggle Kernel?
    if "IPython" in sys.modules:
        in_colab_or_kaggle = "google.colab" in str(
            sys.modules["IPython"].get_ipython())
    elif any(key.startswith("KAGGLE") for key in os.environ.keys()):
        in_colab_or_kaggle = True
    else:
        in_colab_or_kaggle = False

    if in_colab_or_kaggle:
        if os.environ.get("TPU_NAME", None) is not None:
            # TPU launch
            import torch_xla.distributed.xla_multiprocessing as xmp

            if len(AcceleratorState._shared_state) > 0:
                raise ValueError(
                    "To train on TPU in Colab or Kaggle Kernel, the `Accelerator` should only be initialized inside "
                    "your training function. Restart your notebook and make sure no cells initializes an "
                    "`Accelerator`.")
            if num_processes is None:
                num_processes = 8

            launcher = PrepareForLaunch(function, distributed_type="TPU")
            print(f"Launching a training on {num_processes} TPU cores.")
            xmp.spawn(launcher,
                      args=args,
                      nprocs=num_processes,
                      start_method="fork")
        else:
            # No need for a distributed launch otherwise as it's either CPU or one GPU.
            if torch.cuda.is_available():
                print("Launching training on one GPU.")
            else:
                print("Launching training on CPU.")
            function(*args)

    else:
        if num_processes is None:
            raise ValueError(
                "You have to specify the number of GPUs you would like to use, add `num_process=...` to your call."
            )

        if num_processes > 1:
            # Multi-GPU launch
            if len(AcceleratorState._shared_state) > 0:
                raise ValueError(
                    "To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized "
                    "inside your training function. Restart your notebook and make sure no cells initializes an "
                    "`Accelerator`.")

            if torch.cuda.is_initialized():
                raise ValueError(
                    "To launch a multi-GPU training from your notebook, you need to avoid running any instruction "
                    "using `torch.cuda` in any cell. Restart your notebook and make sure no cells use any CUDA "
                    "function.")

            # torch.distributed will expect a few environment variable to be here. We set the ones common to each
            # process here (the other ones will be set be the launcher).
            os.environ["WORLD_SIZE"] = str(num_processes)
            os.environ["MASTER_ADDR"] = "127.0.0.1"
            os.environ["MASTER_PORT"] = str(use_port)
            os.environ["USE_FP16"] = str(use_fp16)

            launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
            try:
                print(f"Launching a training on {num_processes} GPUs.")
                start_processes(launcher,
                                nprocs=num_processes,
                                start_method="fork")
            finally:
                # Clean up the environment variables set.
                del os.environ["WORLD_SIZE"]
                del os.environ["MASTER_ADDR"]
                del os.environ["MASTER_PORT"]

        else:
            # No need for a distributed launch otherwise as it's either CPU or one GPU.
            if torch.cuda.is_available():
                print("Launching training on one GPU.")
            else:
                print("Launching training on CPU.")
            function(*args)