Beispiel #1
0
        def test_function_raise(self):
            """
            run 2x copies of echo2, raise an exception on the first
            """
            RAISE = True

            for start_method in self._start_methods:
                with self.subTest(start_method=start_method):
                    log_dir = self.log_dir()
                    pc = start_processes(
                        name="echo",
                        entrypoint=echo2,
                        args={
                            0: ("hello", RAISE),
                            1: ("world", )
                        },
                        envs={
                            0: {},
                            1: {}
                        },
                        log_dir=log_dir,
                        start_method=start_method,
                    )

                    results = pc.wait(period=0.1)

                    self.assert_pids_noexist(pc.pids())
                    self.assertEqual(1, len(results.failures))
                    self.assertFalse(results.return_values)

                    failure = results.failures[0]
                    error_file = failure.error_file
                    error_file_data = failure.error_file_data

                    self.assertEqual(1, failure.exitcode)
                    self.assertEqual("<N/A>", failure.signal_name())
                    self.assertEqual(pc.pids()[0], failure.pid)
                    self.assertEqual(os.path.join(log_dir, "0", "error.json"),
                                     error_file)
                    self.assertEqual(
                        int(error_file_data["message"]["extraInfo"]
                            ["timestamp"]),
                        int(failure.timestamp),
                    )
                    self.assertTrue(pc._stderr_tail.stopped())
                    self.assertTrue(pc._stdout_tail.stopped())
Beispiel #2
0
        def test_binary_raises(self):
            pc = start_processes(
                name="echo",
                entrypoint=bin("echo2.py"),
                args={0: ("--raises", "true", "foo"), 1: ("bar",)},
                envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
                log_dir=self.log_dir(),
            )

            results = pc.wait(period=0.1)

            self.assert_pids_noexist(pc.pids())
            self.assertTrue(results.is_failed())
            self.assertEqual(1, len(results.failures))

            failure = results.failures[0]
            self.assertEqual(1, failure.exitcode)
            self.assertEqual("<NONE>", failure.error_file_data["message"])
            self.assertEqual("<N/A>", failure.signal_name())
Beispiel #3
0
        def test_function(self):
            for start_method, redirs in product(self._start_methods,
                                                redirects_all()):
                with self.subTest(start_method=start_method, redirs=redirs):
                    pc = start_processes(
                        name="echo",
                        entrypoint=echo1,
                        args={
                            0: ("hello", ),
                            1: ("hello", )
                        },
                        envs={
                            0: {
                                "RANK": "0"
                            },
                            1: {
                                "RANK": "1"
                            }
                        },
                        log_dir=self.log_dir(),
                        start_method=start_method,
                        redirects=redirs,
                    )

                    results = pc.wait(period=0.1)
                    nprocs = pc.nprocs

                    self.assert_pids_noexist(pc.pids())
                    self.assertEqual({i: f"hello_{i}"
                                      for i in range(nprocs)},
                                     results.return_values)

                    for i in range(nprocs):
                        if redirs & Std.OUT != Std.OUT:
                            self.assertFalse(results.stdouts[i])
                        if redirs & Std.ERR != Std.ERR:
                            self.assertFalse(results.stderrs[i])
                        if redirs & Std.OUT == Std.OUT:
                            self.assert_in_file([f"hello stdout from {i}"],
                                                results.stdouts[i])
                        if redirs & Std.ERR == Std.ERR:
                            self.assert_in_file([f"hello stderr from {i}"],
                                                results.stderrs[i])
Beispiel #4
0
        def test_void_function(self):
            for start_method in self._start_methods:
                with self.subTest(start_method=start_method):
                    pc = start_processes(
                        name="echo",
                        entrypoint=echo0,
                        args={
                            0: ("hello", ),
                            1: ("world", )
                        },
                        envs={
                            0: {},
                            1: {}
                        },
                        log_dir=self.log_dir(),
                        start_method=start_method,
                    )

                    results = pc.wait(period=0.1)
                    self.assertEqual({0: None, 1: None}, results.return_values)
Beispiel #5
0
        def test_binary(self):
            for redirs in redirects_oss_test():
                with self.subTest(redirs=redirs):
                    pc = start_processes(
                        name="echo",
                        entrypoint=bin("echo1.py"),
                        args={
                            0: ("hello", ),
                            1: ("hello", )
                        },
                        envs={
                            0: {
                                "RANK": "0"
                            },
                            1: {
                                "RANK": "1"
                            }
                        },
                        log_dir=self.log_dir(),
                        redirects=redirs,
                    )

                    results = pc.wait(period=0.1)

                    self.assert_pids_noexist(pc.pids())
                    # currently binaries return {rank: None}
                    self.assertEqual(2, len(results.return_values))
                    self.assertFalse(results.is_failed())

                    nprocs = pc.nprocs
                    for i in range(nprocs):
                        if redirs & Std.OUT != Std.OUT:
                            self.assertFalse(results.stdouts[i])
                        if redirs & Std.ERR != Std.ERR:
                            self.assertFalse(results.stderrs[i])
                        if redirs & Std.OUT == Std.OUT:
                            self.assert_in_file([f"hello stdout from {i}"],
                                                results.stdouts[i])
                        if redirs & Std.ERR == Std.ERR:
                            self.assert_in_file([f"hello stderr from {i}"],
                                                results.stderrs[i])
Beispiel #6
0
        def test_binary_redirect_and_tee(self):
            pc = start_processes(
                name="trainer",
                entrypoint=bin("echo1.py"),
                args={0: ("hello",), 1: ("world",)},
                envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
                log_dir=self.log_dir(),
                start_method="fork",
                redirects={0: Std.ERR, 1: Std.NONE},
                tee={0: Std.OUT, 1: Std.ERR},
            )

            result = pc.wait()

            self.assertFalse(result.is_failed())
            self.assert_in_file(["hello stdout from 0"], pc.stdouts[0])
            self.assert_in_file(["hello stderr from 0"], pc.stderrs[0])
            self.assert_in_file(["world stderr from 1"], pc.stderrs[1])
            self.assertFalse(pc.stdouts[1])
            self.assertTrue(pc._stderr_tail.stopped())
            self.assertTrue(pc._stdout_tail.stopped())
Beispiel #7
0
        def test_function_redirect_and_tee(self):
            for start_method in self._start_methods:
                with self.subTest(start_method=start_method):
                    log_dir = self.log_dir()
                    pc = start_processes(
                        name="trainer",
                        entrypoint=echo1,
                        args={
                            0: ("hello", ),
                            1: ("world", )
                        },
                        envs={
                            0: {
                                "RANK": "0"
                            },
                            1: {
                                "RANK": "1"
                            }
                        },
                        log_dir=log_dir,
                        start_method="spawn",
                        redirects={
                            0: Std.ERR,
                            1: Std.NONE
                        },
                        tee={
                            0: Std.OUT,
                            1: Std.ERR
                        },
                    )

                    result = pc.wait()

                    self.assertFalse(result.is_failed())
                    self.assert_in_file(["hello stdout from 0"], pc.stdouts[0])
                    self.assert_in_file(["hello stderr from 0"], pc.stderrs[0])
                    self.assert_in_file(["world stderr from 1"], pc.stderrs[1])
                    self.assertFalse(pc.stdouts[1])
                    self.assertTrue(pc._stderr_tail.stopped())
                    self.assertTrue(pc._stdout_tail.stopped())
Beispiel #8
0
        def test_function_large_ret_val(self):
            # python multiprocessing.queue module uses pipes and actually PipedQueues
            # This means that if a single object is greater than a pipe size
            # the writer process will block until reader process will start
            # reading the pipe.
            # This test makes a worker fn to return huge output, around ~10 MB

            size = 200000
            for start_method in self._start_methods:
                with self.subTest(start_method=start_method):
                    pc = start_processes(
                        name="echo",
                        entrypoint=echo_large,
                        args={0: (size,), 1: (size,), 2: (size,), 3: (size,)},
                        envs={0: {}, 1: {}, 2: {}, 3: {}},
                        log_dir=self.log_dir(),
                        start_method=start_method,
                    )

                    results = pc.wait(period=0.1)
                    for i in range(pc.nprocs):
                        self.assertEqual(size, len(results.return_values[i]))
Beispiel #9
0
    def test_function_signal(self):
        """
        run 2x copies of echo3, induce a segfault on first
        """
        SEGFAULT = True
        for start_method, redirs in product(self._start_methods, redirects()):
            with self.subTest(start_method=start_method):
                log_dir = self.log_dir()
                pc = start_processes(
                    name="echo",
                    entrypoint=echo3,
                    args={
                        0: ("hello", SEGFAULT),
                        1: ("world", )
                    },
                    envs={
                        0: {},
                        1: {}
                    },
                    log_dir=log_dir,
                    start_method=start_method,
                    redirects=redirs,
                )

                results = pc.wait(period=0.1)

                self.assert_pids_noexist(pc.pids())
                self.assertEqual(1, len(results.failures))
                self.assertFalse(results.return_values)

                failure = results.failures[0]
                error_file = failure.error_file

                self.assertEqual(-signal.SIGSEGV, failure.exitcode)
                self.assertEqual("SIGSEGV", failure.signal_name())
                self.assertEqual(pc.pids()[0], failure.pid)
                self.assertEqual(os.path.join(log_dir, "0", "error.json"),
                                 error_file)
Beispiel #10
0
        def test_binary_signal(self):
            pc = start_processes(
                name="echo",
                entrypoint=bin("echo3.py"),
                args={0: ("--segfault", "true", "foo"), 1: ("bar",)},
                envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
                log_dir=self.log_dir(),
            )

            results = pc.wait(period=0.1)

            self.assert_pids_noexist(pc.pids())
            self.assertTrue(results.is_failed())
            self.assertEqual(1, len(results.failures))

            failure = results.failures[0]
            self.assertNotEqual(signal.SIGSEGV, failure.exitcode)
            if TEST_WITH_ASAN:
                # ASAN exit code is 1.
                self.assertEqual("<N/A>", failure.signal_name())
            else:
                self.assertEqual("SIGSEGV", failure.signal_name())
            self.assertEqual("<NONE>", failure.error_file_data["message"])
Beispiel #11
0
        def test_binary_exit(self):
            FAIL = 138
            pc = start_processes(
                name="echo",
                entrypoint=bin("echo1.py"),
                args={
                    0: ("--exitcode", FAIL, "foo"),
                    1: ("--exitcode", 0, "bar")
                },
                envs={
                    0: {
                        "RANK": "0"
                    },
                    1: {
                        "RANK": "1"
                    }
                },
                log_dir=self.log_dir(),
                redirects={0: Std.ALL},
            )

            results = pc.wait(period=0.1)

            self.assertTrue(results.is_failed())
            self.assertEqual(1, len(results.failures))

            failure = results.failures[0]
            self.assertEqual(138, failure.exitcode)
            self.assertEqual("<N/A>", failure.signal_name())
            self.assertEqual("<NONE>", failure.error_file_data["message"])
            self.assert_in_file([f"exit {FAIL} from 0"], results.stderrs[0])
            self.assert_in_file([], results.stdouts[0])
            self.assertFalse(results.stderrs[1])
            self.assertFalse(results.stdouts[1])
            self.assertTrue(pc._stderr_tail.stopped())
            self.assertTrue(pc._stdout_tail.stopped())
Beispiel #12
0
    def launch(self, args):
        cores = []
        set_kmp_affinity = True
        if args.core_list:  # user specify what cores will be used by params
            cores = [int(x) for x in args.core_list.split(",")]
            if args.ncores_per_instance == -1:
                raise RuntimeError(
                    "please specify the \"--ncores_per_instance\" if you have pass the --core_list params"
                )
            elif args.ninstances > 1 and args.ncores_per_instance * args.ninstances < len(
                    cores):
                logger.warning(
                    f"only first {args.ncores_per_instance * args.ninstances} cores will be used, \
but you specify {len(cores)} cores in core_list")
            else:
                args.ninstances = len(cores) // args.ncores_per_instance

        else:
            if args.use_logical_core:
                if args.node_id != -1:
                    cores = self.cpuinfo.get_node_logical_cores(args.node_id)
                else:
                    cores = self.cpuinfo.get_all_logical_cores()
                    # When using all cores on all nodes, including logical cores,
                    # setting KMP_AFFINITY disables logical cores. Thus, KMP_AFFINITY should not be set.
                    set_kmp_affinity = False
            else:
                if args.node_id != -1:
                    cores = self.cpuinfo.get_node_physical_cores(args.node_id)
                else:
                    cores = self.cpuinfo.get_all_physical_cores()
            if not args.multi_instance and args.ninstances == -1 and args.ncores_per_instance == -1:
                args.ninstances = 1
                args.ncores_per_instance = len(cores)
            elif args.multi_instance and args.ninstances == -1 and args.ncores_per_instance == -1:
                args.throughput_mode = True
            elif args.ncores_per_instance == -1 and args.ninstances != -1:
                if args.ninstances > len(cores):
                    raise RuntimeError(
                        f"there are {len(cores)} total cores but you specify {args.ninstances} ninstances; \
please make sure ninstances <= total_cores)")
                else:
                    args.ncores_per_instance = len(cores) // args.ninstances
            elif args.ncores_per_instance != -1 and args.ninstances == -1:
                if not args.skip_cross_node_cores:
                    args.ninstances = len(cores) // args.ncores_per_instance
                else:
                    ncore_per_node = len(self.cpuinfo.node_physical_cores[0])
                    num_leftover_cores = ncore_per_node % args.ncores_per_instance
                    if args.ncores_per_instance > ncore_per_node:
                        # too many ncores_per_instance to skip cross-node cores
                        logger.warning(
                            "there are {} core(s) per socket, but you specify {} ncores_per_instance and \
skip_cross_node_cores. Please make sure --ncores_per_instance < core(s) per \
socket".format(ncore_per_node, args.ncores_per_instance))
                        exit(-1)
                    elif num_leftover_cores == 0:
                        # aren't any cross-node cores
                        logger.info(
                            '--skip_cross_node_cores is set, but there are no cross-node cores.'
                        )
                        args.ninstances = len(
                            cores) // args.ncores_per_instance
                    else:
                        # skip cross-node cores
                        if args.ninstances != -1:
                            logger.warning(
                                '--skip_cross_node_cores is exclusive to --ninstances. --ninstances \
won\'t take effect even if it is set explicitly.')

                        i = 1
                        leftover_cores = set()
                        while ncore_per_node * i <= len(cores):
                            leftover_cores.update(
                                cores[ncore_per_node * i -
                                      num_leftover_cores:ncore_per_node * i])
                            i += 1
                        cores = list(set(cores) - leftover_cores)
                        assert len(cores) % args.ncores_per_instance == 0
                        args.ninstances = len(
                            cores) // args.ncores_per_instance
            else:
                if args.ninstances * args.ncores_per_instance > len(cores):
                    raise RuntimeError(
                        "Please make sure ninstances * ncores_per_instance <= total_cores"
                    )
            if args.latency_mode:
                logger.warning(
                    "--latency_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \
--use_logical_core. They won't take effect even they are set explicitly.")
                args.ncores_per_instance = 4
                cores = self.cpuinfo.get_all_physical_cores()
                args.ninstances = len(cores) // args.ncores_per_instance

            if args.throughput_mode:
                logger.warning(
                    "--throughput_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \
--use_logical_core. They won't take effect even they are set explicitly.")
                args.ninstances = self.cpuinfo.node_nums
                cores = self.cpuinfo.get_all_physical_cores()
                args.ncores_per_instance = len(cores) // args.ninstances

        if args.ninstances > 1 and args.rank != -1:
            logger.info(
                f"assigning {args.ncores_per_instance} cores for instance {args.rank}"
            )

        self.set_multi_thread_and_allocator(args.ncores_per_instance,
                                            args.disable_iomp,
                                            set_kmp_affinity,
                                            args.enable_tcmalloc,
                                            args.enable_jemalloc,
                                            args.use_default_allocator)
        entrypoint = ""
        launch_args = {}
        launch_envs: Dict[int, Dict] = {}
        launch_tee = {}
        for i in range(args.ninstances):
            cmd = []
            cur_process_cores = ""
            if not args.disable_numactl:
                cmd = ["numactl"]
                cores = sorted(cores)
                if args.rank == -1:  # sequentially assign ncores_per_instance to ninstances
                    core_list = cores[i * args.ncores_per_instance:(i + 1) *
                                      args.ncores_per_instance]
                else:  # assign ncores_per_instance from rank
                    core_list = cores[args.rank *
                                      args.ncores_per_instance:(args.rank +
                                                                1) *
                                      args.ncores_per_instance]

                core_ranges: List[Dict] = []
                for core in core_list:
                    if len(core_ranges) == 0:
                        range_elem = {"start": core, "end": core}
                        core_ranges.append(range_elem)
                    else:
                        if core - core_ranges[-1]["end"] == 1:
                            core_ranges[-1]["end"] = core
                        else:
                            range_elem = {"start": core, "end": core}
                            core_ranges.append(range_elem)
                for r in core_ranges:
                    cur_process_cores = f"{cur_process_cores}{r['start']}-{r['end']},"
                cur_process_cores = cur_process_cores[:-1]
                numa_params = f"-C {cur_process_cores} "
                numa_ids = ",".join([
                    str(numa_id)
                    for numa_id in self.cpuinfo.numa_aware_check(core_list)
                ])
                numa_params += f"-m {numa_ids}"
                cmd.extend(numa_params.split())
            with_python = not args.no_python
            if with_python:
                cmd.append(sys.executable)
                cmd.append("-u")
            if args.module:
                cmd.append("-m")
            cmd.append(args.program)
            cmd.extend(args.program_args)
            cmd_s = " ".join(cmd)
            logger.info(cmd_s)
            if entrypoint == "":
                entrypoint = cmd[0]
            del cmd[0]
            launch_args[i] = tuple(cmd)
            launch_envs[i] = {}
            launch_tee[i] = Std.ALL

            if args.rank != -1:  # launches single instance, rank, only
                break

        ctx = start_processes(name=args.log_file_prefix,
                              entrypoint=entrypoint,
                              args=launch_args,
                              envs=launch_envs,
                              log_dir=args.log_path,
                              tee=launch_tee)
        ctx.wait()
    def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
        spec = worker_group.spec
        store = worker_group.store
        assert store is not None
        master_addr, master_port = super()._get_master_addr_port(store)
        restart_count = spec.max_restarts - self._remaining_restarts

        use_agent_store = spec.rdzv_handler.get_backend() == "static"

        args: Dict[int, Tuple] = {}
        envs: Dict[int, Dict[str, str]] = {}
        for worker in worker_group.workers:
            local_rank = worker.local_rank
            worker_env = {
                "LOCAL_RANK":
                str(local_rank),
                "RANK":
                str(worker.global_rank),
                "GROUP_RANK":
                str(worker_group.group_rank),
                "ROLE_RANK":
                str(worker.role_rank),
                "ROLE_NAME":
                spec.role,
                "LOCAL_WORLD_SIZE":
                str(spec.local_world_size),
                "WORLD_SIZE":
                str(worker.world_size),
                "GROUP_WORLD_SIZE":
                str(worker_group.group_world_size),
                "ROLE_WORLD_SIZE":
                str(worker.role_world_size),
                "MASTER_ADDR":
                master_addr,
                "MASTER_PORT":
                str(master_port),
                "TORCHELASTIC_RESTART_COUNT":
                str(restart_count),
                "TORCHELASTIC_MAX_RESTARTS":
                str(spec.max_restarts),
                "TORCHELASTIC_RUN_ID":
                spec.rdzv_handler.get_run_id(),
                "TORCHELASTIC_USE_AGENT_STORE":
                str(use_agent_store),
                "NCCL_ASYNC_ERROR_HANDLING":
                os.getenv("NCCL_ASYNC_ERROR_HANDLING", str(1)),
            }
            if "OMP_NUM_THREADS" in os.environ:
                worker_env["OMP_NUM_THREADS"] = os.environ["OMP_NUM_THREADS"]

            envs[local_rank] = worker_env
            worker_args = list(spec.args)
            worker_args = macros.substitute(worker_args, str(local_rank))
            args[local_rank] = tuple(worker_args)

        # scaling events do not count towards restarts (gets same attempt #)
        # remove existing log dir if this restart is due to a scaling event
        attempt_log_dir = os.path.join(self._log_dir,
                                       f"attempt_{restart_count}")
        shutil.rmtree(attempt_log_dir, ignore_errors=True)
        os.makedirs(attempt_log_dir)

        assert spec.entrypoint is not None
        self._pcontext = start_processes(
            name=spec.role,
            entrypoint=spec.entrypoint,
            args=args,
            envs=envs,
            log_dir=attempt_log_dir,
            start_method=self._start_method,
            redirects=spec.redirects,
            tee=spec.tee,
        )

        return self._pcontext.pids()