Example #1
0
    def __call__(self, parser, args, values, option_string=None):
        output = '''\
        Horovod v{version}:

        Available Frameworks:
          [{tensorflow}] TensorFlow
          [{torch}] PyTorch
          [{mxnet}] MXNet

        Available Controllers:
          [{mpi}] MPI
          [{gloo}] Gloo

        Available Tensor Operations:
          [{nccl_ops}] NCCL
          [{ddl_ops}] DDL
          [{mlsl_ops}] MLSL
          [{mpi_ops}] MPI
          [{gloo_ops}] Gloo\
        '''.format(
            version=horovod.__version__,
            tensorflow=CheckBuildAction.get_check(
                extension_available('tensorflow')),
            torch=CheckBuildAction.get_check(extension_available('torch')),
            mxnet=CheckBuildAction.get_check(extension_available('mxnet')),
            mpi=CheckBuildAction.get_check(mpi_built()),
            gloo=CheckBuildAction.get_check(gloo_built()),
            nccl_ops=CheckBuildAction.get_check(nccl_built()),
            ddl_ops=CheckBuildAction.get_check(ddl_built()),
            mpi_ops=CheckBuildAction.get_check(mpi_built()),
            mlsl_ops=CheckBuildAction.get_check(mlsl_built()),
            gloo_ops=CheckBuildAction.get_check(gloo_built()))
        print(textwrap.dedent(output))
        os._exit(0)
Example #2
0
def run_controller(use_gloo, gloo_run, use_mpi, mpi_run, use_jsrun, js_run, verbosity):
    # keep logic in sync with is_gloo_used(...)
    verbose = verbosity is not None and verbosity >= 2
    if use_gloo:
        if not gloo_built(verbose=verbose):
            raise ValueError('Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                             'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.')
        gloo_run()
    elif use_mpi:
        if not mpi_built(verbose=verbose):
            raise ValueError('MPI support has not been built.  If this is not expected, ensure MPI is installed '
                             'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.')
        mpi_run()
    elif use_jsrun:
        if not mpi_built(verbose=verbose):
            raise ValueError('MPI support has not been built.  If this is not expected, ensure MPI is installed '
                             'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.')
        if not lsf.LSFUtils.using_lsf():
            raise ValueError(
                'Horovod did not detect an LSF job.  The jsrun launcher can only be used in that environment. '
                'Please, pick a different launcher for other environments.')
        js_run()
    else:
        if mpi_built(verbose=verbose):
            if lsf.LSFUtils.using_lsf() and is_jsrun_installed():
                js_run()
            else:
                mpi_run()
        elif gloo_built(verbose=verbose):
            gloo_run()
        else:
            raise ValueError('Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                             'either MPI is installed (MPI) or CMake is installed (Gloo).')
Example #3
0
    def test_failed_run(self):
        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()
            # The other worker waits a while before exiting.
            time.sleep(120)

        assert gloo_built() or mpi_built()

        start = time.time()

        if gloo_built():
            with pytest.raises(
                    RuntimeError,
                    match='Horovod detected that one or more processes exited'
            ):
                run(fn, np=2, use_gloo=True)

        if mpi_built():
            with pytest.raises(RuntimeError, match='mpirun failed'):
                run(fn, np=2, use_mpi=True)

        # The controller should be terminating workers way before the 2-minute delay.
        assert time.time() - start < 60
Example #4
0
def _launch_job(args, remote_host_names, settings, common_intfs, command):
    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)
    driver_ip = _get_driver_ip(common_intfs)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, common_intfs, env, driver_ip,
                 command)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, common_intfs, env, command)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            mpi_run(settings, common_intfs, env, command)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, common_intfs, env, driver_ip,
                     command)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
Example #5
0
    def test_happy_run(self):
        def fn(a, b, c, d):
            hvd.init()
            rank = hvd.rank()
            v = a + b + c + d
            res = hvd.allgather(torch.tensor([rank, v])).tolist()
            if rank == 0:
                return res
            elif rank == 1:
                return "ret_val_of_rank_1"
            else:
                return None

        assert gloo_built() or mpi_built()
        for use_gloo, use_mpi in [(True, False), (False, True)]:
            if use_mpi and not mpi_built():
                continue

            if use_gloo and not gloo_built():
                continue

            res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321]], res1)
            res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321, 1, 4321, 2, 4321],
                                  "ret_val_of_rank_1",
                                  None], res2)
Example #6
0
    def test_happy_run_elastic_fault_tolerant_fails(self):
        self.skipTest(
            'elastic horovod does not support shutdown from the spark driver '
            'while elastic driver is waiting for hosts to come up')

        if not gloo_built():
            self.skipTest("Gloo is not available")

        with spark_session('test_happy_run_elastic_fault_tolerant_fails',
                           max_failures=2):
            with tempdir() as dir:
                # these files make training function fail in given rank, epoch and batch
                # we have as many failures as Spark has max_failures (per task / index)
                with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \
                     open(os.path.sep.join([dir, 'rank_1_epoch_3_batch_1_fail']), 'w'):
                    pass
                res = horovod.spark.run_elastic(
                    fn,
                    args=(2, 5, 5, dir),
                    env={'HOROVOD_LOG_LEVEL': 'DEBUG'},
                    num_proc=2,
                    min_num_proc=2,
                    max_num_proc=2,
                    start_timeout=5,
                    verbose=2)
                self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0),
                                      ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
Example #7
0
    def test_happy_run_elastic_fault_tolerant(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        if not gloo_built():
            self.skipTest("Gloo is not available")

        with spark_session('test_happy_run_elastic_fault_tolerant',
                           max_failures=3):
            with tempdir() as dir:
                # these files make training function fail in given rank, epoch and batch
                with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \
                     open(os.path.sep.join([dir, 'rank_0_epoch_3_batch_1_fail']), 'w'), \
                     open(os.path.sep.join([dir, 'rank_1_epoch_4_batch_2_fail']), 'w'):
                    pass
                res = horovod.spark.run_elastic(
                    fn,
                    args=(2, 5, 5, dir),
                    env={'HOROVOD_LOG_LEVEL': 'DEBUG'},
                    num_proc=2,
                    min_num_proc=2,
                    max_num_proc=2,
                    start_timeout=5,
                    verbose=2)
                self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0),
                                      ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
Example #8
0
    def test_failed_run(self):
        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()

        assert gloo_built() or mpi_built()

        if gloo_built():
            with pytest.raises(RuntimeError, match='Horovod detected that one or more processes exited'):
                run(fn, np=2, use_gloo=True)

        if mpi_built():
            with pytest.raises(RuntimeError, match='mpirun failed'):
                run(fn, np=2, use_mpi=True)
Example #9
0
def _run_elastic(args):
    # construct host discovery component
    if args.host_discovery_script:
        discover_hosts = discovery.HostDiscoveryScript(
            args.host_discovery_script, args.slots)
    elif args.hosts:
        _, available_host_slots = hosts.parse_hosts_and_slots(args.hosts)
        if len(available_host_slots) < 2:
            raise ValueError(
                'Cannot run in fault tolerance mode with fewer than 2 hosts.')
        discover_hosts = discovery.FixedHosts(available_host_slots)
    else:
        raise ValueError(
            'One of --host-discovery-script, --hosts, or --hostnames must be provided'
        )

    # horovodrun has to finish all the checks before this timeout runs out.
    if args.start_timeout:
        start_timeout = args.start_timeout
    else:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30'))

    tmout = timeout.Timeout(start_timeout,
                            message='Timed out waiting for {activity}. Please '
                            'check connectivity between servers. You '
                            'may need to increase the --start-timeout '
                            'parameter if you have too many servers.')
    settings = elastic_settings.ElasticSettings(
        discovery=discover_hosts,
        min_num_proc=args.min_num_proc or args.num_proc,
        max_num_proc=args.max_num_proc,
        elastic_timeout=args.elastic_timeout,
        reset_limit=args.reset_limit,
        cooldown_range=args.cooldown_range,
        num_proc=args.num_proc,
        verbose=2 if args.verbose else 0,
        ssh_port=args.ssh_port,
        ssh_identity_file=args.ssh_identity_file,
        extra_mpi_args=args.mpi_args,
        key=secret.make_secret_key(),
        start_timeout=tmout,
        output_filename=args.output_filename,
        run_func_mode=args.run_func is not None,
        nics=args.nics,
        prefix_output_with_timestamp=args.prefix_output_with_timestamp)

    if not gloo_built(verbose=(settings.verbose >= 2)):
        raise ValueError(
            'Gloo support is required to use elastic training, but has not been built.  Ensure CMake is '
            'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
        )

    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)
    executable = args.executable or sys.executable
    return gloo_run_elastic(settings, env,
                            args.run_func if args.run_func else args.command,
                            executable)
Example #10
0
    def test_run_failure(self, controller, mode, run):
        if controller == 'gloo' and not gloo_built():
            self.skipTest("Gloo is not available")
        if controller == 'mpi':
            if not (mpi_built() and mpi_available()):
                self.skipTest("MPI is not available")

        self.do_test_run_with_controller_failure(controller, mode, run)
Example #11
0
def _launch_job(args, remote_host_names, settings, nics, command):
    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, nics, env,
                 network._get_driver_ip(nics), command)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, nics, env, command)
    elif args.use_jsrun:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        if not lsf.LSFUtils.using_lsf():
            raise ValueError(
                'Horovod did not detect an LSF job.  The jsrun launcher can only be used in that environment. '
                'Please, pick a different launcher for other environments.')
        js_run(settings, nics, env, command)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            if lsf.LSFUtils.using_lsf() and is_jsrun_installed():
                js_run(settings, nics, env, command)
            else:
                mpi_run(settings, nics, env, command)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, nics, env,
                     network._get_driver_ip(nics), command)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
Example #12
0
    def test_happy_run_elastic(self):
        if not gloo_built():
            self.skipTest("Gloo is not available")

        with spark_session('test_happy_run_elastic'):
            res = horovod.spark.run_elastic(fn, args=(2, 5, 4),
                                            num_proc=2, min_np=2, max_np=2,
                                            start_timeout=10, verbose=2)
            self.assertListEqual([([0, 3, 0, 1, 1, 3, 0, 1], 0),
                                  ([0, 3, 0, 1, 1, 3, 0, 1], 1)], res)
Example #13
0
    def test_run_success(self, controller, mode, run):
        if controller == 'gloo' and not gloo_built():
            self.skipTest("Gloo is not available")
        if controller == 'mpi':
            if not (mpi_built() and mpi_available()):
                self.skipTest("MPI is not available")
            if is_mpich():
                self.skipTest("MPICH is not testable")

        self.do_test_run_with_controller_success(controller, mode, run)
Example #14
0
    def test_run_failure(self, controller, mode, run):
        if controller == 'gloo' and not gloo_built():
            self.skipTest("Gloo is not available")
        if controller == 'mpi':
            if not (mpi_built() and mpi_available()):
                self.skipTest("MPI is not available")
            if is_mpich():
                self.skipTest("MPICH is not testable")
            if is_intel_mpi():
                self.skipTest(
                    "Intel(R) MPI is not testable because it is based on MPICH"
                )

        self.do_test_run_with_controller_failure(controller, mode, run)
Example #15
0
    def test_happy_run_elastic(self):
        if skip_lightning_tests:
            self.skipTest('Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                          'https://github.com/horovod/horovod/pull/3263')

        if not gloo_built():
            self.skipTest("Gloo is not available")

        with spark_session('test_happy_run_elastic'):
            res = horovod.spark.run_elastic(fn, args=(2, 5, 4),
                                            num_proc=2, min_np=2, max_np=2,
                                            start_timeout=10, verbose=2)
            self.assertListEqual([([0, 3, 0, 1, 1, 3, 0, 1], 0),
                                  ([0, 3, 0, 1, 1, 3, 0, 1], 1)], res)
Example #16
0
def check_build(verbose):
    def get_check(value):
        return 'X' if value else ' '

    output = '''{verbose_newline}\
    Horovod v{version}:

    Available Frameworks:
        [{tensorflow}] TensorFlow
        [{torch}] PyTorch
        [{mxnet}] MXNet

    Available Controllers:
        [{mpi}] MPI
        [{gloo}] Gloo

    Available Tensor Operations:
        [{nccl_ops}] NCCL
        [{ddl_ops}] DDL
        [{ccl_ops}] CCL
        [{mpi_ops}] MPI
        [{gloo_ops}] Gloo\
    '''.format(verbose_newline='\n' if verbose else '',
               version=horovod.__version__,
               tensorflow=get_check(
                   extension_available('tensorflow', verbose=verbose)),
               torch=get_check(extension_available('torch', verbose=verbose)),
               mxnet=get_check(extension_available('mxnet', verbose=verbose)),
               mpi=get_check(mpi_built(verbose=verbose)),
               gloo=get_check(gloo_built(verbose=verbose)),
               nccl_ops=get_check(nccl_built(verbose=verbose)),
               ddl_ops=get_check(ddl_built(verbose=verbose)),
               mpi_ops=get_check(mpi_built(verbose=verbose)),
               ccl_ops=get_check(ccl_built(verbose=verbose)),
               gloo_ops=get_check(gloo_built(verbose=verbose)))
    print(textwrap.dedent(output))
    os._exit(0)
Example #17
0
    def test_happy_run_elastic(self):
        if not gloo_built():
            self.skipTest("Gloo is not available")

        args = _HorovodArgs()

        # we need two different hosts here, otherwise would need to give args.nics
        args.hosts = 'localhost:2,127.0.0.1:2'
        args.command = [sys.executable, '-V']
        args.num_proc = 2
        args.min_num_proc = 2
        args.verbose = True

        # no assertions, we are happy when there are no exceptions
        # TODO: call into run() when elastic supports args.run_func (#1873)
        #       we can assert the returned result then
        _run(args)
Example #18
0
def run():
    args = parse_args()

    if args.check_build:
        check_build(args.verbose)

    # if hosts are not specified, either parse from hostfile, or default as
    # localhost
    if not args.hosts:
        if args.hostfile:
            args.hosts = parse_host_files(args.hostfile)
        else:
            # Set hosts to localhost if not specified
            args.hosts = 'localhost:{np}'.format(np=args.np)

    host_list = args.hosts.split(',')
    all_host_names = []
    pattern = re.compile(r'^[\w.-]+:\d+$')
    for host in host_list:
        if not pattern.match(host.strip()):
            raise ValueError('Invalid host input, please make sure it has '
                             'format as : worker-0:2,worker-1:2.')
        all_host_names.append(host.strip().split(':')[0])

    # horovodrun has to finish all the checks before this timeout runs out.
    if args.start_timeout:
        start_timeout = args.start_timeout
    else:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30'))

    tmout = timeout.Timeout(start_timeout,
                            message='Timed out waiting for {activity}. Please '
                            'check connectivity between servers. You '
                            'may need to increase the --start-timeout '
                            'parameter if you have too many servers.')
    settings = hvd_settings.Settings(verbose=2 if args.verbose else 0,
                                     ssh_port=args.ssh_port,
                                     key=secret.make_secret_key(),
                                     timeout=tmout,
                                     num_hosts=len(all_host_names),
                                     num_proc=args.np,
                                     hosts=args.hosts,
                                     command=args.command)

    # This cache stores the results of checks performed by horovodrun
    # during the initialization step. It can be disabled by setting
    # --disable-cache flag.
    fn_cache = None
    if not args.disable_cache:
        params = ''
        if args.np:
            params += str(args.np) + ' '
        if args.hosts:
            params += str(args.hosts) + ' '
        if args.ssh_port:
            params += str(args.ssh_port)
        parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest()
        fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES,
                               parameters_hash)

    if settings.verbose >= 2:
        print('Filtering local host names.')
    remote_host_names = network.filter_local_addresses(all_host_names)
    if settings.verbose >= 2:
        print('Remote host found: ' + ' '.join(remote_host_names))

    if len(remote_host_names) > 0:
        if settings.verbose >= 2:
            print('Checking ssh on all remote hosts.')
        # Check if we can ssh into all remote hosts successfully.
        _check_all_hosts_ssh_successful(remote_host_names,
                                        args.ssh_port,
                                        fn_cache=fn_cache)
        if settings.verbose >= 2:
            print('SSH was successful into all the remote hosts.')

    if len(remote_host_names) > 0:
        if settings.verbose >= 2:
            print('Testing interfaces on all the hosts.')

        local_host_names = set(all_host_names) - set(remote_host_names)
        # Find the set of common, routed interfaces on all the hosts (remote
        # and local) and specify it in the args to be used by NCCL. It is
        # expected that the following function will find at least one interface
        # otherwise, it will raise an exception.
        common_intfs = _driver_fn(all_host_names,
                                  local_host_names,
                                  settings,
                                  fn_cache=fn_cache)

        if settings.verbose >= 2:
            print('Interfaces on all the hosts were successfully checked.')
            print('Common interface found: ' + ' '.join(common_intfs))

    else:
        if settings.verbose >= 2:
            print('All hosts are local, finding the interfaces '
                  'with address 127.0.0.1')
        # If all the given hosts are local, find the interfaces with address
        # 127.0.0.1
        common_intfs = set()
        for iface, addrs in net_if_addrs().items():
            for addr in addrs:
                if addr.family == AF_INET and addr.address == '127.0.0.1':
                    common_intfs.add(iface)
                    break

        if len(common_intfs) == 0:
            raise ValueError('No interface is found for address 127.0.0.1.')

        if settings.verbose >= 2:
            print('Local interface found ' + ' '.join(common_intfs))

    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, common_intfs, env)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, common_intfs, env)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            mpi_run(settings, common_intfs, env)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, common_intfs, env)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
Example #19
0
    actor.update_env_vars.remote({"TEST": DUMMY_VALUE})
    assert ray.get(actor.env_vars.remote())["TEST"] == str(DUMMY_VALUE)


def test_local(ray_start_4_cpus):
    original_resources = ray.available_resources()
    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(setting, num_hosts=1, num_slots=4)
    hjob.start()
    hostnames = hjob.execute(lambda _: ray.services.get_node_ip_address())
    assert len(set(hostnames)) == 1, hostnames
    hjob.shutdown()
    assert check_resources(original_resources)


@pytest.mark.skipif(not gloo_built(), reason='Gloo is required for Ray integration')
def test_ray_init(ray_start_4_cpus):
    original_resources = ray.available_resources()

    def simple_fn(worker):
        import horovod.torch as hvd
        hvd.init()
        return hvd.rank()

    setting = RayExecutor.create_settings(timeout_s=30)
    hjob = RayExecutor(
        setting, num_hosts=1, num_slots=4, use_gpu=torch.cuda.is_available())
    hjob.start()
    result = hjob.execute(simple_fn)
    assert len(set(result)) == 4
    hjob.shutdown()
Example #20
0
# ==============================================================================

import os
import unittest
from tempfile import NamedTemporaryFile

import horovod
from horovod.common.util import gloo_built
from horovod.runner.common.util.env import get_env_rank_and_size


def train():
    return get_env_rank_and_size()


@unittest.skipIf(not gloo_built(), "Gloo is not available")
class ElasticRunTests(unittest.TestCase):
    """
    Tests for run api with elastic config.
    """
    def test_run_with_hosts(self):
        """Tests two usable hosts, two slots each in standard happy path."""
        hosts = 'localhost:2,127.0.0.1:2'
        results = horovod.run(train,
                              num_proc=2,
                              min_num_proc=2,
                              max_num_proc=2,
                              hosts=hosts)
        self.assertEqual([(0, 2), (1, 2)], results)

    def test_run_with_discovery_script(self):
Example #21
0
    return logger, training_fn


@contextmanager
def fault_tolerance_patches():
    with mock.patch(
            'horovod.runner.elastic.driver.DISCOVER_HOSTS_FREQUENCY_SECS',
            0.1):
        with mock.patch("horovod.runner.util.network.get_driver_ip",
                        return_value=socket.gethostbyname(
                            socket.gethostname())):
            yield


@pytest.mark.skipif(not gloo_built(),
                    reason='Gloo is required for Ray integration')
def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (20, ['host-1:2']),
            (60, ['host-1:2', 'host-2:1', 'host-3:1']),
            (None, ['host-2:1']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = ElasticRayExecutor.create_settings(min_np=1, nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = ElasticRayExecutor(settings,
                                      cpus_per_slot=1,
                                      override_discovery=False)
Example #22
0
def run_elastic(
        fn,
        args=(),
        kwargs={},
        num_proc=None,
        min_num_proc=None,
        max_num_proc=None,
        start_timeout=None,
        elastic_timeout=None,
        reset_limit=None,
        env=None,
        stdout=None,
        stderr=None,
        verbose=1,
        nics=None,
        prefix_output_with_timestamp=False,
        # np is deprecated, use min_num_proc instead
        min_np=None,
        # max_num_proc is deprecated, use max_num_proc instead
        max_np=None):
    """
    Runs Elastic Horovod on Spark.  Runs `num_proc` processes executing `fn` using the same amount of Spark tasks.

    Args:
        fn: Function to run.
        args: Arguments to pass to `fn`.
        kwargs: Keyword arguments to pass to `fn`.
        num_proc: Number of Horovod processes.  Defaults to `spark.default.parallelism`.
        min_num_proc: Minimum number of processes running for training to continue.
                      If number of available processes dips below this threshold,
                      then training will wait for more instances to become available.
        max_num_proc: Maximum number of training processes,
                      beyond which no additional processes will be created.
                      If not specified, then will be unbounded.
        start_timeout: Timeout for Spark tasks to spawn, register and start running the code, in seconds.
                       If not set, falls back to `HOROVOD_SPARK_START_TIMEOUT` environment variable value.
                       If it is not set as well, defaults to 600 seconds.
        elastic_timeout: Timeout for elastic initialisation after re-scaling the cluster.
                       If not set, falls back to `HOROVOD_ELASTIC_TIMEOUT` environment variable value.
                       If it is not set as well, defaults to 600 seconds.
        reset_limit: Maximum number of resets after which the job is terminated.
        env: Environment dictionary to use in Horovod run.  Defaults to `os.environ`.
        stdout: Horovod stdout is redirected to this stream.
        stderr: Horovod stderr is redirected to this stream.
        verbose: Debug output verbosity (0-2). Defaults to 1.
        nics: List of NICs for tcp network communication.
        prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver

    Returns:
        List of results returned by running `fn` on each rank.
    """
    if min_np is not None:
        min_num_proc = min_np
        warnings.warn('min_np is deprecated, use min_num_proc instead',
                      DeprecationWarning)
    if max_np is not None:
        max_num_proc = max_np
        warnings.warn('max_np is deprecated, use max_num_proc instead',
                      DeprecationWarning)

    if not gloo_built(verbose=(verbose >= 2)):
        raise ValueError(
            'Gloo support is required to use elastic training, but has not been built.  Ensure CMake is '
            'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
        )

    spark_context = pyspark.SparkContext._active_spark_context
    if spark_context is None:
        raise Exception('Could not find an active SparkContext, are you '
                        'running in a PySpark session?')

    if start_timeout is None:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_SPARK_START_TIMEOUT', '600'))

    # nics needs to be a set
    if nics and not isinstance(nics, set):
        nics = set(nics)

    if num_proc is None:
        # TODO: #2023 try spark.dynamicAllocation.initialExecutors
        num_proc = spark_context.defaultParallelism
        if verbose >= 1:
            logging.info(
                'Running %d processes (inferred from spark.default.parallelism)...',
                num_proc)
    else:
        if verbose >= 1:
            logging.info('Running %d processes...', num_proc)

    if min_num_proc is None:
        # TODO: #2023 try spark.dynamicAllocation.minExecutors
        min_num_proc = num_proc
    if max_num_proc is None:
        # TODO: #2023 try spark.dynamicAllocation.maxExecutors
        max_num_proc = num_proc

    # start Spark driver service and launch settings.num_proc Spark tasks
    key = secret.make_secret_key()
    spark_job_group = 'horovod.spark.run.%d' % job_id.next_job_id()
    driver = driver_service.SparkDriverService(num_proc, max_num_proc, fn,
                                               args, kwargs, key, nics)

    discovery = host_discovery.SparkDriverHostDiscovery(driver)

    tmout = timeout.Timeout(
        start_timeout,
        message='Timed out waiting for {activity}. Please check that you have '
        'enough resources to run all Horovod processes. Each Horovod '
        'process runs in a Spark task. You may need to increase the '
        'start_timeout parameter to a larger value if your Spark resources '
        'are allocated on-demand.')
    settings = hvd_elastic_settings.ElasticSettings(
        discovery=discovery,
        min_num_proc=min_num_proc,
        max_num_proc=max_num_proc,
        elastic_timeout=elastic_timeout,
        reset_limit=reset_limit,
        num_proc=num_proc,
        verbose=verbose,
        key=key,
        start_timeout=tmout,
        nics=nics,
        run_func_mode=True,
        prefix_output_with_timestamp=prefix_output_with_timestamp)

    result_queue = queue.Queue(1)

    # launch settings.num_proc / settings.max_num_proc Spark tasks
    spark_thread = _make_spark_thread(spark_context,
                                      spark_job_group,
                                      driver,
                                      result_queue,
                                      settings,
                                      use_gloo=True,
                                      is_elastic=True)
    try:
        # Register task addresses of initial num_proc tasks
        _register_task_addresses(driver, settings)

        # Run the job
        gloo_run_elastic(settings, driver, env, stdout, stderr)
    except:
        # Terminate Spark job.
        spark_context.cancelJobGroup(spark_job_group)

        # Re-raise exception.
        raise
    finally:
        spark_thread.join()
        driver.shutdown()

    # Make sure Spark Job did not fail.
    driver.check_for_spark_job_failure()

    # get ranks from driver
    indices_in_rank_order = _get_indices_in_rank_order(driver)

    # If there's no exception, execution results are in this queue.
    results = result_queue.get_nowait()
    return [results[index] for index in indices_in_rank_order]
Example #23
0
    return training_fn


@contextmanager
def fault_tolerance_patches():
    with mock.patch(
            'horovod.runner.elastic.driver.DISCOVER_HOSTS_FREQUENCY_SECS',
            0.1):
        with mock.patch(
                "horovod.runner.util.network.get_driver_ip",
                return_value=socket.gethostbyname(socket.gethostname())):
            yield


@pytest.mark.skipif(
    not gloo_built(), reason='Gloo is required for Ray integration')
@pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/3197')
def test_fault_tolerance_hosts_added_and_removed(ray_8_cpus):
    with fault_tolerance_patches():
        discovery_schedule = [
            (10, ['host-1:2']),
            (30, ['host-1:2', 'host-2:1', 'host-3:1']),
            (None, ['host-2:1']),
        ]
        nics = list(psutil.net_if_addrs().keys())[0]

        settings = RayExecutor.create_settings(nics={nics})
        settings.discovery = SimpleTestDiscovery(discovery_schedule)
        executor = RayExecutor(
            settings,
            min_workers=1,
Example #24
0
class TestRayDiscoverySuite:
    @pytest.mark.skipif(not gloo_built(),
                        reason='Gloo is required for Ray integration')
    def test_cpu_discovery(self, ray_shutdown):
        ray.init(num_cpus=4, num_gpus=1)
        discovery = RayHostDiscovery(cpus_per_slot=1)
        mapping = discovery.find_available_hosts_and_slots()
        assert len(mapping) == 1
        assert list(mapping.values()) == [4]

    @pytest.mark.skipif(not gloo_built(),
                        reason='Gloo is required for Ray integration')
    def test_gpu_discovery(self, ray_shutdown):
        ray.init(num_cpus=4, num_gpus=1)
        discovery = RayHostDiscovery(use_gpu=True, cpus_per_slot=1)
        mapping = discovery.find_available_hosts_and_slots()
        assert len(mapping) == 1
        assert list(mapping.values()) == [1]

    @pytest.mark.skipif(not gloo_built(),
                        reason='Gloo is required for Ray integration')
    def test_gpu_slot_discovery(self, ray_shutdown):
        ray.init(num_cpus=4, num_gpus=4)
        discovery = RayHostDiscovery(use_gpu=True,
                                     cpus_per_slot=1,
                                     gpus_per_slot=2)
        mapping = discovery.find_available_hosts_and_slots()
        assert len(mapping) == 1
        assert list(mapping.values()) == [2]

    @pytest.mark.skipif(not gloo_built(),
                        reason='Gloo is required for Ray integration')
    def test_multinode(self, monkeypatch):
        def create_multi_node_mock():
            host_names = ["host-1", "host-2", "host-3"]
            resources = {"GPU": 2, "CPU": 8}

            def create_node_entry(hostname):
                return {
                    "NodeManagerAddress": hostname,
                    "Resources": resources.copy(),
                    "alive": True
                }

            return map(create_node_entry, host_names)

        monkeypatch.setattr(ray, "nodes", create_multi_node_mock)
        discovery = RayHostDiscovery(use_gpu=True, cpus_per_slot=1)
        mapping = discovery.find_available_hosts_and_slots()
        assert len(mapping) == 3
        assert list(mapping.values()) == [2, 2, 2]

    @pytest.mark.skipif(not gloo_built(),
                        reason='Gloo is required for Ray integration')
    def test_multinode_gpus_per_slot(self, monkeypatch):
        def create_multi_node_mock():
            host_names = ["host-1", "host-2", "host-3"]
            resources = {"GPU": 2, "CPU": 8}

            def create_node_entry(hostname):
                return {
                    "NodeManagerAddress": hostname,
                    "Resources": resources.copy(),
                    "alive": True
                }

            return map(create_node_entry, host_names)

        monkeypatch.setattr(ray, "nodes", create_multi_node_mock)
        discovery = RayHostDiscovery(use_gpu=True, gpus_per_slot=2)
        mapping = discovery.find_available_hosts_and_slots()
        assert len(mapping) == 3
        assert list(mapping.values()) == [1, 1, 1]

    @pytest.mark.skipif(not gloo_built(),
                        reason='Gloo is required for Ray integration')
    def test_multinode_mismatch(self, monkeypatch):
        def create_multi_node_mock():
            host_names = ["host-1", "host-2", "host-3"]
            resources = {"CPU": 8}

            def create_node_entry(hostname):
                return {
                    "NodeManagerAddress": hostname,
                    "Resources": resources.copy(),
                    "alive": True
                }

            return map(create_node_entry, host_names)

        monkeypatch.setattr(ray, "nodes", create_multi_node_mock)
        discovery = RayHostDiscovery(use_gpu=True, cpus_per_slot=1)
        mapping = discovery.find_available_hosts_and_slots()
        assert sum(mapping.values()) == 0
Example #25
0
 def test_gloo_built(self):
     """Test that Gloo has been built if env is set."""
     gloo_rank = int(os.getenv('HOROVOD_RANK', -1))
     if gloo_rank >= 0:
         self.assertTrue(gloo_built())