Beispiel #1
0
def _launch_job(args, remote_host_names, settings, common_intfs, command):
    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)
    driver_ip = _get_driver_ip(common_intfs)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, common_intfs, env, driver_ip,
                 command)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, common_intfs, env, command)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            mpi_run(settings, common_intfs, env, command)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, common_intfs, env, driver_ip,
                     command)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
Beispiel #2
0
    def __call__(self, parser, args, values, option_string=None):
        output = '''\
        Horovod v{version}:

        Available Frameworks:
          [{tensorflow}] TensorFlow
          [{torch}] PyTorch
          [{mxnet}] MXNet

        Available Controllers:
          [{mpi}] MPI
          [{gloo}] Gloo

        Available Tensor Operations:
          [{nccl_ops}] NCCL
          [{ddl_ops}] DDL
          [{mlsl_ops}] MLSL
          [{mpi_ops}] MPI
          [{gloo_ops}] Gloo\
        '''.format(
            version=horovod.__version__,
            tensorflow=CheckBuildAction.get_check(
                extension_available('tensorflow')),
            torch=CheckBuildAction.get_check(extension_available('torch')),
            mxnet=CheckBuildAction.get_check(extension_available('mxnet')),
            mpi=CheckBuildAction.get_check(mpi_built()),
            gloo=CheckBuildAction.get_check(gloo_built()),
            nccl_ops=CheckBuildAction.get_check(nccl_built()),
            ddl_ops=CheckBuildAction.get_check(ddl_built()),
            mpi_ops=CheckBuildAction.get_check(mpi_built()),
            mlsl_ops=CheckBuildAction.get_check(mlsl_built()),
            gloo_ops=CheckBuildAction.get_check(gloo_built()))
        print(textwrap.dedent(output))
        os._exit(0)
Beispiel #3
0
def run_controller(use_gloo, gloo_run, use_mpi, mpi_run, use_jsrun, js_run, verbosity):
    # keep logic in sync with is_gloo_used(...)
    verbose = verbosity is not None and verbosity >= 2
    if use_gloo:
        if not gloo_built(verbose=verbose):
            raise ValueError('Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                             'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.')
        gloo_run()
    elif use_mpi:
        if not mpi_built(verbose=verbose):
            raise ValueError('MPI support has not been built.  If this is not expected, ensure MPI is installed '
                             'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.')
        mpi_run()
    elif use_jsrun:
        if not mpi_built(verbose=verbose):
            raise ValueError('MPI support has not been built.  If this is not expected, ensure MPI is installed '
                             'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.')
        if not lsf.LSFUtils.using_lsf():
            raise ValueError(
                'Horovod did not detect an LSF job.  The jsrun launcher can only be used in that environment. '
                'Please, pick a different launcher for other environments.')
        js_run()
    else:
        if mpi_built(verbose=verbose):
            if lsf.LSFUtils.using_lsf() and is_jsrun_installed():
                js_run()
            else:
                mpi_run()
        elif gloo_built(verbose=verbose):
            gloo_run()
        else:
            raise ValueError('Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                             'either MPI is installed (MPI) or CMake is installed (Gloo).')
    def test_failed_run(self):
        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()
            # The other worker waits a while before exiting.
            time.sleep(120)

        assert gloo_built() or mpi_built()

        start = time.time()

        if gloo_built():
            with pytest.raises(
                    RuntimeError,
                    match='Horovod detected that one or more processes exited'
            ):
                run(fn, np=2, use_gloo=True)

        if mpi_built():
            with pytest.raises(RuntimeError, match='mpirun failed'):
                run(fn, np=2, use_mpi=True)

        # The controller should be terminating workers way before the 2-minute delay.
        assert time.time() - start < 60
Beispiel #5
0
    def test_happy_run(self):
        def fn(a, b, c, d):
            hvd.init()
            rank = hvd.rank()
            v = a + b + c + d
            res = hvd.allgather(torch.tensor([rank, v])).tolist()
            if rank == 0:
                return res
            elif rank == 1:
                return "ret_val_of_rank_1"
            else:
                return None

        assert gloo_built() or mpi_built()
        for use_gloo, use_mpi in [(True, False), (False, True)]:
            if use_mpi and not mpi_built():
                continue

            if use_gloo and not gloo_built():
                continue

            res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321]], res1)
            res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321, 1, 4321, 2, 4321],
                                  "ret_val_of_rank_1",
                                  None], res2)
Beispiel #6
0
    def test_failed_run(self):
        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()

        assert gloo_built() or mpi_built()

        if gloo_built():
            with pytest.raises(RuntimeError, match='Horovod detected that one or more processes exited'):
                run(fn, np=2, use_gloo=True)

        if mpi_built():
            with pytest.raises(RuntimeError, match='mpirun failed'):
                run(fn, np=2, use_mpi=True)
Beispiel #7
0
    def test_run_failure(self, controller, mode, run):
        if controller == 'gloo' and not gloo_built():
            self.skipTest("Gloo is not available")
        if controller == 'mpi':
            if not (mpi_built() and mpi_available()):
                self.skipTest("MPI is not available")

        self.do_test_run_with_controller_failure(controller, mode, run)
Beispiel #8
0
    def test_run_success(self, controller, mode, run):
        if controller == 'gloo' and not gloo_built():
            self.skipTest("Gloo is not available")
        if controller == 'mpi':
            if not (mpi_built() and mpi_available()):
                self.skipTest("MPI is not available")
            if is_mpich():
                self.skipTest("MPICH is not testable")

        self.do_test_run_with_controller_success(controller, mode, run)
Beispiel #9
0
def _launch_job(args, remote_host_names, settings, nics, command):
    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, nics, env,
                 network._get_driver_ip(nics), command)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, nics, env, command)
    elif args.use_jsrun:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        if not lsf.LSFUtils.using_lsf():
            raise ValueError(
                'Horovod did not detect an LSF job.  The jsrun launcher can only be used in that environment. '
                'Please, pick a different launcher for other environments.')
        js_run(settings, nics, env, command)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            if lsf.LSFUtils.using_lsf() and is_jsrun_installed():
                js_run(settings, nics, env, command)
            else:
                mpi_run(settings, nics, env, command)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, nics, env,
                     network._get_driver_ip(nics), command)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
    def test_run_failure(self, controller, mode, run):
        if controller == 'gloo' and not gloo_built():
            self.skipTest("Gloo is not available")
        if controller == 'mpi':
            if not (mpi_built() and mpi_available()):
                self.skipTest("MPI is not available")
            if is_mpich():
                self.skipTest("MPICH is not testable")
            if is_intel_mpi():
                self.skipTest(
                    "Intel(R) MPI is not testable because it is based on MPICH"
                )

        self.do_test_run_with_controller_failure(controller, mode, run)
Beispiel #11
0
def check_build(verbose):
    def get_check(value):
        return 'X' if value else ' '

    output = '''{verbose_newline}\
    Horovod v{version}:

    Available Frameworks:
        [{tensorflow}] TensorFlow
        [{torch}] PyTorch
        [{mxnet}] MXNet

    Available Controllers:
        [{mpi}] MPI
        [{gloo}] Gloo

    Available Tensor Operations:
        [{nccl_ops}] NCCL
        [{ddl_ops}] DDL
        [{ccl_ops}] CCL
        [{mpi_ops}] MPI
        [{gloo_ops}] Gloo\
    '''.format(verbose_newline='\n' if verbose else '',
               version=horovod.__version__,
               tensorflow=get_check(
                   extension_available('tensorflow', verbose=verbose)),
               torch=get_check(extension_available('torch', verbose=verbose)),
               mxnet=get_check(extension_available('mxnet', verbose=verbose)),
               mpi=get_check(mpi_built(verbose=verbose)),
               gloo=get_check(gloo_built(verbose=verbose)),
               nccl_ops=get_check(nccl_built(verbose=verbose)),
               ddl_ops=get_check(ddl_built(verbose=verbose)),
               mpi_ops=get_check(mpi_built(verbose=verbose)),
               ccl_ops=get_check(ccl_built(verbose=verbose)),
               gloo_ops=get_check(gloo_built(verbose=verbose)))
    print(textwrap.dedent(output))
    os._exit(0)
Beispiel #12
0
def is_gloo_used(use_gloo=None, use_mpi=None, use_jsrun=None):
    # determines whether run_controller will run gloo
    # for the given (use_gloo, _, use_mpi, _, use_jsrun, _, _)
    return use_gloo or (not use_mpi and not use_jsrun and not mpi_built())
Beispiel #13
0
def run():
    args = parse_args()

    if args.check_build:
        check_build(args.verbose)

    # if hosts are not specified, either parse from hostfile, or default as
    # localhost
    if not args.hosts:
        if args.hostfile:
            args.hosts = parse_host_files(args.hostfile)
        else:
            # Set hosts to localhost if not specified
            args.hosts = 'localhost:{np}'.format(np=args.np)

    host_list = args.hosts.split(',')
    all_host_names = []
    pattern = re.compile(r'^[\w.-]+:\d+$')
    for host in host_list:
        if not pattern.match(host.strip()):
            raise ValueError('Invalid host input, please make sure it has '
                             'format as : worker-0:2,worker-1:2.')
        all_host_names.append(host.strip().split(':')[0])

    # horovodrun has to finish all the checks before this timeout runs out.
    if args.start_timeout:
        start_timeout = args.start_timeout
    else:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30'))

    tmout = timeout.Timeout(start_timeout,
                            message='Timed out waiting for {activity}. Please '
                            'check connectivity between servers. You '
                            'may need to increase the --start-timeout '
                            'parameter if you have too many servers.')
    settings = hvd_settings.Settings(verbose=2 if args.verbose else 0,
                                     ssh_port=args.ssh_port,
                                     key=secret.make_secret_key(),
                                     timeout=tmout,
                                     num_hosts=len(all_host_names),
                                     num_proc=args.np,
                                     hosts=args.hosts,
                                     command=args.command)

    # This cache stores the results of checks performed by horovodrun
    # during the initialization step. It can be disabled by setting
    # --disable-cache flag.
    fn_cache = None
    if not args.disable_cache:
        params = ''
        if args.np:
            params += str(args.np) + ' '
        if args.hosts:
            params += str(args.hosts) + ' '
        if args.ssh_port:
            params += str(args.ssh_port)
        parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest()
        fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES,
                               parameters_hash)

    if settings.verbose >= 2:
        print('Filtering local host names.')
    remote_host_names = network.filter_local_addresses(all_host_names)
    if settings.verbose >= 2:
        print('Remote host found: ' + ' '.join(remote_host_names))

    if len(remote_host_names) > 0:
        if settings.verbose >= 2:
            print('Checking ssh on all remote hosts.')
        # Check if we can ssh into all remote hosts successfully.
        _check_all_hosts_ssh_successful(remote_host_names,
                                        args.ssh_port,
                                        fn_cache=fn_cache)
        if settings.verbose >= 2:
            print('SSH was successful into all the remote hosts.')

    if len(remote_host_names) > 0:
        if settings.verbose >= 2:
            print('Testing interfaces on all the hosts.')

        local_host_names = set(all_host_names) - set(remote_host_names)
        # Find the set of common, routed interfaces on all the hosts (remote
        # and local) and specify it in the args to be used by NCCL. It is
        # expected that the following function will find at least one interface
        # otherwise, it will raise an exception.
        common_intfs = _driver_fn(all_host_names,
                                  local_host_names,
                                  settings,
                                  fn_cache=fn_cache)

        if settings.verbose >= 2:
            print('Interfaces on all the hosts were successfully checked.')
            print('Common interface found: ' + ' '.join(common_intfs))

    else:
        if settings.verbose >= 2:
            print('All hosts are local, finding the interfaces '
                  'with address 127.0.0.1')
        # If all the given hosts are local, find the interfaces with address
        # 127.0.0.1
        common_intfs = set()
        for iface, addrs in net_if_addrs().items():
            for addr in addrs:
                if addr.family == AF_INET and addr.address == '127.0.0.1':
                    common_intfs.add(iface)
                    break

        if len(common_intfs) == 0:
            raise ValueError('No interface is found for address 127.0.0.1.')

        if settings.verbose >= 2:
            print('Local interface found ' + ' '.join(common_intfs))

    env = os.environ.copy()
    config_parser.set_env_from_args(env, args)

    if args.use_gloo:
        if not gloo_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'Gloo support has not been built.  If this is not expected, ensure CMake is installed '
                'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.'
            )
        gloo_run(settings, remote_host_names, common_intfs, env)
    elif args.use_mpi:
        if not mpi_built(verbose=(settings.verbose >= 2)):
            raise ValueError(
                'MPI support has not been built.  If this is not expected, ensure MPI is installed '
                'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.'
            )
        mpi_run(settings, common_intfs, env)
    else:
        if mpi_built(verbose=(settings.verbose >= 2)):
            mpi_run(settings, common_intfs, env)
        elif gloo_built(verbose=(settings.verbose >= 2)):
            gloo_run(settings, remote_host_names, common_intfs, env)
        else:
            raise ValueError(
                'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that '
                'either MPI is installed (MPI) or CMake is installed (Gloo).')
Beispiel #14
0
 def test_mpi_built(self):
     """Test that MPI has been built if env is set."""
     gloo_rank = int(os.getenv('HOROVOD_RANK', -1))
     if gloo_rank == -1:
         self.assertTrue(mpi_built())