Beispiel #1
0
def multiple_machines_launch(args, env: Dict[str, str], hosts_arg: str,
                             all_host_names: List[str],
                             remote_host_names: List[str]):
    common_intfs = set()  # common network interface
    # 1. Check if we can ssh into all remote hosts successfully.
    assert network_util.check_all_hosts_ssh_successful(remote_host_names,
                                                       args.ssh_port)
    if not args.nic:
        # 2. Find the set of common, routed interfaces on all the hosts (remote
        # and local) and specify it in the args. It is expected that the following
        # function will find at least one interface.
        # otherwise, it will raise an exception.
        # So far, we just use horovodrun to do this job since the task are the same.
        local_host_names = set(all_host_names) - set(remote_host_names)
        common_intfs = horovod_driver.driver_fn(all_host_names,
                                                local_host_names,
                                                args.ssh_port, args.verbose)
    else:
        common_intfs = [args.nic]

    tcp_intf_arg = '-mca btl_tcp_if_include {common_intfs}'.format(
        common_intfs=','.join(common_intfs)) if common_intfs else ''
    nccl_socket_intf_arg = '-x NCCL_SOCKET_IFNAME={common_intfs}'.format(
        common_intfs=','.join(common_intfs)) if common_intfs else ''

    if args.use_infiniband:
        ib_arg = "-mca btl openib,self"
    else:
        ib_arg = "-mca btl ^openib"

    if args.ssh_port:
        ssh_port_arg = "-mca plm_rsh_args \"-p {ssh_port}\"".format(
            ssh_port=args.ssh_port)
    else:
        ssh_port_arg = ""

    extra_flags = args.extra_flags if args.extra_flags else ''

    ipcontroller_command = "ipcontroller --profile {profile} --ip='*'".format(
        profile=args.profile)

    # Maybe kill the last time unfinished process.
    if _maybe_kill_ipcontroller_process(args.profile):
        print("Found and killed the unfinished ipcontroller process.")
    subprocess.run('ipcluster nbextension enable --user', shell=True, env=env)
    print("Starting the controller.")
    stdout = None if args.verbose else subprocess.PIPE
    p_controller = subprocess.Popen(ipcontroller_command,
                                    shell=True,
                                    env=env,
                                    stdout=stdout,
                                    stderr=subprocess.STDOUT)
    engine_file = _wait_engine_file_ready(args.profile)
    client_file = _wait_client_file_ready(args.profile)
    # Copy the engine file to all remote hosts
    assert network_util.scp_transmit_file(engine_file, remote_host_names,
                                          args.ssh_port)
    assert network_util.scp_transmit_file(client_file, remote_host_names,
                                          args.ssh_port)

    print("Starting the engines.")
    ipengine_command = "ipengine start --profile {profile}".format(
        profile=args.profile, )

    # TODO(ybc) Cannot carry the env variable. May encounter:
    # ORCE-TERMINATE AT Data unpack would read past end of buffer:-26 - error grpcomm_direct.c(359)?

    # Use mpirun to start ipengines
    mpi_ipengine_command = ('mpirun --allow-run-as-root '
                            '-np {num_proc} {hosts_arg} '
                            '-bind-to none -map-by slot '
                            '-mca pml ob1 {ib_arg} '
                            '{ssh_port_arg} {tcp_intf_arg} '
                            '{extra_flags} {nccl_socket_intf_arg} '
                            '{command}'.format(
                                num_proc=args.np,
                                hosts_arg=hosts_arg,
                                ssh_port_arg=ssh_port_arg,
                                tcp_intf_arg=tcp_intf_arg,
                                ib_arg=ib_arg,
                                nccl_socket_intf_arg=nccl_socket_intf_arg,
                                extra_flags=extra_flags,
                                env=' '.join('-x %s' % key
                                             for key in env.keys()
                                             if env_util.is_exportable(key)),
                                command=ipengine_command))
    p_engine = subprocess.Popen(mpi_ipengine_command, shell=True, env=env)
    while not p_controller.poll() and not p_engine.poll():
        time.sleep(600)
Beispiel #2
0
def main():
    args = parse_args()

    if args.version:
        print(bluefog.__version__)
        exit(0)

    hosts_arg, all_host_names = network_util.get_hosts_arg_and_hostnames(args)
    remote_host_names = network_util.filter_local_addresses(all_host_names)

    common_intfs = set()
    if remote_host_names:
        # 1. Check if we can ssh into all remote hosts successfully.
        assert network_util.check_all_hosts_ssh_successful(remote_host_names, args.ssh_port)
        if not args.nic:
            # 2. Find the set of common, routed interfaces on all the hosts (remote
            # and local) and specify it in the args. It is expected that the following
            # function will find at least one interface.
            # otherwise, it will raise an exception.
            # So far, we just use horovodrun to do this job since the task are the same.
            local_host_names = set(all_host_names) - set(remote_host_names)
            common_intfs = horovod_driver.driver_fn(all_host_names, local_host_names,
                                                    args.ssh_port, args.verbose)
        else:
            common_intfs = [args.nic]
    tcp_intf_arg = '-mca btl_tcp_if_include {common_intfs}'.format(
        common_intfs=','.join(common_intfs)) if common_intfs else ''
    nccl_socket_intf_arg = '-x NCCL_SOCKET_IFNAME={common_intfs}'.format(
        common_intfs=','.join(common_intfs)) if common_intfs else ''

    if args.ssh_port:
        ssh_port_arg = "-mca plm_rsh_args \"-p {ssh_port}\"".format(
            ssh_port=args.ssh_port)
    else:
        ssh_port_arg = ""

    if args.use_infiniband:
        ib_arg = "-mca btl openib,self"
    else:
        ib_arg = "-mca btl ^openib"

    if args.prefix:
        mpi_prefix = args.prefix
    else:
        mpi_prefix = ""

    if not env_util.is_open_mpi_installed():
        raise Exception(
            'bfrun convenience script currently only supports Open MPI.\n\n'
            'Choose one of:\n'
            '1. Install Open MPI 4.0.0+ and re-install Bluefog.\n'
            '2. Run distributed '
            'training script using the standard way provided by your'
            ' MPI distribution (usually mpirun, srun, or jsrun).')

    extra_flags = args.extra_flags if args.extra_flags else ''
    # Pass all the env variables to the mpirun command.
    env = os.environ.copy()
    env = env_util.set_env_from_args(env, args)
    mpirun_command = (
        '{prefix}mpirun --allow-run-as-root '
        '-np {num_proc} {hosts_arg} '
        '-bind-to none -map-by slot '
        '-mca pml ob1 {ib_arg} '
        '{ssh_port_arg} {tcp_intf_arg} '
        '{nccl_socket_intf_arg} '
        '{extra_flags} {env} {command}'
        .format(prefix=mpi_prefix,
                num_proc=args.np,
                hosts_arg=hosts_arg,
                ib_arg=ib_arg,
                ssh_port_arg=ssh_port_arg,
                tcp_intf_arg=tcp_intf_arg,
                nccl_socket_intf_arg=nccl_socket_intf_arg,
                extra_flags=extra_flags,
                env=' '.join('-x %s' % key for key in env.keys() if env_util.is_exportable(key)),
                command=' '.join(shlex.quote(par) for par in args.command))
    )

    if args.verbose:
        print(mpirun_command)
    # Execute the mpirun command.
    os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)