Ejemplo n.º 1
0
 def _exec_command(command, alloc_info, event):
     host = alloc_info.hostname
     local_rank = alloc_info.local_rank
     rsh(driver_addresses, settings, host, command, env, local_rank)
     # this indicate successful command execution, not the result of the executed command
     # the result of each task is collected through Spark at the end of horovod.spark.run.run()
     return 0, time.time()
Ejemplo n.º 2
0
 def _exec_command(command, slot_info, events):
     host = slot_info.hostname
     local_rank = slot_info.local_rank
     verbose = settings.verbose
     result = rsh(driver_addresses, key, host, command, env, local_rank,
                  verbose, False, events)
     return result, time.time()
Ejemplo n.º 3
0
 def _exec_command(command, slot_info, events):
     host = slot_info.hostname
     local_rank = slot_info.local_rank
     verbose = settings.verbose
     result = rsh(driver.addresses(), key, host, command, env, local_rank,
                  verbose, stdout, stderr, prefix_output_with_timestamp,
                  False, events)
     return result, time.time()
Ejemplo n.º 4
0
if __name__ == '__main__':
    """
    Method run by MPI to connect to a host hash and execute the given command.

    The command is usually `orted` to setup the MPI cluster. That `orted` process
    is then used to spin-up the actual remote process, the Horovod user's Python method.
    The `orted` process will run on the lowest task index and all other tasks with the
    same host hash are expected to no-op (see `horovod.spark._task_fn`)
    and wait for the first task to terminate.

    :param driver_addresses: all IP addresses of the driver, base64 encoded
    :param settings: all settings, base64 encoded
    :param host_hash: the host hash to connect to
    :param command: the command and arguments to execute remotely
    """
    if len(sys.argv) < 5:
        print('Usage: %s <service addresses> <settings> <host hash> '
              '<command...>' % sys.argv[0])
        sys.exit(1)

    addresses = codec.loads_base64(sys.argv[1])
    key = codec.loads_base64(os.environ.get(secret.HOROVOD_SECRET_KEY))
    settings = codec.loads_base64(sys.argv[2])
    host_hash = sys.argv[3]
    command = " ".join(sys.argv[4:])
    env = {}  # orted does not need any env vars, the target training code gets env from mpirun

    # Since tasks with the same host hash have shared memory,
    # we will run only one orted process on the first task.
    rsh(addresses, key, settings, host_hash, command, env, 0)
Ejemplo n.º 5
0
if __name__ == '__main__':
    """
    Method run by MPI to connect to a host hash and execute the given command.

    The command is usually `orted` to setup the MPI cluster. That `orted` process
    is then used to spin-up the actual remote process, the Horovod user's Python method.
    The `orted` process will run on the lowest task index and all other tasks with the
    same host hash are expected to no-op (see `horovod.spark._task_fn`)
    and wait for the first task to terminate.

    :param driver_addresses: all IP addresses of the driver, base64 encoded
    :param settings: all settings, base64 encoded
    :param host_hash: the host hash to connect to
    :param command: the command and arguments to execute remotely
    """
    if len(sys.argv) < 5:
        print('Usage: %s <service addresses> <settings> <host hash> '
              '<command...>' % sys.argv[0])
        sys.exit(1)

    addresses = codec.loads_base64(sys.argv[1])
    key = codec.loads_base64(os.environ.get(secret.HOROVOD_SECRET_KEY))
    settings = codec.loads_base64(sys.argv[2])
    host_hash = sys.argv[3]
    command = " ".join(sys.argv[4:])
    env = {}  # orted does not need any env vars, the target training code gets env from mpirun

    # Since tasks with the same host hash have shared memory,
    # we will run only one orted process on the first task.
    rsh(addresses, key, host_hash, command, env, 0, settings.verbose)
Ejemplo n.º 6
0
 def _exec_command(command, alloc_info, event):
     host = alloc_info.hostname
     local_rank = alloc_info.local_rank
     result = rsh(driver_addresses, key, settings, host, command, env, local_rank, False, event)
     return result, time.time()