def _exec_command(command, alloc_info, event): host = alloc_info.hostname local_rank = alloc_info.local_rank rsh(driver_addresses, settings, host, command, env, local_rank) # this indicate successful command execution, not the result of the executed command # the result of each task is collected through Spark at the end of horovod.spark.run.run() return 0, time.time()
def _exec_command(command, slot_info, events): host = slot_info.hostname local_rank = slot_info.local_rank verbose = settings.verbose result = rsh(driver_addresses, key, host, command, env, local_rank, verbose, False, events) return result, time.time()
def _exec_command(command, slot_info, events): host = slot_info.hostname local_rank = slot_info.local_rank verbose = settings.verbose result = rsh(driver.addresses(), key, host, command, env, local_rank, verbose, stdout, stderr, prefix_output_with_timestamp, False, events) return result, time.time()
if __name__ == '__main__': """ Method run by MPI to connect to a host hash and execute the given command. The command is usually `orted` to setup the MPI cluster. That `orted` process is then used to spin-up the actual remote process, the Horovod user's Python method. The `orted` process will run on the lowest task index and all other tasks with the same host hash are expected to no-op (see `horovod.spark._task_fn`) and wait for the first task to terminate. :param driver_addresses: all IP addresses of the driver, base64 encoded :param settings: all settings, base64 encoded :param host_hash: the host hash to connect to :param command: the command and arguments to execute remotely """ if len(sys.argv) < 5: print('Usage: %s <service addresses> <settings> <host hash> ' '<command...>' % sys.argv[0]) sys.exit(1) addresses = codec.loads_base64(sys.argv[1]) key = codec.loads_base64(os.environ.get(secret.HOROVOD_SECRET_KEY)) settings = codec.loads_base64(sys.argv[2]) host_hash = sys.argv[3] command = " ".join(sys.argv[4:]) env = {} # orted does not need any env vars, the target training code gets env from mpirun # Since tasks with the same host hash have shared memory, # we will run only one orted process on the first task. rsh(addresses, key, settings, host_hash, command, env, 0)
if __name__ == '__main__': """ Method run by MPI to connect to a host hash and execute the given command. The command is usually `orted` to setup the MPI cluster. That `orted` process is then used to spin-up the actual remote process, the Horovod user's Python method. The `orted` process will run on the lowest task index and all other tasks with the same host hash are expected to no-op (see `horovod.spark._task_fn`) and wait for the first task to terminate. :param driver_addresses: all IP addresses of the driver, base64 encoded :param settings: all settings, base64 encoded :param host_hash: the host hash to connect to :param command: the command and arguments to execute remotely """ if len(sys.argv) < 5: print('Usage: %s <service addresses> <settings> <host hash> ' '<command...>' % sys.argv[0]) sys.exit(1) addresses = codec.loads_base64(sys.argv[1]) key = codec.loads_base64(os.environ.get(secret.HOROVOD_SECRET_KEY)) settings = codec.loads_base64(sys.argv[2]) host_hash = sys.argv[3] command = " ".join(sys.argv[4:]) env = {} # orted does not need any env vars, the target training code gets env from mpirun # Since tasks with the same host hash have shared memory, # we will run only one orted process on the first task. rsh(addresses, key, host_hash, command, env, 0, settings.verbose)
def _exec_command(command, alloc_info, event): host = alloc_info.hostname local_rank = alloc_info.local_rank result = rsh(driver_addresses, key, settings, host, command, env, local_rank, False, event) return result, time.time()