Esempio n. 1
0
    def _create_model(self, run_results, run_id, metadata):
        history, serialized_model, serialized_optimizer = run_results[0]
        model = codec.loads_base64(serialized_model)

        # torch.load correctly moves all the optimizer state values to cpu
        # before creating the object.
        optimizer_bio = codec.loads_base64(serialized_optimizer)
        opt = torch.load(optimizer_bio, map_location=torch.device('cpu'))

        return self.get_model_class()(
            **self._get_model_kwargs(model, history, opt, run_id, metadata))
Esempio n. 2
0
        def _param_deserializer_fn(name, param_val, keras_utils, custom_objects):
            if param_val is None:
                return param_val

            if name == EstimatorParams.model.name:
                def load_model_fn(x):
                    with keras_utils.keras().utils.custom_object_scope(custom_objects):
                        return keras_utils.keras().models.load_model(x, compile=True)

                return keras_utils.deserialize_model(param_val,
                                                     load_model_fn=load_model_fn)
            elif name == KerasEstimator.optimizer.name:
                opt_base64_encoded = codec.loads_base64(param_val)
                return keras_utils.deserialize_optimizer(opt_base64_encoded)
            else:
                return codec.loads_base64(param_val)
Esempio n. 3
0
def task_exec(driver_addresses, settings, rank_env, local_rank_env):
    # Die if parent process terminates
    in_thread(target=_parent_process_monitor, args=(os.getppid(), ))

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    rank = int(os.environ[rank_env])
    local_rank = int(os.environ[local_rank_env])
    driver_client = driver_service.SparkDriverClient(driver_addresses,
                                                     key,
                                                     verbose=settings.verbose)

    # tell driver about local rank and rank
    # in elastic mode the driver already knows this mapping
    # for simplicity we keep code paths the same for elastic and static mode
    host_hash = os.environ['HOROVOD_HOSTNAME']
    task_index = driver_client.set_local_rank_to_rank(host_hash, local_rank,
                                                      rank)

    # gather available resources from task service
    task_addresses = driver_client.all_task_addresses(task_index)
    task_client = task_service.SparkTaskClient(task_index,
                                               task_addresses,
                                               key,
                                               verbose=settings.verbose)
    task_info.set_resources(task_client.resources())

    fn, args, kwargs = driver_client.code()
    result = fn(*args, **kwargs)
    task_client.register_code_result(result)
Esempio n. 4
0
def main(driver_addresses, settings, host_hash, command):
    """
    Method to run `orted` remotely given a host hash and driver addresses.

    This method connects to the SparkDriverService running on the Spark driver,
    retrieves all information required to connect to the task with the lowest task index
    of that host hash and invoke the command there.
    All other tasks with the same host hash are expected to no-op (see `horovod.spark._task_fn`)
    and wait for the first task to terminate.

    :param driver_addresses: driver's addresses
    :param settings: settings
    :param host_hash: host hash to connect to
    :param command: command and arguments to invoke
    """
    if ':' in host_hash:
        raise Exception(
            'Illegal host hash provided. Are you using Open MPI 4.0.0+?')

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    driver_client = driver_service.SparkDriverClient(driver_addresses,
                                                     key,
                                                     verbose=settings.verbose)
    task_indices = driver_client.task_host_hash_indices(host_hash)
    # Since tasks with the same host hash have shared memory, we will run only
    # one ORTED process on the first task.
    first_task_index = task_indices[0]
    task_addresses = driver_client.all_task_addresses(first_task_index)
    task_client = task_service.SparkTaskClient(first_task_index,
                                               task_addresses,
                                               key,
                                               verbose=settings.verbose)
    task_client.run_command(command, os.environ)
Esempio n. 5
0
def rsh(driver_addresses, settings, host_hash, command, env, local_rank):
    """
    Method to run a command remotely given a host hash, local rank and driver addresses.

    This method connects to the SparkDriverService running on the Spark driver,
    retrieves all information required to connect to the task with given local rank
    of that host hash and invoke the command there.

    :param driver_addresses: driver's addresses
    :param settings: settings
    :param host_hash: host hash to connect to
    :param command: command and arguments to invoke
    :param env: environment to use
    :param local_rank: local rank on the host of task to run the command in
    """
    if ':' in host_hash:
        raise Exception('Illegal host hash provided. Are you using Open MPI 4.0.0+?')

    key = codec.loads_base64(env[secret.HOROVOD_SECRET_KEY])
    driver_client = driver_service.SparkDriverClient(driver_addresses, key,
                                                     verbose=settings.verbose)
    task_indices = driver_client.task_host_hash_indices(host_hash)
    task_index = task_indices[local_rank]
    task_addresses = driver_client.all_task_addresses(task_index)
    task_client = task_service.SparkTaskClient(task_index, task_addresses,
                                               key, verbose=settings.verbose)
    task_client.run_command(command, env)
Esempio n. 6
0
        def _put_value(self, scope, key, value):
            if scope == PUT_WORKER_ADDRESSES:
                host, local_rank = key.split(':')
                addresses, secret_key = codec.loads_base64(value)
                self._put_worker_addresses(host, int(local_rank), addresses, secret_key)

            super(RendezvousHandler, self)._put_value(scope, key, value)
Esempio n. 7
0
 def _deserialize_dict(self, dict_values):
     deserialized_dict = dict()
     for key, val in dict_values.items():
         if val is None:
             deserialized_dict[key] = None
         else:
             deserialized_dict[key] = codec.loads_base64(val)
     return deserialized_dict
Esempio n. 8
0
    def _deserialize(model_bytes_base64):
        """Deserialize model from byte array encoded in base 64."""
        if is_module_available('torch'):
            import torch
            sys.modules["torch._C._nn"] = torch.nn.functional

        obj = codec.loads_base64(model_bytes_base64)
        return obj
Esempio n. 9
0
def _load_metadata_from_fs(fs, path):
    with fs.open(path, 'rb') as train_meta_file:
        meta = train_meta_file.read()
        meta = codec.loads_base64(meta.decode())
        data_schema = meta['schema']
        rows = meta['rows']
        total_byte_size = meta['total_byte_size']

    return data_schema, rows, total_byte_size
Esempio n. 10
0
def read_data_from_kvstore(addr, port, scope, key):
    try:
        url = "http://{addr}:{port}/{scope}/{key}".format(addr=addr,
                                                          port=str(port),
                                                          scope=scope,
                                                          key=key)
        req = Request(url)
        resp = urlopen(req)
        # TODO: remove base64 encoding because base64 is not efficient
        return codec.loads_base64(resp.read())
    except (HTTPError, URLError) as e:
        raise RuntimeError("Read data from KVStore server failed.", e)
Esempio n. 11
0
def main(driver_addresses, settings, host_hash, command):
    if ':' in host_hash:
        raise Exception('Illegal host hash provided. Are you using Open MPI 4.0.0+?')

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    driver_client = driver_service.SparkDriverClient(driver_addresses, key,
                                                     verbose=settings.verbose)
    task_indices = driver_client.task_host_hash_indices(host_hash)
    # Since tasks with the same host hash have shared memory, we will run only
    # one ORTED process on the first task.
    first_task_index = task_indices[0]
    task_addresses = driver_client.all_task_addresses(first_task_index)
    task_client = task_service.SparkTaskClient(first_task_index, task_addresses,
                                               key, verbose=settings.verbose)
    task_client.run_command(command, os.environ)
Esempio n. 12
0
def main(driver_addresses):
    # Die if parent process terminates
    bg = threading.Thread(target=parent_process_monitor, args=(os.getppid(), ))
    bg.daemon = True
    bg.start()

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    driver_client = driver_service.SparkDriverClient(driver_addresses, key)
    task_index = driver_client.task_index_by_rank(rank)
    task_addresses = driver_client.all_task_addresses(task_index)
    task_client = task_service.SparkTaskClient(task_index, task_addresses, key)
    fn, args, kwargs = driver_client.code()
    result = fn(*args, **kwargs)
    task_client.register_code_result(result)
Esempio n. 13
0
def task_exec(driver_addresses, settings, rank_env):
    # Die if parent process terminates
    in_thread(target=_parent_process_monitor, args=(os.getppid(), ))

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    rank = int(os.environ[rank_env])
    driver_client = driver_service.SparkDriverClient(driver_addresses,
                                                     key,
                                                     verbose=settings.verbose)
    task_index = driver_client.task_index_by_rank(rank)
    task_addresses = driver_client.all_task_addresses(task_index)
    task_client = task_service.SparkTaskClient(task_index,
                                               task_addresses,
                                               key,
                                               verbose=settings.verbose)
    task_info.set_resources(task_client.resources())

    fn, args, kwargs = driver_client.code()
    result = fn(*args, **kwargs)
    task_client.register_code_result(result)
Esempio n. 14
0
 def test_settings_dump_drops_key(self):
     settings = hvd_settings.Settings(verbose=2, key="a secret key")
     clone = codec.loads_base64(codec.dumps_base64(settings))
     self.assertEqual(settings.verbose, clone.verbose)
     self.assertIsNotNone(settings.key)
     self.assertIsNone(clone.key)
Esempio n. 15
0
if __name__ == '__main__':
    """
    Method run by MPI to connect to a host hash and execute the given command.

    The command is usually `orted` to setup the MPI cluster. That `orted` process
    is then used to spin-up the actual remote process, the Horovod user's Python method.
    The `orted` process will run on the lowest task index and all other tasks with the
    same host hash are expected to no-op (see `horovod.spark._task_fn`)
    and wait for the first task to terminate.

    :param driver_addresses: all IP addresses of the driver, base64 encoded
    :param settings: all settings, base64 encoded
    :param host_hash: the host hash to connect to
    :param command: the command and arguments to execute remotely
    """
    if len(sys.argv) < 5:
        print('Usage: %s <service addresses> <settings> <host hash> '
              '<command...>' % sys.argv[0])
        sys.exit(1)

    addresses = codec.loads_base64(sys.argv[1])
    settings = codec.loads_base64(sys.argv[2])
    host_hash = sys.argv[3]
    command = " ".join(sys.argv[4:])
    env = os.environ

    # Since tasks with the same host hash have shared memory,
    # we will run only one ORTED process on the first task.
    rsh(addresses, settings, host_hash, command, env, 0)
Esempio n. 16
0
                                                      next_task_addresses,
                                                      settings.key,
                                                      settings.verbose,
                                                      match_intf=True,
                                                      attempts=10)
        driver.register_task_to_task_addresses(next_task_index,
                                               next_task.addresses())
        # Notify the next task that the address checks are completed.
        next_task.task_to_task_address_check_completed()
        # Wait to get a notification from previous task that its address checks
        # are completed as well.
        task.wait_for_task_to_task_address_check_finish_signal(
            settings.start_timeout)

    finally:
        task.shutdown()


if __name__ == '__main__':
    if len(sys.argv) != 5:
        print('Usage: {} <index> <num_hosts> <driver_addresses> <settings>'.
              format(sys.argv[0]))
        sys.exit(1)

    index = codec.loads_base64(sys.argv[1])
    num_hosts = codec.loads_base64(sys.argv[2])
    driver_addresses = codec.loads_base64(sys.argv[3])
    settings = codec.loads_base64(sys.argv[4])

    _task_fn(index, num_hosts, driver_addresses, settings)
Esempio n. 17
0
 def deserialize_keras_model(model_bytes, load_model_fn):
     """Deserialize model from byte array encoded in base 64."""
     model_bytes = codec.loads_base64(model_bytes)
     bio = io.BytesIO(model_bytes)
     with h5py.File(bio, 'r') as f:
         return load_model_fn(f)
Esempio n. 18
0
if __name__ == '__main__':
    """
    Method run by MPI to connect to a host hash and execute the given command.

    The command is usually `orted` to setup the MPI cluster. That `orted` process
    is then used to spin-up the actual remote process, the Horovod user's Python method.
    The `orted` process will run on the lowest task index and all other tasks with the
    same host hash are expected to no-op (see `horovod.spark._task_fn`)
    and wait for the first task to terminate.

    :param driver_addresses: all IP addresses of the driver, base64 encoded
    :param settings: all settings, base64 encoded
    :param host_hash: the host hash to connect to
    :param command: the command and arguments to execute remotely
    """
    if len(sys.argv) < 5:
        print('Usage: %s <service addresses> <settings> <host hash> '
              '<command...>' % sys.argv[0])
        sys.exit(1)

    addresses = codec.loads_base64(sys.argv[1])
    key = codec.loads_base64(os.environ.get(secret.HOROVOD_SECRET_KEY))
    settings = codec.loads_base64(sys.argv[2])
    host_hash = sys.argv[3]
    command = " ".join(sys.argv[4:])
    env = {}  # orted does not need any env vars, the target training code gets env from mpirun

    # Since tasks with the same host hash have shared memory,
    # we will run only one orted process on the first task.
    rsh(addresses, key, settings, host_hash, command, env, 0)
Esempio n. 19
0
import sys

from horovod.spark.task import task_service
from horovod.spark.driver import driver_service
from horovod.run.common.util import codec, secret


def main(driver_addresses, host_hash, command):
    if ':' in host_hash:
        raise Exception(
            'Illegal host hash provided. Are you using Open MPI 4.0.0+?')

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    driver_client = driver_service.SparkDriverClient(driver_addresses, key)
    task_indices = driver_client.task_host_hash_indices(host_hash)
    # Since tasks with the same host hash have shared memory, we will run only
    # one ORTED process on the first task.
    first_task_index = task_indices[0]
    task_addresses = driver_client.all_task_addresses(first_task_index)
    task_client = task_service.SparkTaskClient(first_task_index,
                                               task_addresses, key)
    task_client.run_command(command, os.environ)


if __name__ == '__main__':
    if len(sys.argv) < 4:
        print('Usage: %s <service addresses> <host hash> <command...>' %
              sys.argv[0])
        sys.exit(1)
    main(codec.loads_base64(sys.argv[1]), sys.argv[2], " ".join(sys.argv[3:]))
Esempio n. 20
0
        time.sleep(1)


def main(driver_addresses, settings):
    # Die if parent process terminates
    bg = threading.Thread(target=parent_process_monitor, args=(os.getppid(), ))
    bg.daemon = True
    bg.start()

    key = codec.loads_base64(os.environ[secret.HOROVOD_SECRET_KEY])
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    driver_client = driver_service.SparkDriverClient(driver_addresses,
                                                     key,
                                                     verbose=settings.verbose)
    task_index = driver_client.task_index_by_rank(rank)
    task_addresses = driver_client.all_task_addresses(task_index)
    task_client = task_service.SparkTaskClient(task_index,
                                               task_addresses,
                                               key,
                                               verbose=settings.verbose)
    fn, args, kwargs = driver_client.code()
    result = fn(*args, **kwargs)
    task_client.register_code_result(result)


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('Usage: %s <driver addresses> <settings>' % sys.argv[0])
        sys.exit(1)
    main(codec.loads_base64(sys.argv[1]), codec.loads_base64(sys.argv[2]))
Esempio n. 21
0
                                                      next_task_addresses,
                                                      key,
                                                      match_intf=True,
                                                      retries=10)
        driver.register_task_to_task_addresses(next_task_index,
                                               next_task.addresses())
        # Notify the next task that the address checks are completed.
        next_task.task_to_task_address_check_completed()
        # Wait to get a notification from previous task that its address checks
        # are completed as well.
        task.wait_for_task_to_task_address_check_finish_signal(tmout)

    finally:
        task.shutdown()


if __name__ == '__main__':
    if len(sys.argv) != 6:
        print(
            'Usage: %s <index> <service addresses> <num_hosts> <tmout> <key>' %
            sys.argv[0])
        sys.exit(1)

    index = codec.loads_base64(sys.argv[1])
    driver_addresses = codec.loads_base64(sys.argv[2])
    num_hosts = codec.loads_base64(sys.argv[3])
    tmout = codec.loads_base64(sys.argv[4])
    key = codec.loads_base64(sys.argv[5])

    _task_fn(index, driver_addresses, num_hosts, tmout, key)