Example #1
0
def _driver_fn(client, net_if):
    cluster_tasks = _task_commons._get_cluster_tasks(client)
    # Worker discovery
    worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"]
    n_workers = 1
    for cluster_task in cluster_tasks:
        if 'worker' in cluster_task:
            worker_addr = event.wait(client, f"{cluster_task}/addr")
            logger.info(f"{cluster_task}: {worker_addr}")
            worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}")
            n_workers += 1

    # Worker task allocation to workers
    hosts = gloo_run.parse_hosts(','.join(worker_list))
    host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers)
    for host in host_alloc_plan:
        host_info = f"""\
            {host.rank},{host.size},{host.local_rank},\
            {host.local_size},{host.cross_rank},{host.cross_size}\
            """
        event.broadcast(client, f"{get_task()}/{host.hostname}", host_info)

    global_rendezv = RendezvousServer(verbose=1)
    global_rendezv_port = global_rendezv.start_server()
    global_rendezv.httpd.init(host_alloc_plan)
    event.broadcast(client, f"{get_task()}/sock_addr",
                    f"{net_if[1]}:{global_rendezv_port}")
    return global_rendezv.listen_thread
Example #2
0
def _worker_fn(client, task, net_if):
    event.broadcast(client, f"{task}/addr", net_if[1])

    worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',')
    driver_socket = event.wait(client, "chief:0/sock_addr").split(':')

    os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0]
    os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1]
    os.environ['HOROVOD_CONTROLLER'] = 'gloo'
    os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo'
    os.environ['HOROVOD_GLOO_IFACE'] = net_if[0]
    os.environ['HOROVOD_RANK'] = worker_info[0]
    os.environ['HOROVOD_SIZE'] = worker_info[1]
    os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2]
    os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3]
    os.environ['HOROVOD_CROSS_RANK'] = worker_info[4]
    os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5]

    hvd.init()

    experiment = _task_commons._get_experiment(client)

    if isinstance(experiment, Experiment):
        if not is_chief(get_task_type(task)):
            # Overwrite config to do nothing but training to improve training speed
            experiment.estimator._model_dir = "."
            new_config = experiment.estimator.config.replace(
                save_summary_steps=None,
                save_checkpoints_steps=None,
                save_checkpoints_secs=None,
                log_step_count_steps=None)
            experiment.estimator._config = new_config

        logger.info("start training..")

        experiment.estimator.train(experiment.train_spec.input_fn,
                                   hooks=experiment.train_spec.hooks,
                                   max_steps=experiment.train_spec.max_steps)
    elif isinstance(experiment, KerasExperiment):
        if not is_chief(get_task_type(task)):
            if experiment.train_params['callbacks'] is not None:
                callbacks_to_keep = []
                for callback in experiment.train_params['callbacks']:
                    if not isinstance(callback,
                                      tf.keras.callbacks.ModelCheckpoint):
                        callbacks_to_keep.append(callback)
                experiment.train_params['callbacks'] = callbacks_to_keep
        if experiment.input_data_fn is not None:
            experiment.train_params['x'] = experiment.input_data_fn()
        if experiment.target_data_fn is not None:
            experiment.train_params['y'] = experiment.target_data_fn()
        logger.info("start training..")
        experiment.model.fit(**experiment.train_params)
    else:
        raise ValueError(
            "experiment must be an Experiment or a KerasExperiment")
Example #3
0
def _setup_master(client: skein.ApplicationClient, rank: int) -> None:
    if rank == 0:
        with _internal.reserve_sock_addr() as host_port:
            event.broadcast(client, MASTER_ADDR, host_port[0])
            event.broadcast(client, MASTER_PORT, str(host_port[1]))
            os.environ[MASTER_ADDR] = host_port[0]
            os.environ[MASTER_PORT] = str(host_port[1])
    else:
        master_addr = event.wait(client, MASTER_ADDR)
        master_port = event.wait(client, MASTER_PORT)
        os.environ[MASTER_ADDR] = master_addr
        os.environ[MASTER_PORT] = master_port
Example #4
0
def _start_tracker(client, n_workers: int):
    tf.logging.info(f"Starting tracker with {n_workers} workers")
    rabit_context = tracker.RabitTracker(
        hostIP=tracker.get_host_ip(),
        nslave=n_workers,
        # will do bind(0) -> choose a random port
        port=0,
        port_end=1)
    rabit_context.start(n_workers)
    thread = Thread(target=rabit_context.join, daemon=True)
    thread.start()

    event.broadcast(
        client,
        f"{cluster.get_task()}/tracker",
        f"{rabit_context.hostIP}:{rabit_context.port}"
    )
    return thread
Example #5
0
def client_tf(client):
    spec = create_skein_app()
    app = client.submit_and_connect(spec)
    x = tf.placeholder(tf.float32, 100)

    with tf.device(f"/job:{NODE_NAME}/task:1"):
        first_batch = tf.slice(x, [0], [50])
        mean1 = tf.reduce_mean(first_batch)

    with tf.device(f"/job:{NODE_NAME}/task:0"):
        second_batch = tf.slice(x, [50], [-1])
        mean2 = tf.reduce_mean(second_batch)
        mean = (mean1 + mean2) / 2

    first_task = event.wait(app, f"{NODE_NAME}:0/init")
    with tf.Session(f"grpc://{first_task}") as sess:
        result = sess.run(mean, feed_dict={x: np.random.random(100)})
        print(f"mean = {result}")
    event.broadcast(app, "stop", "1")
Example #6
0
def _worker_fn(client, task, net_if):
    event.broadcast(client, f"{task}/addr", net_if[1])

    worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',')
    driver_socket = event.wait(client, "chief:0/sock_addr").split(':')

    os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0]
    os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1]
    os.environ['HOROVOD_CONTROLLER'] = 'gloo'
    os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo'
    os.environ['HOROVOD_GLOO_IFACE'] = net_if[0]
    os.environ['HOROVOD_RANK'] = worker_info[0]
    os.environ['HOROVOD_SIZE'] = worker_info[1]
    os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2]
    os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3]
    os.environ['HOROVOD_CROSS_RANK'] = worker_info[4]
    os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5]

    hvd.init()

    experiment = _task_commons._get_experiment(client)

    if task != 'chief:0':
        # Overwrite config to do nothing but training to improve training speed
        experiment.estimator._model_dir = "."
        new_config = experiment.estimator.config.replace(
            save_summary_steps=None,
            save_checkpoints_steps=None,
            save_checkpoints_secs=None,
            log_step_count_steps=None)
        experiment.estimator._config = new_config

    logger.info("start training..")

    experiment.estimator.train(experiment.train_spec.input_fn,
                               hooks=experiment.train_spec.hooks,
                               max_steps=experiment.train_spec.max_steps)
Example #7
0
def standalone_client_mode(pyenv_zip_path: Union[str, Dict[NodeLabel, str]],
                           task_specs: Dict[str, TaskSpec] = None,
                           tf_session_config: Optional[tf.ConfigProto] = None,
                           *,
                           skein_client: skein.Client = None,
                           files: Dict[str, str] = None,
                           env: Dict[str, str] = {},
                           queue: str = "default",
                           acls: ACLs = None,
                           file_systems: List[str] = None,
                           log_conf_file: str = None):
    """
    https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow \
            /contrib/distribute/README.md#standalone-client-mode
    Standalone mode means starting tf server on the cluster,
    launching everything on the client and letting tf take care of the rest
    This is not limited to Estimator API, it also works with low level tf
    (see session_run_example.py)

    Parameters
    ----------

    pyenv_zip_path
        Path to an archive of a python environment to be deployed
        It can be a zip conda env or a pex archive
        In case of GPU/CPU cluster, provide a dictionnary with both
        environments.

    skein_client
        Skein client to submit yarn jobs

    task_specs
        Resources to allocate for each task type. The keys
        must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and
        ``"evaluator"``. The minimal spec must contain at least
        ``"chief"``.

    tf_session_config
        tf.ConfigProto to be provided to each started TFServer

    files
        Local files or directories to upload to the container.
        The keys are the target locations of the resources relative
        to the container root, while the values -- their
        corresponding local sources. Note that container root is
        appended to ``PYTHONPATH``. Therefore, any listed Python
        module a package is automatically importable.

    env
        Environment variables to forward to the containers.

    queue
        YARN queue to use.

    acls
        Configures the application-level Access Control Lists (ACLs).
        Optional, defaults to no ACLs.

        See `ACLs <https://jcrist.github.io/skein/specification.html#id16>` for details.

    file_systems
        A list of namenode URIs to acquire delegation tokens for
        in addition to ``fs.defaultFS``.

    log_conf_file
        optional file with log config, setups logging by default with INFO verbosity,
        if you specify a file here don't forget to also ship it to the containers via files arg
    """
    cluster = None
    try:
        pyenvs = _setup_pyenvs(pyenv_zip_path, standalone_client_mode=True)
        cluster = _setup_skein_cluster(pyenvs=pyenvs,
                                       skein_client=skein_client,
                                       task_specs=StaticDefaultDict(
                                           task_specs, default=TASK_SPEC_NONE),
                                       files=files,
                                       env=env,
                                       queue=queue,
                                       acls=acls,
                                       file_systems=file_systems,
                                       log_conf_file=log_conf_file,
                                       standalone_client_mode=True)
        _send_config_proto(cluster, tf_session_config)

        yield cluster.cluster_spec
    finally:
        if cluster:
            broadcast(cluster.app, "stop", "1")
Example #8
0
 def broadcast(self, key: str, value: str):
     broadcast(self.client, f'{self.task}/{key}', value)
Example #9
0
def standalone_client_mode(
        pyenv_zip_path: Union[str, Dict[topologies.NodeLabel, str]],
        task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE,
        tf_session_config: Optional[tf.ConfigProto] = None,
        *,
        skein_client: skein.Client = None,
        files: Dict[str, str] = None,
        env: Dict[str, str] = {},
        queue: str = "default",
        acls: ACLs = _default_acls_all_access(),
        file_systems: List[str] = None,
        name: str = "RunOnYarn",
        pre_script_hook: Optional[str] = None
):
    """
    https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow \
            /contrib/distribute/README.md#standalone-client-mode
    Standalone mode means starting tf server on the cluster,
    launching everything on the client and letting tf take care of the rest
    This is not limited to Estimator API, it also works with low level tf
    (see session_run_example.py)

    Parameters
    ----------

    pyenv_zip_path
        Path to an archive of a python environment to be deployed
        It can be a zip conda env or a pex archive
        In case of GPU/CPU cluster, provide a dictionnary with both
        environments.

    skein_client
        Skein client to submit yarn jobs

    task_specs
        Resources to allocate for each task type. The keys
        must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and
        ``"evaluator"``. The minimal spec must contain at least
        ``"chief"``.

    tf_session_config
        tf.ConfigProto to be provided to each started TFServer

    files
        Local files or directories to upload to the container.
        The keys are the target locations of the resources relative
        to the container root, while the values -- their
        corresponding local sources. Note that container root is
        appended to ``PYTHONPATH``. Therefore, any listed Python
        module a package is automatically importable.

    env
        Environment variables to forward to the containers.

    queue
        YARN queue to use.

    acls
        Configures the application-level Access Control Lists (ACLs).
        Optional, defaults to ACLs all access.

        See `ACLs <https://jcrist.github.io/skein/specification.html#acls>` for details.

    file_systems
        A list of namenode URIs to acquire delegation tokens for
        in addition to ``fs.defaultFS``.

    name
        Name of the yarn application

    pre_script_hook
        bash command to prepare Hadoop environment
    """
    try:
        pyenvs = _setup_pyenvs(pyenv_zip_path)
        skein_cluster = _setup_skein_cluster(
            pyenvs=pyenvs,
            task_specs=task_specs,
            standalone_client_mode=True,
            skein_client=skein_client,
            files=files,
            env=env,
            queue=queue,
            acls=acls,
            file_systems=file_systems,
            name=name,
            pre_script_hook=pre_script_hook
        )

        with _shutdown_on_exception(skein_cluster.app):
            cluster_spec = _setup_cluster_spec(skein_cluster.tasks, skein_cluster.app, True)

            _send_config_proto(skein_cluster, tf_session_config)

            yield cluster_spec
    finally:
        if skein_cluster:
            event.broadcast(skein_cluster.app, "stop", "1")