Beispiel #1
0
def _evaluator_fn(client):
    def _evaluate(stop):
        experiment = _task_commons._get_experiment(client)
        time.sleep(experiment.eval_spec.start_delay_secs)
        evaluated_checkpoints = set()
        while True:
            latest_checkpoint = experiment.estimator.latest_checkpoint()
            latest_eval_result = None
            if latest_checkpoint and latest_checkpoint not in evaluated_checkpoints:
                latest_eval_result = experiment.estimator.evaluate(
                    experiment.eval_spec.input_fn,
                    steps=experiment.eval_spec.steps,
                    hooks=experiment.eval_spec.hooks,
                    name=experiment.eval_spec.name
                )

            if experiment.train_spec.max_steps:
                if latest_eval_result and latest_eval_result.status == _EvalStatus.EVALUATED:
                    global_step = latest_eval_result.metrics.get(ops.GraphKeys.GLOBAL_STEP)
                    if global_step and global_step >= experiment.train_spec.max_steps:
                        break
            else:
                if stop():
                    break

            time.sleep(experiment.eval_spec.throttle_secs)

    stop_evaluation = False
    thread = Thread(target=_evaluate, args=(lambda: stop_evaluation,), daemon=True)
    thread.start()

    event.wait(client, "chief:0/stop")
    stop_evaluation = True
Beispiel #2
0
def wait_for_connected_tasks(client,
                             all_tasks,
                             device_filters,
                             message='stop'):
    for task in all_tasks:
        if matches_device_filters(task, device_filters):
            event.wait(client, f"{task}/{message}")
Beispiel #3
0
def create_cluster():
    client = skein.ApplicationClient.from_current()
    cluster_spec = cluster.start_cluster(client,
                                         [f'{NODE_NAME}:0', f'{NODE_NAME}:1'])
    cluster.setup_tf_config(cluster_spec)
    cluster.start_tf_server(cluster_spec)
    event.wait(client, "stop")
def main() -> None:
    _task_commons._log_sys_info()
    task_type, task_id = cluster.get_task_description()
    task = cluster.get_task()
    client = skein.ApplicationClient.from_current()

    _task_commons._setup_container_logs(client)
    cluster_tasks = _task_commons._get_cluster_tasks(client)

    model_dir = os.getenv('TB_MODEL_DIR', "")
    if not model_dir:
        _logger.info("Read model_dir from estimator config")
        experiment = _task_commons._get_experiment(client)
        model_dir = experiment.estimator.config.model_dir

    _logger.info(f"Starting tensorboard on {model_dir}")

    thread = _internal.MonitoredThread(name=f"{task_type}:{task_id}",
                                       target=tensorboard.start_tf_board,
                                       args=(client, model_dir),
                                       daemon=True)
    thread.start()

    for cluster_task in cluster_tasks:
        event.wait(client, f"{cluster_task}/stop")

    timeout = tensorboard.get_termination_timeout()
    thread.join(timeout)

    event.stop_event(client, task, thread.exception)
    event.broadcast_container_stop_time(client, task)
Beispiel #5
0
def _worker_fn(client, task, net_if):
    event.broadcast(client, f"{task}/addr", net_if[1])

    worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',')
    driver_socket = event.wait(client, "chief:0/sock_addr").split(':')

    os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0]
    os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1]
    os.environ['HOROVOD_CONTROLLER'] = 'gloo'
    os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo'
    os.environ['HOROVOD_GLOO_IFACE'] = net_if[0]
    os.environ['HOROVOD_RANK'] = worker_info[0]
    os.environ['HOROVOD_SIZE'] = worker_info[1]
    os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2]
    os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3]
    os.environ['HOROVOD_CROSS_RANK'] = worker_info[4]
    os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5]

    hvd.init()

    experiment = _task_commons._get_experiment(client)

    if isinstance(experiment, Experiment):
        if not is_chief(get_task_type(task)):
            # Overwrite config to do nothing but training to improve training speed
            experiment.estimator._model_dir = "."
            new_config = experiment.estimator.config.replace(
                save_summary_steps=None,
                save_checkpoints_steps=None,
                save_checkpoints_secs=None,
                log_step_count_steps=None)
            experiment.estimator._config = new_config

        logger.info("start training..")

        experiment.estimator.train(experiment.train_spec.input_fn,
                                   hooks=experiment.train_spec.hooks,
                                   max_steps=experiment.train_spec.max_steps)
    elif isinstance(experiment, KerasExperiment):
        if not is_chief(get_task_type(task)):
            if experiment.train_params['callbacks'] is not None:
                callbacks_to_keep = []
                for callback in experiment.train_params['callbacks']:
                    if not isinstance(callback,
                                      tf.keras.callbacks.ModelCheckpoint):
                        callbacks_to_keep.append(callback)
                experiment.train_params['callbacks'] = callbacks_to_keep
        if experiment.input_data_fn is not None:
            experiment.train_params['x'] = experiment.input_data_fn()
        if experiment.target_data_fn is not None:
            experiment.train_params['y'] = experiment.target_data_fn()
        logger.info("start training..")
        experiment.model.fit(**experiment.train_params)
    else:
        raise ValueError(
            "experiment must be an Experiment or a KerasExperiment")
def main() -> None:
    _task_commons._log_sys_info()
    task_type, task_id = cluster.get_task_description()
    with _internal.reserve_sock_addr() as host_port:
        client, cluster_spec, cluster_tasks = _task_commons._prepare_container(host_port)
        cluster.setup_tf_config(cluster_spec)
        tf_session_config = cloudpickle.loads(client.kv.wait(constants.KV_TF_SESSION_CONFIG))
        _logger.info(f"tf_server_conf {tf_session_config}")

    tf.contrib.distribute.run_standard_tensorflow_server()
    event.wait(client, "stop")
Beispiel #7
0
def _setup_master(client: skein.ApplicationClient, rank: int) -> None:
    if rank == 0:
        with _internal.reserve_sock_addr() as host_port:
            event.broadcast(client, MASTER_ADDR, host_port[0])
            event.broadcast(client, MASTER_PORT, str(host_port[1]))
            os.environ[MASTER_ADDR] = host_port[0]
            os.environ[MASTER_PORT] = str(host_port[1])
    else:
        master_addr = event.wait(client, MASTER_ADDR)
        master_port = event.wait(client, MASTER_PORT)
        os.environ[MASTER_ADDR] = master_addr
        os.environ[MASTER_PORT] = master_port
Beispiel #8
0
def _driver_fn(client, net_if):
    cluster_tasks = _task_commons._get_cluster_tasks(client)
    # Worker discovery
    worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"]
    n_workers = 1
    for cluster_task in cluster_tasks:
        if 'worker' in cluster_task:
            worker_addr = event.wait(client, f"{cluster_task}/addr")
            logger.info(f"{cluster_task}: {worker_addr}")
            worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}")
            n_workers += 1

    # Worker task allocation to workers
    hosts = gloo_run.parse_hosts(','.join(worker_list))
    host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers)
    for host in host_alloc_plan:
        host_info = f"""\
            {host.rank},{host.size},{host.local_rank},\
            {host.local_size},{host.cross_rank},{host.cross_size}\
            """
        event.broadcast(client, f"{get_task()}/{host.hostname}", host_info)

    global_rendezv = RendezvousServer(verbose=1)
    global_rendezv_port = global_rendezv.start_server()
    global_rendezv.httpd.init(host_alloc_plan)
    event.broadcast(client, f"{get_task()}/sock_addr",
                    f"{net_if[1]}:{global_rendezv_port}")
    return global_rendezv.listen_thread
Beispiel #9
0
def aggregate_spec(
        client: skein.ApplicationClient,
        all_tasks: typing.List[str]) -> typing.Dict[str, typing.List[str]]:
    spec: typing.Dict[str, typing.List[str]] = {}
    for task in sorted(all_tasks, key=lambda x: int(x.split(':', 1)[1])):
        sock_addr = event.wait(client, f"{task}/init")
        task_type, _task_id = task.split(":", 1)
        spec.setdefault(task_type, []).append(sock_addr)
    return spec
Beispiel #10
0
def _worker_fn(client, task, net_if):
    event.broadcast(client, f"{task}/addr", net_if[1])

    worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',')
    driver_socket = event.wait(client, "chief:0/sock_addr").split(':')

    os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0]
    os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1]
    os.environ['HOROVOD_CONTROLLER'] = 'gloo'
    os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo'
    os.environ['HOROVOD_GLOO_IFACE'] = net_if[0]
    os.environ['HOROVOD_RANK'] = worker_info[0]
    os.environ['HOROVOD_SIZE'] = worker_info[1]
    os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2]
    os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3]
    os.environ['HOROVOD_CROSS_RANK'] = worker_info[4]
    os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5]

    hvd.init()

    experiment = _task_commons._get_experiment(client)

    if task != 'chief:0':
        # Overwrite config to do nothing but training to improve training speed
        experiment.estimator._model_dir = "."
        new_config = experiment.estimator.config.replace(
            save_summary_steps=None,
            save_checkpoints_steps=None,
            save_checkpoints_secs=None,
            log_step_count_steps=None)
        experiment.estimator._config = new_config

    logger.info("start training..")

    experiment.estimator.train(experiment.train_spec.input_fn,
                               hooks=experiment.train_spec.hooks,
                               max_steps=experiment.train_spec.max_steps)
Beispiel #11
0
def client_tf(client):
    spec = create_skein_app()
    app = client.submit_and_connect(spec)
    x = tf.placeholder(tf.float32, 100)

    with tf.device(f"/job:{NODE_NAME}/task:1"):
        first_batch = tf.slice(x, [0], [50])
        mean1 = tf.reduce_mean(first_batch)

    with tf.device(f"/job:{NODE_NAME}/task:0"):
        second_batch = tf.slice(x, [50], [-1])
        mean2 = tf.reduce_mean(second_batch)
        mean = (mean1 + mean2) / 2

    first_task = event.wait(app, f"{NODE_NAME}:0/init")
    with tf.Session(f"grpc://{first_task}") as sess:
        result = sess.run(mean, feed_dict={x: np.random.random(100)})
        print(f"mean = {result}")
    event.broadcast(app, "stop", "1")
Beispiel #12
0
def _setup_tracker(client):
    host_port = event.wait(client, "chief:0/tracker")
    tf.logging.info(f"Got tracker url {host_port}")
    host, port = host_port.split(":")
    os.environ['DMLC_TRACKER_URI'] = host
    os.environ['DMLC_TRACKER_PORT'] = port