Ejemplo n.º 1
0
def test__get_experiment_exception():
    with contextlib.ExitStack() as stack:
        stack.enter_context(patch(f'{MODULE_TO_TEST}.cluster'))
        mocked_event = stack.enter_context(patch(f'{MODULE_TO_TEST}.event'))
        mocked_client = mock.MagicMock(spec=skein.ApplicationClient)

        def experiment_f():
            raise Exception()

        mocked_client.kv.wait.return_value = cloudpickle.dumps(experiment_f)
        with pytest.raises(Exception):
            _get_experiment(mocked_client)
        mocked_event.start_event.assert_called_once()
        mocked_event.stop_event.assert_called_once()
Ejemplo n.º 2
0
    def _evaluate(stop):
        experiment = _task_commons._get_experiment(client)
        time.sleep(experiment.eval_spec.start_delay_secs)
        evaluated_checkpoints = set()
        while True:
            latest_checkpoint = experiment.estimator.latest_checkpoint()
            latest_eval_result = None
            if latest_checkpoint and latest_checkpoint not in evaluated_checkpoints:
                latest_eval_result = experiment.estimator.evaluate(
                    experiment.eval_spec.input_fn,
                    steps=experiment.eval_spec.steps,
                    hooks=experiment.eval_spec.hooks,
                    name=experiment.eval_spec.name
                )

            if experiment.train_spec.max_steps:
                if latest_eval_result and latest_eval_result.status == _EvalStatus.EVALUATED:
                    global_step = latest_eval_result.metrics.get(ops.GraphKeys.GLOBAL_STEP)
                    if global_step and global_step >= experiment.train_spec.max_steps:
                        break
            else:
                if stop():
                    break

            time.sleep(experiment.eval_spec.throttle_secs)
Ejemplo n.º 3
0
def main() -> None:
    _task_commons._log_sys_info()
    task_type, task_id = cluster.get_task_description()
    with _internal.reserve_sock_addr() as host_port:
        client, cluster_spec, cluster_tasks = _task_commons._prepare_container(
            host_port)
        # Variable TF_CONFIG must be set before instantiating
        # the estimator to train in a distributed way
        cluster.setup_tf_config(cluster_spec)
        experiment = _task_commons._get_experiment(client)
        if isinstance(experiment, Experiment):
            session_config = experiment.config.session_config
        elif isinstance(experiment, KerasExperiment):
            raise ValueError(
                "KerasExperiment using parameter strategy is unsupported")
        else:
            raise ValueError(
                "experiment must be an Experiment or a KerasExperiment")
        _logger.info(f"Starting server {task_type}:{task_id}")

    cluster.start_tf_server(cluster_spec, session_config)
    thread = _task_commons._execute_dispatched_function(client, experiment)

    # "ps" tasks do not terminate by themselves. See
    # https://github.com/tensorflow/tensorflow/issues/4713.
    if task_type not in ['ps']:
        thread.join()
        _logger.info(f"{task_type}:{task_id} {thread.state}")

    _task_commons._shutdown_container(client, cluster_tasks, session_config,
                                      thread)
Ejemplo n.º 4
0
def main() -> None:
    _log_sys_info()
    task_type, task_id = get_task_description()

    client = skein.ApplicationClient.from_current()
    experiment = _get_experiment(client)
    assert isinstance(experiment, PytorchExperiment)
    cluster_tasks = _get_cluster_tasks(client)
    n_workers_per_executor = experiment.n_workers_per_executor

    world_size = len([t for t in cluster_tasks if "worker" in t
                      ]) * n_workers_per_executor
    _logger.info(f"Task type: {task_type}; Task id: {task_id};"
                 f"World_size: {world_size}: Cluster tasks: {cluster_tasks}")

    if n_workers_per_executor > 1:
        workers = list()
        mp.set_start_method("spawn", force=True)
        for n in range(n_workers_per_executor):
            worker = mp.Process(
                target=_train,
                args=(_get_device(n), (task_id * n_workers_per_executor) + n,
                      world_size,
                      _get_collective_ops_backend(n_workers_per_executor)))
            worker.start()
            workers.append(worker)

        for worker in workers:
            worker.join()
    else:
        _train(0, task_id, world_size, "nccl")
Ejemplo n.º 5
0
def _worker_fn(task_type, task_id, client):
    os.environ['DMLC_RANK'] = "0" if task_type == 'chief' else f"{task_id + 1}"
    os.environ['DMLC_ROLE'] = "worker"

    cluster_tasks = _task_commons._get_cluster_tasks(client)

    logger.info(cluster_tasks)

    if task_type == 'chief':
        _start_tracker(client, len(cluster_tasks))

    _setup_tracker(client)

    rabit.init()

    experiment = _task_commons._get_experiment(client)

    if task_type != 'chief':
        # Overwrite config to do nothing but training to improve training speed
        experiment.estimator._model_dir = "."
        new_config = experiment.estimator.config.replace(
            save_summary_steps=None,
            save_checkpoints_steps=None,
            save_checkpoints_secs=None,
            log_step_count_steps=None
        )
        experiment.estimator._config = new_config

    logger.info(f"start training..")

    experiment.estimator.train(
        experiment.train_spec.input_fn,
        hooks=experiment.train_spec.hooks,
        max_steps=experiment.train_spec.max_steps)
Ejemplo n.º 6
0
def main() -> None:
    _task_commons._log_sys_info()
    task_type, task_id = cluster.get_task_description()
    task = cluster.get_task()
    client = skein.ApplicationClient.from_current()

    _task_commons._setup_container_logs(client)
    cluster_tasks = _task_commons._get_cluster_tasks(client)

    model_dir = os.getenv('TB_MODEL_DIR', "")
    if not model_dir:
        _logger.info("Read model_dir from estimator config")
        experiment = _task_commons._get_experiment(client)
        model_dir = experiment.estimator.config.model_dir

    _logger.info(f"Starting tensorboard on {model_dir}")

    thread = _internal.MonitoredThread(name=f"{task_type}:{task_id}",
                                       target=tensorboard.start_tf_board,
                                       args=(client, model_dir),
                                       daemon=True)
    thread.start()

    for cluster_task in cluster_tasks:
        event.wait(client, f"{cluster_task}/stop")

    timeout = tensorboard.get_termination_timeout()
    thread.join(timeout)

    event.stop_event(client, task, thread.exception)
    event.broadcast_container_stop_time(client, task)
Ejemplo n.º 7
0
def evaluator_fn(client):
    experiment = _task_commons._get_experiment(client)
    if isinstance(experiment, Experiment):
        evaluate(experiment, timeout_in_secs=1200)  # Timeout after 20min
    elif isinstance(experiment, KerasExperiment):
        keras_evaluate(experiment, timeout_in_secs=1200)  # Timeout after 20min
    else:
        raise ValueError(
            "experiment must be an Experiment or a KerasExperiment")
Ejemplo n.º 8
0
def _worker_fn(client, task, net_if):
    event.broadcast(client, f"{task}/addr", net_if[1])

    worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',')
    driver_socket = event.wait(client, "chief:0/sock_addr").split(':')

    os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0]
    os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1]
    os.environ['HOROVOD_CONTROLLER'] = 'gloo'
    os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo'
    os.environ['HOROVOD_GLOO_IFACE'] = net_if[0]
    os.environ['HOROVOD_RANK'] = worker_info[0]
    os.environ['HOROVOD_SIZE'] = worker_info[1]
    os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2]
    os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3]
    os.environ['HOROVOD_CROSS_RANK'] = worker_info[4]
    os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5]

    hvd.init()

    experiment = _task_commons._get_experiment(client)

    if isinstance(experiment, Experiment):
        if not is_chief(get_task_type(task)):
            # Overwrite config to do nothing but training to improve training speed
            experiment.estimator._model_dir = "."
            new_config = experiment.estimator.config.replace(
                save_summary_steps=None,
                save_checkpoints_steps=None,
                save_checkpoints_secs=None,
                log_step_count_steps=None)
            experiment.estimator._config = new_config

        logger.info("start training..")

        experiment.estimator.train(experiment.train_spec.input_fn,
                                   hooks=experiment.train_spec.hooks,
                                   max_steps=experiment.train_spec.max_steps)
    elif isinstance(experiment, KerasExperiment):
        if not is_chief(get_task_type(task)):
            if experiment.train_params['callbacks'] is not None:
                callbacks_to_keep = []
                for callback in experiment.train_params['callbacks']:
                    if not isinstance(callback,
                                      tf.keras.callbacks.ModelCheckpoint):
                        callbacks_to_keep.append(callback)
                experiment.train_params['callbacks'] = callbacks_to_keep
        if experiment.input_data_fn is not None:
            experiment.train_params['x'] = experiment.input_data_fn()
        if experiment.target_data_fn is not None:
            experiment.train_params['y'] = experiment.target_data_fn()
        logger.info("start training..")
        experiment.model.fit(**experiment.train_params)
    else:
        raise ValueError(
            "experiment must be an Experiment or a KerasExperiment")
Ejemplo n.º 9
0
def test__get_experiment_object():
    mocked_client = mock.MagicMock(spec=skein.ApplicationClient)
    experiment_obj = 'obj'

    def experiment_f():
        return experiment_obj

    mocked_client.kv.wait.return_value = cloudpickle.dumps(experiment_f)
    returned_object = _get_experiment(mocked_client)
    assert returned_object == experiment_obj
Ejemplo n.º 10
0
def main() -> None:
    task_type, task_id = cluster.get_task_description()
    client, cluster_spec, cluster_tasks = _prepare_container()

    # Variable TF_CONFIG must be set before instantiating
    # the estimator to train in a distributed way
    cluster.setup_tf_config(cluster_spec)
    experiment = _get_experiment(client)
    run_config = experiment.config

    tf.logging.info(f"Starting server {task_type}:{task_id}")
    cluster.start_tf_server(cluster_spec, run_config.session_config)
    thread = _execute_dispatched_function(client, experiment)

    # "ps" tasks do not terminate by themselves. See
    # https://github.com/tensorflow/tensorflow/issues/4713.
    # Tensorboard is terminated after all other tasks in _shutdown_container
    if task_type not in ['ps', 'tensorboard']:
        thread.join()
        tf.logging.info(f"{task_type}:{task_id} {thread.state}")

    _shutdown_container(client, cluster_tasks, run_config, thread)
Ejemplo n.º 11
0
def _worker_fn(client, task, net_if):
    event.broadcast(client, f"{task}/addr", net_if[1])

    worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',')
    driver_socket = event.wait(client, "chief:0/sock_addr").split(':')

    os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0]
    os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1]
    os.environ['HOROVOD_CONTROLLER'] = 'gloo'
    os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo'
    os.environ['HOROVOD_GLOO_IFACE'] = net_if[0]
    os.environ['HOROVOD_RANK'] = worker_info[0]
    os.environ['HOROVOD_SIZE'] = worker_info[1]
    os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2]
    os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3]
    os.environ['HOROVOD_CROSS_RANK'] = worker_info[4]
    os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5]

    hvd.init()

    experiment = _task_commons._get_experiment(client)

    if task != 'chief:0':
        # Overwrite config to do nothing but training to improve training speed
        experiment.estimator._model_dir = "."
        new_config = experiment.estimator.config.replace(
            save_summary_steps=None,
            save_checkpoints_steps=None,
            save_checkpoints_secs=None,
            log_step_count_steps=None)
        experiment.estimator._config = new_config

    logger.info("start training..")

    experiment.estimator.train(experiment.train_spec.input_fn,
                               hooks=experiment.train_spec.hooks,
                               max_steps=experiment.train_spec.max_steps)
Ejemplo n.º 12
0
def _train(device: int, rank: int, world_size: int,
           collective_ops_backend: str) -> None:
    os.environ["NCCL_DEBUG"] = "INFO"
    _logger.info(f"[{os.getpid()}] device: {device}; rank: {rank}")
    os.environ[PYTORCH_DPP_RANK] = str(rank)

    client = skein.ApplicationClient.from_current()
    _setup_master(client, rank)

    dist.init_process_group(collective_ops_backend,
                            rank=rank,
                            world_size=world_size)

    experiment = _get_experiment(client)
    assert isinstance(experiment, PytorchExperiment)
    model = experiment.model.to(device)
    ddp_kwargs = experiment.ddp_args._asdict() if experiment.ddp_args else {}
    ddp_model = DDP(model, device_ids=[device], **ddp_kwargs)

    trainloader = _create_dataloader(experiment.train_dataset,
                                     experiment.dataloader_args)

    with tempfile.TemporaryDirectory() as tmp:
        tb_writer = SummaryWriter(tmp)
        experiment.main_fn(ddp_model, trainloader, f"cuda:{device}", rank,
                           tb_writer)
        tb_writer.flush()
        tb_writer.close()

        if experiment.tensorboard_hdfs_dir:
            worker_tb_dir = os.path.join(experiment.tensorboard_hdfs_dir,
                                         f"worker{rank}")
            _upload_tensorboard_on_hdfs(tmp, worker_tb_dir)

    dist.destroy_process_group()
    _logger.info("Done training")
Ejemplo n.º 13
0
def evaluator_fn(client):
    experiment = _task_commons._get_experiment(client)
    evaluate(experiment, timeout_in_secs=1200)  # Timeout after 20min