Ejemplo n.º 1
0
def test_upload_env():
    with contextlib.ExitStack() as stack:
        # Mock all objects
        mock_is_archive = stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}._is_archive_up_to_date"))
        mock_get_packages = stack.enter_context(
            mock.patch(
                f"{MODULE_TO_TEST}.packaging.get_non_editable_requirements"))

        mock_resolve_fs = stack.enter_context(
            mock.patch(
                f"{MODULE_TO_TEST}.filesystem.resolve_filesystem_and_path"))
        mock_fs = mock.MagicMock()
        mock_resolve_fs.return_value = mock_fs, ""

        stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}._dump_archive_metadata"))
        stack.enter_context(mock.patch(f"{MODULE_TO_TEST}.shutil.rmtree"))
        mock_packer = stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}.packaging.pack_in_pex"))

        # Regenerate archive
        mock_is_archive.return_value = False
        mock_get_packages.return_value = [{
            "name": "a",
            "version": "1.0"
        }, {
            "name": "b",
            "version": "2.0"
        }]

        mock_packer.return_value = MYARCHIVE_FILENAME

        cluster_pack.upload_env(MYARCHIVE_FILENAME, cluster_pack.PEX_PACKER)
        mock_packer.assert_called_once_with({
            "a": "1.0",
            "b": "2.0"
        }, Any(str), [])
        mock_fs.put.assert_called_once_with(MYARCHIVE_FILENAME,
                                            MYARCHIVE_FILENAME)

        mock_packer.reset_mock()
        cluster_pack.upload_env(MYARCHIVE_FILENAME,
                                cluster_pack.PEX_PACKER,
                                additional_packages={"c": "3.0"},
                                ignored_packages=["a"])
        mock_packer.assert_called_once_with({
            "c": "3.0",
            "b": "2.0"
        }, Any(str), ["a"])
Ejemplo n.º 2
0
def launch_remote_check(file: str) -> Tuple[bool, str]:
    logging.info('Launching remote check')
    zip_hdfs, _ = cluster_pack.upload_env(packer=cluster_pack.PEX_PACKER)
    archive_name = os.path.basename(zip_hdfs)
    with skein.Client() as client:
        files = {
            archive_name: zip_hdfs,
            'check_hadoop_env.py': __file__,
        }
        editable_packages = cluster_pack.get_editable_requirements()
        if 'tf_yarn' in editable_packages:
            tf_yarn_zip = cluster_pack.zip_path(editable_packages['tf_yarn'],
                                                False)
            logger.info(f"zip path for editable tf_yarn is {tf_yarn_zip}")
            files.update({'tf_yarn': tf_yarn_zip})
        service = skein.Service(
            script=f'./{archive_name} check_hadoop_env.py --file {file}',
            resources=skein.Resources(2 * 1024, 1),
            env={
                'PEX_ROOT': '/tmp/{uuid.uuid4()}/',
                'PYTHONPATH': '.:',
            },
            files=files,
            instances=1)
        spec = skein.ApplicationSpec(
            {'HADOOP_ENV_CHECKER': service},
            acls=skein.model.ACLs(enable=True, view_users=['*']),
        )
        app = client.submit_and_connect(spec)

        logging.info('Remote check started')
        result = app.kv.wait('result').decode()
        app_id = app.id
        app.shutdown()
        return result == "True", app_id
Ejemplo n.º 3
0
def main():
    # forcing call to model_to_estimator._save_first_checkpoint l457
    # https://github.com/tensorflow/estimator/blob/ \
    # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py
    # otherwise there is a race condition
    # when all workers try to save the first checkpoint at the same time
    experiment_fn(HDFS_DIR)

    pyenv_zip_path, env_name = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                get_safe_exp_fn(),
                task_specs={
                    "chief": TaskSpec(memory="2 GiB", vcores=4),
                    "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4),
                    "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2),
                    "evaluator": TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                    os.path.basename(__file__):
                    __file__
                })
Ejemplo n.º 4
0
def upload_env_to_hdfs(
        archive_on_hdfs: str = None,
        packer=None,
        additional_packages: Dict[str, str] = {},
        ignored_packages: Collection[str] = []) -> Tuple[str, str]:
    return cluster_pack.upload_env(archive_on_hdfs, packer,
                                   additional_packages, ignored_packages)
Ejemplo n.º 5
0
 def launch_pyspark():
     from pyspark.sql import SparkSession
     import cluster_pack
     from cluster_pack.spark import spark_config_builder
     archive, _ = cluster_pack.upload_env()
     ssb = SparkSession.builder.master("yarn").config("spark.submit.deployMode", "client")
     spark_config_builder.add_packaged_environment(ssb, archive)
     sc = ssb.getOrCreate().sparkContext
     hdfs_cat_res = sc.parallelize([1], numSlices=1).map(env).collect()[0]
     print(f"pyspark result:{hdfs_cat_res}")
Ejemplo n.º 6
0
def test_upload_env_in_a_pex():
    home_path = '/home/j.doe'
    home_fs_path = '/user/j.doe'
    with contextlib.ExitStack() as stack:
        mock_running_from_pex = stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}.packaging._running_from_pex"))
        mock_running_from_pex.return_value = True
        mock_pex_filepath = stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}.packaging.get_current_pex_filepath"))
        mock_pex_filepath.return_value = f"{home_path}/myapp.pex"

        mock_resolve_fs = stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}.filesystem.resolve_filesystem_and_path"))
        mock_fs = mock.MagicMock()
        mock_resolve_fs.return_value = mock_fs, ""

        mock__get_archive_metadata_path = stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}._get_archive_metadata_path")
        )
        mock__get_archive_metadata_path.return_value = f"{home_fs_path}/blah.json"

        # metadata & pex already exists on fs
        mock_fs.exists.return_value = True

        mock_pex_info = stack.enter_context(
            mock.patch(f"{MODULE_TO_TEST}.PexInfo")
        )

        def _from_pex(arg):
            if arg == f'{home_path}/myapp.pex':
                return PexInfo({"code_hash": 1})
            else:
                return PexInfo({"code_hash": 2})

        mock_pex_info.from_pex.side_effect = _from_pex

        result = cluster_pack.upload_env(f'{home_fs_path}/blah.pex')

        # Check copy pex to remote
        mock_fs.put.assert_any_call(
            f'{home_path}/myapp.pex',
            f'{home_fs_path}/blah.pex')
        # Check metadata has been cleaned
        mock_fs.rm.assert_called_once_with(f'{home_fs_path}/blah.json')
        # check envname
        assert 'myapp' == result[1]
def main():
    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(
        pyenv_zip_path,
        get_safe_exp_fn(),
        task_specs={
            "chief": TaskSpec(memory="2 GiB", vcores=4),
            "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)),
            "evaluator": TaskSpec(memory="2 GiB", vcores=1)
        },
        files={
            **editable_requirements,
            os.path.basename(winequality.__file__): winequality.__file__,
            os.path.basename(__file__): __file__,
        },
        custom_task_module="tf_yarn.tasks.gloo_allred_task"
    )
Ejemplo n.º 8
0
import logging
import skein
import tempfile

import cluster_pack
from cluster_pack.skein import skein_config_builder, skein_launcher


if __name__ == "__main__":

    logging.basicConfig(level="INFO")

    package_path, _ = cluster_pack.upload_env()

    with tempfile.TemporaryDirectory() as tmp_dir:
        skein_config = skein_config_builder.build(
            module_name="skein_project.worker",
            package_path=package_path,
            tmp_dir=tmp_dir
        )

        with skein.Client() as client:
            service = skein.Service(
                resources=skein.model.Resources("1 GiB", 1),
                files=skein_config.files,
                script=skein_config.script
            )
            spec = skein.ApplicationSpec(services={"service": service})
            app_id = client.submit(spec)

            skein_launcher.wait_for_finished(client, app_id)
Ejemplo n.º 9
0
def main():
    def experiment_fn() -> Experiment:
        # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
        import tensorflow as tf

        def convert_to_tensor(x, y):
            return (tf.convert_to_tensor(value=list(x.values()),
                                         dtype=tf.float32),
                    tf.convert_to_tensor(value=y, dtype=tf.int32))

        def train_input_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE,
                                              split="train")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(
                128).repeat())

        def eval_input_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(128))

        model = keras.Sequential()
        model.add(
            keras.layers.Dense(units=300,
                               activation="relu",
                               input_shape=(11, )))
        model.add(keras.layers.Dense(units=100, activation="relu"))
        model.add(keras.layers.Dense(units=10, activation="softmax"))
        model.summary()
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer="sgd",
                      metrics=['accuracy'])

        config = tf.estimator.RunConfig(model_dir=HDFS_DIR)
        estimator = tf.keras.estimator.model_to_estimator(model, config=config)
        return Experiment(
            estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=1000),
            tf.estimator.EvalSpec(eval_input_fn,
                                  steps=10,
                                  start_delay_secs=0,
                                  throttle_secs=30))

    # forcing call to model_to_estimator._save_first_checkpoint l457
    # https://github.com/tensorflow/estimator/blob/ \
    # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py
    # otherwise there is a race condition
    # when all workers try to save the first checkpoint at the same time
    experiment_fn()

    pyenv_zip_path, env_name = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief": TaskSpec(memory="2 GiB", vcores=4),
                    "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4),
                    "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2),
                    "evaluator": TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                })
Ejemplo n.º 10
0
def test_upload_env_should_throw_error_if_wrong_extension():
    with pytest.raises(ValueError):
        cluster_pack.upload_env("myarchive.tar.gz", packer=cluster_pack.CONDA_PACKER)
        n_classes=winequality.get_n_classes(),
        optimizer=lambda: hvd.DistributedOptimizer(tf.train.AdamOptimizer()))

    return Experiment(
        estimator,
        tf.estimator.TrainSpec(train_input_fn,
                               max_steps=10,
                               hooks=[hvd.BroadcastGlobalVariablesHook(0)]),
        tf.estimator.EvalSpec(eval_input_fn,
                              steps=10,
                              start_delay_secs=0,
                              throttle_secs=30))


if __name__ == "__main__":
    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()

    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "worker":
                    TaskSpec(memory="2 GiB", vcores=4, instances=1),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1),
                    "tensorboard":
                    TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR)
                },
                files={
Ejemplo n.º 12
0
        return (dataset.shuffle(1000).batch(128))

    estimator = tf.estimator.LinearClassifier(
        feature_columns=winequality.get_feature_columns(),
        model_dir=HDFS_DIR,
        n_classes=winequality.get_n_classes())
    return Experiment(
        estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=100),
        tf.estimator.EvalSpec(eval_input_fn,
                              steps=10,
                              start_delay_secs=0,
                              throttle_secs=30))


if __name__ == "__main__":
    pyenv_zip_path, env_name = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1),
                    "tensorboard":
                    TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
Ejemplo n.º 13
0
def run_on_yarn(
        experiment_fn: ExperimentFn,
        task_specs: Dict[str, topologies.TaskSpec],
        *,
        pyenv_zip_path: Union[str, Dict[topologies.NodeLabel, str]] = None,
        skein_client: skein.Client = None,
        files: Dict[str, str] = None,
        env: Dict[str, str] = {},
        queue: str = "default",
        acls: ACLs = _default_acls_all_access(),
        file_systems: List[str] = None,
        eval_monitor_log_thresholds: Dict[str, Tuple[float, float]] = None,
        nb_retries: int = 0,
        custom_task_module: Optional[str] = None,
        name: str = "RunOnYarn",
        pre_script_hook: Optional[str] = None) -> Optional[metrics.Metrics]:
    """Run an experiment on YARN.

    The implementation allocates a service with the requested number
    of instances for each distributed task type. Each
    instance runs ``_dispatch_task`` which roughly does the following.

    1. Reserve a TCP port and communicate the resulting socket address
        (host/port pair) to other instances using the "init" barrier.
    2. Spawn ``train_and_evaluate`` in a separate thread.
    3. Synchronize the "ps" tasks on the "stop" barrier.
        The barrier compensates for the fact that "ps" tasks never
        terminate, and therefore should be killed once all other
        tasks are finished.

    Parameters
    ----------

    experiment_fn
        A function constructing the estimator alongside the train
        and eval specs.

    task_specs
        Resources to allocate for each task type. The keys
        must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and
        ``"evaluator"``. The minimal spec must contain at least
        ``"chief"``.

    pyenv_zip_path
        Path to an archive of a python environment to be deployed
        It can be a zip conda env or a pex archive
        In case of GPU/CPU cluster, provide a dictionnary with both
        environments. If none is provided, the current python environment
        will be packaged in a pex

    skein_client
        Skein client used to submit yarn jobs

    files
        Local files or directories to upload to the container.
        The keys are the target locations of the resources relative
        to the container root, while the values -- their
        corresponding local sources. Note that container root is
        appended to ``PYTHONPATH``. Therefore, any listed Python
        module a package is automatically importable.

    env
        Environment variables to forward to the containers.

    queue
        YARN queue to use.

    acls
        Configures the application-level Access Control Lists (ACLs).
        Optional, defaults to ACLs all access.

        See `ACLs <https://jcrist.github.io/skein/specification.html#acls>` for details.

    file_systems
        A list of namenode URIs to acquire delegation tokens for
        in addition to ``fs.defaultFS``.

    eval_monitor_log_thresholds
        optional dictionnary of string to (float 1, float 2).
        Each couple (key, value) corresponds to an evaluation
        monitored metric and an associated range. The evaluation monitored metric
        is logged if it is in [float 1; float 2]. If the lower bound is None it is set to 0.
        If the upper bound is None, it is set to maximum value
        A monitored metric with no range is always logged. List of monitored metrics:
        'awake_time_ratio': 'Awake/idle ratio',
        'eval_step_mean_duration': 'Eval step mean duration (in sec)',
        'last_training_step': 'Training set of last checkpoint',
        'nb_eval_steps': 'Number of evaluation steps done'

    nb_retries
        Number of times the yarn application is retried in case of failures

    custom_task_module
        Provide the full module name of a custom task that is executed on each worker
        None by default
        (Module will be invoked with python -m {custom_task_module} on the cluster)
        Only for advanced use cases, can be useful for example,
        to bypass/tweek the existing estimator.train_and_evaluate pattern

    name
        Name of the yarn application

    pre_script_hook
        bash command to prepare Hadoop environment

    Raises
    ------
    RunFailed
        If the final status of the YARN application is ``"FAILED"``.
    """
    updated_files = _add_editable_requirements(files)
    _pyenv_zip_path = pyenv_zip_path if pyenv_zip_path else cluster_pack.upload_env(
    )[0]

    if nb_retries < 0:
        raise ValueError(
            f'nb_retries must be greater or equal to 0. Got {nb_retries}')

    pyenvs = _setup_pyenvs(_pyenv_zip_path)

    n_try = 0
    while True:
        try:
            skein_cluster = _setup_skein_cluster(
                pyenvs=pyenvs,
                task_specs=task_specs,
                skein_client=skein_client,
                files=updated_files,
                env=env,
                queue=queue,
                acls=acls,
                file_systems=file_systems,
                name=name,
                n_try=n_try,
                custom_task_module=custom_task_module,
                pre_script_hook=pre_script_hook)
            with _shutdown_on_exception(skein_cluster.app):
                _setup_cluster_spec(skein_cluster.tasks, skein_cluster.app)

                return _run_on_cluster(experiment_fn, skein_cluster,
                                       eval_monitor_log_thresholds, n_try)
        except Exception:
            n_try += 1
            if n_try == nb_retries + 1:
                raise
            logger.exception(f"Retrying user application ... "
                             f"{nb_retries + 1 - n_try} remaining attempts")

    # Necessary for type checking
    return None
Ejemplo n.º 14
0
def main():
    def experiment_fn() -> KerasExperiment:
        def convert_to_tensor(x, y):
            return (tf.convert_to_tensor(value=list(x.values()),
                                         dtype=tf.float32),
                    tf.convert_to_tensor(value=y, dtype=tf.int32))

        def input_data_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE,
                                              split="train")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(
                128).repeat())

        def validation_data_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(128))

        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Dense(units=300,
                                  activation="relu",
                                  input_shape=(11, )))
        model.add(tf.keras.layers.Dense(units=100, activation="relu"))
        model.add(tf.keras.layers.Dense(units=10, activation="softmax"))
        model.summary()
        opt = tf.keras.optimizers.Adadelta(1.0 * HVD_SIZE)
        opt = hvd.DistributedOptimizer(opt)
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
        path_to_checkpoint = f"{HDFS_DIR}" + "/checkpoint-{epoch}"
        my_callbacks = [
            tf.keras.callbacks.ModelCheckpoint(path_to_checkpoint),
            hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0),
        ]
        train_params = {"steps_per_epoch": 1000, "callbacks": my_callbacks}
        return KerasExperiment(model=model,
                               model_dir=HDFS_DIR,
                               train_params=train_params,
                               input_data_fn=input_data_fn,
                               target_data_fn=None,
                               validation_data_fn=validation_data_fn)

    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "worker":
                    TaskSpec(memory="2 GiB",
                             vcores=4,
                             instances=(HVD_SIZE - 1)),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                },
                custom_task_module="tf_yarn.tasks.gloo_allred_task")
Ejemplo n.º 15
0
logging.basicConfig(level="INFO")

_logger = logging.getLogger(__name__)

if __name__ == "__main__":

    # use local minio S3 instance
    # allowed parameters are here:
    # https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem
    s3_args = {
        "use_ssl": False,
        "client_kwargs": {
            'endpoint_url': "http://*****:*****@pandas_udf("double", PandasUDFType.GROUPED_AGG)
    def mean_udf(v: pd.Series) -> float: