Esempio n. 1
0
def main():
    # forcing call to model_to_estimator._save_first_checkpoint l457
    # https://github.com/tensorflow/estimator/blob/ \
    # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py
    # otherwise there is a race condition
    # when all workers try to save the first checkpoint at the same time
    experiment_fn(HDFS_DIR)

    pyenv_zip_path, env_name = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                get_safe_exp_fn(),
                task_specs={
                    "chief": TaskSpec(memory="2 GiB", vcores=4),
                    "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4),
                    "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2),
                    "evaluator": TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                    os.path.basename(__file__):
                    __file__
                })
Esempio n. 2
0
def test_retry_run_on_yarn(nb_retries, nb_failures):
    cpt = 0

    def fail(*args, **kwargs):
        if cpt < nb_failures:
            raise Exception("")
        else:
            pass

    with mock.patch('tf_yarn._setup_pyenvs'), \
            mock.patch('tf_yarn._setup_skein_cluster') as mock_setup_skein_cluster, \
            mock.patch('tf_yarn._run_on_cluster') as mock_run_on_cluster:
        mock_run_on_cluster.side_effect = fail

        gb = 2**10

        try:
            run_on_yarn("path/to/env",
                        lambda: Experiment(None, None, None),
                        task_specs={
                            "chief":
                            TaskSpec(memory=16 * gb, vcores=16),
                            "worker":
                            TaskSpec(memory=16 * gb, vcores=16, instances=1),
                            "ps":
                            TaskSpec(memory=16 * gb, vcores=16, instances=1)
                        },
                        nb_retries=nb_retries)
        except Exception:
            pass

        nb_calls = min(nb_retries, nb_failures) + 1
        assert mock_run_on_cluster.call_count == nb_calls
        assert mock_setup_skein_cluster.call_count == nb_calls
def main():
    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(
        pyenv_zip_path,
        get_safe_exp_fn(),
        task_specs={
            "chief": TaskSpec(memory="2 GiB", vcores=4),
            "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)),
            "evaluator": TaskSpec(memory="2 GiB", vcores=1)
        },
        files={
            **editable_requirements,
            os.path.basename(winequality.__file__): winequality.__file__,
            os.path.basename(__file__): __file__,
        },
        custom_task_module="tf_yarn.tasks.gloo_allred_task"
    )
    user = pwd.getpwuid(os.getuid()).pw_name
    config = tf.estimator.RunConfig(
        tf_random_seed=42, model_dir=f"{fs}/user/{user}/examples/{run_id}")
    estimator = tf.estimator.LinearClassifier(
        winequality.get_feature_columns(),
        n_classes=winequality.get_n_classes(),
        config=config)
    return Experiment(
        estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=10),
        tf.estimator.EvalSpec(eval_input_fn,
                              steps=10,
                              start_delay_secs=0,
                              throttle_secs=30))


if __name__ == "__main__":
    try:
        [dataset_path] = sys.argv[1:]
    except ValueError:
        sys.exit(winequality.__doc__)

    logging.basicConfig(level="INFO")

    #also one can use run_on_yarn(...,num_cores=num_cores) where num_cores is the user specified value, otherwise the default is 1
    run_on_yarn(
        partial(experiment_fn, dataset_path),
        task_specs={
            "chief": TaskSpec(memory=2 * 2**10, vcores=4),
            "evaluator": TaskSpec(memory=2**10, vcores=1)
        },
        files={os.path.basename(winequality.__file__): winequality.__file__})
Esempio n. 5
0
def main():
    def experiment_fn() -> KerasExperiment:
        def convert_to_tensor(x, y):
            return (tf.convert_to_tensor(value=list(x.values()),
                                         dtype=tf.float32),
                    tf.convert_to_tensor(value=y, dtype=tf.int32))

        def input_data_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE,
                                              split="train")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(
                128).repeat())

        def validation_data_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(128))

        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Dense(units=300,
                                  activation="relu",
                                  input_shape=(11, )))
        model.add(tf.keras.layers.Dense(units=100, activation="relu"))
        model.add(tf.keras.layers.Dense(units=10, activation="softmax"))
        model.summary()
        opt = tf.keras.optimizers.Adadelta(1.0 * HVD_SIZE)
        opt = hvd.DistributedOptimizer(opt)
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
        path_to_checkpoint = f"{HDFS_DIR}" + "/checkpoint-{epoch}"
        my_callbacks = [
            tf.keras.callbacks.ModelCheckpoint(path_to_checkpoint),
            hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0),
        ]
        train_params = {"steps_per_epoch": 1000, "callbacks": my_callbacks}
        return KerasExperiment(model=model,
                               model_dir=HDFS_DIR,
                               train_params=train_params,
                               input_data_fn=input_data_fn,
                               target_data_fn=None,
                               validation_data_fn=validation_data_fn)

    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "worker":
                    TaskSpec(memory="2 GiB",
                             vcores=4,
                             instances=(HVD_SIZE - 1)),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                },
                custom_task_module="tf_yarn.tasks.gloo_allred_task")