Example #1
0
def main():
    # forcing call to model_to_estimator._save_first_checkpoint l457
    # https://github.com/tensorflow/estimator/blob/ \
    # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py
    # otherwise there is a race condition
    # when all workers try to save the first checkpoint at the same time
    experiment_fn(HDFS_DIR)

    pyenv_zip_path, env_name = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                get_safe_exp_fn(),
                task_specs={
                    "chief": TaskSpec(memory="2 GiB", vcores=4),
                    "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4),
                    "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2),
                    "evaluator": TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                    os.path.basename(__file__):
                    __file__
                })
def main():
    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(
        pyenv_zip_path,
        get_safe_exp_fn(),
        task_specs={
            "chief": TaskSpec(memory="2 GiB", vcores=4),
            "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)),
            "evaluator": TaskSpec(memory="2 GiB", vcores=1)
        },
        files={
            **editable_requirements,
            os.path.basename(winequality.__file__): winequality.__file__,
            os.path.basename(__file__): __file__,
        },
        custom_task_module="tf_yarn.tasks.gloo_allred_task"
    )
Example #3
0
def main():
    pyenv_zip_path, env_name = packaging.upload_env_to_hdfs()
    editable_requirements = packaging.get_editable_requirements_from_current_venv()
    session_config = tf.ConfigProto(operation_timeout_in_ms=300000)
    with standalone_client_mode(
            pyenv_zip_path,
            task_specs={
                NODE_NAME: TaskSpec(memory="2 GiB", vcores=4, instances=2)
            },
            tf_session_config=session_config,
            files={
                **editable_requirements,
            },
            acls=skein.model.ACLs(
                enable=True,
                view_users=['*']
            )) as cluster_spec:
        size = 10000
        x = tf.placeholder(tf.float32, size)

        with tf.device(f"/job:{NODE_NAME}/task:1"):
            with tf.name_scope("scope_of_task1"):
                first_batch = tf.slice(x, [5000], [-1])
                mean1 = tf.reduce_mean(first_batch)

        with tf.device(f"/job:{NODE_NAME}/task:0"):
            with tf.name_scope("scope_of_task0"):
                second_batch = tf.slice(x, [0], [5000])
                mean2 = tf.reduce_mean(second_batch)
                mean = (mean1 + mean2) / 2

        cluster_spec_dict = cluster_spec.as_dict()
        first_task = next(iter(cluster_spec_dict.values()))[0]
        logger.info("cluster_spec:" + str(cluster_spec_dict))
        logger.info("connecting to target:" + first_task)

        with tf.Session(f"grpc://{first_task}", config=session_config) as sess:
            result = sess.run(mean, feed_dict={x: np.random.random(size)})
            print(f"mean = {result}")
Example #4
0
def main():
    def experiment_fn() -> Experiment:
        train_data, test_data = winequality.get_train_eval_datasets(WINE_EQUALITY_FILE)

        def convert_to_tensor(x, y):
            return (tf.convert_to_tensor(list(x.values()), dtype=tf.float32),
                    tf.convert_to_tensor(y, dtype=tf.int32))

        def train_input_fn():
            return (train_data.map(convert_to_tensor)
                    .shuffle(1000)
                    .batch(128)
                    .repeat()
                    .make_one_shot_iterator()
                    .get_next())

        def eval_input_fn():
            return (test_data.map(convert_to_tensor)
                    .shuffle(1000)
                    .batch(128)
                    .make_one_shot_iterator()
                    .get_next())

        model = keras.Sequential()
        model.add(keras.layers.Dense(units=300, activation="relu", input_shape=(11,)))
        model.add(keras.layers.Dense(units=100, activation="relu"))
        model.add(keras.layers.Dense(units=10, activation="softmax"))
        model.summary()
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer="sgd",
                      metrics=['accuracy'])

        config = tf.estimator.RunConfig(model_dir=HDFS_DIR)
        estimator = tf.keras.estimator.model_to_estimator(model, config=config)
        return Experiment(
            estimator,
            tf.estimator.TrainSpec(
                train_input_fn,
                max_steps=1000),
            tf.estimator.EvalSpec(
                eval_input_fn,
                steps=10,
                start_delay_secs=0,
                throttle_secs=30))

    # forcing call to model_to_estimator._save_first_checkpoint l457
    # https://github.com/tensorflow/estimator/blob/ \
    # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py
    # otherwise there is a race condition
    # when all workers try to save the first checkpoint at the same time
    experiment_fn()

    pyenv_zip_path, env_name = packaging.upload_env_to_hdfs()
    editable_requirements = packaging.get_editable_requirements_from_current_venv()
    run_on_yarn(
        pyenv_zip_path,
        experiment_fn,
        task_specs={
            "chief": TaskSpec(memory="2 GiB", vcores=4),
            "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4),
            "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2),
            "evaluator": TaskSpec(memory="2 GiB", vcores=1)
        },
        files={
            **editable_requirements,
            os.path.basename(winequality.__file__): winequality.__file__,
        },
        acls=skein.model.ACLs(
            enable=True,
            view_users=['*']
        )

    )
Example #5
0
                                      predictions={"x": x},
                                      eval_metric_ops={})


def experiment_fn() -> Experiment:
    def input_fn():
        x = tf.constant([[1.0], [2.0], [3.0], [4.0]])
        return {"x": x}, x

    estimator = tf.estimator.Estimator(model_fn=model_fn)
    train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1)
    eval_spec = tf.estimator.EvalSpec(input_fn, steps=1)
    return Experiment(estimator, train_spec, eval_spec)


if __name__ == "__main__":
    pyenv_zip_path, env_name = packaging.upload_env_to_hdfs()
    editable_requirements = packaging.get_editable_requirements_from_current_venv(
    )
    # skein.Client is useful when multiple learnings run in parallel
    # and share one single skein JAVA process
    with skein.Client() as client:
        run_on_yarn(pyenv_zip_path,
                    experiment_fn,
                    task_specs={"chief": TaskSpec(memory=64, vcores=1)},
                    files={
                        **editable_requirements,
                    },
                    acls=skein.model.ACLs(enable=True, view_users=['*']),
                    skein_client=client)
Example #6
0
    loss = tf.losses.mean_squared_error(x, labels)
    train_op = tf.assign_add(tf.train.get_global_step(), 1)
    return tf.estimator.EstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        predictions={"x": x},
        eval_metric_ops={})


def experiment_fn() -> Experiment:
    def input_fn():
        x = tf.constant([[1.0], [2.0], [3.0], [4.0]])
        return {"x": x}, x

    config = tf.estimator.RunConfig()
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
    train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1)
    eval_spec = tf.estimator.EvalSpec(input_fn, steps=1)
    return Experiment(estimator, train_spec, eval_spec)


if __name__ == "__main__":
    logging.basicConfig(level="INFO")

    with TFYarnExecutor() as tf_yarn_executor:
        tf_yarn_executor.run_on_yarn(experiment_fn, task_specs={
            "chief": TaskSpec(memory=64, vcores=1)
        })
Example #7
0
def eval_input_fn():
    train_data, test_data = winequality.get_train_eval_datasets(
        WINE_EQUALITY_FILE)
    return (test_data.shuffle(1000).batch(128).repeat())


if __name__ == "__main__":
    pyenv_zip_path, env_name = packaging.upload_env_to_hdfs()
    editable_requirements = packaging.get_editable_requirements_from_current_venv(
    )

    with standalone_client_mode(
            pyenv_zip_path,
            task_specs={
                "worker": TaskSpec(memory="2 GiB", vcores=4, instances=2)
            },
            files=editable_requirements,
            acls=skein.model.ACLs(enable=True,
                                  view_users=['*'])) as cluster_spec:

        distrib_config = tf.contrib.distribute.DistributeConfig(
            train_distribute=tf.contrib.distribute.CollectiveAllReduceStrategy(
            ),
            eval_distribute=tf.contrib.distribute.CollectiveAllReduceStrategy(
            ),
            remote_cluster=cluster_spec)
        run_config = tf.estimator.RunConfig(
            experimental_distribute=distrib_config)

        estimator = tf.estimator.LinearClassifier(
Example #8
0
                                      predictions={"x": x},
                                      eval_metric_ops={})


def experiment_fn() -> Experiment:
    def input_fn():
        x = tf.constant([[1.0], [2.0], [3.0], [4.0]])
        return {"x": x}, x

    estimator = tf.estimator.Estimator(model_fn=model_fn)
    train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1)
    eval_spec = tf.estimator.EvalSpec(input_fn, steps=1)
    return Experiment(estimator, train_spec, eval_spec)


if __name__ == "__main__":
    pyenv_zip_path, env_name = packaging.upload_env_to_hdfs()
    editable_requirements = packaging.get_editable_requirements_from_current_venv(
    )
    # skein.Client is useful when multiple learnings run in parallel
    # and share one single skein JAVA process
    with skein.Client() as client:
        run_on_yarn(pyenv_zip_path,
                    experiment_fn,
                    task_specs={"chief": TaskSpec(memory="1 GiB", vcores=1)},
                    files={
                        **editable_requirements,
                    },
                    acls=skein.model.ACLs(enable=True, view_users=['*']),
                    skein_client=client)
                               max_steps=10,
                               hooks=[hvd.BroadcastGlobalVariablesHook(0)]),
        tf.estimator.EvalSpec(eval_input_fn,
                              steps=10,
                              start_delay_secs=0,
                              throttle_secs=30))


if __name__ == "__main__":
    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()

    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "worker":
                    TaskSpec(memory="2 GiB", vcores=4, instances=1),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1),
                    "tensorboard":
                    TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                },
                custom_task_module="tf_yarn.tasks._gloo_allred_task")
Example #10
0

if __name__ == "__main__":

    # you need to install mlflow `pip install mlflow`
    # and set MLflow tracking uri
    mlflow.set_tracking_uri(os.getenv("CRITEO_MLFLOW_TRACKING_URI", ""))
    run_id = mlflow.start_run(experiment_id=77).info.run_id

    pyenv_zip_path, env_name = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()

    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief": TaskSpec(memory="1 GiB", vcores=1),
                    "evaluator": TaskSpec(memory="1 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                })

    mlflow.end_run()

    # check if run has been registered in MLFlow
    run_json = requests.get(
        f"{mlflow.get_tracking_uri()}/api/2.0/mlflow/runs/get",
        params={
            'run_id': run_id
    # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
    import tensorflow as tf

    def input_fn():
        x = tf.constant([[1.0], [2.0], [3.0], [4.0]])
        return {"x": x}, x

    estimator = tf.estimator.Estimator(model_fn=model_fn)
    train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1)
    eval_spec = tf.estimator.EvalSpec(input_fn, steps=1)
    return Experiment(estimator, train_spec, eval_spec)


if __name__ == "__main__":
    pyenv_zip_path, env_name = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    # skein.Client is useful when multiple learnings run in parallel
    # and share one single skein JAVA process
    with skein.Client() as client:
        run_on_yarn(
            pyenv_zip_path,
            experiment_fn,
            task_specs={
                "chief": TaskSpec(memory="1 GiB", vcores=1)
            },
            files={
                **editable_requirements,
            },
            skein_client=client
        )
        n_classes=winequality.get_n_classes())
    return Experiment(
        estimator,
        tf.estimator.TrainSpec(train_input_fn, max_steps=10),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30))


if __name__ == "__main__":
    pyenv_zip_path, env_name = packaging.upload_env_to_hdfs()
    editable_requirements = packaging.get_editable_requirements_from_current_venv()
    run_on_yarn(
        pyenv_zip_path,
        experiment_fn,
        task_specs={
            "chief": TaskSpec(memory=2 * 2 ** 10, vcores=4),
            "evaluator": TaskSpec(memory=2 ** 10, vcores=1)
        },
        files={
            **editable_requirements,
            os.path.basename(winequality.__file__): winequality.__file__,
        },
        acls=skein.model.ACLs(
            enable=True,
            view_users=['*']
        )
    )
        n_classes=winequality.get_n_classes())

    train_spec = tf.estimator.TrainSpec(
        train_input_fn,
        max_steps=1000,
        hooks=[BroadcastGlobalVariablesHook()]
    )
    return Experiment(estimator, train_spec, tf.estimator.EvalSpec(lambda: True))


if __name__ == "__main__":

    pyenv_zip_path, env_name = packaging.upload_env_to_hdfs()
    editable_requirements = packaging.get_editable_requirements_from_current_venv()

    with skein.Client() as client:
        run_on_yarn(
            pyenv_zip_path,
            experiment_fn,
            task_specs={
                "chief": TaskSpec(memory="1 GiB", vcores=1),
                "worker": TaskSpec(memory="1 GiB", vcores=1, instances=NB_WORKERS)
            },
            files={
                **editable_requirements,
                os.path.basename(winequality.__file__): winequality.__file__,
            },
            skein_client=client,
            custom_task_module="tf_collective_all_reduce.python.tf_yarn._rabit_allred_task"
        )
Example #14
0
def main():
    def experiment_fn() -> KerasExperiment:
        def convert_to_tensor(x, y):
            return (tf.convert_to_tensor(value=list(x.values()),
                                         dtype=tf.float32),
                    tf.convert_to_tensor(value=y, dtype=tf.int32))

        def input_data_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE,
                                              split="train")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(
                128).repeat())

        def validation_data_fn():
            dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
            return (dataset.map(convert_to_tensor).shuffle(1000).batch(128))

        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Dense(units=300,
                                  activation="relu",
                                  input_shape=(11, )))
        model.add(tf.keras.layers.Dense(units=100, activation="relu"))
        model.add(tf.keras.layers.Dense(units=10, activation="softmax"))
        model.summary()
        opt = tf.keras.optimizers.Adadelta(1.0 * HVD_SIZE)
        opt = hvd.DistributedOptimizer(opt)
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
        path_to_checkpoint = f"{HDFS_DIR}" + "/checkpoint-{epoch}"
        my_callbacks = [
            tf.keras.callbacks.ModelCheckpoint(path_to_checkpoint),
            hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0),
        ]
        train_params = {"steps_per_epoch": 1000, "callbacks": my_callbacks}
        return KerasExperiment(model=model,
                               model_dir=HDFS_DIR,
                               train_params=train_params,
                               input_data_fn=input_data_fn,
                               target_data_fn=None,
                               validation_data_fn=validation_data_fn)

    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()
    run_on_yarn(pyenv_zip_path,
                experiment_fn,
                task_specs={
                    "chief":
                    TaskSpec(memory="2 GiB", vcores=4),
                    "worker":
                    TaskSpec(memory="2 GiB",
                             vcores=4,
                             instances=(HVD_SIZE - 1)),
                    "evaluator":
                    TaskSpec(memory="2 GiB", vcores=1)
                },
                files={
                    **editable_requirements,
                    os.path.basename(winequality.__file__):
                    winequality.__file__,
                },
                custom_task_module="tf_yarn.tasks.gloo_allred_task")
Example #15
0
            hooks=[hvd.BroadcastGlobalVariablesHook(0)]
        ),
        tf.estimator.EvalSpec(
            eval_input_fn,
            steps=10,
            start_delay_secs=0,
            throttle_secs=30
        )
    )


if __name__ == "__main__":
    pyenv_zip_path, _ = cluster_pack.upload_env()
    editable_requirements = cluster_pack.get_editable_requirements()

    run_on_yarn(
        pyenv_zip_path,
        experiment_fn,
        task_specs={
            "chief": TaskSpec(memory="2 GiB", vcores=4),
            "worker": TaskSpec(memory="2 GiB", vcores=4, instances=1),
            "evaluator": TaskSpec(memory="2 GiB", vcores=1),
            "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR)
        },
        files={
            **editable_requirements,
            os.path.basename(winequality.__file__): winequality.__file__,
        },
        custom_task_module="tf_yarn.tasks.gloo_allred_task"
    )