def main(): fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE) if not fs.exists(WINE_EQUALITY_FILE): raise Exception(f"{WINE_EQUALITY_FILE} not found") run_on_yarn(experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__, }, custom_task_module="tf_yarn.tensorflow.tasks.gloo_allred_task")
def main(): fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE) if not fs.exists(WINE_EQUALITY_FILE): raise Exception(f"{WINE_EQUALITY_FILE} not found") # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn() run_on_yarn(experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__ })
experiment_name = "tf-yarn-tests" exp = mlflow.get_experiment_by_name(experiment_name) if not exp: experiment_id = mlflow.create_experiment( experiment_name, f"{_get_fs_for_tests()}/user/{USER}/mlflow_artifacts") else: experiment_id = exp.experiment_id run_id = mlflow.start_run(experiment_id=experiment_id).info.run_id run_on_yarn(experiment_fn, task_specs={ "chief": TaskSpec(memory="1 GiB", vcores=1), "evaluator": TaskSpec(memory="1 GiB", vcores=1) }, files={ os.path.basename(winequality.__file__): winequality.__file__, }) mlflow.end_run() # check if run has been registered in MLFlow run_json = requests.get( f"{mlflow.get_tracking_uri()}/api/2.0/mlflow/runs/get", params={ 'run_id': run_id }).json() logger.info(f"created run: {run_json}")
return Experiment( estimator, tf.estimator.TrainSpec( train_input_fn, max_steps=10, hooks=[hvd.BroadcastGlobalVariablesHook(0)] ), tf.estimator.EvalSpec( eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30 ) ) if __name__ == "__main__": run_on_yarn( experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=1), "evaluator": TaskSpec(memory="2 GiB", vcores=1), "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR) }, files={ os.path.basename(winequality.__file__): winequality.__file__, }, custom_task_module="tf_yarn.tensorflow.tasks.gloo_allred_task" )
estimator = tf.estimator.LinearClassifier( feature_columns=winequality.get_feature_columns(), model_dir=HDFS_DIR, n_classes=winequality.get_n_classes()) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=100), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": fs, _ = filesystem.resolve_filesystem_and_path(WINE_EQUALITY_FILE) if not fs.exists(WINE_EQUALITY_FILE): raise Exception(f"{WINE_EQUALITY_FILE} not found") run_on_yarn(experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "evaluator": TaskSpec(memory="2 GiB", vcores=1), "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR) }, files={ os.path.basename(winequality.__file__): winequality.__file__, })
loss = tf.compat.v1.losses.mean_squared_error(x, labels) train_op = tf.compat.v1.assign_add(tf.compat.v1.train.get_global_step(), 1) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, predictions={"x": x}, eval_metric_ops={}) def experiment_fn() -> Experiment: # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15 import tensorflow as tf def input_fn(): x = tf.constant([[1.0], [2.0], [3.0], [4.0]]) return {"x": x}, x estimator = tf.estimator.Estimator(model_fn=model_fn) train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1) eval_spec = tf.estimator.EvalSpec(input_fn, steps=1) return Experiment(estimator, train_spec, eval_spec) if __name__ == "__main__": # skein.Client is useful when multiple learnings run in parallel # and share one single skein JAVA process with skein.Client() as client: run_on_yarn(experiment_fn, task_specs={"chief": TaskSpec(memory="1 GiB", vcores=1)}, skein_client=client)