def main(): # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn(HDFS_DIR) pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__ })
def main(): pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn( pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task" )
def main(): pyenv_zip_path, env_name = packaging.upload_env_to_hdfs() editable_requirements = packaging.get_editable_requirements_from_current_venv() session_config = tf.ConfigProto(operation_timeout_in_ms=300000) with standalone_client_mode( pyenv_zip_path, task_specs={ NODE_NAME: TaskSpec(memory="2 GiB", vcores=4, instances=2) }, tf_session_config=session_config, files={ **editable_requirements, }, acls=skein.model.ACLs( enable=True, view_users=['*'] )) as cluster_spec: size = 10000 x = tf.placeholder(tf.float32, size) with tf.device(f"/job:{NODE_NAME}/task:1"): with tf.name_scope("scope_of_task1"): first_batch = tf.slice(x, [5000], [-1]) mean1 = tf.reduce_mean(first_batch) with tf.device(f"/job:{NODE_NAME}/task:0"): with tf.name_scope("scope_of_task0"): second_batch = tf.slice(x, [0], [5000]) mean2 = tf.reduce_mean(second_batch) mean = (mean1 + mean2) / 2 cluster_spec_dict = cluster_spec.as_dict() first_task = next(iter(cluster_spec_dict.values()))[0] logger.info("cluster_spec:" + str(cluster_spec_dict)) logger.info("connecting to target:" + first_task) with tf.Session(f"grpc://{first_task}", config=session_config) as sess: result = sess.run(mean, feed_dict={x: np.random.random(size)}) print(f"mean = {result}")
def main(): def experiment_fn() -> Experiment: train_data, test_data = winequality.get_train_eval_datasets(WINE_EQUALITY_FILE) def convert_to_tensor(x, y): return (tf.convert_to_tensor(list(x.values()), dtype=tf.float32), tf.convert_to_tensor(y, dtype=tf.int32)) def train_input_fn(): return (train_data.map(convert_to_tensor) .shuffle(1000) .batch(128) .repeat() .make_one_shot_iterator() .get_next()) def eval_input_fn(): return (test_data.map(convert_to_tensor) .shuffle(1000) .batch(128) .make_one_shot_iterator() .get_next()) model = keras.Sequential() model.add(keras.layers.Dense(units=300, activation="relu", input_shape=(11,))) model.add(keras.layers.Dense(units=100, activation="relu")) model.add(keras.layers.Dense(units=10, activation="softmax")) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer="sgd", metrics=['accuracy']) config = tf.estimator.RunConfig(model_dir=HDFS_DIR) estimator = tf.keras.estimator.model_to_estimator(model, config=config) return Experiment( estimator, tf.estimator.TrainSpec( train_input_fn, max_steps=1000), tf.estimator.EvalSpec( eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn() pyenv_zip_path, env_name = packaging.upload_env_to_hdfs() editable_requirements = packaging.get_editable_requirements_from_current_venv() run_on_yarn( pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, acls=skein.model.ACLs( enable=True, view_users=['*'] ) )
predictions={"x": x}, eval_metric_ops={}) def experiment_fn() -> Experiment: def input_fn(): x = tf.constant([[1.0], [2.0], [3.0], [4.0]]) return {"x": x}, x estimator = tf.estimator.Estimator(model_fn=model_fn) train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1) eval_spec = tf.estimator.EvalSpec(input_fn, steps=1) return Experiment(estimator, train_spec, eval_spec) if __name__ == "__main__": pyenv_zip_path, env_name = packaging.upload_env_to_hdfs() editable_requirements = packaging.get_editable_requirements_from_current_venv( ) # skein.Client is useful when multiple learnings run in parallel # and share one single skein JAVA process with skein.Client() as client: run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={"chief": TaskSpec(memory=64, vcores=1)}, files={ **editable_requirements, }, acls=skein.model.ACLs(enable=True, view_users=['*']), skein_client=client)
loss = tf.losses.mean_squared_error(x, labels) train_op = tf.assign_add(tf.train.get_global_step(), 1) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, predictions={"x": x}, eval_metric_ops={}) def experiment_fn() -> Experiment: def input_fn(): x = tf.constant([[1.0], [2.0], [3.0], [4.0]]) return {"x": x}, x config = tf.estimator.RunConfig() estimator = tf.estimator.Estimator(model_fn=model_fn, config=config) train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1) eval_spec = tf.estimator.EvalSpec(input_fn, steps=1) return Experiment(estimator, train_spec, eval_spec) if __name__ == "__main__": logging.basicConfig(level="INFO") with TFYarnExecutor() as tf_yarn_executor: tf_yarn_executor.run_on_yarn(experiment_fn, task_specs={ "chief": TaskSpec(memory=64, vcores=1) })
def eval_input_fn(): train_data, test_data = winequality.get_train_eval_datasets( WINE_EQUALITY_FILE) return (test_data.shuffle(1000).batch(128).repeat()) if __name__ == "__main__": pyenv_zip_path, env_name = packaging.upload_env_to_hdfs() editable_requirements = packaging.get_editable_requirements_from_current_venv( ) with standalone_client_mode( pyenv_zip_path, task_specs={ "worker": TaskSpec(memory="2 GiB", vcores=4, instances=2) }, files=editable_requirements, acls=skein.model.ACLs(enable=True, view_users=['*'])) as cluster_spec: distrib_config = tf.contrib.distribute.DistributeConfig( train_distribute=tf.contrib.distribute.CollectiveAllReduceStrategy( ), eval_distribute=tf.contrib.distribute.CollectiveAllReduceStrategy( ), remote_cluster=cluster_spec) run_config = tf.estimator.RunConfig( experimental_distribute=distrib_config) estimator = tf.estimator.LinearClassifier(
predictions={"x": x}, eval_metric_ops={}) def experiment_fn() -> Experiment: def input_fn(): x = tf.constant([[1.0], [2.0], [3.0], [4.0]]) return {"x": x}, x estimator = tf.estimator.Estimator(model_fn=model_fn) train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1) eval_spec = tf.estimator.EvalSpec(input_fn, steps=1) return Experiment(estimator, train_spec, eval_spec) if __name__ == "__main__": pyenv_zip_path, env_name = packaging.upload_env_to_hdfs() editable_requirements = packaging.get_editable_requirements_from_current_venv( ) # skein.Client is useful when multiple learnings run in parallel # and share one single skein JAVA process with skein.Client() as client: run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={"chief": TaskSpec(memory="1 GiB", vcores=1)}, files={ **editable_requirements, }, acls=skein.model.ACLs(enable=True, view_users=['*']), skein_client=client)
max_steps=10, hooks=[hvd.BroadcastGlobalVariablesHook(0)]), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=1), "evaluator": TaskSpec(memory="2 GiB", vcores=1), "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, custom_task_module="tf_yarn.tasks._gloo_allred_task")
if __name__ == "__main__": # you need to install mlflow `pip install mlflow` # and set MLflow tracking uri mlflow.set_tracking_uri(os.getenv("CRITEO_MLFLOW_TRACKING_URI", "")) run_id = mlflow.start_run(experiment_id=77).info.run_id pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="1 GiB", vcores=1), "evaluator": TaskSpec(memory="1 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }) mlflow.end_run() # check if run has been registered in MLFlow run_json = requests.get( f"{mlflow.get_tracking_uri()}/api/2.0/mlflow/runs/get", params={ 'run_id': run_id
# To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15 import tensorflow as tf def input_fn(): x = tf.constant([[1.0], [2.0], [3.0], [4.0]]) return {"x": x}, x estimator = tf.estimator.Estimator(model_fn=model_fn) train_spec = tf.estimator.TrainSpec(input_fn, max_steps=1) eval_spec = tf.estimator.EvalSpec(input_fn, steps=1) return Experiment(estimator, train_spec, eval_spec) if __name__ == "__main__": pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() # skein.Client is useful when multiple learnings run in parallel # and share one single skein JAVA process with skein.Client() as client: run_on_yarn( pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="1 GiB", vcores=1) }, files={ **editable_requirements, }, skein_client=client )
n_classes=winequality.get_n_classes()) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=10), tf.estimator.EvalSpec( eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": pyenv_zip_path, env_name = packaging.upload_env_to_hdfs() editable_requirements = packaging.get_editable_requirements_from_current_venv() run_on_yarn( pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory=2 * 2 ** 10, vcores=4), "evaluator": TaskSpec(memory=2 ** 10, vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, acls=skein.model.ACLs( enable=True, view_users=['*'] ) )
n_classes=winequality.get_n_classes()) train_spec = tf.estimator.TrainSpec( train_input_fn, max_steps=1000, hooks=[BroadcastGlobalVariablesHook()] ) return Experiment(estimator, train_spec, tf.estimator.EvalSpec(lambda: True)) if __name__ == "__main__": pyenv_zip_path, env_name = packaging.upload_env_to_hdfs() editable_requirements = packaging.get_editable_requirements_from_current_venv() with skein.Client() as client: run_on_yarn( pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="1 GiB", vcores=1), "worker": TaskSpec(memory="1 GiB", vcores=1, instances=NB_WORKERS) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, skein_client=client, custom_task_module="tf_collective_all_reduce.python.tf_yarn._rabit_allred_task" )
def main(): def experiment_fn() -> KerasExperiment: def convert_to_tensor(x, y): return (tf.convert_to_tensor(value=list(x.values()), dtype=tf.float32), tf.convert_to_tensor(value=y, dtype=tf.int32)) def input_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return (dataset.map(convert_to_tensor).shuffle(1000).batch( 128).repeat()) def validation_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return (dataset.map(convert_to_tensor).shuffle(1000).batch(128)) model = tf.keras.Sequential() model.add( tf.keras.layers.Dense(units=300, activation="relu", input_shape=(11, ))) model.add(tf.keras.layers.Dense(units=100, activation="relu")) model.add(tf.keras.layers.Dense(units=10, activation="softmax")) model.summary() opt = tf.keras.optimizers.Adadelta(1.0 * HVD_SIZE) opt = hvd.DistributedOptimizer(opt) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) path_to_checkpoint = f"{HDFS_DIR}" + "/checkpoint-{epoch}" my_callbacks = [ tf.keras.callbacks.ModelCheckpoint(path_to_checkpoint), hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0), ] train_params = {"steps_per_epoch": 1000, "callbacks": my_callbacks} return KerasExperiment(model=model, model_dir=HDFS_DIR, train_params=train_params, input_data_fn=input_data_fn, target_data_fn=None, validation_data_fn=validation_data_fn) pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task")
hooks=[hvd.BroadcastGlobalVariablesHook(0)] ), tf.estimator.EvalSpec( eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30 ) ) if __name__ == "__main__": pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn( pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=1), "evaluator": TaskSpec(memory="2 GiB", vcores=1), "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task" )