def main(): # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn(HDFS_DIR) pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__ })
def test_retry_run_on_yarn(nb_retries, nb_failures): cpt = 0 def fail(*args, **kwargs): if cpt < nb_failures: raise Exception("") else: pass with mock.patch('tf_yarn._setup_pyenvs'), \ mock.patch('tf_yarn._setup_skein_cluster') as mock_setup_skein_cluster, \ mock.patch('tf_yarn._run_on_cluster') as mock_run_on_cluster: mock_run_on_cluster.side_effect = fail gb = 2**10 try: run_on_yarn("path/to/env", lambda: Experiment(None, None, None), task_specs={ "chief": TaskSpec(memory=16 * gb, vcores=16), "worker": TaskSpec(memory=16 * gb, vcores=16, instances=1), "ps": TaskSpec(memory=16 * gb, vcores=16, instances=1) }, nb_retries=nb_retries) except Exception: pass nb_calls = min(nb_retries, nb_failures) + 1 assert mock_run_on_cluster.call_count == nb_calls assert mock_setup_skein_cluster.call_count == nb_calls
def main(): pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn( pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task" )
user = pwd.getpwuid(os.getuid()).pw_name config = tf.estimator.RunConfig( tf_random_seed=42, model_dir=f"{fs}/user/{user}/examples/{run_id}") estimator = tf.estimator.LinearClassifier( winequality.get_feature_columns(), n_classes=winequality.get_n_classes(), config=config) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=10), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": try: [dataset_path] = sys.argv[1:] except ValueError: sys.exit(winequality.__doc__) logging.basicConfig(level="INFO") #also one can use run_on_yarn(...,num_cores=num_cores) where num_cores is the user specified value, otherwise the default is 1 run_on_yarn( partial(experiment_fn, dataset_path), task_specs={ "chief": TaskSpec(memory=2 * 2**10, vcores=4), "evaluator": TaskSpec(memory=2**10, vcores=1) }, files={os.path.basename(winequality.__file__): winequality.__file__})
def main(): def experiment_fn() -> KerasExperiment: def convert_to_tensor(x, y): return (tf.convert_to_tensor(value=list(x.values()), dtype=tf.float32), tf.convert_to_tensor(value=y, dtype=tf.int32)) def input_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return (dataset.map(convert_to_tensor).shuffle(1000).batch( 128).repeat()) def validation_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return (dataset.map(convert_to_tensor).shuffle(1000).batch(128)) model = tf.keras.Sequential() model.add( tf.keras.layers.Dense(units=300, activation="relu", input_shape=(11, ))) model.add(tf.keras.layers.Dense(units=100, activation="relu")) model.add(tf.keras.layers.Dense(units=10, activation="softmax")) model.summary() opt = tf.keras.optimizers.Adadelta(1.0 * HVD_SIZE) opt = hvd.DistributedOptimizer(opt) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) path_to_checkpoint = f"{HDFS_DIR}" + "/checkpoint-{epoch}" my_callbacks = [ tf.keras.callbacks.ModelCheckpoint(path_to_checkpoint), hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0), ] train_params = {"steps_per_epoch": 1000, "callbacks": my_callbacks} return KerasExperiment(model=model, model_dir=HDFS_DIR, train_params=train_params, input_data_fn=input_data_fn, target_data_fn=None, validation_data_fn=validation_data_fn) pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task")