def launch_remote_check(file: str) -> Tuple[bool, str]: logging.info('Launching remote check') zip_hdfs, _ = cluster_pack.upload_env(packer=cluster_pack.PEX_PACKER) archive_name = os.path.basename(zip_hdfs) with skein.Client() as client: files = { archive_name: zip_hdfs, 'check_hadoop_env.py': __file__, } editable_packages = cluster_pack.get_editable_requirements() if 'tf_yarn' in editable_packages: tf_yarn_zip = cluster_pack.zip_path(editable_packages['tf_yarn'], False) logger.info(f"zip path for editable tf_yarn is {tf_yarn_zip}") files.update({'tf_yarn': tf_yarn_zip}) service = skein.Service( script=f'./{archive_name} check_hadoop_env.py --file {file}', resources=skein.Resources(2 * 1024, 1), env={ 'PEX_ROOT': '/tmp/{uuid.uuid4()}/', 'PYTHONPATH': '.:', }, files=files, instances=1) spec = skein.ApplicationSpec( {'HADOOP_ENV_CHECKER': service}, acls=skein.model.ACLs(enable=True, view_users=['*']), ) app = client.submit_and_connect(spec) logging.info('Remote check started') result = app.kv.wait('result').decode() app_id = app.id app.shutdown() return result == "True", app_id
def main(): # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn(HDFS_DIR) pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__ })
def _add_editable_requirements(files: Optional[Dict[str, str]]): editable_requirements = cluster_pack.get_editable_requirements() if files is None: files = dict() for dirname, path in editable_requirements.items(): if dirname not in files: files[dirname] = path return files
def main(): pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn( pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task" )
def main(): def experiment_fn() -> Experiment: # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15 import tensorflow as tf def convert_to_tensor(x, y): return (tf.convert_to_tensor(value=list(x.values()), dtype=tf.float32), tf.convert_to_tensor(value=y, dtype=tf.int32)) def train_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return (dataset.map(convert_to_tensor).shuffle(1000).batch( 128).repeat()) def eval_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return (dataset.map(convert_to_tensor).shuffle(1000).batch(128)) model = keras.Sequential() model.add( keras.layers.Dense(units=300, activation="relu", input_shape=(11, ))) model.add(keras.layers.Dense(units=100, activation="relu")) model.add(keras.layers.Dense(units=10, activation="softmax")) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer="sgd", metrics=['accuracy']) config = tf.estimator.RunConfig(model_dir=HDFS_DIR) estimator = tf.keras.estimator.model_to_estimator(model, config=config) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=1000), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn() pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, })
optimizer=lambda: hvd.DistributedOptimizer(tf.train.AdamOptimizer())) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=10, hooks=[hvd.BroadcastGlobalVariablesHook(0)]), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=1), "evaluator": TaskSpec(memory="2 GiB", vcores=1), "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR) }, files={ **editable_requirements,
def main(): def experiment_fn() -> KerasExperiment: def convert_to_tensor(x, y): return (tf.convert_to_tensor(value=list(x.values()), dtype=tf.float32), tf.convert_to_tensor(value=y, dtype=tf.int32)) def input_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return (dataset.map(convert_to_tensor).shuffle(1000).batch( 128).repeat()) def validation_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return (dataset.map(convert_to_tensor).shuffle(1000).batch(128)) model = tf.keras.Sequential() model.add( tf.keras.layers.Dense(units=300, activation="relu", input_shape=(11, ))) model.add(tf.keras.layers.Dense(units=100, activation="relu")) model.add(tf.keras.layers.Dense(units=10, activation="softmax")) model.summary() opt = tf.keras.optimizers.Adadelta(1.0 * HVD_SIZE) opt = hvd.DistributedOptimizer(opt) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) path_to_checkpoint = f"{HDFS_DIR}" + "/checkpoint-{epoch}" my_callbacks = [ tf.keras.callbacks.ModelCheckpoint(path_to_checkpoint), hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0), ] train_params = {"steps_per_epoch": 1000, "callbacks": my_callbacks} return KerasExperiment(model=model, model_dir=HDFS_DIR, train_params=train_params, input_data_fn=input_data_fn, target_data_fn=None, validation_data_fn=validation_data_fn) pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task")
def get_editable_requirements_from_current_venv( executable: str = sys.executable, editable_packages_dir: str = os.getcwd()): return cluster_pack.get_editable_requirements(executable, editable_packages_dir)
def get_editable_requirements(executable: str = sys.executable): return cluster_pack.get_editable_requirements(executable)