def test_retry_run_on_yarn(nb_retries, nb_failures): cpt = 0 def fail(*args, **kwargs): if cpt < nb_failures: raise Exception("") else: pass with mock.patch('tf_yarn.client._setup_pyenvs'), \ mock.patch('tf_yarn.client._setup_skein_cluster') as mock_setup_skein_cluster, \ mock.patch('tf_yarn.client._run_on_cluster') as mock_run_on_cluster: mock_run_on_cluster.side_effect = fail gb = 2**10 try: run_on_yarn( "path/to/env", lambda: Experiment(None, None, None), task_specs={ "chief": TaskSpec(memory=16 * gb, vcores=16), "worker": TaskSpec(memory=16 * gb, vcores=16, instances=1), "ps": TaskSpec(memory=16 * gb, vcores=16, instances=1) }, nb_retries=nb_retries ) except Exception: pass nb_calls = min(nb_retries, nb_failures) + 1 assert mock_run_on_cluster.call_count == nb_calls assert mock_setup_skein_cluster.call_count == nb_calls
def _experiment_fn(model_dir): print(f"create experiment with model_dir={model_dir}") def model_fn(): return tf.estimator.EstimatorSpec() def train_fn(): return None def eval_fn(): return None return Experiment( tf.estimator.LinearClassifier(feature_columns=[], model_dir=model_dir), tf.estimator.TrainSpec(train_fn), tf.estimator.EvalSpec(eval_fn))
def add_monitor_to_experiment(experiment: Experiment): monitored_eval_spec = experiment.eval_spec._replace( hooks=(EvalMonitorHook(), *experiment.eval_spec.hooks)) experiment = experiment._replace(eval_spec=monitored_eval_spec) return experiment