def test_default_is_chief_without_tf_config_or_job_name(self): # When is_chief is omitted, there is no TF_CONFIG and no job_name # (legacy behavior), then is_chief should be True iff task == 0. config = run_config.RunConfig(task=0) self.assertTrue(config.is_chief) config = run_config.RunConfig(task=1) self.assertFalse(config.is_chief)
def test_bad_is_chief_combinations_raise(self): msg = "Task is 1, but only task 0 may be chief" with self.assertRaisesRegexp(ValueError, msg): run_config.RunConfig(is_chief=True, task=1) msg = "job_name is \'ps\', but only masters or workers may be chiefs" with self.assertRaisesRegexp(ValueError, msg): run_config.RunConfig(is_chief=True, task=0, job_name="ps") with self.assertRaisesRegexp(ValueError, "Master task 0 must be chief"): run_config.RunConfig(is_chief=False, task=0, job_name="master")
def test_default_is_chief_without_tf_config_but_has_job_name(self): # When is_chief is omitted, there is no TF_CONFIG but there is a job_name, # then is_chief is True iff job_name is "worker" and task == 0. config = run_config.RunConfig(job_name="worker", task=0) self.assertTrue(config.is_chief) config = run_config.RunConfig(job_name="worker", task=1) self.assertFalse(config.is_chief) config = run_config.RunConfig(job_name="ps", task=0) self.assertFalse(config.is_chief) config = run_config.RunConfig(job_name="ps", task=1) self.assertFalse(config.is_chief)
def test_explicitly_specified_values(self): cluster_spec = { run_config_lib.TaskType.PS: ["localhost:9990"], "my_job_name": ["localhost:9991", "localhost:9992", "localhost:0"] } tf_config = { "cluster": cluster_spec, "task": { "type": run_config_lib.TaskType.WORKER, "index": 2 } } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig(master="localhost:0", evaluation_master="localhost:9991") self.assertEqual(config.master, "localhost:0") self.assertEqual(config.task_id, 2) self.assertEqual(config.num_ps_replicas, 1) self.assertEqual(config.num_worker_replicas, 0) self.assertEqual(config.cluster_spec, server_lib.ClusterSpec(cluster_spec)) self.assertEqual(config.task_type, run_config_lib.TaskType.WORKER) self.assertFalse(config.is_chief) self.assertEqual(config.evaluation_master, "localhost:9991")
def test_fail_job_name_with_no_default_schedule(self): config = run_config.RunConfig( job_name="foo_has_no_default_schedule", cluster_spec=build_distributed_cluster_spec()) create_experiment_fn = lambda output_dir: TestExperiment(config=config) self.assertRaisesRegexp(ValueError, "No default schedule", learn_runner.run, create_experiment_fn, "/tmp")
def test_no_schedule_and_non_distributed_runs_local_run(self): config = run_config.RunConfig( cluster_spec=build_non_distributed_cluster_spec()) self.assertEqual( "local_run", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_schedule_from_config_runs_train_on_worker(self): config = run_config.RunConfig( job_name="worker", cluster_spec=build_distributed_cluster_spec()) self.assertEqual( "train", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_no_schedule_and_non_distributed_runs_train_and_evaluate(self): tf_config = {"cluster": build_non_distributed_cluster_spec()} with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertEqual( "train_and_evaluate", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def __init__(self, config=None, max_evals=5): self.eval_count = 0 self.fit_count = 0 self._max_evals = max_evals self.export_count = 0 self.monitors = [] self._config = config or run_config.RunConfig() self._model_dir = tempfile.mkdtemp()
def test_schedule_from_tf_config(self): os.environ["TF_CONFIG"] = json.dumps({"task": {"type": "worker"}}) # RunConfig constructuor will set job_name from TF_CONFIG. config = run_config.RunConfig() self.assertEqual( "train", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_fail_schedule_from_config_with_no_job_name(self): config = run_config.RunConfig(job_name=None) self.assertRaisesRegexp( ValueError, "Must specify a schedule", learn_runner.run, lambda output_dir: TestExperiment(config=config), output_dir="/tmp")
def test_defaults_with_no_tf_config(self): config = run_config.RunConfig() self.assertEquals(config.master, "") self.assertEquals(config.task, 0) self.assertEquals(config.num_ps_replicas, 0) self.assertIsNone(config.cluster_spec) self.assertIsNone(config.job_name) self.assertIsNone(config.is_chief)
def test_invalid_job_name_raises(self): cluster_spec = tf.train.ClusterSpec({ "ps": ["host1:1", "host2:2"], "worker": ["host3:3", "host4:4", "host5:5"] }) expected_msg_regexp = "not_in_cluster_spec is not a valid task" with self.assertRaisesRegexp(ValueError, expected_msg_regexp): run_config.RunConfig(cluster_spec=cluster_spec, job_name="not_in_cluster_spec")
def test_defaults_with_no_tf_config(self): config = run_config.RunConfig() self.assertEqual(config.master, "") self.assertEqual(config.task_id, 0) self.assertEqual(config.num_ps_replicas, 0) self.assertEqual(config.cluster_spec, {}) self.assertIsNone(config.task_type) self.assertTrue(config.is_chief) self.assertEqual(config.evaluation_master, "")
def test_no_job_name_produces_empty_master(self): cluster_spec = tf.train.ClusterSpec({ "ps": ["host1:1", "host2:2"], "worker": ["host3:3", "host4:4", "host5:5"] }) # NB: omitted job_name; better to omit than explictly set to None # as this better mimics client behavior. config = run_config.RunConfig(cluster_spec=cluster_spec) self.assertEquals(config.master, "")
def test_fail_schedule_from_config_with_no_task_type(self): tf_config = {"cluster": build_distributed_cluster_spec()} with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertRaisesRegexp( ValueError, "Must specify a schedule", learn_runner.run, lambda output_dir: TestExperiment(config=config), output_dir="/tmp")
def test_schedule_from_tf_config_runs_train_on_worker(self): os.environ["TF_CONFIG"] = json.dumps( {"cluster": build_distributed_cluster_spec(), "task": {"type": tf.contrib.learn.TaskType.WORKER}}) # RunConfig constructor will set job_name from TF_CONFIG. config = run_config.RunConfig() self.assertEqual( "train", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_bad_is_chief_combinations_raise(self): msg = "Task is 1, but only task 0 may be chief" with self.assertRaisesRegexp(ValueError, msg): run_config.RunConfig(is_chief=True, task=1) msg = "job_name is \'ps\', but only masters or workers may be chiefs" with self.assertRaisesRegexp(ValueError, msg): run_config.RunConfig(is_chief=True, task=0, job_name="ps") msg = "Master task 0 must be chief for cloud" with self.assertRaisesRegexp(ValueError, msg): tf_config = {"environment": "cloud"} with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): run_config.RunConfig(is_chief=False, task=0, job_name="master") msg = "Worker task 0 must be chief" with self.assertRaisesRegexp(ValueError, msg): run_config.RunConfig(is_chief=False, task=0, job_name="worker")
def test_schedule_from_config_runs_local_run_on_master(self): config = run_config.RunConfig( job_name="master", cluster_spec=build_distributed_cluster_spec(), task=0, is_chief=True) self.assertEqual( "local_run", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_uid(self): config = run_config.RunConfig(tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR) expected_uid = config.uid() # Check for 10 times, which should prove something. for _ in range(10): self.assertEqual(expected_uid, config.uid()) new_config = config.replace(model_dir=ANOTHER_TEST_DIR) self.assertNotEqual(expected_uid, new_config.uid())
def test_num_ps_replicas_and_cluster_spec_are_mutually_exclusive(self): cluster_spec = tf.train.ClusterSpec({ "ps": ["host1:1", "host2:2"], "worker": ["host3:3", "host4:4", "host5:5"] }) expected_msg_regexp = "Cannot specify both num_ps_replicas and cluster_spec" with self.assertRaisesRegexp(ValueError, expected_msg_regexp): run_config.RunConfig( num_ps_replicas=2, cluster_spec=cluster_spec, )
def test_is_chief_from_noncloud_tf_config(self): # is_chief should be true when ["task"]["type"] == "worker" and # index == 0 if ["task"]["environment"] != "cloud". tf_config = { "cluster": { tf.contrib.learn.TaskType.PS: ["host1:1", "host2:2"], tf.contrib.learn.TaskType.MASTER: ["host3:3"], tf.contrib.learn.TaskType.WORKER: ["host4:4", "host5:5", "host6:6"] }, "task": { "type": tf.contrib.learn.TaskType.WORKER, "index": 0 }, "environment": "random" } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertTrue(config.is_chief) # But task 0 for a job named "master" should not be. tf_config = { "cluster": { tf.contrib.learn.TaskType.PS: ["host1:1", "host2:2"], tf.contrib.learn.TaskType.MASTER: ["host3:3"], tf.contrib.learn.TaskType.WORKER: ["host4:4", "host5:5", "host6:6"] }, "task": { "type": tf.contrib.learn.TaskType.MASTER, "index": 0 }, "environment": "random" } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertFalse(config.is_chief)
def test_no_task_type_produces_empty_master(self): tf_config = { "cluster": { run_config_lib.TaskType.PS: ["host1:1", "host2:2"], run_config_lib.TaskType.WORKER: ["host3:3", "host4:4", "host5:5"] }, # Omits "task": {"type": "worker} } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertEqual(config.master, "")
def test_train_default_delay(self): config = run_config.RunConfig() est = TestEstimator(config) ex = tf.contrib.learn.Experiment(est, train_input_fn='train_input', eval_input_fn='eval_input') for task in [0, 1, 3]: start = time.time() config.task = task ex.train() duration = time.time() - start self.assertAlmostEqual(duration, task * 5, delta=0.5)
def test_fail_task_type_with_no_default_schedule(self): tf_config = { "cluster": build_distributed_cluster_spec(), "task": { "type": "foo_has_no_default_schedule" } } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() create_experiment_fn = lambda output_dir: TestExperiment(config=config) self.assertRaisesRegexp(ValueError, "No default schedule", learn_runner.run, create_experiment_fn, "/tmp")
def test_schedule_from_tf_config_runs_serve_on_ps(self): tf_config = { "cluster": build_distributed_cluster_spec(), "task": { "type": tf.contrib.learn.TaskType.PS } } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertEqual( "run_std_server", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_schedule_from_tf_config_runs_train_and_evaluate_on_master(self): tf_config = { "cluster": build_distributed_cluster_spec(), "task": { "type": run_config_lib.TaskType.MASTER } } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertEqual( "train_and_evaluate", learn_runner.run( lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_train_default_delay(self): for task_id in [0, 1, 3]: tf_config = {'task': {'index': task_id}} with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() for est in self._estimators_for_tests(config): ex = experiment.Experiment( est, train_input_fn='train_input', eval_input_fn='eval_input') sheep = SheepCounter() with test.mock.patch.object(time, 'time', sheep.time): with test.mock.patch.object(time, 'sleep', sheep.sleep): ex.train() self.assertAlmostEqual(task_id * 5, sheep.time(), delta=1e-4)
def test_train_default_delay(self): for task_id in [0, 1, 3]: tf_config = {'task': {'index': task_id}} with patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() est = TestEstimator(config) ex = tf.contrib.learn.Experiment(est, train_input_fn='train_input', eval_input_fn='eval_input') start = time.time() ex.train() duration = time.time() - start self.assertAlmostEqual(duration, task_id * 5, delta=1.0)
def test_train_default_delay(self): for task_id in [0, 1, 3]: tf_config = {'task': {'index': task_id}} with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() est = TestEstimator(config) ex = experiment.Experiment(est, train_input_fn='train_input', eval_input_fn='eval_input') with test.mock.patch('time.sleep', SheepCounter()) as sheep: ex.train() self.assertAlmostEqual(task_id * 5, sheep.total_time, delta=0.1)