Example #1
0
    def testEvaluationService(self):
        task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)],
                                     [("f1", 0, 10), ("f2", 0, 10)])

        # Evaluation metrics will not be accepted if no evaluation ongoing
        evaluation_service = EvaluationService(
            task_d.create_evaluation_tasks,
            0,
            False,
            _eval_metrics_fn,
        )

        master = Mock(
            task_d=task_d,
            instance_manager=None,
            distribution_strategy=None,
        )
        _ = MasterServicer(master.task_d, master.instance_manager, None,
                           evaluation_service)

        # No checkpoint available
        self.assertFalse(evaluation_service.try_to_create_new_job())

        # Add an evaluation task and we can start evaluation
        self.assertEqual(8, len(task_d._todo))
        evaluation_service.add_evaluation_task(False)
        self.assertEqual(8, len(task_d._eval_todo))
        self.assertFalse(evaluation_service._eval_job.finished())

        for i in range(8):
            self.assertFalse(evaluation_service._eval_job.finished())
            evaluation_service.complete_task()
        self.assertTrue(evaluation_service._eval_job is None)
        self.assertFalse(evaluation_service.try_to_create_new_job())
Example #2
0
    def testEvaluationOnly(self):
        task_d = create_task_manager([], [("f1", 0, 10), ("f2", 0, 10)])
        task_d.create_tasks(elasticai_api_pb2.EVALUATION)

        evaluation_service = EvaluationService(task_d.create_evaluation_tasks,
                                               0, True, _eval_metrics_fn)
        task_d.set_evaluation_service(evaluation_service)

        master = Mock(
            task_d=task_d,
            instance_manager=None,
            distribution_strategy=None,
        )

        _ = MasterServicer(
            master.task_d,
            master.instance_manager,
            None,
            evaluation_service,
        )

        self.assertEqual(8, len(task_d._eval_todo))
        for i in range(8):
            self.assertFalse(evaluation_service._eval_job.finished())
            evaluation_service.complete_task()
        self.assertTrue(evaluation_service._eval_job.finished())
Example #3
0
    def testNeedEvaluation(self):
        task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)],
                                     [("f1", 0, 10), ("f2", 0, 10)])

        evaluation_service = EvaluationService(
            task_d.create_evaluation_tasks,
            10,
            False,
            _eval_metrics_fn,
        )

        # Should add evaluation task and create eval job
        evaluation_service.add_evaluation_task_if_needed(model_version=10)
        self.assertTrue(evaluation_service._eval_job is not None)
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [])

        # Should ignore because version 10 is in the eval list
        evaluation_service.add_evaluation_task_if_needed(model_version=10)
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [])

        # Should append version 20 to the eval list
        evaluation_service.add_evaluation_task_if_needed(model_version=20)
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [20])

        # Should ignore version 10 because version 20 is already in eval list
        evaluation_service.add_evaluation_task_if_needed(model_version=10)
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [20])

        # Should append version 30 to the eval list
        evaluation_service.add_evaluation_task_if_needed(model_version=30)
        self.assertEqual(evaluation_service._eval_checkpoint_versions,
                         [20, 30])
Example #4
0
    def test_epoch(self):
        task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [], 2)

        epoch_tasks = [
            ("f1", 0, 3, elasticai_api_pb2.TRAINING, -1),
            ("f1", 3, 6, elasticai_api_pb2.TRAINING, -1),
            ("f1", 6, 9, elasticai_api_pb2.TRAINING, -1),
            ("f1", 9, 10, elasticai_api_pb2.TRAINING, -1),
            ("f2", 0, 3, elasticai_api_pb2.TRAINING, -1),
            ("f2", 3, 6, elasticai_api_pb2.TRAINING, -1),
            ("f2", 6, 9, elasticai_api_pb2.TRAINING, -1),
            ("f2", 9, 10, elasticai_api_pb2.TRAINING, -1),
        ]

        # get first epoch tasks
        got_tasks = [task_d.get(i // 2) for i in range(8)]
        self.assertEqual(
            sorted([v._info() for _, v in got_tasks]), epoch_tasks
        )

        # get second epoch tasks
        got_tasks = [task_d.get(i // 2) for i in range(8)]
        self.assertEqual(
            sorted([v._info() for _, v in got_tasks]), epoch_tasks
        )
Example #5
0
 def test_set_training_params(self):
     task_manager = create_task_manager([], [])
     task_manager.set_training_params(1, 1, 10, False, False, 3)
     self.assertEqual(
         task_manager._training_shards,
         [("", 0, 3), ("", 3, 3), ("", 6, 3), ("", 9, 1)],
     )
     self.assertEqual(len(task_manager._todo), 4)
Example #6
0
 def test_invoke_train_end_callback(self):
     task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [])
     task_d._add_deferred_callback_create_train_end_task()
     task_d._todo.clear()
     task_d.invoke_deferred_callback()
     self.assertEqual(len(task_d._todo), 1)
     self.assertEqual(
         task_d._todo[0].type, elasticai_api_pb2.TRAIN_END_CALLBACK
     )
Example #7
0
    def test_create_tasks_with_zero_start_ind(self):
        task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [])

        all_tasks = [
            ("f1", 0, 3, elasticai_api_pb2.TRAINING, -1),
            ("f1", 3, 6, elasticai_api_pb2.TRAINING, -1),
            ("f1", 6, 9, elasticai_api_pb2.TRAINING, -1),
            ("f1", 9, 10, elasticai_api_pb2.TRAINING, -1),
            ("f2", 0, 3, elasticai_api_pb2.TRAINING, -1),
            ("f2", 3, 6, elasticai_api_pb2.TRAINING, -1),
            ("f2", 6, 9, elasticai_api_pb2.TRAINING, -1),
            ("f2", 9, 10, elasticai_api_pb2.TRAINING, -1),
        ]

        # get all tasks out, each worker is assigned 2 tasks.
        got_tasks = [task_d.get(i // 2) for i in range(8)]

        # verify ids ranges from 1 to 8
        self.assertEqual(list(range(1, 9)), [k for k, _ in got_tasks])

        # verify tasks
        self.assertEqual(sorted([v._info() for _, v in got_tasks]), all_tasks)

        # no todo tasks, should return None
        self.assertEqual((-1, None), task_d.get(10))

        request = elasticai_api_pb2.ReportTaskResultRequest()
        # report 6 task successes.
        for t in (1, 3, 5, 7, 2, 8):
            request.task_id = t
            task_d.report(request, True)

        # there should be 2 doing tasks left.
        self.assertEqual(2, len(task_d._doing))

        # report a task failure
        request.task_id = list(task_d._doing.items())[0][0]
        task_d.report(request, False)
        self.assertEqual(1, len(task_d._doing))

        # recover tasks from a dead worker
        task_d.recover_tasks(list(task_d._doing.items())[0][1][0])
        self.assertEqual(0, len(task_d._doing))

        self.assertEqual(2, len(task_d._todo))

        id1, t1 = task_d.get(11)
        id2, t2 = task_d.get(12)
        request.task_id = id1
        task_d.report(request, True)
        request.task_id = id2
        task_d.report(request, True)

        self.assertTrue(task_d.finished())
Example #8
0
    def test_get_max_task_completed_time(self):
        task_manager = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [])
        self.assertEqual(
            task_manager._max_task_completed_times,
            {elasticai_api_pb2.TRAINING: 0, elasticai_api_pb2.EVALUATION: 0},
        )
        task_manager.record_task_completed_time(elasticai_api_pb2.TRAINING, 10)
        task_manager.record_task_completed_time(
            elasticai_api_pb2.EVALUATION, 5
        )

        self.assertEqual(
            task_manager._max_task_completed_times,
            {elasticai_api_pb2.TRAINING: 10, elasticai_api_pb2.EVALUATION: 5},
        )
Example #9
0
    def test_check_and_reassign_timeout_tasks(self):
        task_manager = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [])
        task_manager.create_tasks(elasticai_api_pb2.TRAINING)
        task_count = len(task_manager._todo)
        task_start_time = time.time() - 1000
        task_manager._worker_start_task_time[0] = task_start_time
        task_manager._doing[0] = (0, task_manager._todo[0], task_start_time)
        task_manager._todo.pop()

        threading.Thread(
            target=task_manager._check_and_reassign_timeout_tasks,
            name="check_timeout_tasks",
            daemon=True,
        ).start()
        time.sleep(1)  # Sleep 1s to reassgin checkout the timeout task
        self.assertEqual(len(task_manager._todo), task_count)
Example #10
0
    def test_failed_worker_pod(self):
        """
        Start a pod running a python program destined to fail with
        restart_policy="Never" to test failed_worker_count
        """
        task_manager = create_task_manager([("f", 0, 10)], [])
        task_manager.recover_tasks = MagicMock()
        pod_manager = PodManager(
            job_name="test-failed-worker-pod-%d-%d"
            % (int(time.time()), random.randint(1, 101)),
            image_name="ubuntu:18.04",
            namespace="default",
            num_workers=3,
            restart_policy="Never",
            envs=[],
        )
        pod_manager.set_up(
            worker_command=["/bin/bash"], worker_args=["-c", "badcommand"],
        )
        pod_manager.add_pod_event_callback(
            TaskRescheduleCallback(task_manager=task_manager)
        )
        pod_manager._k8s_client.start_watch_events()
        pod_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = pod_manager.get_pod_counter(pod_type=PodType.WORKER)
            if counters[PodStatus.FAILED] == 3:
                break

        pod_manager.stop_relaunch_and_remove_pods(pod_type=PodType.WORKER)
        for _ in range(max_check_num):
            time.sleep(3)
            counters = pod_manager.get_pod_counter(pod_type=PodType.WORKER)
            if counters[PodStatus.DELETED] == 3:
                break
        else:
            self.fail("Cannot get 3 deleted worker pods as expected.")
        task_manager.recover_tasks.assert_has_calls(
            [call(0), call(1), call(2)], any_order=True
        )
Example #11
0
    def test_report_task_result(self):
        self.master.task_manager = create_task_manager([("shard_1", 0, 10),
                                                        ("shard_2", 0, 9)], [],
                                                       2)
        master = MasterServicer(
            self.master.task_manager,
            self.master.instance_manager,
            None,
            None,
        )

        # task to number of runs.
        tasks = defaultdict(int)
        while True:
            req = elasticai_api_pb2.GetTaskRequest()
            req.worker_id = random.randint(1, 10)
            task = master.get_task(req, None)
            if not task.shard.name:
                break
            self.assertEqual(self.master.task_manager._doing[task.task_id][0],
                             req.worker_id)
            task_key = (task.shard.name, task.shard.start, task.shard.end)
            tasks[task_key] += 1
            report = elasticai_api_pb2.ReportTaskResultRequest()
            report.task_id = task.task_id
            if task.shard.start == 0 and tasks[task_key] == 1:
                # Simulate error reports.
                report.err_message = "Worker error"
            master.report_task_result(report, None)

        self.assertDictEqual(
            {
                ("shard_1", 0, 3): 3,
                ("shard_1", 3, 6): 2,
                ("shard_1", 6, 9): 2,
                ("shard_1", 9, 10): 2,
                ("shard_2", 0, 3): 3,
                ("shard_2", 3, 6): 2,
                ("shard_2", 6, 9): 2,
            },
            tasks,
        )
Example #12
0
    def test_get_empty_task(self):
        self.master.task_manager = create_task_manager([], [])
        master_servicer = MasterServicer(
            self.master.task_manager,
            self.master.instance_manager,
            None,
            None,
        )

        req = elasticai_api_pb2.GetTaskRequest()

        # No task yet, make sure the returned versions are as expected.
        req.worker_id = 1
        task = master_servicer.get_task(req, None)
        self.assertEqual("", task.shard.name)
        self.assertEqual(0, task.model_version)

        master_servicer._version = 1
        task = master_servicer.get_task(req, None)
        self.assertEqual("", task.shard.name)
        self.assertEqual(1, task.model_version)
Example #13
0
    def test_create_tasks_with_non_zero_start_ind(self):
        task_d = create_task_manager([("f1", 0, 10), ("f2", 10, 10)], [])

        all_tasks = [
            ("f1", 0, 3, elasticai_api_pb2.TRAINING, -1),
            ("f1", 3, 6, elasticai_api_pb2.TRAINING, -1),
            ("f1", 6, 9, elasticai_api_pb2.TRAINING, -1),
            ("f1", 9, 10, elasticai_api_pb2.TRAINING, -1),
            ("f2", 10, 13, elasticai_api_pb2.TRAINING, -1),
            ("f2", 13, 16, elasticai_api_pb2.TRAINING, -1),
            ("f2", 16, 19, elasticai_api_pb2.TRAINING, -1),
            ("f2", 19, 20, elasticai_api_pb2.TRAINING, -1),
        ]

        # get all tasks out, each worker is assigned 2 tasks.
        got_tasks = [task_d.get(i // 2) for i in range(8)]

        # verify ids ranges from 1 to 8
        self.assertEqual(list(range(1, 9)), [k for k, _ in got_tasks])

        # verify tasks
        self.assertEqual(sorted([v._info() for _, v in got_tasks]), all_tasks)