Python TotalJobExeMetrics Examples

Programming Language: Python

Namespace/Package Name: job.execution.metrics

Examples at hotexamples.com: 9

Python TotalJobExeMetrics - 9 examples found. These are the top rated real world Python examples of job.execution.metrics.TotalJobExeMetrics extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TotalJobExeMetrics(5)

add_running_job_exes(4)

generate_status_json(4)

init_with_database(4)

job_exe_finished(4)

Example #1

Show file

File: manager.py Project: mnjstwins/scale

    def __init__(self):
        """Constructor
        """

        self._running_job_exes = {}  # {ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

Example #2

Show file

    def setUp(self):
        django.setup()

        self.alg_error = error_test_utils.create_error(category='ALGORITHM')
        self.data_error = error_test_utils.create_error(category='DATA')
        self.system_error = error_test_utils.create_error(category='SYSTEM')

        self.metrics = TotalJobExeMetrics(now())

Example #3

Show file

File: manager.py Project: GRSEB9S/scale

    def __init__(self):
        """Constructor
        """

        self._job_exe_end_models = [
        ]  # Holds job_exe_end models to send in next messages
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._running_job_messages = []  # Holds running job messages to send
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

Example #4

Show file

    def __init__(self):
        """Constructor
        """

        # Execution information to be sent in command messages
        self._finished_job_exes = []  # Holds finished executions
        self._job_exe_end_models = []  # Holds job_exe_end models to create
        self._running_job_messages = []  # Holds running job messages

        # Current running state
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

Example #5

Show file

class TestTotalJobExeMetrics(TestCase):
    """Tests the TotalJobExeMetrics class"""
    def setUp(self):
        django.setup()

        self.alg_error = error_test_utils.create_error(category='ALGORITHM')
        self.data_error = error_test_utils.create_error(category='DATA')
        self.system_error = error_test_utils.create_error(category='SYSTEM')

        self.metrics = TotalJobExeMetrics(now())

    def test_init_with_database(self):
        """Tests calling init_with_database() successfully to load in job executions from the database"""

        # First block of job executions
        end_time_1 = now(
        ) - FinishedJobExeMetricsOverTime.BLOCK_LENGTH - FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        node_model_1 = node_test_utils.create_node()
        job_type_1 = job_test_utils.create_seed_job_type()
        job_type_2 = job_test_utils.create_seed_job_type()
        job_exe_model_1 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='COMPLETED',
                                                        ended=end_time_1,
                                                        node=node_model_1)
        job_exe_model_2 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='COMPLETED',
                                                        ended=end_time_1,
                                                        node=node_model_1)
        job_exe_model_3 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.alg_error,
                                                        node=node_model_1)
        job_exe_model_4 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.alg_error,
                                                        node=node_model_1)
        job_exe_model_5 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.alg_error,
                                                        node=node_model_1)
        job_exe_model_6 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.data_error,
                                                        node=node_model_1)
        job_exe_model_7 = job_test_utils.create_job_exe(
            job_type=job_type_1,
            status='FAILED',
            ended=end_time_1,
            error=self.system_error,
            node=node_model_1)
        job_exe_model_8 = job_test_utils.create_job_exe(
            job_type=job_type_2,
            status='FAILED',
            ended=end_time_1,
            error=self.system_error,
            node=node_model_1)
        node_model_2 = node_test_utils.create_node()
        job_exe_model_9 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='COMPLETED',
                                                        ended=end_time_1,
                                                        node=node_model_2)
        job_exe_model_10 = job_test_utils.create_job_exe(job_type=job_type_2,
                                                         status='COMPLETED',
                                                         ended=end_time_1,
                                                         node=node_model_2)
        job_exe_model_11 = job_test_utils.create_job_exe(job_type=job_type_2,
                                                         status='FAILED',
                                                         ended=end_time_1,
                                                         error=self.data_error,
                                                         node=node_model_2)
        # Second block of job executions (one time block over from first set of executions)
        end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        job_exe_model_12 = job_test_utils.create_job_exe(
            job_type=job_type_2,
            status='FAILED',
            ended=end_time_2,
            error=self.system_error,
            node=node_model_1)
        job_exe_model_13 = job_test_utils.create_job_exe(
            job_type=job_type_2,
            status='FAILED',
            ended=end_time_2,
            error=self.system_error,
            node=node_model_1)
        job_exe_model_14 = job_test_utils.create_job_exe(job_type=job_type_2,
                                                         status='COMPLETED',
                                                         ended=end_time_2,
                                                         node=node_model_2)
        # Load all initial executions from database
        self.metrics.init_with_database()

        # Generate JSON which should include both sets of job executions
        right_now = end_time_2 + datetime.timedelta(seconds=30)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, right_now)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 8)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 3)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            4)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 3)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 1)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Generate JSON which should include only second set of job executions (first set rolled off by time)
        later = end_time_1 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD + datetime.timedelta(
            seconds=1)
        later += FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, later)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            2)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 1)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Generate JSON where all job executions should have rolled off by time
        later = later + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, later)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

    def test_running_executions(self):
        """Tests the metrics with running executions that complete"""

        node_model_1 = node_test_utils.create_node()
        node_model_2 = node_test_utils.create_node()
        job_type_1 = job_test_utils.create_seed_job_type()
        job_type_2 = job_test_utils.create_seed_job_type()
        job_exe_1 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_1)
        job_exe_2 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_1)
        job_exe_3 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_1)
        job_exe_4 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_1)
        job_exe_5 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_2)
        job_exe_6 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_2)
        job_exe_7 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_2)
        job_exe_8 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_2)
        job_exe_9 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_2)
        job_exe_10 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                           job_type=job_type_2,
                                                           node=node_model_2)
        job_exe_11 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                           job_type=job_type_2,
                                                           node=node_model_2)

        # NOTE: This unit test is about to get CRAZY. I apologize for the complexity, but this is needed for a
        # thorough testing
        self.metrics.add_running_job_exes([
            job_exe_1, job_exe_2, job_exe_3, job_exe_4, job_exe_5, job_exe_6,
            job_exe_7, job_exe_8, job_exe_9, job_exe_10, job_exe_11
        ])
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, now())

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 4)
        for job_type_dict in node_list_dict[0]['job_executions']['running'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 3)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 1)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['running'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Finish some job executions
        end_time_1 = now()
        job_exe_1._set_final_status('COMPLETED', end_time_1)
        job_exe_2._set_final_status('FAILED',
                                    end_time_1,
                                    error=self.data_error)
        job_exe_4._set_final_status('FAILED', end_time_1, error=self.alg_error)
        self.metrics.job_exe_finished(job_exe_1)
        self.metrics.job_exe_finished(job_exe_2)
        self.metrics.job_exe_finished(job_exe_4)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(
            node_list_dict, end_time_1 + datetime.timedelta(seconds=1))

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['running']['by_job_type']),
            1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['completed']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['algorithm']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['job_type_id'], job_type_2.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['data']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['running'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Finish some job executions (all executions still on node 2)
        end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        job_exe_5._set_final_status('COMPLETED', end_time_2)
        job_exe_6._set_final_status('COMPLETED', end_time_2)
        job_exe_7._set_final_status('COMPLETED', end_time_2)
        job_exe_8._set_final_status('COMPLETED', end_time_2)
        job_exe_9._set_final_status('COMPLETED', end_time_2)
        job_exe_10._set_final_status('COMPLETED', end_time_2)
        job_exe_11._set_final_status('COMPLETED', end_time_2)
        self.metrics.job_exe_finished(job_exe_5)
        self.metrics.job_exe_finished(job_exe_6)
        self.metrics.job_exe_finished(job_exe_7)
        self.metrics.job_exe_finished(job_exe_8)
        self.metrics.job_exe_finished(job_exe_9)
        self.metrics.job_exe_finished(job_exe_10)
        self.metrics.job_exe_finished(job_exe_11)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, end_time_2)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['running']['by_job_type']),
            1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['completed']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['algorithm']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['job_type_id'], job_type_2.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['data']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['completed'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Let all finished job executions roll off by time, only running remaining
        end_time_3 = end_time_2 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD
        end_time_3 += FinishedJobExeMetricsOverTime.BLOCK_LENGTH + datetime.timedelta(
            seconds=1)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, end_time_3)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['running']['by_job_type']),
            1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

Example #6

Show file

    def clear(self):
        """Clears all data from the manager. This method is intended for testing only.
        """

        self._running_job_exes = {}
        self._metrics = TotalJobExeMetrics(now())

Example #7

Show file

class JobExecutionManager(object):
    """This class manages all running and finished job executions. This class is thread-safe."""
    def __init__(self):
        """Constructor
        """

        # Execution information to be sent in command messages
        self._finished_job_exes = []  # Holds finished executions
        self._job_exe_end_models = []  # Holds job_exe_end models to create
        self._running_job_messages = []  # Holds running job messages

        # Current running state
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

    def add_canceled_job_exes(self, job_exe_ends):
        """Adds the given job_exe_end models for job executions canceled off of the queue

        :param job_exe_ends: The job_exe_end models to add
        :type job_exe_ends: list
        """

        with self._lock:
            self._job_exe_end_models.extend(job_exe_ends)

    def check_for_starvation(self, when):
        """Checks all of the currently running job executions for resource starvation. If any starved executions are
        found, they are failed and returned.

        :param when: The current time
        :type when: :class:`datetime.datetime`
        :returns: A list of the starved job executions
        :rtype: list
        """

        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.check_for_starvation(when):
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        return finished_job_exes

    def clear(self):
        """Clears all data from the manager. This method is intended for testing only.
        """

        self._running_job_exes = {}
        self._metrics = TotalJobExeMetrics(now())

    def generate_status_json(self, nodes_list, when):
        """Generates the portion of the status JSON that describes the job execution metrics

        :param nodes_list: The list of nodes within the status JSON
        :type nodes_list: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        with self._lock:
            self._metrics.generate_status_json(nodes_list, when)

    def get_messages(self):
        """Returns all messages related to jobs and executions that need to be sent

        :returns: The list of job-related messages to send
        :rtype: list
        """

        running_job_messages = None
        job_exe_end_models = None
        finished_job_exes = None

        with self._lock:
            finished_job_exes = self._finished_job_exes
            job_exe_end_models = self._job_exe_end_models
            running_job_messages = self._running_job_messages
            self._finished_job_exes = []
            self._job_exe_end_models = []
            self._running_job_messages = []

        # Start with running job messages
        messages = running_job_messages

        # Add messages for creating job_exe_end models
        messages.extend(create_job_exe_end_messages(job_exe_end_models))

        # Add messages for finished job executions
        messages.extend(
            self._create_finished_job_exe_messages(finished_job_exes))

        return messages

    def get_running_job_exe(self, cluster_id):
        """Returns the running job execution with the given cluster ID, or None if the job execution does not exist

        :param cluster_id: The cluster ID of the job execution to return
        :type cluster_id: int
        :returns: The running job execution with the given cluster ID, possibly None
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        with self._lock:
            if cluster_id in self._running_job_exes:
                return self._running_job_exes[cluster_id]
            return None

    def get_running_job_exes(self):
        """Returns all currently running job executions

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        with self._lock:
            return list(self._running_job_exes.values())

    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task.id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    # We do not remove the failed job execution at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.
                    job_exe.execution_timed_out(task, when)

    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task_update.task_id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        return job_exe

        return None

    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        with self._lock:
            self._metrics.init_with_database()

    def lost_job_exes(self, job_exe_ids, when):
        """Informs the manager that the job executions with the given IDs were lost

        :param job_exe_ids: The IDs of the lost job executions
        :type job_exe_ids: list
        :param when: The time that the executions were lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the finished job executions
        :rtype: list
        """

        lost_job_exe_ids = set(job_exe_ids)

        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.id in lost_job_exe_ids:
                    job_exe.execution_lost(when)
                    task = job_exe.current_task
                    if task:
                        # Node could be deprecated, so force kill the current task
                        task.force_kill()
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        return finished_job_exes

    def lost_node(self, node_id, when):
        """Informs the manager that the node with the given ID was lost and has gone offline

        :param node_id: The ID of the lost node
        :type node_id: int
        :param when: The time that the node was lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the finished job executions
        :rtype: list
        """

        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.node_id == node_id:
                    job_exe.execution_lost(when)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        return finished_job_exes

    def schedule_job_exes(self, job_exes, messages):
        """Adds newly scheduled running job executions to the manager

        :param job_exes: A list of the running job executions to add
        :type job_exes: list
        :param messages: The messages for the running jobs
        :type messages: list
        """

        with self._lock:
            for job_exe in job_exes:
                self._running_job_exes[job_exe.cluster_id] = job_exe
            self._running_job_messages.extend(messages)
            self._metrics.add_running_job_exes(job_exes)

    def sync_with_database(self):
        """Syncs with the database to handle any canceled executions. Any job executions that are now finished are
        returned.

        :returns: A list of the finished job executions
        :rtype: list
        """

        job_ids = []
        running_job_exes = []
        with self._lock:
            for running_job_exe in self._running_job_exes.values():
                job_ids.append(running_job_exe.job_id)
                running_job_exes.append(running_job_exe)

        # Query job models from database to check if any running executions have been canceled
        job_models = {}
        for job in Job.objects.filter(id__in=job_ids):
            job_models[job.id] = job

        finished_job_exes = []
        when_canceled = now()
        with self._lock:
            for running_job_exe in running_job_exes:
                job_model = job_models[running_job_exe.job_id]
                # If the job has been canceled or the job has a newer execution, this execution must be canceled
                if job_model.status == 'CANCELED' or job_model.num_exes > running_job_exe.exe_num:
                    running_job_exe.execution_canceled(when_canceled)
                    if running_job_exe.is_finished():
                        self._handle_finished_job_exe(running_job_exe)
                        finished_job_exes.append(running_job_exe)

        return finished_job_exes

    def _create_finished_job_exe_messages(self, finished_job_exes):
        """Creates messages for finished job executions

        :param finished_job_exes: The finished job executions
        :type finished_job_exes: list
        :returns: The messages
        :rtype: list
        """

        when = now()

        completed_jobs = []
        failed_jobs = []
        for job_exe in finished_job_exes:
            if job_exe.status == 'COMPLETED':
                completed_jobs.append(
                    CompletedJob(job_exe.job_id, job_exe.exe_num))
            elif job_exe.status == 'FAILED':
                failed_jobs.append(
                    FailedJob(job_exe.job_id, job_exe.exe_num,
                              job_exe.error.id))

        messages = create_completed_jobs_messages(completed_jobs, when)
        messages.extend(create_failed_jobs_messages(failed_jobs, when))

        return messages

    def _handle_finished_job_exe(self, running_job_exe):
        """Handles the finished job execution. Caller must have obtained the manager lock.

        :param running_job_exe: The finished job execution
        :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        # Create job_exe_end model for the finished job execution and send it in a future message
        self._job_exe_end_models.append(
            running_job_exe.create_job_exe_end_model())

        # Collect finished job execution to send a future job update message
        self._finished_job_exes.append(running_job_exe)

        # Remove the finished job execution and update the metrics
        del self._running_job_exes[running_job_exe.cluster_id]
        self._metrics.job_exe_finished(running_job_exe)

Example #8

Show file

File: manager.py Project: GRSEB9S/scale

class JobExecutionManager(object):
    """This class manages all running and finished job executions. This class is thread-safe."""
    def __init__(self):
        """Constructor
        """

        self._job_exe_end_models = [
        ]  # Holds job_exe_end models to send in next messages
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._running_job_messages = []  # Holds running job messages to send
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

    def add_canceled_job_exes(self, job_exe_ends):
        """Adds the given job_exe_end models for job executions canceled off of the queue

        :param job_exe_ends: The job_exe_end models to add
        :type job_exe_ends: list
        """

        with self._lock:
            self._job_exe_end_models.extend(job_exe_ends)

    def clear(self):
        """Clears all data from the manager. This method is intended for testing only.
        """

        self._running_job_exes = {}
        self._metrics = TotalJobExeMetrics(now())

    def generate_status_json(self, nodes_list, when):
        """Generates the portion of the status JSON that describes the job execution metrics

        :param nodes_list: The list of nodes within the status JSON
        :type nodes_list: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        with self._lock:
            self._metrics.generate_status_json(nodes_list, when)

    def get_messages(self):
        """Returns all messages related to jobs and executions that need to be sent

        :returns: The list of job-related messages to send
        :rtype: list
        """

        with self._lock:
            messages = self._running_job_messages
            self._running_job_messages = []

            message = None
            for job_exe_end in self._job_exe_end_models:
                if not message:
                    message = CreateJobExecutionEnd()
                elif not message.can_fit_more():
                    messages.append(message)
                    message = CreateJobExecutionEnd()
                message.add_job_exe_end(job_exe_end)
            if message:
                messages.append(message)
            self._job_exe_end_models = []

        return messages

    def get_running_job_exe(self, cluster_id):
        """Returns the running job execution with the given cluster ID, or None if the job execution does not exist

        :param cluster_id: The cluster ID of the job execution to return
        :type cluster_id: int
        :returns: The running job execution with the given cluster ID, possibly None
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        with self._lock:
            if cluster_id in self._running_job_exes:
                return self._running_job_exes[cluster_id]
            return None

    def get_running_job_exes(self):
        """Returns all currently running job executions

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        with self._lock:
            return list(self._running_job_exes.values())

    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task.id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    # We do not remove the failed job execution at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.
                    job_exe.execution_timed_out(task, when)

    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        finished_job_exe = None
        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task_update.task_id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exe = job_exe
                        # return job_exe

        # TODO: this can be removed once database operations move to messaging backend
        if finished_job_exe:
            self._handle_finished_job_exe_in_database(finished_job_exe)
            return finished_job_exe

        return None

    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        with self._lock:
            self._metrics.init_with_database()

    def lost_node(self, node_id, when):
        """Informs the manager that the node with the given ID was lost and has gone offline

        :param node_id: The ID of the lost node
        :type node_id: int
        :param when: The time that the node was lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the lost job executions that had been running on the node
        :rtype: list
        """

        lost_exes = []
        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.node_id == node_id:
                    lost_exes.append(job_exe)
                    job_exe.execution_lost(when)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        # TODO: this can be removed once database operations move to messaging backend
        for finished_job_exe in finished_job_exes:
            self._handle_finished_job_exe_in_database(finished_job_exe)

        return lost_exes

    def schedule_job_exes(self, job_exes, messages):
        """Adds newly scheduled running job executions to the manager

        :param job_exes: A list of the running job executions to add
        :type job_exes: list
        :param messages: The messages for the running jobs
        :type messages: list
        """

        with self._lock:
            for job_exe in job_exes:
                self._running_job_exes[job_exe.cluster_id] = job_exe
            self._running_job_messages.extend(messages)
            self._metrics.add_running_job_exes(job_exes)

    def sync_with_database(self):
        """Syncs with the database to handle any canceled executions. The current task of each canceled job execution is
        returned so the tasks may be killed.

        :returns: A list of the canceled tasks to kill
        :rtype: [:class:`job.tasks.base_task.Task`]
        """

        job_ids = []
        running_job_exes = []
        with self._lock:
            for running_job_exe in self._running_job_exes.values():
                job_ids.append(running_job_exe.job_id)
                running_job_exes.append(running_job_exe)

        # Query job models from database to check if any running executions have been canceled
        job_models = {}
        for job in Job.objects.filter(id__in=job_ids):
            job_models[job.id] = job

        canceled_tasks = []
        finished_job_exes = []
        when_canceled = now()
        with self._lock:
            for running_job_exe in running_job_exes:
                job_model = job_models[running_job_exe.job_id]
                # If the job has been canceled or the job has a newer execution, this execution must be canceled
                if job_model.status == 'CANCELED' or job_model.num_exes > running_job_exe.exe_num:
                    task = running_job_exe.execution_canceled(when_canceled)
                    if task:
                        # Since it has an outstanding task, we do not remove the canceled job execution at this point.
                        # We wait for the status update of the killed task to come back so that job execution cleanup
                        # occurs after the task is dead.
                        canceled_tasks.append(task)
                    else:
                        if running_job_exe.is_finished():
                            self._handle_finished_job_exe(running_job_exe)
                            finished_job_exes.append(running_job_exe)

        # TODO: this can be removed once database operations move to messaging backend
        for finished_job_exe in finished_job_exes:
            self._handle_finished_job_exe_in_database(finished_job_exe)

        return canceled_tasks

    def _handle_finished_job_exe(self, running_job_exe):
        """Handles the finished job execution. Caller must have obtained the manager lock.

        :param running_job_exe: The finished job execution
        :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        # Create job_exe_end model for the finished job execution and send it in next messages
        self._job_exe_end_models.append(
            running_job_exe.create_job_exe_end_model())

        # Remove the finished job execution and update the metrics
        del self._running_job_exes[running_job_exe.cluster_id]
        self._metrics.job_exe_finished(running_job_exe)

    def _handle_finished_job_exe_in_database(self, running_job_exe):
        """Handles the finished job execution by performing any needed database operations. This is a stop gap until
        these database operations move to the messaging backend.

        :param running_job_exe: The finished job execution
        :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        # TODO: handling job completion and failure here for now, later these will be sent via messaging backend in a
        # background thread
        from queue.models import Queue
        job_id = running_job_exe.job_id
        exe_num = running_job_exe.exe_num
        when = running_job_exe.finished
        if running_job_exe.status == 'COMPLETED':
            Queue.objects.handle_job_completion(job_id, exe_num, when)
        elif running_job_exe.status == 'FAILED':
            Queue.objects.handle_job_failure(job_id, exe_num, when,
                                             running_job_exe.error)

Example #9

Show file

File: manager.py Project: mnjstwins/scale

class JobExecutionManager(object):
    """This class manages all running and finished job executions. This class is thread-safe."""
    def __init__(self):
        """Constructor
        """

        self._running_job_exes = {}  # {ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

    def generate_status_json(self, nodes_list, when):
        """Generates the portion of the status JSON that describes the job execution metrics

        :param nodes_list: The list of nodes within the status JSON
        :type nodes_list: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        with self._lock:
            self._metrics.generate_status_json(nodes_list, when)

    def get_ready_job_exes(self):
        """Returns all running job executions that are ready to execute their next task

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        ready_exes = []
        with self._lock:
            for job_exe_id in self._running_job_exes:
                job_exe = self._running_job_exes[job_exe_id]
                if job_exe.is_next_task_ready():
                    ready_exes.append(job_exe)
        return ready_exes

    def get_running_job_exe(self, job_exe_id):
        """Returns the running job execution with the given ID, or None if the job execution does not exist

        :param job_exe_id: The ID of the job execution to return
        :type job_exe_id: int
        :returns: The running job execution with the given ID, possibly None
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        with self._lock:
            if job_exe_id in self._running_job_exes:
                return self._running_job_exes[job_exe_id]
            return None

    def get_running_job_exes(self):
        """Returns all currently running job executions

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        running_job_exes = []
        with self._lock:
            for job_exe_id in self._running_job_exes:
                running_job_exes.append(self._running_job_exes[job_exe_id])
        return running_job_exes

    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            job_exe_id = JobExecution.get_job_exe_id(task.id)
            with self._lock:
                if job_exe_id in self._running_job_exes:
                    job_exe = self._running_job_exes[job_exe_id]
                    try:
                        job_exe.execution_timed_out(task, when)
                    except DatabaseError:
                        logger.exception(
                            'Error failing timed out job execution %i',
                            job_exe_id)
                    # We do not remove timed out job executions at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.

    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            job_exe_id = JobExecution.get_job_exe_id(task_update.task_id)
            with self._lock:
                if job_exe_id in self._running_job_exes:
                    job_exe = self._running_job_exes[job_exe_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        return job_exe

        return None

    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        with self._lock:
            self._metrics.init_with_database()

    def lost_node(self, node_id, when):
        """Informs the manager that the node with the given ID was lost and has gone offline

        :param node_id: The ID of the lost node
        :type node_id: int
        :param when: The time that the node was lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the lost job executions that had been running on the node
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        lost_exes = []
        with self._lock:
            for job_exe_id in self._running_job_exes.keys():
                job_exe = self._running_job_exes[job_exe_id]
                if job_exe.node_id == node_id:
                    lost_exes.append(job_exe)
                    try:
                        job_exe.execution_lost(when)
                    except DatabaseError:
                        logger.exception(
                            'Error failing lost job execution: %s', job_exe.id)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
        return lost_exes

    def schedule_job_exes(self, job_exes):
        """Adds newly scheduled running job executions to the manager

        :param job_exes: A list of the running job executions to add
        :type job_exes: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        with self._lock:
            for job_exe in job_exes:
                self._running_job_exes[job_exe.id] = job_exe
            self._metrics.add_running_job_exes(job_exes)

    def sync_with_database(self):
        """Syncs with the database to handle any canceled executions. The current task of each canceled job execution is
        returned so the tasks may be killed.

        :returns: A list of the canceled tasks to kill
        :rtype: [:class:`job.tasks.base_task.Task`]
        """

        with self._lock:
            job_exe_ids = list(self._running_job_exes.keys())

        canceled_tasks = []
        canceled_models = list(
            JobExecution.objects.filter(id__in=job_exe_ids,
                                        status='CANCELED').iterator())

        with self._lock:
            for job_exe_model in canceled_models:
                if job_exe_model.id in self._running_job_exes:
                    canceled_job_exe = self._running_job_exes[job_exe_model.id]
                    try:
                        task = canceled_job_exe.execution_canceled()
                        if task:
                            canceled_tasks.append(task)
                    except DatabaseError:
                        logger.exception('Error canceling job execution %i',
                                         job_exe_model.id)
                    # We do not remove canceled job executions at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.

        return canceled_tasks

    def _handle_finished_job_exe(self, job_exe):
        """Handles the finished job execution. Caller must have obtained the manager lock.

        :param job_exe: The finished job execution
        :type job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        del self._running_job_exes[job_exe.id]
        self._metrics.job_exe_finished(job_exe)