Example #1
0
    def __init__(self):
        """Constructor
        """

        self._running_job_exes = {}  # {ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())
Example #2
0
    def setUp(self):
        django.setup()

        self.alg_error = error_test_utils.create_error(category='ALGORITHM')
        self.data_error = error_test_utils.create_error(category='DATA')
        self.system_error = error_test_utils.create_error(category='SYSTEM')

        self.metrics = TotalJobExeMetrics(now())
Example #3
0
    def __init__(self):
        """Constructor
        """

        self._job_exe_end_models = [
        ]  # Holds job_exe_end models to send in next messages
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._running_job_messages = []  # Holds running job messages to send
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())
Example #4
0
    def __init__(self):
        """Constructor
        """

        # Execution information to be sent in command messages
        self._finished_job_exes = []  # Holds finished executions
        self._job_exe_end_models = []  # Holds job_exe_end models to create
        self._running_job_messages = []  # Holds running job messages

        # Current running state
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())
Example #5
0
class TestTotalJobExeMetrics(TestCase):
    """Tests the TotalJobExeMetrics class"""
    def setUp(self):
        django.setup()

        self.alg_error = error_test_utils.create_error(category='ALGORITHM')
        self.data_error = error_test_utils.create_error(category='DATA')
        self.system_error = error_test_utils.create_error(category='SYSTEM')

        self.metrics = TotalJobExeMetrics(now())

    def test_init_with_database(self):
        """Tests calling init_with_database() successfully to load in job executions from the database"""

        # First block of job executions
        end_time_1 = now(
        ) - FinishedJobExeMetricsOverTime.BLOCK_LENGTH - FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        node_model_1 = node_test_utils.create_node()
        job_type_1 = job_test_utils.create_seed_job_type()
        job_type_2 = job_test_utils.create_seed_job_type()
        job_exe_model_1 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='COMPLETED',
                                                        ended=end_time_1,
                                                        node=node_model_1)
        job_exe_model_2 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='COMPLETED',
                                                        ended=end_time_1,
                                                        node=node_model_1)
        job_exe_model_3 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.alg_error,
                                                        node=node_model_1)
        job_exe_model_4 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.alg_error,
                                                        node=node_model_1)
        job_exe_model_5 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.alg_error,
                                                        node=node_model_1)
        job_exe_model_6 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='FAILED',
                                                        ended=end_time_1,
                                                        error=self.data_error,
                                                        node=node_model_1)
        job_exe_model_7 = job_test_utils.create_job_exe(
            job_type=job_type_1,
            status='FAILED',
            ended=end_time_1,
            error=self.system_error,
            node=node_model_1)
        job_exe_model_8 = job_test_utils.create_job_exe(
            job_type=job_type_2,
            status='FAILED',
            ended=end_time_1,
            error=self.system_error,
            node=node_model_1)
        node_model_2 = node_test_utils.create_node()
        job_exe_model_9 = job_test_utils.create_job_exe(job_type=job_type_1,
                                                        status='COMPLETED',
                                                        ended=end_time_1,
                                                        node=node_model_2)
        job_exe_model_10 = job_test_utils.create_job_exe(job_type=job_type_2,
                                                         status='COMPLETED',
                                                         ended=end_time_1,
                                                         node=node_model_2)
        job_exe_model_11 = job_test_utils.create_job_exe(job_type=job_type_2,
                                                         status='FAILED',
                                                         ended=end_time_1,
                                                         error=self.data_error,
                                                         node=node_model_2)
        # Second block of job executions (one time block over from first set of executions)
        end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        job_exe_model_12 = job_test_utils.create_job_exe(
            job_type=job_type_2,
            status='FAILED',
            ended=end_time_2,
            error=self.system_error,
            node=node_model_1)
        job_exe_model_13 = job_test_utils.create_job_exe(
            job_type=job_type_2,
            status='FAILED',
            ended=end_time_2,
            error=self.system_error,
            node=node_model_1)
        job_exe_model_14 = job_test_utils.create_job_exe(job_type=job_type_2,
                                                         status='COMPLETED',
                                                         ended=end_time_2,
                                                         node=node_model_2)
        # Load all initial executions from database
        self.metrics.init_with_database()

        # Generate JSON which should include both sets of job executions
        right_now = end_time_2 + datetime.timedelta(seconds=30)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, right_now)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 8)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 3)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            4)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 3)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 1)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Generate JSON which should include only second set of job executions (first set rolled off by time)
        later = end_time_1 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD + datetime.timedelta(
            seconds=1)
        later += FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, later)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            2)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 1)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Generate JSON where all job executions should have rolled off by time
        later = later + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, later)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

    def test_running_executions(self):
        """Tests the metrics with running executions that complete"""

        node_model_1 = node_test_utils.create_node()
        node_model_2 = node_test_utils.create_node()
        job_type_1 = job_test_utils.create_seed_job_type()
        job_type_2 = job_test_utils.create_seed_job_type()
        job_exe_1 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_1)
        job_exe_2 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_1)
        job_exe_3 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_1)
        job_exe_4 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_1)
        job_exe_5 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_2)
        job_exe_6 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_1,
                                                          node=node_model_2)
        job_exe_7 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_2)
        job_exe_8 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_2)
        job_exe_9 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                          job_type=job_type_2,
                                                          node=node_model_2)
        job_exe_10 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                           job_type=job_type_2,
                                                           node=node_model_2)
        job_exe_11 = job_test_utils.create_running_job_exe(agent_id='agent',
                                                           job_type=job_type_2,
                                                           node=node_model_2)

        # NOTE: This unit test is about to get CRAZY. I apologize for the complexity, but this is needed for a
        # thorough testing
        self.metrics.add_running_job_exes([
            job_exe_1, job_exe_2, job_exe_3, job_exe_4, job_exe_5, job_exe_6,
            job_exe_7, job_exe_8, job_exe_9, job_exe_10, job_exe_11
        ])
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, now())

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 4)
        for job_type_dict in node_list_dict[0]['job_executions']['running'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 3)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 1)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['running'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Finish some job executions
        end_time_1 = now()
        job_exe_1._set_final_status('COMPLETED', end_time_1)
        job_exe_2._set_final_status('FAILED',
                                    end_time_1,
                                    error=self.data_error)
        job_exe_4._set_final_status('FAILED', end_time_1, error=self.alg_error)
        self.metrics.job_exe_finished(job_exe_1)
        self.metrics.job_exe_finished(job_exe_2)
        self.metrics.job_exe_finished(job_exe_4)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(
            node_list_dict, end_time_1 + datetime.timedelta(seconds=1))

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['running']['by_job_type']),
            1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['completed']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['algorithm']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['job_type_id'], job_type_2.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['data']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['running'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Finish some job executions (all executions still on node 2)
        end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        job_exe_5._set_final_status('COMPLETED', end_time_2)
        job_exe_6._set_final_status('COMPLETED', end_time_2)
        job_exe_7._set_final_status('COMPLETED', end_time_2)
        job_exe_8._set_final_status('COMPLETED', end_time_2)
        job_exe_9._set_final_status('COMPLETED', end_time_2)
        job_exe_10._set_final_status('COMPLETED', end_time_2)
        job_exe_11._set_final_status('COMPLETED', end_time_2)
        self.metrics.job_exe_finished(job_exe_5)
        self.metrics.job_exe_finished(job_exe_6)
        self.metrics.job_exe_finished(job_exe_7)
        self.metrics.job_exe_finished(job_exe_8)
        self.metrics.job_exe_finished(job_exe_9)
        self.metrics.job_exe_finished(job_exe_10)
        self.metrics.job_exe_finished(job_exe_11)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, end_time_2)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['running']['by_job_type']),
            1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['completed']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['algorithm']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['by_job_type'][0]['job_type_id'], job_type_2.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['failed']['data']
                ['by_job_type']), 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']
            ['by_job_type'][0]['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['completed'][
                'by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)

        # Let all finished job executions roll off by time, only running remaining
        end_time_3 = end_time_2 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD
        end_time_3 += FinishedJobExeMetricsOverTime.BLOCK_LENGTH + datetime.timedelta(
            seconds=1)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, end_time_3)

        # Check expected totals
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(
            len(node_list_dict[0]['job_executions']['running']['by_job_type']),
            1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['count'], 1)
        self.assertEqual(
            node_list_dict[0]['job_executions']['running']['by_job_type'][0]
            ['job_type_id'], job_type_1.id)
        self.assertEqual(
            node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[0]['job_executions']['failed']['system']['total'],
            0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['algorithm']
            ['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(
            node_list_dict[1]['job_executions']['failed']['system']['total'],
            0)
Example #6
0
    def clear(self):
        """Clears all data from the manager. This method is intended for testing only.
        """

        self._running_job_exes = {}
        self._metrics = TotalJobExeMetrics(now())
Example #7
0
class JobExecutionManager(object):
    """This class manages all running and finished job executions. This class is thread-safe."""
    def __init__(self):
        """Constructor
        """

        # Execution information to be sent in command messages
        self._finished_job_exes = []  # Holds finished executions
        self._job_exe_end_models = []  # Holds job_exe_end models to create
        self._running_job_messages = []  # Holds running job messages

        # Current running state
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

    def add_canceled_job_exes(self, job_exe_ends):
        """Adds the given job_exe_end models for job executions canceled off of the queue

        :param job_exe_ends: The job_exe_end models to add
        :type job_exe_ends: list
        """

        with self._lock:
            self._job_exe_end_models.extend(job_exe_ends)

    def check_for_starvation(self, when):
        """Checks all of the currently running job executions for resource starvation. If any starved executions are
        found, they are failed and returned.

        :param when: The current time
        :type when: :class:`datetime.datetime`
        :returns: A list of the starved job executions
        :rtype: list
        """

        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.check_for_starvation(when):
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        return finished_job_exes

    def clear(self):
        """Clears all data from the manager. This method is intended for testing only.
        """

        self._running_job_exes = {}
        self._metrics = TotalJobExeMetrics(now())

    def generate_status_json(self, nodes_list, when):
        """Generates the portion of the status JSON that describes the job execution metrics

        :param nodes_list: The list of nodes within the status JSON
        :type nodes_list: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        with self._lock:
            self._metrics.generate_status_json(nodes_list, when)

    def get_messages(self):
        """Returns all messages related to jobs and executions that need to be sent

        :returns: The list of job-related messages to send
        :rtype: list
        """

        running_job_messages = None
        job_exe_end_models = None
        finished_job_exes = None

        with self._lock:
            finished_job_exes = self._finished_job_exes
            job_exe_end_models = self._job_exe_end_models
            running_job_messages = self._running_job_messages
            self._finished_job_exes = []
            self._job_exe_end_models = []
            self._running_job_messages = []

        # Start with running job messages
        messages = running_job_messages

        # Add messages for creating job_exe_end models
        messages.extend(create_job_exe_end_messages(job_exe_end_models))

        # Add messages for finished job executions
        messages.extend(
            self._create_finished_job_exe_messages(finished_job_exes))

        return messages

    def get_running_job_exe(self, cluster_id):
        """Returns the running job execution with the given cluster ID, or None if the job execution does not exist

        :param cluster_id: The cluster ID of the job execution to return
        :type cluster_id: int
        :returns: The running job execution with the given cluster ID, possibly None
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        with self._lock:
            if cluster_id in self._running_job_exes:
                return self._running_job_exes[cluster_id]
            return None

    def get_running_job_exes(self):
        """Returns all currently running job executions

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        with self._lock:
            return list(self._running_job_exes.values())

    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task.id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    # We do not remove the failed job execution at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.
                    job_exe.execution_timed_out(task, when)

    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task_update.task_id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        return job_exe

        return None

    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        with self._lock:
            self._metrics.init_with_database()

    def lost_job_exes(self, job_exe_ids, when):
        """Informs the manager that the job executions with the given IDs were lost

        :param job_exe_ids: The IDs of the lost job executions
        :type job_exe_ids: list
        :param when: The time that the executions were lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the finished job executions
        :rtype: list
        """

        lost_job_exe_ids = set(job_exe_ids)

        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.id in lost_job_exe_ids:
                    job_exe.execution_lost(when)
                    task = job_exe.current_task
                    if task:
                        # Node could be deprecated, so force kill the current task
                        task.force_kill()
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        return finished_job_exes

    def lost_node(self, node_id, when):
        """Informs the manager that the node with the given ID was lost and has gone offline

        :param node_id: The ID of the lost node
        :type node_id: int
        :param when: The time that the node was lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the finished job executions
        :rtype: list
        """

        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.node_id == node_id:
                    job_exe.execution_lost(when)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        return finished_job_exes

    def schedule_job_exes(self, job_exes, messages):
        """Adds newly scheduled running job executions to the manager

        :param job_exes: A list of the running job executions to add
        :type job_exes: list
        :param messages: The messages for the running jobs
        :type messages: list
        """

        with self._lock:
            for job_exe in job_exes:
                self._running_job_exes[job_exe.cluster_id] = job_exe
            self._running_job_messages.extend(messages)
            self._metrics.add_running_job_exes(job_exes)

    def sync_with_database(self):
        """Syncs with the database to handle any canceled executions. Any job executions that are now finished are
        returned.

        :returns: A list of the finished job executions
        :rtype: list
        """

        job_ids = []
        running_job_exes = []
        with self._lock:
            for running_job_exe in self._running_job_exes.values():
                job_ids.append(running_job_exe.job_id)
                running_job_exes.append(running_job_exe)

        # Query job models from database to check if any running executions have been canceled
        job_models = {}
        for job in Job.objects.filter(id__in=job_ids):
            job_models[job.id] = job

        finished_job_exes = []
        when_canceled = now()
        with self._lock:
            for running_job_exe in running_job_exes:
                job_model = job_models[running_job_exe.job_id]
                # If the job has been canceled or the job has a newer execution, this execution must be canceled
                if job_model.status == 'CANCELED' or job_model.num_exes > running_job_exe.exe_num:
                    running_job_exe.execution_canceled(when_canceled)
                    if running_job_exe.is_finished():
                        self._handle_finished_job_exe(running_job_exe)
                        finished_job_exes.append(running_job_exe)

        return finished_job_exes

    def _create_finished_job_exe_messages(self, finished_job_exes):
        """Creates messages for finished job executions

        :param finished_job_exes: The finished job executions
        :type finished_job_exes: list
        :returns: The messages
        :rtype: list
        """

        when = now()

        completed_jobs = []
        failed_jobs = []
        for job_exe in finished_job_exes:
            if job_exe.status == 'COMPLETED':
                completed_jobs.append(
                    CompletedJob(job_exe.job_id, job_exe.exe_num))
            elif job_exe.status == 'FAILED':
                failed_jobs.append(
                    FailedJob(job_exe.job_id, job_exe.exe_num,
                              job_exe.error.id))

        messages = create_completed_jobs_messages(completed_jobs, when)
        messages.extend(create_failed_jobs_messages(failed_jobs, when))

        return messages

    def _handle_finished_job_exe(self, running_job_exe):
        """Handles the finished job execution. Caller must have obtained the manager lock.

        :param running_job_exe: The finished job execution
        :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        # Create job_exe_end model for the finished job execution and send it in a future message
        self._job_exe_end_models.append(
            running_job_exe.create_job_exe_end_model())

        # Collect finished job execution to send a future job update message
        self._finished_job_exes.append(running_job_exe)

        # Remove the finished job execution and update the metrics
        del self._running_job_exes[running_job_exe.cluster_id]
        self._metrics.job_exe_finished(running_job_exe)
Example #8
0
class JobExecutionManager(object):
    """This class manages all running and finished job executions. This class is thread-safe."""
    def __init__(self):
        """Constructor
        """

        self._job_exe_end_models = [
        ]  # Holds job_exe_end models to send in next messages
        self._running_job_exes = {}  # {Cluster ID: RunningJobExecution}
        self._running_job_messages = []  # Holds running job messages to send
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

    def add_canceled_job_exes(self, job_exe_ends):
        """Adds the given job_exe_end models for job executions canceled off of the queue

        :param job_exe_ends: The job_exe_end models to add
        :type job_exe_ends: list
        """

        with self._lock:
            self._job_exe_end_models.extend(job_exe_ends)

    def clear(self):
        """Clears all data from the manager. This method is intended for testing only.
        """

        self._running_job_exes = {}
        self._metrics = TotalJobExeMetrics(now())

    def generate_status_json(self, nodes_list, when):
        """Generates the portion of the status JSON that describes the job execution metrics

        :param nodes_list: The list of nodes within the status JSON
        :type nodes_list: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        with self._lock:
            self._metrics.generate_status_json(nodes_list, when)

    def get_messages(self):
        """Returns all messages related to jobs and executions that need to be sent

        :returns: The list of job-related messages to send
        :rtype: list
        """

        with self._lock:
            messages = self._running_job_messages
            self._running_job_messages = []

            message = None
            for job_exe_end in self._job_exe_end_models:
                if not message:
                    message = CreateJobExecutionEnd()
                elif not message.can_fit_more():
                    messages.append(message)
                    message = CreateJobExecutionEnd()
                message.add_job_exe_end(job_exe_end)
            if message:
                messages.append(message)
            self._job_exe_end_models = []

        return messages

    def get_running_job_exe(self, cluster_id):
        """Returns the running job execution with the given cluster ID, or None if the job execution does not exist

        :param cluster_id: The cluster ID of the job execution to return
        :type cluster_id: int
        :returns: The running job execution with the given cluster ID, possibly None
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        with self._lock:
            if cluster_id in self._running_job_exes:
                return self._running_job_exes[cluster_id]
            return None

    def get_running_job_exes(self):
        """Returns all currently running job executions

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        with self._lock:
            return list(self._running_job_exes.values())

    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task.id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    # We do not remove the failed job execution at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.
                    job_exe.execution_timed_out(task, when)

    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        finished_job_exe = None
        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            cluster_id = JobExecution.parse_cluster_id(task_update.task_id)
            with self._lock:
                if cluster_id in self._running_job_exes:
                    job_exe = self._running_job_exes[cluster_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exe = job_exe
                        # return job_exe

        # TODO: this can be removed once database operations move to messaging backend
        if finished_job_exe:
            self._handle_finished_job_exe_in_database(finished_job_exe)
            return finished_job_exe

        return None

    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        with self._lock:
            self._metrics.init_with_database()

    def lost_node(self, node_id, when):
        """Informs the manager that the node with the given ID was lost and has gone offline

        :param node_id: The ID of the lost node
        :type node_id: int
        :param when: The time that the node was lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the lost job executions that had been running on the node
        :rtype: list
        """

        lost_exes = []
        finished_job_exes = []
        with self._lock:
            for job_exe in self._running_job_exes.values():
                if job_exe.node_id == node_id:
                    lost_exes.append(job_exe)
                    job_exe.execution_lost(when)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        finished_job_exes.append(job_exe)

        # TODO: this can be removed once database operations move to messaging backend
        for finished_job_exe in finished_job_exes:
            self._handle_finished_job_exe_in_database(finished_job_exe)

        return lost_exes

    def schedule_job_exes(self, job_exes, messages):
        """Adds newly scheduled running job executions to the manager

        :param job_exes: A list of the running job executions to add
        :type job_exes: list
        :param messages: The messages for the running jobs
        :type messages: list
        """

        with self._lock:
            for job_exe in job_exes:
                self._running_job_exes[job_exe.cluster_id] = job_exe
            self._running_job_messages.extend(messages)
            self._metrics.add_running_job_exes(job_exes)

    def sync_with_database(self):
        """Syncs with the database to handle any canceled executions. The current task of each canceled job execution is
        returned so the tasks may be killed.

        :returns: A list of the canceled tasks to kill
        :rtype: [:class:`job.tasks.base_task.Task`]
        """

        job_ids = []
        running_job_exes = []
        with self._lock:
            for running_job_exe in self._running_job_exes.values():
                job_ids.append(running_job_exe.job_id)
                running_job_exes.append(running_job_exe)

        # Query job models from database to check if any running executions have been canceled
        job_models = {}
        for job in Job.objects.filter(id__in=job_ids):
            job_models[job.id] = job

        canceled_tasks = []
        finished_job_exes = []
        when_canceled = now()
        with self._lock:
            for running_job_exe in running_job_exes:
                job_model = job_models[running_job_exe.job_id]
                # If the job has been canceled or the job has a newer execution, this execution must be canceled
                if job_model.status == 'CANCELED' or job_model.num_exes > running_job_exe.exe_num:
                    task = running_job_exe.execution_canceled(when_canceled)
                    if task:
                        # Since it has an outstanding task, we do not remove the canceled job execution at this point.
                        # We wait for the status update of the killed task to come back so that job execution cleanup
                        # occurs after the task is dead.
                        canceled_tasks.append(task)
                    else:
                        if running_job_exe.is_finished():
                            self._handle_finished_job_exe(running_job_exe)
                            finished_job_exes.append(running_job_exe)

        # TODO: this can be removed once database operations move to messaging backend
        for finished_job_exe in finished_job_exes:
            self._handle_finished_job_exe_in_database(finished_job_exe)

        return canceled_tasks

    def _handle_finished_job_exe(self, running_job_exe):
        """Handles the finished job execution. Caller must have obtained the manager lock.

        :param running_job_exe: The finished job execution
        :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        # Create job_exe_end model for the finished job execution and send it in next messages
        self._job_exe_end_models.append(
            running_job_exe.create_job_exe_end_model())

        # Remove the finished job execution and update the metrics
        del self._running_job_exes[running_job_exe.cluster_id]
        self._metrics.job_exe_finished(running_job_exe)

    def _handle_finished_job_exe_in_database(self, running_job_exe):
        """Handles the finished job execution by performing any needed database operations. This is a stop gap until
        these database operations move to the messaging backend.

        :param running_job_exe: The finished job execution
        :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        # TODO: handling job completion and failure here for now, later these will be sent via messaging backend in a
        # background thread
        from queue.models import Queue
        job_id = running_job_exe.job_id
        exe_num = running_job_exe.exe_num
        when = running_job_exe.finished
        if running_job_exe.status == 'COMPLETED':
            Queue.objects.handle_job_completion(job_id, exe_num, when)
        elif running_job_exe.status == 'FAILED':
            Queue.objects.handle_job_failure(job_id, exe_num, when,
                                             running_job_exe.error)
Example #9
0
class JobExecutionManager(object):
    """This class manages all running and finished job executions. This class is thread-safe."""
    def __init__(self):
        """Constructor
        """

        self._running_job_exes = {}  # {ID: RunningJobExecution}
        self._lock = threading.Lock()
        self._metrics = TotalJobExeMetrics(now())

    def generate_status_json(self, nodes_list, when):
        """Generates the portion of the status JSON that describes the job execution metrics

        :param nodes_list: The list of nodes within the status JSON
        :type nodes_list: list
        :param when: The current time
        :type when: :class:`datetime.datetime`
        """

        with self._lock:
            self._metrics.generate_status_json(nodes_list, when)

    def get_ready_job_exes(self):
        """Returns all running job executions that are ready to execute their next task

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        ready_exes = []
        with self._lock:
            for job_exe_id in self._running_job_exes:
                job_exe = self._running_job_exes[job_exe_id]
                if job_exe.is_next_task_ready():
                    ready_exes.append(job_exe)
        return ready_exes

    def get_running_job_exe(self, job_exe_id):
        """Returns the running job execution with the given ID, or None if the job execution does not exist

        :param job_exe_id: The ID of the job execution to return
        :type job_exe_id: int
        :returns: The running job execution with the given ID, possibly None
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        with self._lock:
            if job_exe_id in self._running_job_exes:
                return self._running_job_exes[job_exe_id]
            return None

    def get_running_job_exes(self):
        """Returns all currently running job executions

        :returns: A list of running job executions
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        running_job_exes = []
        with self._lock:
            for job_exe_id in self._running_job_exes:
                running_job_exes.append(self._running_job_exes[job_exe_id])
        return running_job_exes

    def handle_task_timeout(self, task, when):
        """Handles the timeout of the given task

        :param task: The task
        :type task: :class:`job.tasks.base_task.Task`
        :param when: The time that the time out occurred
        :type when: :class:`datetime.datetime`
        """

        if task.id.startswith(JOB_TASK_ID_PREFIX):
            job_exe_id = JobExecution.get_job_exe_id(task.id)
            with self._lock:
                if job_exe_id in self._running_job_exes:
                    job_exe = self._running_job_exes[job_exe_id]
                    try:
                        job_exe.execution_timed_out(task, when)
                    except DatabaseError:
                        logger.exception(
                            'Error failing timed out job execution %i',
                            job_exe_id)
                    # We do not remove timed out job executions at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.

    def handle_task_update(self, task_update):
        """Handles the given task update and returns the associated job execution if it has finished

        :param task_update: The task update
        :type task_update: :class:`job.tasks.update.TaskStatusUpdate`
        :returns: The job execution if it has finished, None otherwise
        :rtype: :class:`job.execution.job_exe.RunningJobExecution`
        """

        if task_update.task_id.startswith(JOB_TASK_ID_PREFIX):
            job_exe_id = JobExecution.get_job_exe_id(task_update.task_id)
            with self._lock:
                if job_exe_id in self._running_job_exes:
                    job_exe = self._running_job_exes[job_exe_id]
                    job_exe.task_update(task_update)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
                        return job_exe

        return None

    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        with self._lock:
            self._metrics.init_with_database()

    def lost_node(self, node_id, when):
        """Informs the manager that the node with the given ID was lost and has gone offline

        :param node_id: The ID of the lost node
        :type node_id: int
        :param when: The time that the node was lost
        :type when: :class:`datetime.datetime`
        :returns: A list of the lost job executions that had been running on the node
        :rtype: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        lost_exes = []
        with self._lock:
            for job_exe_id in self._running_job_exes.keys():
                job_exe = self._running_job_exes[job_exe_id]
                if job_exe.node_id == node_id:
                    lost_exes.append(job_exe)
                    try:
                        job_exe.execution_lost(when)
                    except DatabaseError:
                        logger.exception(
                            'Error failing lost job execution: %s', job_exe.id)
                    if job_exe.is_finished():
                        self._handle_finished_job_exe(job_exe)
        return lost_exes

    def schedule_job_exes(self, job_exes):
        """Adds newly scheduled running job executions to the manager

        :param job_exes: A list of the running job executions to add
        :type job_exes: [:class:`job.execution.job_exe.RunningJobExecution`]
        """

        with self._lock:
            for job_exe in job_exes:
                self._running_job_exes[job_exe.id] = job_exe
            self._metrics.add_running_job_exes(job_exes)

    def sync_with_database(self):
        """Syncs with the database to handle any canceled executions. The current task of each canceled job execution is
        returned so the tasks may be killed.

        :returns: A list of the canceled tasks to kill
        :rtype: [:class:`job.tasks.base_task.Task`]
        """

        with self._lock:
            job_exe_ids = list(self._running_job_exes.keys())

        canceled_tasks = []
        canceled_models = list(
            JobExecution.objects.filter(id__in=job_exe_ids,
                                        status='CANCELED').iterator())

        with self._lock:
            for job_exe_model in canceled_models:
                if job_exe_model.id in self._running_job_exes:
                    canceled_job_exe = self._running_job_exes[job_exe_model.id]
                    try:
                        task = canceled_job_exe.execution_canceled()
                        if task:
                            canceled_tasks.append(task)
                    except DatabaseError:
                        logger.exception('Error canceling job execution %i',
                                         job_exe_model.id)
                    # We do not remove canceled job executions at this point. We wait for the status update of the
                    # killed task to come back so that job execution cleanup occurs after the task is dead.

        return canceled_tasks

    def _handle_finished_job_exe(self, job_exe):
        """Handles the finished job execution. Caller must have obtained the manager lock.

        :param job_exe: The finished job execution
        :type job_exe: :class:`job.execution.job_exe.RunningJobExecution`
        """

        del self._running_job_exes[job_exe.id]
        self._metrics.job_exe_finished(job_exe)