def __init__(self): """Constructor """ self._running_job_exes = {} # {ID: RunningJobExecution} self._lock = threading.Lock() self._metrics = TotalJobExeMetrics(now())
def setUp(self): django.setup() self.alg_error = error_test_utils.create_error(category='ALGORITHM') self.data_error = error_test_utils.create_error(category='DATA') self.system_error = error_test_utils.create_error(category='SYSTEM') self.metrics = TotalJobExeMetrics(now())
def __init__(self): """Constructor """ self._job_exe_end_models = [ ] # Holds job_exe_end models to send in next messages self._running_job_exes = {} # {Cluster ID: RunningJobExecution} self._running_job_messages = [] # Holds running job messages to send self._lock = threading.Lock() self._metrics = TotalJobExeMetrics(now())
def __init__(self): """Constructor """ # Execution information to be sent in command messages self._finished_job_exes = [] # Holds finished executions self._job_exe_end_models = [] # Holds job_exe_end models to create self._running_job_messages = [] # Holds running job messages # Current running state self._running_job_exes = {} # {Cluster ID: RunningJobExecution} self._lock = threading.Lock() self._metrics = TotalJobExeMetrics(now())
class TestTotalJobExeMetrics(TestCase): """Tests the TotalJobExeMetrics class""" def setUp(self): django.setup() self.alg_error = error_test_utils.create_error(category='ALGORITHM') self.data_error = error_test_utils.create_error(category='DATA') self.system_error = error_test_utils.create_error(category='SYSTEM') self.metrics = TotalJobExeMetrics(now()) def test_init_with_database(self): """Tests calling init_with_database() successfully to load in job executions from the database""" # First block of job executions end_time_1 = now( ) - FinishedJobExeMetricsOverTime.BLOCK_LENGTH - FinishedJobExeMetricsOverTime.BLOCK_LENGTH node_model_1 = node_test_utils.create_node() job_type_1 = job_test_utils.create_seed_job_type() job_type_2 = job_test_utils.create_seed_job_type() job_exe_model_1 = job_test_utils.create_job_exe(job_type=job_type_1, status='COMPLETED', ended=end_time_1, node=node_model_1) job_exe_model_2 = job_test_utils.create_job_exe(job_type=job_type_1, status='COMPLETED', ended=end_time_1, node=node_model_1) job_exe_model_3 = job_test_utils.create_job_exe(job_type=job_type_1, status='FAILED', ended=end_time_1, error=self.alg_error, node=node_model_1) job_exe_model_4 = job_test_utils.create_job_exe(job_type=job_type_1, status='FAILED', ended=end_time_1, error=self.alg_error, node=node_model_1) job_exe_model_5 = job_test_utils.create_job_exe(job_type=job_type_1, status='FAILED', ended=end_time_1, error=self.alg_error, node=node_model_1) job_exe_model_6 = job_test_utils.create_job_exe(job_type=job_type_1, status='FAILED', ended=end_time_1, error=self.data_error, node=node_model_1) job_exe_model_7 = job_test_utils.create_job_exe( job_type=job_type_1, status='FAILED', ended=end_time_1, error=self.system_error, node=node_model_1) job_exe_model_8 = job_test_utils.create_job_exe( job_type=job_type_2, status='FAILED', ended=end_time_1, error=self.system_error, node=node_model_1) node_model_2 = node_test_utils.create_node() job_exe_model_9 = job_test_utils.create_job_exe(job_type=job_type_1, status='COMPLETED', ended=end_time_1, node=node_model_2) job_exe_model_10 = job_test_utils.create_job_exe(job_type=job_type_2, status='COMPLETED', ended=end_time_1, node=node_model_2) job_exe_model_11 = job_test_utils.create_job_exe(job_type=job_type_2, status='FAILED', ended=end_time_1, error=self.data_error, node=node_model_2) # Second block of job executions (one time block over from first set of executions) end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH job_exe_model_12 = job_test_utils.create_job_exe( job_type=job_type_2, status='FAILED', ended=end_time_2, error=self.system_error, node=node_model_1) job_exe_model_13 = job_test_utils.create_job_exe( job_type=job_type_2, status='FAILED', ended=end_time_2, error=self.system_error, node=node_model_1) job_exe_model_14 = job_test_utils.create_job_exe(job_type=job_type_2, status='COMPLETED', ended=end_time_2, node=node_model_2) # Load all initial executions from database self.metrics.init_with_database() # Generate JSON which should include both sets of job executions right_now = end_time_2 + datetime.timedelta(seconds=30) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, right_now) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 2) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 8) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 3) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 4) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 3) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 1) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 1) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Generate JSON which should include only second set of job executions (first set rolled off by time) later = end_time_1 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD + datetime.timedelta( seconds=1) later += FinishedJobExeMetricsOverTime.BLOCK_LENGTH node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, later) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 2) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 2) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 1) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Generate JSON where all job executions should have rolled off by time later = later + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, later) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) def test_running_executions(self): """Tests the metrics with running executions that complete""" node_model_1 = node_test_utils.create_node() node_model_2 = node_test_utils.create_node() job_type_1 = job_test_utils.create_seed_job_type() job_type_2 = job_test_utils.create_seed_job_type() job_exe_1 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_1) job_exe_2 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_1) job_exe_3 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_1) job_exe_4 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_1) job_exe_5 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_2) job_exe_6 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_1, node=node_model_2) job_exe_7 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_8 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_9 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_10 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) job_exe_11 = job_test_utils.create_running_job_exe(agent_id='agent', job_type=job_type_2, node=node_model_2) # NOTE: This unit test is about to get CRAZY. I apologize for the complexity, but this is needed for a # thorough testing self.metrics.add_running_job_exes([ job_exe_1, job_exe_2, job_exe_3, job_exe_4, job_exe_5, job_exe_6, job_exe_7, job_exe_8, job_exe_9, job_exe_10, job_exe_11 ]) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, now()) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 4) for job_type_dict in node_list_dict[0]['job_executions']['running'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 3) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 1) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['running'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Finish some job executions end_time_1 = now() job_exe_1._set_final_status('COMPLETED', end_time_1) job_exe_2._set_final_status('FAILED', end_time_1, error=self.data_error) job_exe_4._set_final_status('FAILED', end_time_1, error=self.alg_error) self.metrics.job_exe_finished(job_exe_1) self.metrics.job_exe_finished(job_exe_2) self.metrics.job_exe_finished(job_exe_4) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json( node_list_dict, end_time_1 + datetime.timedelta(seconds=1)) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['completed'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 2) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['job_type_id'], job_type_2.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['running'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Finish some job executions (all executions still on node 2) end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH job_exe_5._set_final_status('COMPLETED', end_time_2) job_exe_6._set_final_status('COMPLETED', end_time_2) job_exe_7._set_final_status('COMPLETED', end_time_2) job_exe_8._set_final_status('COMPLETED', end_time_2) job_exe_9._set_final_status('COMPLETED', end_time_2) job_exe_10._set_final_status('COMPLETED', end_time_2) job_exe_11._set_final_status('COMPLETED', end_time_2) self.metrics.job_exe_finished(job_exe_5) self.metrics.job_exe_finished(job_exe_6) self.metrics.job_exe_finished(job_exe_7) self.metrics.job_exe_finished(job_exe_8) self.metrics.job_exe_finished(job_exe_9) self.metrics.job_exe_finished(job_exe_10) self.metrics.job_exe_finished(job_exe_11) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, end_time_2) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['completed'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['completed']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 2) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['by_job_type'][0]['job_type_id'], job_type_2.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data'] ['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['completed'][ 'by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Let all finished job executions roll off by time, only running remaining end_time_3 = end_time_2 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD end_time_3 += FinishedJobExeMetricsOverTime.BLOCK_LENGTH + datetime.timedelta( seconds=1) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, end_time_3) # Check expected totals self.assertEqual( node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual( len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['count'], 1) self.assertEqual( node_list_dict[0]['job_executions']['running']['by_job_type'][0] ['job_type_id'], job_type_1.id) self.assertEqual( node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['algorithm'] ['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual( node_list_dict[1]['job_executions']['failed']['system']['total'], 0)
def clear(self): """Clears all data from the manager. This method is intended for testing only. """ self._running_job_exes = {} self._metrics = TotalJobExeMetrics(now())
class JobExecutionManager(object): """This class manages all running and finished job executions. This class is thread-safe.""" def __init__(self): """Constructor """ # Execution information to be sent in command messages self._finished_job_exes = [] # Holds finished executions self._job_exe_end_models = [] # Holds job_exe_end models to create self._running_job_messages = [] # Holds running job messages # Current running state self._running_job_exes = {} # {Cluster ID: RunningJobExecution} self._lock = threading.Lock() self._metrics = TotalJobExeMetrics(now()) def add_canceled_job_exes(self, job_exe_ends): """Adds the given job_exe_end models for job executions canceled off of the queue :param job_exe_ends: The job_exe_end models to add :type job_exe_ends: list """ with self._lock: self._job_exe_end_models.extend(job_exe_ends) def check_for_starvation(self, when): """Checks all of the currently running job executions for resource starvation. If any starved executions are found, they are failed and returned. :param when: The current time :type when: :class:`datetime.datetime` :returns: A list of the starved job executions :rtype: list """ finished_job_exes = [] with self._lock: for job_exe in self._running_job_exes.values(): if job_exe.check_for_starvation(when): if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) finished_job_exes.append(job_exe) return finished_job_exes def clear(self): """Clears all data from the manager. This method is intended for testing only. """ self._running_job_exes = {} self._metrics = TotalJobExeMetrics(now()) def generate_status_json(self, nodes_list, when): """Generates the portion of the status JSON that describes the job execution metrics :param nodes_list: The list of nodes within the status JSON :type nodes_list: list :param when: The current time :type when: :class:`datetime.datetime` """ with self._lock: self._metrics.generate_status_json(nodes_list, when) def get_messages(self): """Returns all messages related to jobs and executions that need to be sent :returns: The list of job-related messages to send :rtype: list """ running_job_messages = None job_exe_end_models = None finished_job_exes = None with self._lock: finished_job_exes = self._finished_job_exes job_exe_end_models = self._job_exe_end_models running_job_messages = self._running_job_messages self._finished_job_exes = [] self._job_exe_end_models = [] self._running_job_messages = [] # Start with running job messages messages = running_job_messages # Add messages for creating job_exe_end models messages.extend(create_job_exe_end_messages(job_exe_end_models)) # Add messages for finished job executions messages.extend( self._create_finished_job_exe_messages(finished_job_exes)) return messages def get_running_job_exe(self, cluster_id): """Returns the running job execution with the given cluster ID, or None if the job execution does not exist :param cluster_id: The cluster ID of the job execution to return :type cluster_id: int :returns: The running job execution with the given cluster ID, possibly None :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ with self._lock: if cluster_id in self._running_job_exes: return self._running_job_exes[cluster_id] return None def get_running_job_exes(self): """Returns all currently running job executions :returns: A list of running job executions :rtype: [:class:`job.execution.job_exe.RunningJobExecution`] """ with self._lock: return list(self._running_job_exes.values()) def handle_task_timeout(self, task, when): """Handles the timeout of the given task :param task: The task :type task: :class:`job.tasks.base_task.Task` :param when: The time that the time out occurred :type when: :class:`datetime.datetime` """ if task.id.startswith(JOB_TASK_ID_PREFIX): cluster_id = JobExecution.parse_cluster_id(task.id) with self._lock: if cluster_id in self._running_job_exes: job_exe = self._running_job_exes[cluster_id] # We do not remove the failed job execution at this point. We wait for the status update of the # killed task to come back so that job execution cleanup occurs after the task is dead. job_exe.execution_timed_out(task, when) def handle_task_update(self, task_update): """Handles the given task update and returns the associated job execution if it has finished :param task_update: The task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` :returns: The job execution if it has finished, None otherwise :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ if task_update.task_id.startswith(JOB_TASK_ID_PREFIX): cluster_id = JobExecution.parse_cluster_id(task_update.task_id) with self._lock: if cluster_id in self._running_job_exes: job_exe = self._running_job_exes[cluster_id] job_exe.task_update(task_update) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) return job_exe return None def init_with_database(self): """Initializes the job execution metrics with the execution history from the database """ with self._lock: self._metrics.init_with_database() def lost_job_exes(self, job_exe_ids, when): """Informs the manager that the job executions with the given IDs were lost :param job_exe_ids: The IDs of the lost job executions :type job_exe_ids: list :param when: The time that the executions were lost :type when: :class:`datetime.datetime` :returns: A list of the finished job executions :rtype: list """ lost_job_exe_ids = set(job_exe_ids) finished_job_exes = [] with self._lock: for job_exe in self._running_job_exes.values(): if job_exe.id in lost_job_exe_ids: job_exe.execution_lost(when) task = job_exe.current_task if task: # Node could be deprecated, so force kill the current task task.force_kill() if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) finished_job_exes.append(job_exe) return finished_job_exes def lost_node(self, node_id, when): """Informs the manager that the node with the given ID was lost and has gone offline :param node_id: The ID of the lost node :type node_id: int :param when: The time that the node was lost :type when: :class:`datetime.datetime` :returns: A list of the finished job executions :rtype: list """ finished_job_exes = [] with self._lock: for job_exe in self._running_job_exes.values(): if job_exe.node_id == node_id: job_exe.execution_lost(when) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) finished_job_exes.append(job_exe) return finished_job_exes def schedule_job_exes(self, job_exes, messages): """Adds newly scheduled running job executions to the manager :param job_exes: A list of the running job executions to add :type job_exes: list :param messages: The messages for the running jobs :type messages: list """ with self._lock: for job_exe in job_exes: self._running_job_exes[job_exe.cluster_id] = job_exe self._running_job_messages.extend(messages) self._metrics.add_running_job_exes(job_exes) def sync_with_database(self): """Syncs with the database to handle any canceled executions. Any job executions that are now finished are returned. :returns: A list of the finished job executions :rtype: list """ job_ids = [] running_job_exes = [] with self._lock: for running_job_exe in self._running_job_exes.values(): job_ids.append(running_job_exe.job_id) running_job_exes.append(running_job_exe) # Query job models from database to check if any running executions have been canceled job_models = {} for job in Job.objects.filter(id__in=job_ids): job_models[job.id] = job finished_job_exes = [] when_canceled = now() with self._lock: for running_job_exe in running_job_exes: job_model = job_models[running_job_exe.job_id] # If the job has been canceled or the job has a newer execution, this execution must be canceled if job_model.status == 'CANCELED' or job_model.num_exes > running_job_exe.exe_num: running_job_exe.execution_canceled(when_canceled) if running_job_exe.is_finished(): self._handle_finished_job_exe(running_job_exe) finished_job_exes.append(running_job_exe) return finished_job_exes def _create_finished_job_exe_messages(self, finished_job_exes): """Creates messages for finished job executions :param finished_job_exes: The finished job executions :type finished_job_exes: list :returns: The messages :rtype: list """ when = now() completed_jobs = [] failed_jobs = [] for job_exe in finished_job_exes: if job_exe.status == 'COMPLETED': completed_jobs.append( CompletedJob(job_exe.job_id, job_exe.exe_num)) elif job_exe.status == 'FAILED': failed_jobs.append( FailedJob(job_exe.job_id, job_exe.exe_num, job_exe.error.id)) messages = create_completed_jobs_messages(completed_jobs, when) messages.extend(create_failed_jobs_messages(failed_jobs, when)) return messages def _handle_finished_job_exe(self, running_job_exe): """Handles the finished job execution. Caller must have obtained the manager lock. :param running_job_exe: The finished job execution :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution` """ # Create job_exe_end model for the finished job execution and send it in a future message self._job_exe_end_models.append( running_job_exe.create_job_exe_end_model()) # Collect finished job execution to send a future job update message self._finished_job_exes.append(running_job_exe) # Remove the finished job execution and update the metrics del self._running_job_exes[running_job_exe.cluster_id] self._metrics.job_exe_finished(running_job_exe)
class JobExecutionManager(object): """This class manages all running and finished job executions. This class is thread-safe.""" def __init__(self): """Constructor """ self._job_exe_end_models = [ ] # Holds job_exe_end models to send in next messages self._running_job_exes = {} # {Cluster ID: RunningJobExecution} self._running_job_messages = [] # Holds running job messages to send self._lock = threading.Lock() self._metrics = TotalJobExeMetrics(now()) def add_canceled_job_exes(self, job_exe_ends): """Adds the given job_exe_end models for job executions canceled off of the queue :param job_exe_ends: The job_exe_end models to add :type job_exe_ends: list """ with self._lock: self._job_exe_end_models.extend(job_exe_ends) def clear(self): """Clears all data from the manager. This method is intended for testing only. """ self._running_job_exes = {} self._metrics = TotalJobExeMetrics(now()) def generate_status_json(self, nodes_list, when): """Generates the portion of the status JSON that describes the job execution metrics :param nodes_list: The list of nodes within the status JSON :type nodes_list: list :param when: The current time :type when: :class:`datetime.datetime` """ with self._lock: self._metrics.generate_status_json(nodes_list, when) def get_messages(self): """Returns all messages related to jobs and executions that need to be sent :returns: The list of job-related messages to send :rtype: list """ with self._lock: messages = self._running_job_messages self._running_job_messages = [] message = None for job_exe_end in self._job_exe_end_models: if not message: message = CreateJobExecutionEnd() elif not message.can_fit_more(): messages.append(message) message = CreateJobExecutionEnd() message.add_job_exe_end(job_exe_end) if message: messages.append(message) self._job_exe_end_models = [] return messages def get_running_job_exe(self, cluster_id): """Returns the running job execution with the given cluster ID, or None if the job execution does not exist :param cluster_id: The cluster ID of the job execution to return :type cluster_id: int :returns: The running job execution with the given cluster ID, possibly None :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ with self._lock: if cluster_id in self._running_job_exes: return self._running_job_exes[cluster_id] return None def get_running_job_exes(self): """Returns all currently running job executions :returns: A list of running job executions :rtype: [:class:`job.execution.job_exe.RunningJobExecution`] """ with self._lock: return list(self._running_job_exes.values()) def handle_task_timeout(self, task, when): """Handles the timeout of the given task :param task: The task :type task: :class:`job.tasks.base_task.Task` :param when: The time that the time out occurred :type when: :class:`datetime.datetime` """ if task.id.startswith(JOB_TASK_ID_PREFIX): cluster_id = JobExecution.parse_cluster_id(task.id) with self._lock: if cluster_id in self._running_job_exes: job_exe = self._running_job_exes[cluster_id] # We do not remove the failed job execution at this point. We wait for the status update of the # killed task to come back so that job execution cleanup occurs after the task is dead. job_exe.execution_timed_out(task, when) def handle_task_update(self, task_update): """Handles the given task update and returns the associated job execution if it has finished :param task_update: The task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` :returns: The job execution if it has finished, None otherwise :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ finished_job_exe = None if task_update.task_id.startswith(JOB_TASK_ID_PREFIX): cluster_id = JobExecution.parse_cluster_id(task_update.task_id) with self._lock: if cluster_id in self._running_job_exes: job_exe = self._running_job_exes[cluster_id] job_exe.task_update(task_update) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) finished_job_exe = job_exe # return job_exe # TODO: this can be removed once database operations move to messaging backend if finished_job_exe: self._handle_finished_job_exe_in_database(finished_job_exe) return finished_job_exe return None def init_with_database(self): """Initializes the job execution metrics with the execution history from the database """ with self._lock: self._metrics.init_with_database() def lost_node(self, node_id, when): """Informs the manager that the node with the given ID was lost and has gone offline :param node_id: The ID of the lost node :type node_id: int :param when: The time that the node was lost :type when: :class:`datetime.datetime` :returns: A list of the lost job executions that had been running on the node :rtype: list """ lost_exes = [] finished_job_exes = [] with self._lock: for job_exe in self._running_job_exes.values(): if job_exe.node_id == node_id: lost_exes.append(job_exe) job_exe.execution_lost(when) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) finished_job_exes.append(job_exe) # TODO: this can be removed once database operations move to messaging backend for finished_job_exe in finished_job_exes: self._handle_finished_job_exe_in_database(finished_job_exe) return lost_exes def schedule_job_exes(self, job_exes, messages): """Adds newly scheduled running job executions to the manager :param job_exes: A list of the running job executions to add :type job_exes: list :param messages: The messages for the running jobs :type messages: list """ with self._lock: for job_exe in job_exes: self._running_job_exes[job_exe.cluster_id] = job_exe self._running_job_messages.extend(messages) self._metrics.add_running_job_exes(job_exes) def sync_with_database(self): """Syncs with the database to handle any canceled executions. The current task of each canceled job execution is returned so the tasks may be killed. :returns: A list of the canceled tasks to kill :rtype: [:class:`job.tasks.base_task.Task`] """ job_ids = [] running_job_exes = [] with self._lock: for running_job_exe in self._running_job_exes.values(): job_ids.append(running_job_exe.job_id) running_job_exes.append(running_job_exe) # Query job models from database to check if any running executions have been canceled job_models = {} for job in Job.objects.filter(id__in=job_ids): job_models[job.id] = job canceled_tasks = [] finished_job_exes = [] when_canceled = now() with self._lock: for running_job_exe in running_job_exes: job_model = job_models[running_job_exe.job_id] # If the job has been canceled or the job has a newer execution, this execution must be canceled if job_model.status == 'CANCELED' or job_model.num_exes > running_job_exe.exe_num: task = running_job_exe.execution_canceled(when_canceled) if task: # Since it has an outstanding task, we do not remove the canceled job execution at this point. # We wait for the status update of the killed task to come back so that job execution cleanup # occurs after the task is dead. canceled_tasks.append(task) else: if running_job_exe.is_finished(): self._handle_finished_job_exe(running_job_exe) finished_job_exes.append(running_job_exe) # TODO: this can be removed once database operations move to messaging backend for finished_job_exe in finished_job_exes: self._handle_finished_job_exe_in_database(finished_job_exe) return canceled_tasks def _handle_finished_job_exe(self, running_job_exe): """Handles the finished job execution. Caller must have obtained the manager lock. :param running_job_exe: The finished job execution :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution` """ # Create job_exe_end model for the finished job execution and send it in next messages self._job_exe_end_models.append( running_job_exe.create_job_exe_end_model()) # Remove the finished job execution and update the metrics del self._running_job_exes[running_job_exe.cluster_id] self._metrics.job_exe_finished(running_job_exe) def _handle_finished_job_exe_in_database(self, running_job_exe): """Handles the finished job execution by performing any needed database operations. This is a stop gap until these database operations move to the messaging backend. :param running_job_exe: The finished job execution :type running_job_exe: :class:`job.execution.job_exe.RunningJobExecution` """ # TODO: handling job completion and failure here for now, later these will be sent via messaging backend in a # background thread from queue.models import Queue job_id = running_job_exe.job_id exe_num = running_job_exe.exe_num when = running_job_exe.finished if running_job_exe.status == 'COMPLETED': Queue.objects.handle_job_completion(job_id, exe_num, when) elif running_job_exe.status == 'FAILED': Queue.objects.handle_job_failure(job_id, exe_num, when, running_job_exe.error)
class JobExecutionManager(object): """This class manages all running and finished job executions. This class is thread-safe.""" def __init__(self): """Constructor """ self._running_job_exes = {} # {ID: RunningJobExecution} self._lock = threading.Lock() self._metrics = TotalJobExeMetrics(now()) def generate_status_json(self, nodes_list, when): """Generates the portion of the status JSON that describes the job execution metrics :param nodes_list: The list of nodes within the status JSON :type nodes_list: list :param when: The current time :type when: :class:`datetime.datetime` """ with self._lock: self._metrics.generate_status_json(nodes_list, when) def get_ready_job_exes(self): """Returns all running job executions that are ready to execute their next task :returns: A list of running job executions :rtype: [:class:`job.execution.job_exe.RunningJobExecution`] """ ready_exes = [] with self._lock: for job_exe_id in self._running_job_exes: job_exe = self._running_job_exes[job_exe_id] if job_exe.is_next_task_ready(): ready_exes.append(job_exe) return ready_exes def get_running_job_exe(self, job_exe_id): """Returns the running job execution with the given ID, or None if the job execution does not exist :param job_exe_id: The ID of the job execution to return :type job_exe_id: int :returns: The running job execution with the given ID, possibly None :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ with self._lock: if job_exe_id in self._running_job_exes: return self._running_job_exes[job_exe_id] return None def get_running_job_exes(self): """Returns all currently running job executions :returns: A list of running job executions :rtype: [:class:`job.execution.job_exe.RunningJobExecution`] """ running_job_exes = [] with self._lock: for job_exe_id in self._running_job_exes: running_job_exes.append(self._running_job_exes[job_exe_id]) return running_job_exes def handle_task_timeout(self, task, when): """Handles the timeout of the given task :param task: The task :type task: :class:`job.tasks.base_task.Task` :param when: The time that the time out occurred :type when: :class:`datetime.datetime` """ if task.id.startswith(JOB_TASK_ID_PREFIX): job_exe_id = JobExecution.get_job_exe_id(task.id) with self._lock: if job_exe_id in self._running_job_exes: job_exe = self._running_job_exes[job_exe_id] try: job_exe.execution_timed_out(task, when) except DatabaseError: logger.exception( 'Error failing timed out job execution %i', job_exe_id) # We do not remove timed out job executions at this point. We wait for the status update of the # killed task to come back so that job execution cleanup occurs after the task is dead. def handle_task_update(self, task_update): """Handles the given task update and returns the associated job execution if it has finished :param task_update: The task update :type task_update: :class:`job.tasks.update.TaskStatusUpdate` :returns: The job execution if it has finished, None otherwise :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ if task_update.task_id.startswith(JOB_TASK_ID_PREFIX): job_exe_id = JobExecution.get_job_exe_id(task_update.task_id) with self._lock: if job_exe_id in self._running_job_exes: job_exe = self._running_job_exes[job_exe_id] job_exe.task_update(task_update) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) return job_exe return None def init_with_database(self): """Initializes the job execution metrics with the execution history from the database """ with self._lock: self._metrics.init_with_database() def lost_node(self, node_id, when): """Informs the manager that the node with the given ID was lost and has gone offline :param node_id: The ID of the lost node :type node_id: int :param when: The time that the node was lost :type when: :class:`datetime.datetime` :returns: A list of the lost job executions that had been running on the node :rtype: [:class:`job.execution.job_exe.RunningJobExecution`] """ lost_exes = [] with self._lock: for job_exe_id in self._running_job_exes.keys(): job_exe = self._running_job_exes[job_exe_id] if job_exe.node_id == node_id: lost_exes.append(job_exe) try: job_exe.execution_lost(when) except DatabaseError: logger.exception( 'Error failing lost job execution: %s', job_exe.id) if job_exe.is_finished(): self._handle_finished_job_exe(job_exe) return lost_exes def schedule_job_exes(self, job_exes): """Adds newly scheduled running job executions to the manager :param job_exes: A list of the running job executions to add :type job_exes: [:class:`job.execution.job_exe.RunningJobExecution`] """ with self._lock: for job_exe in job_exes: self._running_job_exes[job_exe.id] = job_exe self._metrics.add_running_job_exes(job_exes) def sync_with_database(self): """Syncs with the database to handle any canceled executions. The current task of each canceled job execution is returned so the tasks may be killed. :returns: A list of the canceled tasks to kill :rtype: [:class:`job.tasks.base_task.Task`] """ with self._lock: job_exe_ids = list(self._running_job_exes.keys()) canceled_tasks = [] canceled_models = list( JobExecution.objects.filter(id__in=job_exe_ids, status='CANCELED').iterator()) with self._lock: for job_exe_model in canceled_models: if job_exe_model.id in self._running_job_exes: canceled_job_exe = self._running_job_exes[job_exe_model.id] try: task = canceled_job_exe.execution_canceled() if task: canceled_tasks.append(task) except DatabaseError: logger.exception('Error canceling job execution %i', job_exe_model.id) # We do not remove canceled job executions at this point. We wait for the status update of the # killed task to come back so that job execution cleanup occurs after the task is dead. return canceled_tasks def _handle_finished_job_exe(self, job_exe): """Handles the finished job execution. Caller must have obtained the manager lock. :param job_exe: The finished job execution :type job_exe: :class:`job.execution.job_exe.RunningJobExecution` """ del self._running_job_exes[job_exe.id] self._metrics.job_exe_finished(job_exe)