def _create_finished_job_exe_messages(self, finished_job_exes): """Creates messages for finished job executions :param finished_job_exes: The finished job executions :type finished_job_exes: list :returns: The messages :rtype: list """ when = now() completed_jobs = [] failed_jobs = [] for job_exe in finished_job_exes: if job_exe.status == 'COMPLETED': completed_jobs.append( CompletedJob(job_exe.job_id, job_exe.exe_num)) elif job_exe.status == 'FAILED': failed_jobs.append( FailedJob(job_exe.job_id, job_exe.exe_num, job_exe.error.id)) messages = create_completed_jobs_messages(completed_jobs, when) messages.extend(create_failed_jobs_messages(failed_jobs, when)) return messages
def execute(self): """See :meth:`messaging.messages.message.CommandMessage.execute` """ failed_jobs = [] job_exe_ends = [] error = get_builtin_error('scheduler-lost') task_results = TaskResults(do_validate=False) # Blank # Find executions for unfinished jobs so we can fail them for job_exe in JobExecution.objects.get_unfinished_job_exes(): if job_exe.started < self.when: failed_jobs.append( FailedJob(job_exe.job_id, job_exe.exe_num, error.id)) job_exe_ends.append( job_exe.create_job_exe_end_model(task_results, 'FAILED', error.id, self.when)) # Create messages to fail unfinished jobs and executions if failed_jobs: count = len(failed_jobs) logger.info( 'Failing %d job(s) that had started but not finished prior to scheduler restart', count) self.new_messages.extend( create_failed_jobs_messages(failed_jobs, self.when)) self.new_messages.extend(create_job_exe_end_messages(job_exe_ends)) return True
def test_execute(self): """Tests calling FailedJobs.execute() successfully""" error_1 = error_test_utils.create_error(should_be_retried=True) error_2 = error_test_utils.create_error(should_be_retried=False) data = JobData() job_1 = job_test_utils.create_job(num_exes=1, status='QUEUED', data=data.get_dict(), max_tries=2) job_2 = job_test_utils.create_job(num_exes=1, status='RUNNING', data=data.get_dict(), max_tries=2) job_3 = job_test_utils.create_job(num_exes=1, status='RUNNING', data=data.get_dict(), max_tries=1) job_4 = job_test_utils.create_job(num_exes=1, status='RUNNING', data=data.get_dict(), max_tries=2) job_5 = job_test_utils.create_job(num_exes=1, status='RUNNING', data=data.get_dict(), max_tries=2) job_6 = job_test_utils.create_job(num_exes=1, status='FAILED', data=data.get_dict(), max_tries=2) job_7 = job_test_utils.create_job(num_exes=0, status='CANCELED') job_ids = [ job_1.id, job_2.id, job_3.id, job_4.id, job_5.id, job_6.id, job_7.id ] from recipe.test import utils as recipe_test_utils recipe_1 = recipe_test_utils.create_recipe() recipe_test_utils.create_recipe_job(recipe=recipe_1, job=job_3) recipe_2 = recipe_test_utils.create_recipe() recipe_test_utils.create_recipe_job(recipe=recipe_2, job=job_4) when_ended = now() # Add jobs to message message = FailedJobs() message.ended = when_ended if message.can_fit_more(): message.add_failed_job( FailedJob(job_1.id, job_1.num_exes, error_1.id)) if message.can_fit_more(): message.add_failed_job( FailedJob(job_2.id, job_2.num_exes, error_1.id)) if message.can_fit_more(): message.add_failed_job( FailedJob(job_3.id, job_3.num_exes, error_1.id)) if message.can_fit_more(): message.add_failed_job( FailedJob(job_4.id, job_4.num_exes, error_2.id)) # Error that cannot be retried if message.can_fit_more(): message.add_failed_job( FailedJob(job_5.id, job_5.num_exes - 1, error_1.id)) # Mismatched exe_num if message.can_fit_more(): message.add_failed_job( FailedJob(job_6.id, job_6.num_exes, error_1.id)) if message.can_fit_more(): message.add_failed_job( FailedJob(job_7.id, job_7.num_exes - 1, error_1.id)) # Execute message result = message.execute() self.assertTrue(result) jobs = Job.objects.filter(id__in=job_ids).order_by('id') queued_jobs_msg = None update_recipes_msg = None self.assertEqual(len(message.new_messages), 2) for msg in message.new_messages: if msg.type == 'queued_jobs': queued_jobs_msg = msg elif msg.type == 'update_recipes': update_recipes_msg = msg self.assertEqual(len(queued_jobs_msg._queued_jobs), 2) # 2 jobs should have been retried self.assertEqual(len(update_recipes_msg._recipe_ids), 2) # 2 jobs should have been failed # Job 1 should be retried and put back on the queue self.assertEqual(jobs[0].status, 'QUEUED') self.assertEqual(jobs[0].num_exes, 1) self.assertEqual(queued_jobs_msg._queued_jobs[0].job_id, job_1.id) # Job 2 should be retried and put back on the queue self.assertEqual(jobs[1].status, 'RUNNING') self.assertEqual(jobs[1].num_exes, 1) self.assertEqual(queued_jobs_msg._queued_jobs[1].job_id, job_2.id) # Job 3 should be failed since max_tries is used up self.assertEqual(jobs[2].status, 'FAILED') self.assertEqual(jobs[2].num_exes, 1) self.assertEqual(jobs[2].error_id, error_1.id) self.assertEqual(jobs[2].ended, when_ended) self.assertTrue(recipe_1.id in update_recipes_msg._recipe_ids) # Job 4 should be failed since error cannot be retried self.assertEqual(jobs[3].status, 'FAILED') self.assertEqual(jobs[3].num_exes, 1) self.assertEqual(jobs[3].error_id, error_2.id) self.assertEqual(jobs[3].ended, when_ended) self.assertTrue(recipe_2.id in update_recipes_msg._recipe_ids) # Job 5 should be ignored since mismatched exe_num self.assertEqual(jobs[4].status, 'RUNNING') self.assertEqual(jobs[4].num_exes, 1) # Job 6 should be ignored since it is already failed self.assertEqual(jobs[5].status, 'FAILED') self.assertEqual(jobs[5].num_exes, 1) # Job 6 should be ignored since it is canceled self.assertEqual(jobs[6].status, 'CANCELED') self.assertEqual(jobs[6].num_exes, 0) # Test executing message again message_json_dict = message.to_json() message = FailedJobs.from_json(message_json_dict) result = message.execute() self.assertTrue(result) jobs = Job.objects.filter(id__in=job_ids).order_by('id') self.assertEqual(len(message.new_messages), 1) queued_jobs_msg = message.new_messages[0] self.assertEqual(queued_jobs_msg.type, 'queued_jobs') # The same 2 jobs should have been retried self.assertEqual(len(queued_jobs_msg._queued_jobs), 2) # Job 1 should be retried and put back on the queue self.assertEqual(jobs[0].status, 'QUEUED') self.assertEqual(jobs[0].num_exes, 1) self.assertEqual(queued_jobs_msg._queued_jobs[0].job_id, job_1.id) # Job 2 should be retried and put back on the queue self.assertEqual(jobs[1].status, 'RUNNING') self.assertEqual(jobs[1].num_exes, 1) self.assertEqual(queued_jobs_msg._queued_jobs[1].job_id, job_2.id) # Job 3 should be failed from first execution self.assertEqual(jobs[2].status, 'FAILED') self.assertEqual(jobs[2].num_exes, 1) self.assertEqual(jobs[2].error_id, error_1.id) # Job 4 should be failed from first execution self.assertEqual(jobs[3].status, 'FAILED') self.assertEqual(jobs[3].num_exes, 1) self.assertEqual(jobs[3].error_id, error_2.id) # Job 5 should be ignored since mismatched exe_num self.assertEqual(jobs[4].status, 'RUNNING') self.assertEqual(jobs[4].num_exes, 1) # Job 6 should be ignored since it is already failed self.assertEqual(jobs[5].status, 'FAILED') self.assertEqual(jobs[5].num_exes, 1) # Job 6 should be ignored since it is canceled self.assertEqual(jobs[6].status, 'CANCELED') self.assertEqual(jobs[6].num_exes, 0)
def test_json(self): """Tests coverting a FailedJobs message to and from JSON""" error = error_test_utils.create_error(should_be_retried=True) data = JobData() job_1 = job_test_utils.create_job(num_exes=1, status='QUEUED', data=data.get_dict(), max_tries=2) job_2 = job_test_utils.create_job(num_exes=1, status='RUNNING', data=data.get_dict(), max_tries=1) job_3 = job_test_utils.create_job(num_exes=0, status='PENDING') job_ids = [job_1.id, job_2.id, job_3.id] from recipe.test import utils as recipe_test_utils recipe_1 = recipe_test_utils.create_recipe() recipe_test_utils.create_recipe_job(recipe=recipe_1, job=job_2) when_ended = now() # Add jobs to message message = FailedJobs() message.ended = when_ended if message.can_fit_more(): message.add_failed_job( FailedJob(job_1.id, job_1.num_exes, error.id)) if message.can_fit_more(): message.add_failed_job( FailedJob(job_2.id, job_2.num_exes, error.id)) if message.can_fit_more(): message.add_failed_job( FailedJob(job_3.id, job_3.num_exes, error.id)) # Convert message to JSON and back, and then execute message_json_dict = message.to_json() new_message = FailedJobs.from_json(message_json_dict) result = new_message.execute() self.assertTrue(result) jobs = Job.objects.filter(id__in=job_ids).order_by('id') queued_jobs_msg = None update_recipes_msg = None self.assertEqual(len(new_message.new_messages), 2) for msg in new_message.new_messages: if msg.type == 'queued_jobs': queued_jobs_msg = msg elif msg.type == 'update_recipes': update_recipes_msg = msg # Job 1 should be retried and put back on the queue self.assertEqual(jobs[0].status, 'QUEUED') self.assertEqual(jobs[0].num_exes, 1) self.assertEqual(len(queued_jobs_msg._queued_jobs), 1) self.assertEqual(queued_jobs_msg._queued_jobs[0].job_id, job_1.id) # Job 2 should be failed since max_tries is used up self.assertEqual(jobs[1].status, 'FAILED') self.assertEqual(jobs[1].num_exes, 1) self.assertEqual(jobs[1].error_id, error.id) self.assertEqual(jobs[1].ended, when_ended) self.assertEqual(len(update_recipes_msg._recipe_ids), 1) self.assertTrue(recipe_1.id in update_recipes_msg._recipe_ids) # Job 3 should ignore update self.assertEqual(jobs[2].status, 'PENDING') self.assertEqual(jobs[2].num_exes, 0)
def test_execute(self): """Tests calling RestartScheduler.execute() successfully""" started = now() scheduler_restarted = started + datetime.timedelta(seconds=30) started_later = scheduler_restarted + datetime.timedelta(seconds=30) running_job_exe_1 = job_test_utils.create_running_job_exe( started=started) running_job_exe_2 = job_test_utils.create_running_job_exe( started=started) running_job_exe_3 = job_test_utils.create_running_job_exe( started=started) running_job_exe_4 = job_test_utils.create_running_job_exe( started=started_later) # After scheduler restart # Set job 1 so it is still QUEUED Job.objects.filter(id=running_job_exe_1.job_id).update(status='QUEUED') # Set job 3 to COMPLETED, so it should not be failed by scheduler restart Job.objects.filter(id=running_job_exe_3.job_id).update( status='COMPLETED') # Create message message = RestartScheduler() message.when = scheduler_restarted # Execute message result = message.execute() self.assertTrue(result) failed_jobs_msg = None job_exe_end_msg = None self.assertEqual(len(message.new_messages), 2) for msg in message.new_messages: if msg.type == 'failed_jobs': failed_jobs_msg = msg elif msg.type == 'create_job_exe_ends': job_exe_end_msg = msg error = get_builtin_error('scheduler-lost') # Jobs 1 and 2 should be in messages to be failed, Jobs 3 and 4 should not be included expected_failed_jobs = { FailedJob(running_job_exe_1.job_id, running_job_exe_1.exe_num, error.id), FailedJob(running_job_exe_2.job_id, running_job_exe_2.exe_num, error.id) } expected_failed_job_exe_ids = { running_job_exe_1.id, running_job_exe_2.id } self.assertSetEqual(set(failed_jobs_msg._failed_jobs.values()[0]), expected_failed_jobs) failed_job_exe_ids = set() for job_exe_end_model in job_exe_end_msg._job_exe_ends: failed_job_exe_ids.add(job_exe_end_model.job_exe_id) self.assertSetEqual(failed_job_exe_ids, expected_failed_job_exe_ids) # Test executing message again, should get same result message_json_dict = message.to_json() message = RestartScheduler.from_json(message_json_dict) result = message.execute() self.assertTrue(result) # Jobs 1 and 2 should be in messages to be failed, Jobs 3 and 4 should not be included expected_failed_jobs = { FailedJob(running_job_exe_1.job_id, running_job_exe_1.exe_num, error.id), FailedJob(running_job_exe_2.job_id, running_job_exe_2.exe_num, error.id) } expected_failed_job_exe_ids = { running_job_exe_1.id, running_job_exe_2.id } self.assertSetEqual(set(failed_jobs_msg._failed_jobs.values()[0]), expected_failed_jobs) failed_job_exe_ids = set() for job_exe_end_model in job_exe_end_msg._job_exe_ends: failed_job_exe_ids.add(job_exe_end_model.job_exe_id) self.assertSetEqual(failed_job_exe_ids, expected_failed_job_exe_ids)