def test_paused_node(self): """Tests adding job executions when the node is paused""" node_offers = NodeOffers(self.paused_node) offer_1 = ResourceOffer('offer_1', self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) # Ensure it accepts new tasks for already running job executions job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = node_offers.consider_next_task(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) # Don't accept new job executions while paused job_exe_new = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_new) self.assertEqual(result, NodeOffers.NODE_NOT_READY) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2) self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2}) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) self.assertEqual(node_offers._available_cpus, 68.0) self.assertEqual(node_offers._available_mem, 1536.0) self.assertEqual(node_offers._available_disk, 2222.0)
def test_timed_out_system_job_task(self): """Tests running through a job execution where a system job task times out""" ingest_job_type = Ingest.objects.get_ingest_job_type() ingest_job_type.max_tries = 1 ingest_job_type.save() job = job_test_utils.create_job(job_type=ingest_job_type, num_exes=1) job_exe = job_test_utils.create_job_exe(job=job) running_job_exe = RunningJobExecution(job_exe) # Start job-task and then task times out when_launched = now() + timedelta(seconds=1) job_task_started = when_launched + timedelta(seconds=1) when_timed_out = job_task_started + timedelta(seconds=1) job_task = running_job_exe.start_next_task() self.task_mgr.launch_tasks([job_task], when_launched) update = job_test_utils.create_task_status_update( job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started) self.task_mgr.handle_task_update(update) running_job_exe.task_update(update) running_job_exe.execution_timed_out(job_task, when_timed_out) self.assertTrue(running_job_exe.is_finished()) self.assertFalse(running_job_exe.is_next_task_ready()) job_exe = JobExecution.objects.get(id=job_exe.id) self.assertEqual('FAILED', job_exe.status) self.assertEqual('ingest-timeout', job_exe.error.name) self.assertEqual(when_timed_out, job_exe.ended)
def init_with_database(self): """Initializes the job execution metrics with the execution history from the database """ oldest_time = self._finished_metrics_over_time.time_blocks[0].start blank_config = ExecutionConfiguration() for job_exe_end in JobExecutionEnd.objects.get_recent_job_exe_end_metrics( oldest_time): running_job_exe = RunningJobExecution('', job_exe_end.job_exe, job_exe_end.job_type, blank_config, 0) running_job_exe._set_final_status(job_exe_end.status, job_exe_end.ended, job_exe_end.error) self._finished_metrics.add_job_execution(running_job_exe) self._finished_metrics_over_time.add_job_execution(running_job_exe)
def test_job_exe_clean_task(self, mock_get_slaves): """Tests the NodeManager where a cleanup task is returned to clean up a job execution""" mock_get_slaves.return_value = self.slave_infos when = now() node_mgr = NodeManager() node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2]) node_mgr.sync_with_database('master_host', 5050) cleanup_mgr = CleanupManager() cleanup_mgr.update_nodes(node_mgr.get_nodes()) tasks = node_mgr.get_next_tasks(when) task_mgr = TaskManager() # Complete initial cleanup tasks for task in tasks: task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) task_mgr.handle_task_update(update) node_mgr.handle_task_update(update) # Mark image pull done to get rid of image tasks for node in node_mgr.get_nodes(): node._image_pull_completed() node._update_state() job_exe = job_test_utils.create_job_exe(node=self.node_1) # Add a job execution to clean up and get the cleanup task for it cleanup_mgr.add_job_execution(RunningJobExecution(job_exe)) tasks = node_mgr.get_next_tasks(when) self.assertEqual(len(tasks), 1) task = tasks[0] self.assertEqual(task.agent_id, self.node_agent_1) self.assertFalse(task.is_initial_cleanup) self.assertEqual(len(task.job_exes), 1)
def setUp(self): django.setup() # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() self.node_model_1 = node_test_utils.create_node() self.job_exe_model_1 = job_test_utils.create_job_exe( status='RUNNING', node=self.node_model_1) self.job_exe_1 = RunningJobExecution(self.job_exe_model_1) self.node_model_2 = node_test_utils.create_node() self.job_exe_model_2 = job_test_utils.create_job_exe( status='RUNNING', node=self.node_model_2) self.job_exe_2 = RunningJobExecution(self.job_exe_model_2) self.job_exe_mgr = JobExecutionManager()
def test_handle_regular_cleanup_task(self): """Tests handling a regular cleanup task""" when = now() node = Node(self.node_agent, self.node) node._last_heath_task = when node._initial_cleanup_completed() node._image_pull_completed() node._update_state() # No task since there are no job executions to clean self.assertListEqual([], node.get_next_tasks(when)) # Add job execution and complete task to clean it up job_exe = RunningJobExecution(self.job_exe) node.add_job_execution(job_exe) task = node.get_next_tasks(when)[0] self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX)) self.assertFalse(task.is_initial_cleanup) self.assertListEqual(task.job_exes, [job_exe]) self.task_mgr.launch_tasks([task], now()) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.RUNNING, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FINISHED, now()) self.task_mgr.handle_task_update(update) node.handle_task_update(update) # No task since all job executions have been cleaned self.assertListEqual([], node.get_next_tasks(when))
def test_lost_node(self): """Tests accepting a running and queued job execution and then the node being lost""" offer_1 = ResourceOffer( 'offer_1', self.node_agent, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) manager.update_nodes([self.node, self.paused_node]) manager.ready_new_offers() job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.ACCEPTED) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.ACCEPTED) manager.lost_node(self.node_agent) node_offers = manager.pop_offers_with_accepted_job_exes() self.assertEqual(len(node_offers), 0)
def test_lost_node(self): """Tests when the node is lost""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) # Accept a couple job executions job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_2 = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertGreater(node_offers._available_cpus, 0.0) self.assertGreater(node_offers._available_mem, 0.0) self.assertGreater(node_offers._available_disk, 0.0) # Node is lost node_offers.lost_node() self.assertFalse(node_offers.has_accepted_job_exes()) self.assertEqual(node_offers._available_cpus, 0.0) self.assertEqual(node_offers._available_mem, 0.0) self.assertEqual(node_offers._available_disk, 0.0)
def init_with_database(self): """Initializes the job execution metrics with the execution history from the database """ oldest_time = self._finished_metrics_over_time.time_blocks[0].start # TODO: this should be in the manager, but the JobExecution model is going to be completely re-worked anyway job_exe_query = JobExecution.objects.select_related('error') job_exe_query = job_exe_query.filter( status__in=['COMPLETED', 'FAILED'], ended__gte=oldest_time) for job_exe_model in job_exe_query: job_exe = RunningJobExecution(job_exe_model) job_exe._set_finished_status(job_exe_model.status, job_exe_model.ended, job_exe_model.error) self._finished_metrics.add_job_execution(job_exe) self._finished_metrics_over_time.add_job_execution(job_exe)
def create_running_job_exe(agent_id='agent_1', job_type=None, job=None, node=None, timeout=None, input_file_size=10.0, queued=None, started=None, resources=None, priority=None, num_exes=1): """Creates a running job execution for unit testing :returns: The running job execution :rtype: :class:`job.execution.job_exe.RunningJobExecution` """ when = timezone.now() if not job: job = create_job(job_type=job_type, status='RUNNING', input_file_size=input_file_size, num_exes=num_exes) job_type = job.job_type # Configuration that occurs at queue time input_files = {} input_file_ids = job.get_job_data().get_input_file_ids() if input_file_ids: for input_file in ScaleFile.objects.get_files_for_queued_jobs(input_file_ids): input_files[input_file.id] = input_file exe_config = QueuedExecutionConfigurator(input_files).configure_queued_job(job) job_exe = JobExecution() job_exe.set_cluster_id('1234', job.id, job.num_exes) job_exe.job = job job_exe.job_type = job_type job_exe.exe_num = job.num_exes if not node: node = node_utils.create_node() job_exe.node = node if not timeout: timeout = job.timeout job_exe.timeout = timeout job_exe.input_file_size = input_file_size if not resources: resources = job.get_resources() job_exe.resources = resources.get_json().get_dict() job_exe.configuration = exe_config.get_dict() if not queued: queued = when job_exe.queued = queued if not started: started = when + datetime.timedelta(seconds=1) job_exe.started = started job_exe.save() if not priority: priority = job.priority # Configuration that occurs at schedule time workspaces = {} for workspace in Workspace.objects.all(): workspaces[workspace.name] = workspace secret_config = ScheduledExecutionConfigurator(workspaces).configure_scheduled_job(job_exe, job_type, job_type.get_job_interface(),'INFO') return RunningJobExecution(agent_id, job_exe, job_type, secret_config, priority)
def test_consider_next_task(self): """Tests consider_next_task() and get_accepted_running_job_exes()""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.ACCEPTED) result = node_offers.consider_next_task(job_exe_1) # Same job_exe, should have no effect self.assertEqual(result, NodeOffers.ACCEPTED) job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus) result = node_offers.consider_next_task(job_exe_high_cpus) self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS) job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem) result = node_offers.consider_next_task(job_exe_high_mem) self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM) job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk) result = node_offers.consider_next_task(job_exe_high_disk) self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK) job_exe_2 = RunningJobExecution(self.running_job_exe_2) result = node_offers.consider_next_task(job_exe_2) self.assertEqual(result, NodeOffers.ACCEPTED) self.assertTrue(node_offers.has_accepted_job_exes()) self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2) self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2}) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) self.assertEqual(node_offers._available_cpus, 68.0) self.assertEqual(node_offers._available_mem, 1536.0) self.assertEqual(node_offers._available_disk, 2222.0)
def test_job_exe_canceled(self): """Tests adding a job execution that becomes canceled while scheduling""" node_offers = NodeOffers(self.node) offer_1 = ResourceOffer('offer_1', self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0)) node_offers.add_offer(offer_1) offer_2 = ResourceOffer('offer_2', self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0)) node_offers.add_offer(offer_2) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) job_exe_1.execution_canceled() result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.TASK_INVALID) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
def test_no_offers(self): """Tests adding job executions when there are no offers""" node_offers = NodeOffers(self.node) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), []) job_exe_1 = RunningJobExecution(self.running_job_exe_1) result = node_offers.consider_next_task(job_exe_1) self.assertEqual(result, NodeOffers.NO_OFFERS) job_exe_new = QueuedJobExecution(self.queue_1) result = node_offers.consider_new_job_exe(job_exe_new) self.assertEqual(result, NodeOffers.NO_OFFERS) self.assertFalse(node_offers.has_accepted_job_exes()) self.assertListEqual(node_offers.get_accepted_running_job_exes(), []) self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
def test_no_ready_offers(self): """Tests considering job executions when no offers are ready""" offer_1 = ResourceOffer( 'offer_1', self.node_agent_paused, NodeResources(cpus=2.0, mem=1024.0, disk=1024.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent, NodeResources(cpus=25.0, mem=2048.0, disk=2048.0)) manager = OfferManager() manager.add_new_offers([offer_1, offer_2]) job_exe_1 = QueuedJobExecution(self.queue_1) result = manager.consider_new_job_exe(job_exe_1) self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE) job_exe_2 = RunningJobExecution(self.running_job_exe_1) result = manager.consider_next_task(job_exe_2) self.assertEqual(result, OfferManager.NODE_NOT_READY)
def test_job_type_limit(self, mock_taskinfo): """Tests running the scheduling thread with a job type limit""" mock_taskinfo.return_value = MagicMock() Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING') queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running job_exe_mgr.schedule_job_exes([RunningJobExecution(job_exe_1)]) offer_1 = ResourceOffer( 'offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0)) offer_mgr.add_new_offers([offer_1, offer_2]) # Ignore Docker pull tasks for node in node_mgr.get_nodes(): node._is_image_pulled = True # Ignore cleanup tasks for node in node_mgr.get_nodes(): node._initial_cleanup_completed() node._update_state() num_tasks = self._scheduling_thread._perform_scheduling() self.assertEqual( num_tasks, 3 ) # One is already running, should only be able to schedule 3 more
def schedule_job_executions(self, framework_id, job_executions, workspaces): """Schedules the given job executions on the provided nodes and resources. The corresponding queue models will be deleted from the database. All database changes occur in an atomic transaction. :param framework_id: The scheduling framework ID :type framework_id: string :param job_executions: A list of queued job executions that have been given nodes and resources on which to run :type job_executions: list[:class:`queue.job_exe.QueuedJobExecution`] :param workspaces: A dict of all workspaces stored by name :type workspaces: {string: :class:`storage.models.Workspace`} :returns: The scheduled job executions :rtype: list[:class:`job.execution.job_exe.RunningJobExecution`] """ if not job_executions: return [] job_exe_ids = [] for job_execution in job_executions: job_exe_ids.append(job_execution.id) # Lock corresponding job executions job_exes = {} for job_exe in JobExecution.objects.select_for_update().filter( id__in=job_exe_ids).order_by('id'): job_exes[job_exe.id] = job_exe # Set up job executions to schedule executions_to_schedule = [] for job_execution in job_executions: queue = job_execution.queue node_id = job_execution.provided_node_id resources = job_execution.provided_resources job_exe = job_exes[job_execution.id] # Ignore executions that are no longer queued (executions may have been changed since queue model was last # queried) if job_exe.status != 'QUEUED': continue # Check that resources are sufficient if resources.cpus < queue.cpus_required: msg = 'Job execution requires %s CPUs and only %s were provided' raise Exception( msg % (str(queue.cpus_required), str(resources.cpus))) if resources.mem < queue.mem_required: msg = 'Job execution requires %s MiB of memory and only %s MiB were provided' raise Exception(msg % (str(queue.mem_required), str(resources.mem))) if resources.disk_in < queue.disk_in_required: msg = 'Job execution requires %s MiB of input disk space and only %s MiB were provided' raise Exception( msg % (str(queue.disk_in_required), str(resources.disk_in))) if resources.disk_out < queue.disk_out_required: msg = 'Job execution requires %s MiB of output disk space and only %s MiB were provided' raise Exception( msg % (str(queue.disk_out_required), str(resources.disk_out))) if resources.disk_total < queue.disk_total_required: msg = 'Job execution requires %s MiB of total disk space and only %s MiB were provided' raise Exception(msg % (str( queue.disk_total_required), str(resources.disk_total))) executions_to_schedule.append((job_exe, node_id, resources)) # Schedule job executions scheduled_job_exes = [] job_exe_ids_scheduled = [] for job_exe in JobExecution.objects.schedule_job_executions( framework_id, executions_to_schedule, workspaces): scheduled_job_exes.append(RunningJobExecution(job_exe)) job_exe_ids_scheduled.append(job_exe.id) # Clear the scheduled job executions from the queue Queue.objects.filter(job_exe_id__in=job_exe_ids_scheduled).delete() return scheduled_job_exes
def test_running_executions(self): """Tests the metrics with running executions that complete""" node_model_1 = node_test_utils.create_node() node_model_2 = node_test_utils.create_node() job_type_1 = job_test_utils.create_job_type() job_type_2 = job_test_utils.create_job_type() job_exe_model_1 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_1) job_exe_model_2 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_1) job_exe_model_3 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_1) job_exe_model_4 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_1) job_exe_model_5 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_2) job_exe_model_6 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_2) job_exe_model_7 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2) job_exe_model_8 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2) job_exe_model_9 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2) job_exe_model_10 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2) job_exe_model_11 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2) job_exe_1 = RunningJobExecution(job_exe_model_1) job_exe_2 = RunningJobExecution(job_exe_model_2) job_exe_3 = RunningJobExecution(job_exe_model_3) job_exe_4 = RunningJobExecution(job_exe_model_4) job_exe_5 = RunningJobExecution(job_exe_model_5) job_exe_6 = RunningJobExecution(job_exe_model_6) job_exe_7 = RunningJobExecution(job_exe_model_7) job_exe_8 = RunningJobExecution(job_exe_model_8) job_exe_9 = RunningJobExecution(job_exe_model_9) job_exe_10 = RunningJobExecution(job_exe_model_10) job_exe_11 = RunningJobExecution(job_exe_model_11) # NOTE: This unit test is about to get CRAZY. I apologize for the complexity, but this is needed for a # thorough testing self.metrics.add_running_job_exes([job_exe_1, job_exe_2, job_exe_3, job_exe_4, job_exe_5, job_exe_6, job_exe_7, job_exe_8, job_exe_9, job_exe_10, job_exe_11]) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, now()) # Check expected totals self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 4) for job_type_dict in node_list_dict[0]['job_executions']['running']['by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 3) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 1) else: self.fail('Unexpected job type ID') self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['running']['by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Finish some job executions end_time_1 = now() job_exe_1._set_finished_status('COMPLETED', end_time_1) job_exe_2._set_finished_status('FAILED', end_time_1, error=self.data_error) job_exe_4._set_finished_status('FAILED', end_time_1, error=self.alg_error) self.metrics.job_exe_finished(job_exe_1) self.metrics.job_exe_finished(job_exe_2) self.metrics.job_exe_finished(job_exe_4) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, end_time_1 + datetime.timedelta(seconds=1)) # Check expected totals self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['completed']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 2) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['job_type_id'], job_type_2.id) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['data']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['running']['by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Finish some job executions (all executions still on node 2) end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH job_exe_5._set_finished_status('COMPLETED', end_time_2) job_exe_6._set_finished_status('COMPLETED', end_time_2) job_exe_7._set_finished_status('COMPLETED', end_time_2) job_exe_8._set_finished_status('COMPLETED', end_time_2) job_exe_9._set_finished_status('COMPLETED', end_time_2) job_exe_10._set_finished_status('COMPLETED', end_time_2) job_exe_11._set_finished_status('COMPLETED', end_time_2) self.metrics.job_exe_finished(job_exe_5) self.metrics.job_exe_finished(job_exe_6) self.metrics.job_exe_finished(job_exe_7) self.metrics.job_exe_finished(job_exe_8) self.metrics.job_exe_finished(job_exe_9) self.metrics.job_exe_finished(job_exe_10) self.metrics.job_exe_finished(job_exe_11) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, end_time_2) # Check expected totals self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['completed']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 2) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['job_type_id'], job_type_2.id) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['data']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 7) for job_type_dict in node_list_dict[1]['job_executions']['completed']['by_job_type']: if job_type_dict['job_type_id'] == job_type_1.id: self.assertEqual(job_type_dict['count'], 2) elif job_type_dict['job_type_id'] == job_type_2.id: self.assertEqual(job_type_dict['count'], 5) else: self.fail('Unexpected job type ID') self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0) # Let all finished job executions roll off by time, only running remaining end_time_3 = end_time_2 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD end_time_3 += FinishedJobExeMetricsOverTime.BLOCK_LENGTH + datetime.timedelta(seconds=1) node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}] self.metrics.generate_status_json(node_list_dict, end_time_3) # Check expected totals self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 1) self.assertEqual(len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1) self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['count'], 1) self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['job_type_id'], job_type_1.id) self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 0) self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0) self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0)
def _process_scheduled_job_executions(self, framework_id, queued_job_executions, job_types, workspaces): """Processes the given queued job executions that have been scheduled and returns the new running job executions. All database updates occur in an atomic transaction. :param framework_id: The scheduling framework ID :type framework_id: string :param queued_job_executions: A list of queued job executions that have been scheduled :type queued_job_executions: list :param job_types: A dict of all job types stored by ID :type job_types: dict :param workspaces: A dict of all workspaces stored by name :type workspaces: dict :returns: The running job executions stored in lists by node ID :rtype: dict """ started = now() running_job_exes = {} configurator = ScheduledExecutionConfigurator(workspaces) with transaction.atomic(): # Bulk create the job execution models job_exe_models = [] scheduled_models = {} # {queue ID: (job_exe model, config)} canceled_models = {} # {queue ID: job_exe model} for queued_job_exe in queued_job_executions: job_exe_model = queued_job_exe.create_job_exe_model( framework_id, started) job_exe_models.append(job_exe_model) if queued_job_exe.is_canceled: canceled_models[queued_job_exe.id] = job_exe_model else: job_type = job_types[job_exe_model.job_type_id] # The configuration stored in the job_exe model has been censored so it is safe to save in database # The returned configuration may contain secrets and should be passed to running job_exe for use config = configurator.configure_scheduled_job( job_exe_model, job_type, queued_job_exe.interface, scheduler_mgr.config.system_logging_level) scheduled_models[queued_job_exe.id] = (job_exe_model, config) JobExecution.objects.bulk_create(job_exe_models) # Create running and canceled job executions queue_ids = [] canceled_job_exe_end_models = [] for queued_job_exe in queued_job_executions: queue_ids.append(queued_job_exe.id) if queued_job_exe.is_canceled: job_exe_model = canceled_models[queued_job_exe.id] canceled_job_exe_end_models.append( job_exe_model.create_canceled_job_exe_end_model( started)) else: agent_id = queued_job_exe.scheduled_agent_id job_exe_model = scheduled_models[queued_job_exe.id][0] job_type = job_types[job_exe_model.job_type_id] config = scheduled_models[queued_job_exe.id][ 1] # May contain secrets! priority = queued_job_exe.priority running_job_exe = RunningJobExecution( agent_id, job_exe_model, job_type, config, priority) if running_job_exe.node_id in running_job_exes: running_job_exes[running_job_exe.node_id].append( running_job_exe) else: running_job_exes[running_job_exe.node_id] = [ running_job_exe ] # Add canceled job execution end models to manager to be sent to messaging backend if canceled_job_exe_end_models: job_exe_mgr.add_canceled_job_exes(canceled_job_exe_end_models) # Delete queue models Queue.objects.filter(id__in=queue_ids).delete() duration = now() - started msg = 'Queries to process scheduled jobs took %.3f seconds' if duration > SCHEDULE_QUERY_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds()) return running_job_exes
class TestJobExecutionManager(TransactionTestCase): """Tests the JobExecutionManager class""" fixtures = ['basic_errors.json', 'basic_job_errors.json'] def setUp(self): django.setup() # Clear error cache so test works correctly CACHED_BUILTIN_ERRORS.clear() self.node_model_1 = node_test_utils.create_node() self.job_exe_model_1 = job_test_utils.create_job_exe( status='RUNNING', node=self.node_model_1) self.job_exe_1 = RunningJobExecution(self.job_exe_model_1) self.node_model_2 = node_test_utils.create_node() self.job_exe_model_2 = job_test_utils.create_job_exe( status='RUNNING', node=self.node_model_2) self.job_exe_2 = RunningJobExecution(self.job_exe_model_2) self.job_exe_mgr = JobExecutionManager() def test_generate_status_json(self): """Tests calling generate_status_json() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2]) json_dict = [{ 'id': self.node_model_1.id }, { 'id': self.node_model_2.id }] self.job_exe_mgr.generate_status_json(json_dict, now()) for node_dict in json_dict: self.assertEqual(node_dict['job_executions']['running']['total'], 1) def test_schedule_job_exes(self): """Tests calling schedule_job_exes() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2]) # Both executions should be in the manager and ready self.assertEqual(len(self.job_exe_mgr.get_running_job_exes()), 2) self.assertEqual(len(self.job_exe_mgr.get_ready_job_exes()), 2) self.assertIsNotNone( self.job_exe_mgr.get_running_job_exe(self.job_exe_1.id)) self.assertIsNotNone( self.job_exe_mgr.get_running_job_exe(self.job_exe_2.id)) def test_handle_task_timeout(self): """Tests calling handle_task_timeout() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2]) task = self.job_exe_1.start_next_task() self.job_exe_mgr.handle_task_timeout(task, now()) self.assertEqual(self.job_exe_1.status, 'FAILED') def test_handle_task_update(self): """Tests calling handle_task_update() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2]) # Start task task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update( task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) # Job execution is not finished, so None should be returned result = self.job_exe_mgr.handle_task_update(update) self.assertIsNone(result) # Fail task task_1_failed = task_1_started + timedelta(seconds=1) update = job_test_utils.create_task_status_update( task_1.id, 'agent', TaskStatusUpdate.FAILED, task_1_failed, exit_code=1) # Job execution is finished, so it should be returned result = self.job_exe_mgr.handle_task_update(update) self.assertEqual(self.job_exe_1.id, result.id) def test_init_with_database(self): """Tests calling init_with_database() successfully""" self.job_exe_mgr.init_with_database() def test_lost_node(self): """Tests calling lost_node() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2]) task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update( task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.job_exe_mgr.handle_task_update(update) lost_job_exe = self.job_exe_mgr.lost_node(self.node_model_1.id, now())[0] self.assertEqual(lost_job_exe.id, self.job_exe_1.id) self.assertEqual(lost_job_exe.status, 'FAILED') self.assertEqual(lost_job_exe._error.name, 'node-lost') def test_sync_with_database(self): """Tests calling sync_with_database() successfully""" self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2]) task_1 = self.job_exe_1.start_next_task() task_1_started = now() - timedelta(minutes=5) update = job_test_utils.create_task_status_update( task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started) self.job_exe_mgr.handle_task_update(update) # Cancel job_exe_1 and have manager sync with database JobExecution.objects.update_status([self.job_exe_model_1], 'CANCELED', now()) tasks_to_kill = self.job_exe_mgr.sync_with_database() self.assertEqual(self.job_exe_1.status, 'CANCELED') self.assertEqual(len(tasks_to_kill), 1) self.assertEqual(tasks_to_kill[0].id, task_1.id)