def test_get_tasks_to_kill(self): """Tests calling get_tasks_to_kill() successfully""" # Start with 5 tasks scheduler_mgr.config.num_message_handlers = 5 service = MessagingService() tasks = service.get_tasks_to_schedule() task_mgr.launch_tasks(tasks, now()) # Lower number of desired tasks to 3, should get 2 to kill scheduler_mgr.config.num_message_handlers = 3 tasks_to_kill = service.get_tasks_to_kill() self.assertEqual(len(tasks_to_kill), 2) # Kill the 2 tasks for task in tasks_to_kill: update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.KILLED, now()) task_mgr.handle_task_update(update) service.handle_task_update(update) self.assertEqual(service.get_actual_task_count(), 3) # Increase desired tasks to 10, should not get any to kill scheduler_mgr.config.num_message_handlers = 10 tasks_to_kill = service.get_tasks_to_kill() self.assertEqual(len(tasks_to_kill), 0)
def test_get_tasks_to_schedule(self): """Tests calling get_tasks_to_schedule() successfully""" # Set desired tasks to 5 scheduler_mgr.config.num_message_handlers = 5 service = MessagingService() # Should get 5 tasks to schedule tasks = service.get_tasks_to_schedule() self.assertEqual(len(tasks), 5) # Launch the 5 tasks task_mgr.launch_tasks(tasks, now()) self.assertEqual(service.get_actual_task_count(), 5) # Lower number of desired tasks to 3, should not get any to schedule scheduler_mgr.config.num_message_handlers = 3 tasks = service.get_tasks_to_schedule() self.assertEqual(len(tasks), 0)
def test_handle_task_update(self): """Tests calling handle_task_update() successfully""" # Start with 5 tasks scheduler_mgr.config.num_message_handlers = 5 service = MessagingService() tasks = service.get_tasks_to_schedule() task_mgr.launch_tasks(tasks, now()) # One task fails task = tasks[0] update = job_test_utils.create_task_status_update( task.id, task.agent_id, TaskStatusUpdate.FAILED, now()) task_mgr.handle_task_update(update) service.handle_task_update(update) self.assertEqual(service.get_actual_task_count(), 4) # Should get one new task to schedule tasks = service.get_tasks_to_schedule() self.assertEqual(len(tasks), 1)
def _launch_tasks(self, client, nodes): """Launches all of the tasks that have been scheduled on the given nodes :param client: The Mesos scheduler client :type client: :class:`mesoshttp.client.MesosClient` :param nodes: The dict of all scheduling nodes stored by node ID :type nodes: dict :returns: The number of tasks that were launched and the number of offers accepted :rtype: tuple """ started = now() # Start and launch tasks in the task manager all_tasks = [] for node in nodes.values(): node.start_job_exe_tasks() all_tasks.extend(node.allocated_tasks) task_mgr.launch_tasks(all_tasks, started) # Launch tasks in Mesos node_count = 0 total_node_count = 0 total_offer_count = 0 total_task_count = 0 total_offer_resources = NodeResources() total_task_resources = NodeResources() for node in nodes.values(): mesos_offers = [] mesos_tasks = [] offers = node.allocated_offers for offer in offers: total_offer_count += 1 total_offer_resources.add(offer.resources) mesos_offers.append(offer.mesos_offer) tasks = node.allocated_tasks for task in tasks: total_task_resources.add(task.get_resources()) mesos_tasks.append(create_mesos_task(task)) task_count = len(tasks) total_task_count += task_count if task_count: node_count += 1 if mesos_offers: total_node_count += 1 try: client.combine_offers(mesos_offers, mesos_tasks) except Exception: logger.exception( 'Error occurred while launching tasks on node %s', node.hostname) duration = now() - started msg = 'Launching tasks took %.3f seconds' if duration > LAUNCH_TASK_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds()) declined_resources = NodeResources() declined_resources.add(total_offer_resources) declined_resources.subtract(total_task_resources) if total_offer_count: logger.info( 'Accepted %d offer(s) from %d node(s), launched %d task(s) with %s on %d node(s), declined %s', total_offer_count, total_node_count, total_task_count, total_task_resources, node_count, declined_resources) return total_task_count, total_offer_count
def _schedule_accepted_tasks(self): """Schedules all of the tasks that have been accepted :returns: The number of Mesos tasks that were scheduled :rtype: int """ when = now() tasks = [] tasks_to_launch = {} # {Node ID: [Mesos Tasks]} queued_job_exes_to_schedule = [] node_offers_list = offer_mgr.pop_offers_with_accepted_job_exes() for node_offers in node_offers_list: mesos_tasks = [] tasks_to_launch[node_offers.node.id] = mesos_tasks # Add cleanup tasks for task in node_offers.get_accepted_tasks(): tasks.append(task) mesos_tasks.append(create_mesos_task(task)) # Start next task for already running job executions that were accepted for running_job_exe in node_offers.get_accepted_running_job_exes(): task = running_job_exe.start_next_task() if task: tasks.append(task) mesos_tasks.append(create_mesos_task(task)) # Gather up queued job executions that were accepted for queued_job_exe in node_offers.get_accepted_new_job_exes(): queued_job_exes_to_schedule.append(queued_job_exe) try: # Schedule queued job executions and start their first tasks workspaces = workspace_mgr.get_workspaces() scheduled_job_exes = self._schedule_queued_job_executions(queued_job_exes_to_schedule, workspaces) job_exe_mgr.schedule_job_exes(scheduled_job_exes) for scheduled_job_exe in scheduled_job_exes: task = scheduled_job_exe.start_next_task() if task: tasks.append(task) tasks_to_launch[scheduled_job_exe.node_id].append(create_mesos_task(task)) except OperationalError: logger.exception('Failed to schedule queued job executions') # Launch tasks on Mesos task_mgr.launch_tasks(tasks, when) total_num_tasks = 0 total_num_nodes = 0 for node_offers in node_offers_list: task_list = tasks_to_launch[node_offers.node.id] num_tasks = len(task_list) total_num_tasks += num_tasks if num_tasks: total_num_nodes += 1 mesos_offer_ids = [] for offer_id in node_offers.offer_ids: mesos_offer_id = mesos_pb2.OfferID() mesos_offer_id.value = offer_id mesos_offer_ids.append(mesos_offer_id) self._driver.launchTasks(mesos_offer_ids, task_list) if total_num_tasks: logger.info('Launched %i Mesos task(s) on %i node(s)', total_num_tasks, total_num_nodes) return total_num_tasks