def setUp(self): django.setup() reset_error_cache() self.framework_id = '1234' Scheduler.objects.initialize_scheduler() Scheduler.objects.update( num_message_handlers=0 ) # Prevent message handler tasks from scheduling self._client = MagicMock() scheduler_mgr.sync_with_database() scheduler_mgr.update_from_mesos(framework_id=self.framework_id) resource_mgr.clear() job_exe_mgr.clear() self.agent_1 = Agent('agent_1', 'host_1') self.agent_2 = Agent('agent_2', 'host_2') self.agent_3 = Agent('agent_3', 'host_2') node_mgr.clear() node_mgr.register_agents([self.agent_1, self.agent_2]) node_mgr.sync_with_database(scheduler_mgr.config) # Ignore initial cleanup, health check, and image pull tasks for node in node_mgr.get_nodes(): node._last_health_task = now() node._initial_cleanup_completed() node._is_image_pulled = True node._update_state() if node.agent_id == 'agent_1': self.node_1_id = node.id cleanup_mgr.update_nodes(node_mgr.get_nodes()) self.node_1 = Node.objects.get(id=self.node_1_id) # Ignore system tasks system_task_mgr._is_db_update_completed = True self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) self.queue_large = queue_test_utils.create_queue( resources=NodeResources([Cpus( 125.0), Mem(12048.0), Disk(12048.0)])) job_type_mgr.sync_with_database()
def test_job_type_limit(self, mock_taskinfo): """Tests running the scheduling thread with a job type limit""" mock_taskinfo.return_value = MagicMock() Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING') queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running running_job_mgr.add_job_exes([RunningJobExecution(job_exe_1)]) offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0)) offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0)) offer_mgr.add_new_offers([offer_1, offer_2]) # Ignore cleanup tasks for node in node_mgr.get_nodes(): node.initial_cleanup_completed() num_tasks = self._scheduling_thread._perform_scheduling() self.assertEqual(num_tasks, 3) # One is already running, should only be able to schedule 3 more
def setUp(self): django.setup() Scheduler.objects.initialize_scheduler() self._driver = MagicMock() scheduler_mgr.sync_with_database() offer_mgr.clear() self.node_agent_1 = 'agent_1' self.node_agent_2 = 'agent_2' self.slave_infos = [SlaveInfo('host_1', slave_id=self.node_agent_1), SlaveInfo('host_2', slave_id=self.node_agent_2)] node_mgr.clear() node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2]) with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves: mock_get_slaves.return_value = self.slave_infos node_mgr.sync_with_database('master_host', 5050) # Ignore initial cleanup tasks for node in node_mgr.get_nodes(): node.initial_cleanup_completed() self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) job_type_mgr.sync_with_database() self._scheduling_thread = SchedulingThread(self._driver, '123')
def _perform_scheduling(self): """Performs task reconciliation with the Mesos master :returns: The number of Mesos tasks that were scheduled :rtype: int """ when = now() # Get updated node and job type models from managers nodes = node_mgr.get_nodes() cleanup_mgr.update_nodes(nodes) offer_mgr.update_nodes(nodes) offer_mgr.ready_new_offers() self._job_types = job_type_mgr.get_job_types() # Look at job type limits and determine number available to be scheduled self._job_type_limit_available = {} for job_type in self._job_types.values(): if job_type.max_scheduled: self._job_type_limit_available[job_type.id] = job_type.max_scheduled for running_job_exe in job_exe_mgr.get_running_job_exes(): if running_job_exe.job_type_id in self._job_type_limit_available: self._job_type_limit_available[running_job_exe.job_type_id] -= 1 self._consider_node_tasks(when) self._consider_running_job_exes() self._consider_new_job_exes() return self._schedule_accepted_tasks()
def _perform_scheduling(self): """Performs task reconciliation with the Mesos master :returns: The number of Mesos tasks that were scheduled :rtype: int """ # Get updated node and job type models from managers nodes = node_mgr.get_nodes() cleanup_mgr.update_nodes(nodes) offer_mgr.update_nodes(nodes) offer_mgr.ready_new_offers() self._job_types = job_type_mgr.get_job_types() # Look at job type limits and determine number available to be scheduled self._job_type_limit_available = {} for job_type in self._job_types.values(): if job_type.max_scheduled: self._job_type_limit_available[job_type.id] = job_type.max_scheduled for running_job_exe in running_job_mgr.get_all_job_exes(): if running_job_exe.job_type_id in self._job_type_limit_available: self._job_type_limit_available[running_job_exe.job_type_id] -= 1 self._send_tasks_for_reconciliation() self._consider_cleanup_tasks() self._consider_running_job_exes() self._consider_new_job_exes() return self._schedule_accepted_tasks()
def _prepare_nodes(self, tasks, running_job_exes, when): """Prepares the nodes to use for scheduling :param tasks: The current current running :type tasks: list :param running_job_exes: The currently running job executions :type running_job_exes: list :param when: The current time :type when: :class:`datetime.datetime` :returns: The dict of scheduling nodes stored by node ID :rtype: dict """ nodes = node_mgr.get_nodes() # Group tasks by agent ID tasks_by_agent_id = {} # {Agent ID: List of tasks} for task in tasks: if task.agent_id not in tasks_by_agent_id: tasks_by_agent_id[task.agent_id] = [task] else: tasks_by_agent_id[task.agent_id].append(task) # Group job executions by node ID running_exes_by_node_id = {} # {Node ID: List of running job exes} for running_job_exe in running_job_exes: if running_job_exe.node_id not in running_exes_by_node_id: running_exes_by_node_id[running_job_exe.node_id] = [ running_job_exe ] else: running_exes_by_node_id[running_job_exe.node_id].append( running_job_exe) agent_resources = resource_mgr.refresh_agent_resources(tasks, when) scheduling_nodes = {} # {Node ID: SchedulingNode} for node in nodes: agent_id = node.agent_id # Grab agent ID once since it could change while we are scheduling if agent_id in tasks_by_agent_id: node_tasks = tasks_by_agent_id[agent_id] else: node_tasks = [] if node.id in running_exes_by_node_id: node_exes = running_exes_by_node_id[node.id] else: node_exes = [] if agent_id in agent_resources: resource_set = agent_resources[agent_id] else: resource_set = ResourceSet() scheduling_node = SchedulingNode(agent_id, node, node_tasks, node_exes, resource_set) scheduling_nodes[scheduling_node.node_id] = scheduling_node return scheduling_nodes
def test_job_type_limit(self, mock_taskinfo): """Tests running the scheduling thread with a job type limit""" mock_taskinfo.return_value = MagicMock() Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING') queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running job_exe_mgr.schedule_job_exes([RunningJobExecution(job_exe_1)]) offer_1 = ResourceOffer( 'offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0)) offer_mgr.add_new_offers([offer_1, offer_2]) # Ignore Docker pull tasks for node in node_mgr.get_nodes(): node._is_image_pulled = True # Ignore cleanup tasks for node in node_mgr.get_nodes(): node._initial_cleanup_completed() node._update_state() num_tasks = self._scheduling_thread._perform_scheduling() self.assertEqual( num_tasks, 3 ) # One is already running, should only be able to schedule 3 more
def _execute(self): """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute` """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() job_exe_mgr.sync_with_database() workspace_mgr.sync_with_database() node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr.update_nodes(node_mgr.get_nodes()) mesos_master = scheduler_mgr.mesos_address resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port) # Handle canceled job executions for finished_job_exe in job_exe_mgr.sync_with_database(): cleanup_mgr.add_job_execution(finished_job_exe) if settings.SECRETS_URL: secrets_mgr.sync_with_backend()
def _execute(self): """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute` """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() workspace_mgr.sync_with_database() node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr.update_nodes(node_mgr.get_nodes()) mesos_master = scheduler_mgr.mesos_address resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port) # Kill running tasks for canceled job executions for task_to_kill in job_exe_mgr.sync_with_database(): pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill.id logger.info('Killing task %s', task_to_kill.id) self._driver.killTask(pb_task_to_kill) if settings.SECRETS_URL: secrets_mgr.sync_with_backend()
def setUp(self): django.setup() Scheduler.objects.initialize_scheduler() self._driver = MagicMock() scheduler_mgr.sync_with_database() offer_mgr.clear() self.node_agent_1 = 'agent_1' self.node_agent_2 = 'agent_2' self.slave_infos = [ SlaveInfo('host_1', slave_id=self.node_agent_1), SlaveInfo('host_2', slave_id=self.node_agent_2) ] node_mgr.clear() node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2]) with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves: mock_get_slaves.return_value = self.slave_infos node_mgr.sync_with_database('master_host', 5050) # Ignore initial cleanup tasks and health check tasks for node in node_mgr.get_nodes(): node._last_heath_task = now() node._initial_cleanup_completed() node._update_state() self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) job_type_mgr.sync_with_database() self._scheduling_thread = SchedulingThread(self._driver, '123')
def test_generate_nodes_status(self): """Tests the _generate_nodes_status method""" # Setup nodes from scheduler.node.manager import node_mgr node_mgr.clear() nodes = dependency_mgr._generate_nodes_status() self.assertDictEqual( nodes, { 'OK': False, 'detail': { 'msg': 'No nodes reported' }, 'errors': [{ 'NODES_OFFLINE': 'No nodes reported.' }], 'warnings': [] }) node_mgr.register_agents([ self.agent_1, self.agent_2, self.agent_3, self.agent_4, self.agent_5, self.agent_6, self.agent_7, self.agent_8, self.agent_9, self.agent_10 ]) node_mgr.sync_with_database(scheduler_mgr.config) nodes = node_mgr.get_nodes() self.assertEqual(len(nodes), 10) nodes = dependency_mgr._generate_nodes_status() self.assertDictEqual( nodes, { 'OK': True, 'detail': { 'msg': 'Enough nodes are online to function.' }, 'errors': [], 'warnings': [] }) node_mgr.lost_node(self.agent_1.agent_id) node_mgr.lost_node(self.agent_2.agent_id) node_mgr.lost_node(self.agent_3.agent_id) node_mgr.lost_node(self.agent_4.agent_id) nodes = dependency_mgr._generate_nodes_status() self.assertDictEqual( nodes, { 'OK': False, 'detail': { u'msg': u'Over a third of nodes are in an error state' }, 'errors': [{ 'NODES_ERRORED': 'Over a third of the nodes are offline or degraded.' }], 'warnings': [{ u'NODES_OFFLINE': u'4 nodes are offline' }] })