def test_job_type_limit(self): """Tests calling perform_scheduling() with a job type limit""" Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_seed_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() running_job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_1.agent_id, job_type=job_type_with_limit, node=self.node_1) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running job_exe_mgr.schedule_job_exes([running_job_exe_1], []) offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(0.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 3) # One is already running, should only be able to schedule 3 more
def test_job_type_limit(self, mock_taskinfo): """Tests running the scheduling thread with a job type limit""" mock_taskinfo.return_value = MagicMock() Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING') queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running running_job_mgr.add_job_exes([RunningJobExecution(job_exe_1)]) offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0)) offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0)) offer_mgr.add_new_offers([offer_1, offer_2]) # Ignore cleanup tasks for node in node_mgr.get_nodes(): node.initial_cleanup_completed() num_tasks = self._scheduling_thread._perform_scheduling() self.assertEqual(num_tasks, 3) # One is already running, should only be able to schedule 3 more
def test_paused_job_type(self): """Tests calling perform_scheduling() when a job type is paused""" offer_1 = ResourceOffer( 'offer_1', self.agent_1.agent_id, self.framework_id, NodeResources([Cpus(2.0), Mem(1024.0), Disk(1024.0)]), now(), None) offer_2 = ResourceOffer( 'offer_2', self.agent_2.agent_id, self.framework_id, NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None) resource_mgr.add_new_offers([offer_1, offer_2]) self.queue_1.job_type.is_paused = True self.queue_1.job_type.save() job_type_mgr.sync_with_database() scheduling_manager = SchedulingManager() num_tasks = scheduling_manager.perform_scheduling(self._client, now()) self.assertEqual(num_tasks, 1) # Schedule queued job execution that is not paused self.assertEqual( JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0) self.assertEqual( JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1) self.assertEqual( Queue.objects.filter( id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
def setUp(self): django.setup() Scheduler.objects.initialize_scheduler() self._driver = MagicMock() scheduler_mgr.sync_with_database() offer_mgr.clear() self.node_agent_1 = 'agent_1' self.node_agent_2 = 'agent_2' self.slave_infos = [SlaveInfo('host_1', slave_id=self.node_agent_1), SlaveInfo('host_2', slave_id=self.node_agent_2)] node_mgr.clear() node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2]) with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves: mock_get_slaves.return_value = self.slave_infos node_mgr.sync_with_database('master_host', 5050) # Ignore initial cleanup tasks for node in node_mgr.get_nodes(): node.initial_cleanup_completed() self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) job_type_mgr.sync_with_database() self._scheduling_thread = SchedulingThread(self._driver, '123')
def _perform_sync(self): """Performs the sync with the database """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() workspace_mgr.sync_with_database() mesos_master = scheduler_mgr.mesos_address node_mgr.sync_with_database(mesos_master.hostname, mesos_master.port) self._sync_running_job_executions()
def setUp(self): django.setup() reset_error_cache() self.framework_id = '1234' Scheduler.objects.initialize_scheduler() Scheduler.objects.update( num_message_handlers=0 ) # Prevent message handler tasks from scheduling self._client = MagicMock() scheduler_mgr.sync_with_database() scheduler_mgr.update_from_mesos(framework_id=self.framework_id) resource_mgr.clear() job_exe_mgr.clear() self.agent_1 = Agent('agent_1', 'host_1') self.agent_2 = Agent('agent_2', 'host_2') self.agent_3 = Agent('agent_3', 'host_2') node_mgr.clear() node_mgr.register_agents([self.agent_1, self.agent_2]) node_mgr.sync_with_database(scheduler_mgr.config) # Ignore initial cleanup, health check, and image pull tasks for node in node_mgr.get_nodes(): node._last_health_task = now() node._initial_cleanup_completed() node._is_image_pulled = True node._update_state() if node.agent_id == 'agent_1': self.node_1_id = node.id cleanup_mgr.update_nodes(node_mgr.get_nodes()) self.node_1 = Node.objects.get(id=self.node_1_id) # Ignore system tasks system_task_mgr._is_db_update_completed = True self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) self.queue_large = queue_test_utils.create_queue( resources=NodeResources([Cpus( 125.0), Mem(12048.0), Disk(12048.0)])) job_type_mgr.sync_with_database()
def registered(self, driver, frameworkId, masterInfo): """ Invoked when the scheduler successfully registers with a Mesos master. It is called with the frameworkId, a unique ID generated by the master, and the masterInfo which is information about the master itself. See documentation for :meth:`mesos_api.mesos.Scheduler.registered`. """ self._driver = driver self._framework_id = frameworkId.value self._master_hostname = masterInfo.hostname self._master_port = masterInfo.port logger.info('Scale scheduler registered as framework %s with Mesos master at %s:%i', self._framework_id, self._master_hostname, self._master_port) initialize_system() Scheduler.objects.update_master(self._master_hostname, self._master_port) scheduler_mgr.update_from_mesos(self._framework_id, HostAddress(self._master_hostname, self._master_port)) recon_mgr.driver = self._driver # Initial database sync job_type_mgr.sync_with_database() scheduler_mgr.sync_with_database() workspace_mgr.sync_with_database() # Start up background threads self._db_sync_thread = DatabaseSyncThread(self._driver) db_sync_thread = threading.Thread(target=self._db_sync_thread.run) db_sync_thread.daemon = True db_sync_thread.start() self._recon_thread = ReconciliationThread() recon_thread = threading.Thread(target=self._recon_thread.run) recon_thread.daemon = True recon_thread.start() self._scheduling_thread = SchedulingThread(self._driver, self._framework_id) scheduling_thread = threading.Thread(target=self._scheduling_thread.run) scheduling_thread.daemon = True scheduling_thread.start() self._task_thread = TaskUpdateThread() task_thread = threading.Thread(target=self._task_thread.run) task_thread.daemon = True task_thread.start() self._reconcile_running_jobs()
def _perform_sync(self): """Performs the sync with the database """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() workspace_mgr.sync_with_database() mesos_master = scheduler_mgr.mesos_address node_mgr.sync_with_database(mesos_master.hostname, mesos_master.port) # Kill running tasks for canceled job executions for task_to_kill in job_exe_mgr.sync_with_database(): pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill.id logger.info('Killing task %s', task_to_kill.id) self._driver.killTask(pb_task_to_kill) if settings.SECRETS_URL: secrets_mgr.sync_with_backend()
def _execute(self): """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute` """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() job_exe_mgr.sync_with_database() workspace_mgr.sync_with_database() node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr.update_nodes(node_mgr.get_nodes()) mesos_master = scheduler_mgr.mesos_address resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port) # Handle canceled job executions for finished_job_exe in job_exe_mgr.sync_with_database(): cleanup_mgr.add_job_execution(finished_job_exe) if settings.SECRETS_URL: secrets_mgr.sync_with_backend()
def test_job_type_limit(self, mock_taskinfo): """Tests running the scheduling thread with a job type limit""" mock_taskinfo.return_value = MagicMock() Queue.objects.all().delete() job_type_with_limit = job_test_utils.create_job_type() job_type_with_limit.max_scheduled = 4 job_type_with_limit.save() job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING') queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) queue_test_utils.create_queue(job_type=job_type_with_limit) job_type_mgr.sync_with_database() # One job of this type is already running job_exe_mgr.schedule_job_exes([RunningJobExecution(job_exe_1)]) offer_1 = ResourceOffer( 'offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0)) offer_2 = ResourceOffer( 'offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0)) offer_mgr.add_new_offers([offer_1, offer_2]) # Ignore Docker pull tasks for node in node_mgr.get_nodes(): node._is_image_pulled = True # Ignore cleanup tasks for node in node_mgr.get_nodes(): node._initial_cleanup_completed() node._update_state() num_tasks = self._scheduling_thread._perform_scheduling() self.assertEqual( num_tasks, 3 ) # One is already running, should only be able to schedule 3 more
def _execute(self): """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute` """ scheduler_mgr.sync_with_database() job_type_mgr.sync_with_database() workspace_mgr.sync_with_database() node_mgr.sync_with_database(scheduler_mgr.config) cleanup_mgr.update_nodes(node_mgr.get_nodes()) mesos_master = scheduler_mgr.mesos_address resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port) # Kill running tasks for canceled job executions for task_to_kill in job_exe_mgr.sync_with_database(): pb_task_to_kill = mesos_pb2.TaskID() pb_task_to_kill.value = task_to_kill.id logger.info('Killing task %s', task_to_kill.id) self._driver.killTask(pb_task_to_kill) if settings.SECRETS_URL: secrets_mgr.sync_with_backend()
def setUp(self): django.setup() Scheduler.objects.initialize_scheduler() self._driver = MagicMock() scheduler_mgr.sync_with_database() offer_mgr.clear() self.node_agent_1 = 'agent_1' self.node_agent_2 = 'agent_2' self.slave_infos = [ SlaveInfo('host_1', slave_id=self.node_agent_1), SlaveInfo('host_2', slave_id=self.node_agent_2) ] node_mgr.clear() node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2]) with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves: mock_get_slaves.return_value = self.slave_infos node_mgr.sync_with_database('master_host', 5050) # Ignore initial cleanup tasks and health check tasks for node in node_mgr.get_nodes(): node._last_heath_task = now() node._initial_cleanup_completed() node._update_state() self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0, disk_out_required=200.0, disk_total_required=300.0) self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0, disk_out_required=45.0, disk_total_required=445.0) job_type_mgr.sync_with_database() self._scheduling_thread = SchedulingThread(self._driver, '123')
def initialize(self): """Initializes the scheduler and gets it ready to connect to Mesos. This method should only ever be called once. """ initialize_system() # Initial database sync logger.info('Performing initial sync with Scale database') logger.info('Retrieving errors...') reset_error_cache() logger.info('Retrieving job execution metrics...') job_exe_mgr.init_with_database() logger.info('Retrieving job types...') job_type_mgr.sync_with_database() logger.info('Retrieving workspaces...') workspace_mgr.sync_with_database() logger.info('Retrieving scheduler settings...') scheduler_mgr.sync_with_database() # Start up background threads self._threads = [] logger.info('Starting up background threads') self._messaging_thread = MessagingThread() restart_msg = RestartScheduler() restart_msg.when = now() self._messaging_thread.add_initial_messages([restart_msg]) messaging_thread = threading.Thread(target=self._messaging_thread.run) messaging_thread.daemon = True messaging_thread.start() self._threads.append(messaging_thread) self._recon_thread = ReconciliationThread() recon_thread = threading.Thread(target=self._recon_thread.run) recon_thread.daemon = True recon_thread.start() self._threads.append(recon_thread) self._scheduler_status_thread = SchedulerStatusThread() scheduler_status_thread = threading.Thread(target=self._scheduler_status_thread.run) scheduler_status_thread.daemon = True scheduler_status_thread.start() self._threads.append(scheduler_status_thread) self._scheduling_thread = SchedulingThread(self._client) scheduling_thread = threading.Thread(target=self._scheduling_thread.run) scheduling_thread.daemon = True scheduling_thread.start() self._threads.append(scheduling_thread) self._sync_thread = SyncThread(self._driver) sync_thread = threading.Thread(target=self._sync_thread.run) sync_thread.daemon = True sync_thread.start() self._threads.append(sync_thread) self._task_handling_thread = TaskHandlingThread(self._driver) task_handling_thread = threading.Thread(target=self._task_handling_thread.run) task_handling_thread.daemon = True task_handling_thread.start() self._threads.append(task_handling_thread) self._task_update_thread = TaskUpdateThread() task_update_thread = threading.Thread(target=self._task_update_thread.run) task_update_thread.daemon = True task_update_thread.start() self._threads.append(task_update_thread)
def registered(self, driver, frameworkId, masterInfo): """ Invoked when the scheduler successfully registers with a Mesos master. It is called with the frameworkId, a unique ID generated by the master, and the masterInfo which is information about the master itself. See documentation for :meth:`mesos_api.mesos.Scheduler.registered`. """ self._driver = driver self._framework_id = frameworkId.value self._master_hostname = masterInfo.hostname self._master_port = masterInfo.port logger.info( 'Scale scheduler registered as framework %s with Mesos master at %s:%i', self._framework_id, self._master_hostname, self._master_port) initialize_system() Scheduler.objects.update_master(self._master_hostname, self._master_port) scheduler_mgr.update_from_mesos( self._framework_id, HostAddress(self._master_hostname, self._master_port)) recon_mgr.driver = self._driver # Initial database sync logger.info('Performing initial sync with Scale database') reset_error_cache() job_exe_mgr.init_with_database() job_type_mgr.sync_with_database() scheduler_mgr.sync_with_database() workspace_mgr.sync_with_database() # Start up background threads self._messaging_thread = MessagingThread() restart_msg = RestartScheduler() restart_msg.when = now() self._messaging_thread.add_initial_messages([restart_msg]) messaging_thread = threading.Thread(target=self._messaging_thread.run) messaging_thread.daemon = True messaging_thread.start() self._recon_thread = ReconciliationThread() recon_thread = threading.Thread(target=self._recon_thread.run) recon_thread.daemon = True recon_thread.start() self._scheduler_status_thread = SchedulerStatusThread() scheduler_status_thread = threading.Thread( target=self._scheduler_status_thread.run) scheduler_status_thread.daemon = True scheduler_status_thread.start() self._scheduling_thread = SchedulingThread(self._driver) scheduling_thread = threading.Thread( target=self._scheduling_thread.run) scheduling_thread.daemon = True scheduling_thread.start() self._sync_thread = SyncThread(self._driver) sync_thread = threading.Thread(target=self._sync_thread.run) sync_thread.daemon = True sync_thread.start() self._task_handling_thread = TaskHandlingThread(self._driver) task_handling_thread = threading.Thread( target=self._task_handling_thread.run) task_handling_thread.daemon = True task_handling_thread.start() self._task_update_thread = TaskUpdateThread() task_update_thread = threading.Thread( target=self._task_update_thread.run) task_update_thread.daemon = True task_update_thread.start() self._reconcile_running_jobs()