Example #1
0
    def test_job_type_limit(self):
        """Tests calling perform_scheduling() with a job type limit"""
        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_seed_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        running_job_exe_1 = job_test_utils.create_running_job_exe(agent_id=self.agent_1.agent_id,
                                                                  job_type=job_type_with_limit, node=self.node_1)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        job_exe_mgr.schedule_job_exes([running_job_exe_1], [])

        offer_1 = ResourceOffer('offer_1', self.agent_1.agent_id, self.framework_id,
                                NodeResources([Cpus(0.0), Mem(1024.0), Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer('offer_2', self.agent_2.agent_id, self.framework_id,
                                NodeResources([Cpus(25.0), Mem(2048.0), Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())
        self.assertEqual(num_tasks, 3)  # One is already running, should only be able to schedule 3 more
Example #2
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit, status='RUNNING')
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        running_job_mgr.add_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer('offer_1', self.node_agent_1, NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer('offer_2', self.node_agent_2, NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        offer_mgr.add_new_offers([offer_1, offer_2])

        # Ignore cleanup tasks
        for node in node_mgr.get_nodes():
            node.initial_cleanup_completed()

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(num_tasks, 3)  # One is already running, should only be able to schedule 3 more
Example #3
0
    def test_paused_job_type(self):
        """Tests calling perform_scheduling() when a job type is paused"""
        offer_1 = ResourceOffer(
            'offer_1', self.agent_1.agent_id, self.framework_id,
            NodeResources([Cpus(2.0), Mem(1024.0),
                           Disk(1024.0)]), now(), None)
        offer_2 = ResourceOffer(
            'offer_2', self.agent_2.agent_id, self.framework_id,
            NodeResources([Cpus(25.0), Mem(2048.0),
                           Disk(2048.0)]), now(), None)
        resource_mgr.add_new_offers([offer_1, offer_2])
        self.queue_1.job_type.is_paused = True
        self.queue_1.job_type.save()
        job_type_mgr.sync_with_database()

        scheduling_manager = SchedulingManager()
        num_tasks = scheduling_manager.perform_scheduling(self._client, now())

        self.assertEqual(num_tasks,
                         1)  # Schedule queued job execution that is not paused
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_1.job_id).count(), 0)
        self.assertEqual(
            JobExecution.objects.filter(job_id=self.queue_2.job_id).count(), 1)
        self.assertEqual(
            Queue.objects.filter(
                id__in=[self.queue_1.id, self.queue_2.id]).count(), 1)
Example #4
0
    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()
        self._driver = MagicMock()

        scheduler_mgr.sync_with_database()
        offer_mgr.clear()

        self.node_agent_1 = 'agent_1'
        self.node_agent_2 = 'agent_2'
        self.slave_infos = [SlaveInfo('host_1', slave_id=self.node_agent_1),
                            SlaveInfo('host_2', slave_id=self.node_agent_2)]
        node_mgr.clear()
        node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2])
        with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves:
            mock_get_slaves.return_value = self.slave_infos
            node_mgr.sync_with_database('master_host', 5050)
        # Ignore initial cleanup tasks
        for node in node_mgr.get_nodes():
            node.initial_cleanup_completed()

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0, mem_required=1024.0, disk_in_required=100.0,
                                                     disk_out_required=200.0, disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0, mem_required=512.0, disk_in_required=400.0,
                                                     disk_out_required=45.0, disk_total_required=445.0)
        job_type_mgr.sync_with_database()

        self._scheduling_thread = SchedulingThread(self._driver, '123')
Example #5
0
    def _perform_sync(self):
        """Performs the sync with the database
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        mesos_master = scheduler_mgr.mesos_address
        node_mgr.sync_with_database(mesos_master.hostname, mesos_master.port)

        self._sync_running_job_executions()
Example #6
0
    def setUp(self):
        django.setup()

        reset_error_cache()

        self.framework_id = '1234'
        Scheduler.objects.initialize_scheduler()
        Scheduler.objects.update(
            num_message_handlers=0
        )  # Prevent message handler tasks from scheduling
        self._client = MagicMock()

        scheduler_mgr.sync_with_database()
        scheduler_mgr.update_from_mesos(framework_id=self.framework_id)
        resource_mgr.clear()
        job_exe_mgr.clear()

        self.agent_1 = Agent('agent_1', 'host_1')
        self.agent_2 = Agent('agent_2', 'host_2')
        self.agent_3 = Agent('agent_3', 'host_2')
        node_mgr.clear()
        node_mgr.register_agents([self.agent_1, self.agent_2])
        node_mgr.sync_with_database(scheduler_mgr.config)
        # Ignore initial cleanup, health check, and image pull tasks
        for node in node_mgr.get_nodes():
            node._last_health_task = now()
            node._initial_cleanup_completed()
            node._is_image_pulled = True
            node._update_state()
            if node.agent_id == 'agent_1':
                self.node_1_id = node.id
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        self.node_1 = Node.objects.get(id=self.node_1_id)
        # Ignore system tasks
        system_task_mgr._is_db_update_completed = True

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        self.queue_large = queue_test_utils.create_queue(
            resources=NodeResources([Cpus(
                125.0), Mem(12048.0), Disk(12048.0)]))

        job_type_mgr.sync_with_database()
Example #7
0
    def registered(self, driver, frameworkId, masterInfo):
        """
        Invoked when the scheduler successfully registers with a Mesos master.
        It is called with the frameworkId, a unique ID generated by the
        master, and the masterInfo which is information about the master
        itself.

        See documentation for :meth:`mesos_api.mesos.Scheduler.registered`.
        """

        self._driver = driver
        self._framework_id = frameworkId.value
        self._master_hostname = masterInfo.hostname
        self._master_port = masterInfo.port
        logger.info('Scale scheduler registered as framework %s with Mesos master at %s:%i',
                    self._framework_id, self._master_hostname, self._master_port)

        initialize_system()
        Scheduler.objects.update_master(self._master_hostname, self._master_port)
        scheduler_mgr.update_from_mesos(self._framework_id, HostAddress(self._master_hostname, self._master_port))
        recon_mgr.driver = self._driver

        # Initial database sync
        job_type_mgr.sync_with_database()
        scheduler_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        # Start up background threads
        self._db_sync_thread = DatabaseSyncThread(self._driver)
        db_sync_thread = threading.Thread(target=self._db_sync_thread.run)
        db_sync_thread.daemon = True
        db_sync_thread.start()

        self._recon_thread = ReconciliationThread()
        recon_thread = threading.Thread(target=self._recon_thread.run)
        recon_thread.daemon = True
        recon_thread.start()

        self._scheduling_thread = SchedulingThread(self._driver, self._framework_id)
        scheduling_thread = threading.Thread(target=self._scheduling_thread.run)
        scheduling_thread.daemon = True
        scheduling_thread.start()

        self._task_thread = TaskUpdateThread()
        task_thread = threading.Thread(target=self._task_thread.run)
        task_thread.daemon = True
        task_thread.start()

        self._reconcile_running_jobs()
Example #8
0
    def _perform_sync(self):
        """Performs the sync with the database
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        mesos_master = scheduler_mgr.mesos_address
        node_mgr.sync_with_database(mesos_master.hostname, mesos_master.port)

        # Kill running tasks for canceled job executions
        for task_to_kill in job_exe_mgr.sync_with_database():
            pb_task_to_kill = mesos_pb2.TaskID()
            pb_task_to_kill.value = task_to_kill.id
            logger.info('Killing task %s', task_to_kill.id)
            self._driver.killTask(pb_task_to_kill)
        
        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Example #9
0
File: sync.py Project: Fizz11/scale
    def _execute(self):
        """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute`
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        job_exe_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        node_mgr.sync_with_database(scheduler_mgr.config)
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        mesos_master = scheduler_mgr.mesos_address
        resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port)

        # Handle canceled job executions
        for finished_job_exe in job_exe_mgr.sync_with_database():
            cleanup_mgr.add_job_execution(finished_job_exe)

        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Example #10
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit,
                                                  status='RUNNING')
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        job_exe_mgr.schedule_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_1,
            NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_2,
            NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        offer_mgr.add_new_offers([offer_1, offer_2])

        # Ignore Docker pull tasks
        for node in node_mgr.get_nodes():
            node._is_image_pulled = True

        # Ignore cleanup tasks
        for node in node_mgr.get_nodes():
            node._initial_cleanup_completed()
            node._update_state()

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(
            num_tasks, 3
        )  # One is already running, should only be able to schedule 3 more
Example #11
0
    def _execute(self):
        """See :meth:`scheduler.threads.base_thread.BaseSchedulerThread._execute`
        """

        scheduler_mgr.sync_with_database()
        job_type_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        node_mgr.sync_with_database(scheduler_mgr.config)
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        mesos_master = scheduler_mgr.mesos_address
        resource_mgr.sync_with_mesos(mesos_master.hostname, mesos_master.port)

        # Kill running tasks for canceled job executions
        for task_to_kill in job_exe_mgr.sync_with_database():
            pb_task_to_kill = mesos_pb2.TaskID()
            pb_task_to_kill.value = task_to_kill.id
            logger.info('Killing task %s', task_to_kill.id)
            self._driver.killTask(pb_task_to_kill)

        if settings.SECRETS_URL:
            secrets_mgr.sync_with_backend()
Example #12
0
    def setUp(self):
        django.setup()

        Scheduler.objects.initialize_scheduler()
        self._driver = MagicMock()

        scheduler_mgr.sync_with_database()
        offer_mgr.clear()

        self.node_agent_1 = 'agent_1'
        self.node_agent_2 = 'agent_2'
        self.slave_infos = [
            SlaveInfo('host_1', slave_id=self.node_agent_1),
            SlaveInfo('host_2', slave_id=self.node_agent_2)
        ]
        node_mgr.clear()
        node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2])
        with patch('scheduler.node.manager.api.get_slaves') as mock_get_slaves:
            mock_get_slaves.return_value = self.slave_infos
            node_mgr.sync_with_database('master_host', 5050)
        # Ignore initial cleanup tasks and health check tasks
        for node in node_mgr.get_nodes():
            node._last_heath_task = now()
            node._initial_cleanup_completed()
            node._update_state()

        self.queue_1 = queue_test_utils.create_queue(cpus_required=4.0,
                                                     mem_required=1024.0,
                                                     disk_in_required=100.0,
                                                     disk_out_required=200.0,
                                                     disk_total_required=300.0)
        self.queue_2 = queue_test_utils.create_queue(cpus_required=8.0,
                                                     mem_required=512.0,
                                                     disk_in_required=400.0,
                                                     disk_out_required=45.0,
                                                     disk_total_required=445.0)
        job_type_mgr.sync_with_database()

        self._scheduling_thread = SchedulingThread(self._driver, '123')
Example #13
0
    def initialize(self):
        """Initializes the scheduler and gets it ready to connect to Mesos. This method should only ever be called once.
        """

        initialize_system()

        # Initial database sync
        logger.info('Performing initial sync with Scale database')
        logger.info('Retrieving errors...')
        reset_error_cache()
        logger.info('Retrieving job execution metrics...')
        job_exe_mgr.init_with_database()
        logger.info('Retrieving job types...')
        job_type_mgr.sync_with_database()
        logger.info('Retrieving workspaces...')
        workspace_mgr.sync_with_database()
        logger.info('Retrieving scheduler settings...')
        scheduler_mgr.sync_with_database()

        # Start up background threads
        self._threads = []

        logger.info('Starting up background threads')
        self._messaging_thread = MessagingThread()
        restart_msg = RestartScheduler()
        restart_msg.when = now()
        self._messaging_thread.add_initial_messages([restart_msg])
        messaging_thread = threading.Thread(target=self._messaging_thread.run)
        messaging_thread.daemon = True
        messaging_thread.start()
        self._threads.append(messaging_thread)

        self._recon_thread = ReconciliationThread()
        recon_thread = threading.Thread(target=self._recon_thread.run)
        recon_thread.daemon = True
        recon_thread.start()
        self._threads.append(recon_thread)

        self._scheduler_status_thread = SchedulerStatusThread()
        scheduler_status_thread = threading.Thread(target=self._scheduler_status_thread.run)
        scheduler_status_thread.daemon = True
        scheduler_status_thread.start()
        self._threads.append(scheduler_status_thread)

        self._scheduling_thread = SchedulingThread(self._client)
        scheduling_thread = threading.Thread(target=self._scheduling_thread.run)
        scheduling_thread.daemon = True
        scheduling_thread.start()
        self._threads.append(scheduling_thread)

        self._sync_thread = SyncThread(self._driver)
        sync_thread = threading.Thread(target=self._sync_thread.run)
        sync_thread.daemon = True
        sync_thread.start()
        self._threads.append(sync_thread)

        self._task_handling_thread = TaskHandlingThread(self._driver)
        task_handling_thread = threading.Thread(target=self._task_handling_thread.run)
        task_handling_thread.daemon = True
        task_handling_thread.start()
        self._threads.append(task_handling_thread)

        self._task_update_thread = TaskUpdateThread()
        task_update_thread = threading.Thread(target=self._task_update_thread.run)
        task_update_thread.daemon = True
        task_update_thread.start()
        self._threads.append(task_update_thread)
Example #14
0
    def registered(self, driver, frameworkId, masterInfo):
        """
        Invoked when the scheduler successfully registers with a Mesos master.
        It is called with the frameworkId, a unique ID generated by the
        master, and the masterInfo which is information about the master
        itself.

        See documentation for :meth:`mesos_api.mesos.Scheduler.registered`.
        """

        self._driver = driver
        self._framework_id = frameworkId.value
        self._master_hostname = masterInfo.hostname
        self._master_port = masterInfo.port
        logger.info(
            'Scale scheduler registered as framework %s with Mesos master at %s:%i',
            self._framework_id, self._master_hostname, self._master_port)

        initialize_system()
        Scheduler.objects.update_master(self._master_hostname,
                                        self._master_port)
        scheduler_mgr.update_from_mesos(
            self._framework_id,
            HostAddress(self._master_hostname, self._master_port))
        recon_mgr.driver = self._driver

        # Initial database sync
        logger.info('Performing initial sync with Scale database')
        reset_error_cache()
        job_exe_mgr.init_with_database()
        job_type_mgr.sync_with_database()
        scheduler_mgr.sync_with_database()
        workspace_mgr.sync_with_database()

        # Start up background threads
        self._messaging_thread = MessagingThread()
        restart_msg = RestartScheduler()
        restart_msg.when = now()
        self._messaging_thread.add_initial_messages([restart_msg])
        messaging_thread = threading.Thread(target=self._messaging_thread.run)
        messaging_thread.daemon = True
        messaging_thread.start()

        self._recon_thread = ReconciliationThread()
        recon_thread = threading.Thread(target=self._recon_thread.run)
        recon_thread.daemon = True
        recon_thread.start()

        self._scheduler_status_thread = SchedulerStatusThread()
        scheduler_status_thread = threading.Thread(
            target=self._scheduler_status_thread.run)
        scheduler_status_thread.daemon = True
        scheduler_status_thread.start()

        self._scheduling_thread = SchedulingThread(self._driver)
        scheduling_thread = threading.Thread(
            target=self._scheduling_thread.run)
        scheduling_thread.daemon = True
        scheduling_thread.start()

        self._sync_thread = SyncThread(self._driver)
        sync_thread = threading.Thread(target=self._sync_thread.run)
        sync_thread.daemon = True
        sync_thread.start()

        self._task_handling_thread = TaskHandlingThread(self._driver)
        task_handling_thread = threading.Thread(
            target=self._task_handling_thread.run)
        task_handling_thread.daemon = True
        task_handling_thread.start()

        self._task_update_thread = TaskUpdateThread()
        task_update_thread = threading.Thread(
            target=self._task_update_thread.run)
        task_update_thread.daemon = True
        task_update_thread.start()

        self._reconcile_running_jobs()