Ejemplo n.º 1
0
    def setUp(self):
        super(DatabaseJobTest, self).setUp()
        logger = logging.getLogger(
            'lava_scheduler_daemon.dbjobsource.DatabaseJobSource')
        logger.disabled = True
        logger = logging.getLogger('dispatcher-master')
        logger.disabled = True
        logger = logging.getLogger('lava_scheduler_app')
        logger.disabled = True

        DeviceType.objects.all().delete()

        self.panda_type = self.factory.ensure_device_type(name='panda')

        # make sure the DB is in a clean state wrt devices and jobs
        Device.objects.all().delete()
        TestJob.objects.all().delete()
        Tag.objects.all().delete()

        panda_type = self.panda_type
        # prevent local variables - get changes from the database
        self.factory.make_device(device_type=panda_type, hostname='panda01')
        self.factory.make_device(device_type=panda_type, hostname='panda02')
        self.factory.make_device(device_type=panda_type, hostname='panda03')
        self.factory.make_device(device_type=panda_type, hostname='panda04')
        self.factory.make_device(device_type=panda_type, hostname='panda05')
        self.factory.make_device(device_type=panda_type, hostname='panda06')

        self.user = self.factory.make_user()

        self.master = DatabaseJobSource(lambda: ['panda01', 'panda02'])
Ejemplo n.º 2
0
    def test_multinode_job_across_different_workers(self):
        master = self.master
        # This is not a normal worker, it is just another database view
        worker = DatabaseJobSource(lambda: ['arndale01'])
        arndale01 = self.arndale01
        self.panda02.state_transition_to(Device.OFFLINE)
        panda01 = self.panda01

        self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])

        master_jobs = self.scheduler_tick(master)
        worker_jobs = self.scheduler_tick(worker)

        self.assertEqual(1, len(master_jobs))
        self.assertEqual(master_jobs[0].actual_device, panda01)

        self.assertEqual(1, len(worker_jobs))
        self.assertEqual(worker_jobs[0].actual_device, arndale01)
Ejemplo n.º 3
0
    def setUp(self):
        super(DatabaseJobSourceTest, self).setUp()

        DeviceType.objects.all().delete()

        self.panda = self.factory.ensure_device_type(name='panda')
        self.beaglebone = self.factory.ensure_device_type(name='beaglebone')
        self.arndale = self.factory.ensure_device_type(name='arndale')

        # make sure the DB is in a clean state wrt devices and jobs
        Device.objects.all().delete()
        TestJob.objects.all().delete()
        Tag.objects.all().delete()

        panda = self.panda
        self.panda01 = self.factory.make_device(device_type=panda, hostname='panda01')
        self.panda02 = self.factory.make_device(device_type=panda, hostname='panda02')

        arndale = self.arndale
        self.arndale01 = self.factory.make_device(device_type=arndale, hostname='arndale01')
        self.arndale02 = self.factory.make_device(device_type=arndale, hostname='arndale02')

        self.common_tag = self.factory.ensure_tag('common')
        self.unique_tag = self.factory.ensure_tag('unique')
        self.exclusion_tag = self.factory.ensure_tag('exclude')

        self.black01 = self.factory.make_device(device_type=self.beaglebone, hostname='black01', tags=[self.common_tag])
        self.black02 = self.factory.make_device(device_type=self.beaglebone, hostname='black02', tags=[
            self.common_tag, self.unique_tag])
        self.black03 = self.factory.make_device(device_type=self.beaglebone, hostname='black03', tags=[
            self.exclusion_tag])

        self.user = self.factory.make_user()

        self.master = DatabaseJobSource(lambda: ['panda01', 'panda02'])
    def restart(self, who):
        self.report_start(who)
        DeviceType.objects.all().delete()

        self.panda = self.factory.ensure_device_type(name='panda')
        self.beaglebone = self.factory.ensure_device_type(name='beaglebone')
        self.arndale = self.factory.ensure_device_type(name='arndale')

        # make sure the DB is in a clean state wrt devices and jobs
        Device.objects.all().delete()
        TestJob.objects.all().delete()
        Tag.objects.all().delete()

        panda = self.panda
        self.panda01 = self.factory.make_device(device_type=panda, hostname='panda01')
        self.panda02 = self.factory.make_device(device_type=panda, hostname='panda02')

        arndale = self.arndale
        self.arndale01 = self.factory.make_device(device_type=arndale, hostname='arndale01')
        self.arndale02 = self.factory.make_device(device_type=arndale, hostname='arndale02')

        self.common_tag = self.factory.ensure_tag('common')
        self.unique_tag = self.factory.ensure_tag('unique')
        self.exclusion_tag = self.factory.ensure_tag('exclude')

        self.black01 = self.factory.make_device(device_type=self.beaglebone, hostname='black01', tags=[self.common_tag])
        self.black02 = self.factory.make_device(device_type=self.beaglebone, hostname='black02', tags=[
            self.common_tag, self.unique_tag])
        self.black03 = self.factory.make_device(device_type=self.beaglebone, hostname='black03', tags=[
            self.exclusion_tag])

        self.user = self.factory.make_user()

        self.master = DatabaseJobSource(lambda: ['panda01', 'panda02'])
Ejemplo n.º 5
0
    def handle(self, *args, **options):  # pylint: disable=too-many-locals
        import os

        from twisted.internet import reactor

        from lava_scheduler_daemon.service import JobQueue
        from lava_scheduler_daemon.dbjobsource import DatabaseJobSource

        daemon_options = self._configure(options)

        source = DatabaseJobSource()

        if options['use_fake']:
            import lava_scheduler_app
            opd = os.path.dirname
            dispatcher = os.path.join(
                opd(opd(os.path.abspath(lava_scheduler_app.__file__))),
                'fake-dispatcher')
        else:
            dispatcher = options['dispatcher']

        # Start scheduler service.
        service = JobQueue(source,
                           dispatcher,
                           reactor,
                           daemon_options=daemon_options)
        reactor.callWhenRunning(service.startService)  # pylint: disable=no-member
        reactor.run()  # pylint: disable=no-member
    def test_one_worker_does_not_mess_with_jobs_from_the_others(self):
        # simulate a worker with no devices configured
        worker = DatabaseJobSource(lambda: [])

        self.submit_job(device_type='panda')

        scheduled_jobs = self.scheduler_tick(worker)

        self.assertEqual([], scheduled_jobs)
        self.assertTrue(all([job.status == TestJob.SUBMITTED for job in TestJob.objects.all()]))
Ejemplo n.º 7
0
    def test_two_multinode_and_multiworker_jobs_waiting_in_the_queue(self):
        self.report_start(self.whoami())
        master = self.master
        worker = DatabaseJobSource(lambda: ['arndale01', 'arndale02'])

        self.submit_job(device_type='panda')
        self.submit_job(device_type='panda')
        self.submit_job(device_type='arndale')
        self.submit_job(device_type='arndale')

        p1, p2 = self.scheduler_tick(master)
        a1, a2 = self.scheduler_tick(worker)

        m1p, m1a = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])
        m1p.target_group = m1a.target_group = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
        m1p.save()
        m1a.save()
        m2p, m2a = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])
        m2p.target_group = m2a.target_group = 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
        m2p.save()
        m2a.save()

        self.assertEqual([], self.scheduler_tick(master))
        self.assertEqual([], self.scheduler_tick(worker))

        self.job_finished(p1, master)
        self.job_finished(a1, worker)

        self.report_status('bug', self.whoami())
        self.assertEqual([m1p], self.scheduler_tick(master))
        self.assertEqual([m1a], self.scheduler_tick(worker))
        self.report_end(self.whoami())
Ejemplo n.º 8
0
    def handle(self, *args, **options):
        from twisted.internet import reactor
        from lava_scheduler_daemon.job import Job
        daemon_options = self._configure(options)
        source = DatabaseJobSource()
        dispatcher, board_name, json_file = args

        job = Job(simplejson.load(open(json_file)),
                  dispatcher,
                  source,
                  board_name,
                  reactor,
                  daemon_options=daemon_options)

        def run():
            job.run().addCallback(lambda result: reactor.stop())

        reactor.callWhenRunning(run)
        reactor.run()
    def handle(self, *args, **options):
        import os

        from twisted.internet import reactor

        from lava_scheduler_daemon.service import JobQueue
        from lava_scheduler_daemon.worker import WorkerData
        from lava_scheduler_daemon.dbjobsource import DatabaseJobSource
        import xmlrpclib

        daemon_options = self._configure(options)

        source = DatabaseJobSource()

        if options['use_fake']:
            import lava_scheduler_app
            opd = os.path.dirname
            dispatcher = os.path.join(
                opd(opd(os.path.abspath(lava_scheduler_app.__file__))),
                'fake-dispatcher')
        else:
            dispatcher = options['dispatcher']

        # Update complete worker heartbeat data. This will be run once,
        # on every start/restart of the scheduler daemon.
        worker = WorkerData()
        try:
            worker.put_heartbeat_data(restart=True)
        except (xmlrpclib.Fault, xmlrpclib.ProtocolError) as err:
            worker.logger.error("Complete heartbeat update failed!")

        # Start scheduler service.
        service = JobQueue(source,
                           dispatcher,
                           reactor,
                           daemon_options=daemon_options)
        reactor.callWhenRunning(service.startService)
        reactor.run()
Ejemplo n.º 10
0
    def setUp(self):
        super(DatabaseJobTest, self).setUp()

        DeviceType.objects.all().delete()

        self.panda_type = self.factory.ensure_device_type(name='panda')

        # make sure the DB is in a clean state wrt devices and jobs
        Device.objects.all().delete()
        TestJob.objects.all().delete()
        Tag.objects.all().delete()

        panda_type = self.panda_type
        # prevent local variables - get changes from the database
        self.factory.make_device(device_type=panda_type, hostname='panda01')
        self.factory.make_device(device_type=panda_type, hostname='panda02')
        self.factory.make_device(device_type=panda_type, hostname='panda03')
        self.factory.make_device(device_type=panda_type, hostname='panda04')
        self.factory.make_device(device_type=panda_type, hostname='panda05')
        self.factory.make_device(device_type=panda_type, hostname='panda06')

        self.user = self.factory.make_user()

        self.master = DatabaseJobSource(lambda: ['panda01', 'panda02'])
Ejemplo n.º 11
0
    def test_failed_reservation_multinode(self):
        self.restart(self.whoami())
        master = self.master
        worker = DatabaseJobSource(lambda: ['arndale01'])
        # two pandas, two arndales, three blacks
        for panda in Device.objects.filter(device_type=self.panda):
            self.assertIsNone(panda.current_job)
        self.assertEqual(TestJob.objects.all().count(), 0)
        self.panda02.put_into_maintenance_mode(self.user, 'unit test', None)
        job1, job2 = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])
        # create a queue
        job3, job4 = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])
        # master_jobs = self.scheduler_tick(master)
        worker_jobs = self.scheduler_tick(worker)
        # self.scheduler_tick()
        job1 = TestJob.objects.get(id=job1.id)  # reload
        job1.status = TestJob.INCOMPLETE
        job1.save(update_fields=['status'])
        self.panda01 = Device.objects.get(
            hostname=self.panda01.hostname)  # reload
        self.panda01.status = Device.OFFLINING
        self.panda01.current_job = job1
        self.panda01.save(update_fields=['status', 'current_job'])

        # master_jobs = self.scheduler_tick(master)
        worker_jobs = self.scheduler_tick(worker)
        # self.scheduler_tick()
        job2 = TestJob.objects.get(id=job2.id)  # reload
        job2.cancel(self.user)
        master_jobs = self.scheduler_tick(master)
        # worker_jobs = self.scheduler_tick(worker)
        # self.scheduler_tick()

        self.panda01 = Device.objects.get(
            hostname=self.panda01.hostname)  # reload
        job3 = TestJob.objects.get(id=job3.id)  # reload

        # FORCE the buggy status
        job3.actual_device = self.panda01
        job3.save(update_fields=['actual_device'])

        job3 = TestJob.objects.get(id=job3.id)  # reload
        self.assertFalse(job3.is_ready_to_start)
        self.assertEqual(job3.actual_device, self.panda01)
        self.assertNotEqual(job3.actual_device.current_job, job3)

        job4 = TestJob.objects.get(id=job4.id)  # reload
        if job4.actual_device:
            self.assertNotEqual(job4.actual_device, self.panda01)
        job3 = TestJob.objects.get(id=job3.id)  # reload
        self.assertFalse(job4.is_ready_to_start)

        master_jobs = self.scheduler_tick(master)
        worker_jobs = self.scheduler_tick(worker)
        # self.scheduler_tick()

        job3 = TestJob.objects.get(id=job3.id)  # reload
        # if job3.actual_device:
        #     self.assertNotEqual(job3.actual_device, self.panda01)
        self.assertFalse(job3.is_ready_to_start)

        job4 = TestJob.objects.get(id=job4.id)  # reload
        if job4.actual_device:
            self.assertNotEqual(job4.actual_device, self.panda01)
        self.assertFalse(job4.is_ready_to_start)

        self.cleanup(self.whoami())
Ejemplo n.º 12
0
class DatabaseJobSourceTest(TestCaseWithFactory):

    def setUp(self):
        super(DatabaseJobSourceTest, self).setUp()

        DeviceType.objects.all().delete()

        self.panda = self.factory.ensure_device_type(name='panda')
        self.beaglebone = self.factory.ensure_device_type(name='beaglebone')
        self.arndale = self.factory.ensure_device_type(name='arndale')

        # make sure the DB is in a clean state wrt devices and jobs
        Device.objects.all().delete()
        TestJob.objects.all().delete()
        Tag.objects.all().delete()

        panda = self.panda
        self.panda01 = self.factory.make_device(device_type=panda, hostname='panda01')
        self.panda02 = self.factory.make_device(device_type=panda, hostname='panda02')

        arndale = self.arndale
        self.arndale01 = self.factory.make_device(device_type=arndale, hostname='arndale01')
        self.arndale02 = self.factory.make_device(device_type=arndale, hostname='arndale02')

        self.common_tag = self.factory.ensure_tag('common')
        self.unique_tag = self.factory.ensure_tag('unique')
        self.exclusion_tag = self.factory.ensure_tag('exclude')

        self.black01 = self.factory.make_device(device_type=self.beaglebone, hostname='black01', tags=[self.common_tag])
        self.black02 = self.factory.make_device(device_type=self.beaglebone, hostname='black02', tags=[
            self.common_tag, self.unique_tag])
        self.black03 = self.factory.make_device(device_type=self.beaglebone, hostname='black03', tags=[
            self.exclusion_tag])

        self.user = self.factory.make_user()

        self.master = DatabaseJobSource(lambda: ['panda01', 'panda02'])

    def submit_job(self, **kw):
        job_definition = self.factory.make_job_json(**kw)
        return TestJob.from_json_and_user(job_definition, self.user)

    @contextmanager
    def log_scheduler_state(self, event):
        if 'DEBUG' in os.environ:
            print("##############################################")
            print('# Before %s' % event)
            print('        Job queue: %r' % self.master._get_job_queue())
            print('Available devices: %r' % self.master._get_available_devices())
        yield
        if 'DEBUG' in os.environ:
            print('# After %s' % event)
            print('        Job queue: %r' % self.master._get_job_queue())
            print('Available devices: %r' % self.master._get_available_devices())

    def scheduler_tick(self, worker=None):
        if worker is None:
            worker = self.master
        with self.log_scheduler_state("scheduler ticks"):
            jobs = worker.getJobList_impl()
        if 'DEBUG' in os.environ:
            print('Jobs ready to run: %r' % jobs)
            print('   Submitted jobs: %r' % TestJob.objects.filter(status=TestJob.SUBMITTED))
            print(' State of devices: %r' % Device.objects.all())
        for job in jobs:
            # simulates the actual daemon, which will start jobs just after it
            # gets them from the scheduler
            self.job_started(job, worker)
        return jobs

    def job_started(self, job, worker=None):
        if worker is None:
            worker = self.master
        worker.jobStarted_impl(job)

    def job_finished(self, job, worker=None):
        if worker is None:
            worker = self.master
        with self.log_scheduler_state("job %d completes" % job.id):
            worker.jobCompleted_impl(job.actual_device.hostname, 0, None)

    def device_status(self, hostname, status=None, health_status=None):
        device = Device.objects.get(pk=hostname)
        if status is not None:
            device.status = status
        if health_status is not None:
            device.health_status = health_status
        device.save()

    def test_simple_single_node_scheduling(self):
        submitted = self.submit_job(device_type='panda')
        scheduled = self.scheduler_tick()

        self.assertEqual([submitted], scheduled)
        job = scheduled[0]
        self.assertTrue(job.actual_device)

    def test_simple_multi_node_scheduler(self):
        submitted_jobs = self.submit_job(
            device_group=[
                {"device_type": "panda", "count": 1, "role": "client"},
                {"device_type": "panda", "count": 1, "role": "server"},
            ]
        )

        scheduled_jobs = self.scheduler_tick()

        for job in submitted_jobs:
            self.assertTrue(job in scheduled_jobs)

    def test_single_node_and_multinode(self):
        singlenode_job1 = self.submit_job(device_type='panda')
        singlenode_job2 = self.submit_job(device_type='panda')

        self.scheduler_tick()  # should schedule single node jobs and start running them

        singlenode_job1 = TestJob.objects.get(pk=singlenode_job1.id)  # reload
        singlenode_job2 = TestJob.objects.get(pk=singlenode_job2.id)  # reload

        # multinode jobs submitted
        multinode_job1, multinode_job2 = self.submit_job(
            device_group=[
                {"device_type": "panda", "count": 1, "role": "client"},
                {"device_type": "panda", "count": 1, "role": "server"},
            ]
        )

        # job on first device finishes
        self.job_finished(singlenode_job1)
        singlenode_job1 = TestJob.objects.get(pk=singlenode_job1.id)  # reload
        self.assertEqual(singlenode_job1.status, TestJob.COMPLETE)

        self.scheduler_tick()  # should reserve a device for one of jobs in the multinode group

        # one (and only one) of the multinode jobs gets a device assigned
        multinode_job1 = TestJob.objects.get(pk=multinode_job1.id)  # reload
        multinode_job2 = TestJob.objects.get(pk=multinode_job2.id)  # reload
        self.assertTrue(any([job.actual_device is not None for job in [multinode_job1, multinode_job2]]))
        self.assertTrue(any([job.actual_device is None for job in [multinode_job1, multinode_job2]]))

        # job on second board finishes
        self.job_finished(singlenode_job2)
        singlenode_job2 = TestJob.objects.get(pk=singlenode_job2.id)  # reload
        self.assertEqual(singlenode_job2.status, TestJob.COMPLETE)

        self.scheduler_tick()  # should reserve a device for the other jon in the multinode job

        multinode_job1 = TestJob.objects.get(pk=multinode_job1.id)  # reload
        multinode_job2 = TestJob.objects.get(pk=multinode_job2.id)  # reload
        self.assertTrue(all([job.actual_device is not None for job in [multinode_job1, multinode_job2]]))

    def test_health_check(self):

        self.panda.health_check_job = self.factory.make_job_json(health_check='true')
        self.panda.save()

        jobs = self.scheduler_tick()

        panda_jobs = [j for j in jobs if j.actual_device.device_type == self.panda]

        self.assertTrue(len(panda_jobs) > 0)
        self.assertTrue(all([job.actual_device is not None for job in panda_jobs]))

    def test_one_worker_does_not_mess_with_jobs_from_the_others(self):
        # simulate a worker with no devices configured
        worker = DatabaseJobSource(lambda: [])

        self.submit_job(device_type='panda')

        scheduled_jobs = self.scheduler_tick(worker)

        self.assertEqual([], scheduled_jobs)
        self.assertTrue(all([job.status == TestJob.SUBMITTED for job in TestJob.objects.all()]))

    def test_multinode_job_across_different_workers(self):
        master = self.master
        worker = DatabaseJobSource(lambda: ['arndale01'])
        arndale01 = self.arndale01
        self.panda02.state_transition_to(Device.OFFLINE)
        panda01 = self.panda01

        self.submit_job(
            device_group=[
                {"device_type": "panda", "count": 1, "role": "client"},
                {"device_type": "arndale", "count": 1, "role": "server"},
            ]
        )

        master_jobs = self.scheduler_tick(master)
        worker_jobs = self.scheduler_tick(worker)

        self.assertEqual(1, len(master_jobs))
        self.assertEqual(master_jobs[0].actual_device, panda01)

        self.assertEqual(1, len(worker_jobs))
        self.assertEqual(worker_jobs[0].actual_device, arndale01)

    def test_two_multinode_jobs_plus_two_singlenode_jobs(self):

        single1 = self.submit_job(device_type='panda')
        single2 = self.submit_job(device_type='panda')

        multi1a, multi1b = self.submit_job(
            device_group=[
                {"device_type": "panda", "count": 1, "role": "client"},
                {"device_type": "panda", "count": 1, "role": "server"},
            ]
        )

        multi2a, multi2b = self.submit_job(
            device_group=[
                {"device_type": "panda", "count": 1, "role": "client"},
                {"device_type": "panda", "count": 1, "role": "server"},
            ]
        )

        # make it confusing by making both multinode jobs have the exact same
        # submit time
        # also set the target_group string to make the outcome predictable
        now = datetime.datetime.now()
        for job in [multi1a, multi1b]:
            job.submit_time = now
            job.target_group = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
            job.save()
        for job in [multi2a, multi2b]:
            job.submit_time = now
            job.target_group = 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
            job.save()

        scheduled = sorted(self.scheduler_tick(), key=lambda job: job.id)
        self.assertEqual([single1, single2], scheduled)
        single1, single2 = scheduled  # reload locals

        self.job_finished(single1)

        self.assertEqual([], self.scheduler_tick())

        self.job_finished(single2)

        scheduled = sorted(self.scheduler_tick(), key=lambda job: job.id)
        self.assertEqual([multi1a, multi1b], scheduled)

    def test_two_multinode_and_multiworker_jobs_waiting_in_the_queue(self):
        master = self.master
        worker = DatabaseJobSource(lambda: ['arndale01', 'arndale02'])

        self.submit_job(device_type='panda')
        self.submit_job(device_type='panda')
        self.submit_job(device_type='arndale')
        self.submit_job(device_type='arndale')

        p1, p2 = self.scheduler_tick(master)
        a1, a2 = self.scheduler_tick(worker)

        m1p, m1a = self.submit_job(
            device_group=[
                {"device_type": "panda", "count": 1, "role": "client"},
                {"device_type": "arndale", "count": 1, "role": "server"},
            ]
        )
        m1p.target_group = m1a.target_group = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
        m1p.save()
        m1a.save()
        m2p, m2a = self.submit_job(
            device_group=[
                {"device_type": "panda", "count": 1, "role": "client"},
                {"device_type": "arndale", "count": 1, "role": "server"},
            ]
        )
        m2p.target_group = m2a.target_group = 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
        m2p.save()
        m2a.save()

        self.assertEqual([], self.scheduler_tick(master))
        self.assertEqual([], self.scheduler_tick(worker))

        self.job_finished(p1, master)
        self.job_finished(a1, worker)

        self.assertEqual([m1p], self.scheduler_tick(master))
        self.assertEqual([m1a], self.scheduler_tick(worker))

    def test_looping_mode(self):

        self.panda.health_check_job = self.factory.make_job_json(health_check='true')
        self.panda.save()
        self.device_status('panda01', health_status=Device.HEALTH_LOOPING)
        self.device_status('panda02', status=Device.OFFLINE)

        jobs = self.scheduler_tick()
        self.assertEqual(1, len(jobs))
        health_check = jobs[0]
        self.assertTrue(health_check.health_check)
        self.assertEqual(health_check.actual_device.hostname, 'panda01')

        # no new health check while the original one is running
        self.assertEqual(0, len(self.scheduler_tick()))

        self.job_finished(health_check)
        jobs = self.scheduler_tick()
        self.assertEqual(1, len(jobs))
        new_health_check = jobs[0]
        self.assertTrue(new_health_check.health_check)
        self.assertEqual(new_health_check.actual_device.hostname, 'panda01')

        # again just to be sure
        self.job_finished(new_health_check)
        jobs = self.scheduler_tick()
        self.assertEqual(1, len(jobs))
        third_health_check = jobs[0]
        self.assertTrue(third_health_check.health_check)
        self.assertEqual(third_health_check.actual_device.hostname, 'panda01')

    def test_find_device_for_job(self):
        """
        tests that find_device_for_job gives preference to matching by requested
        _device_ over matching by requested device _type_.
        """
        job = self.submit_job(target='panda01', device_type='panda')
        devices = [self.panda02, self.panda01]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.panda01, chosen_device)

    def test_offline_health_check(self):
        """
        tests whether we are able to submit health check jobs for devices that
        are OFFLINE.
        """
        self.panda.health_check_job = self.factory.make_job_json(health_check='true')
        self.panda.save()

        self.panda01.state_transition_to(Device.OFFLINE)
        self.panda02.state_transition_to(Device.OFFLINE)

        Device.initiate_health_check_job(self.panda01)
        Device.initiate_health_check_job(self.panda02)

        jobs = self.scheduler_tick()

        self.assertEqual(2, len(jobs))
        self.assertTrue(all([job.actual_device is not None for job in jobs]))
        self.assertEqual(self.panda01.status, Device.OFFLINE)
        self.assertEqual(self.panda02.status, Device.OFFLINE)

    def test_find_device_for_job_with_tag(self):
        """
        test that tags are used to set which device is selected
        panda should be excluded by device_type
        black03 should be excluded as it does not have the common tag
        black02 would also match but is not included in the device check
        """
        job = self.submit_job(device_type='beaglebone', tags=[
            self.common_tag.name
        ])
        devices = [self.panda01, self.arndale02, self.black01, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black01, chosen_device)

    def test_find_device_for_devices_without_tags(self):
        """
        ensure that tags do not interfere with finding devices of
        unrelated types
        """
        job = self.submit_job(device_type='arndale', tags=[])
        devices = [self.panda01, self.arndale02, self.black01, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.arndale02, chosen_device)
        try:
            job = self.submit_job(device_type='arndale', tags=[
                self.common_tag.name
            ])
        except DevicesUnavailableException:
            pass
        else:
            self.fail("Offered an arndale when no arndale support the requested tags")

    def test_find_device_for_job_with_multiple_tags(self):
        """
        test that tags are used to set which device is selected
        choose black02 and never black01 due to the presence
        of both the common tag and the unique tag only with black02.
        """

        job = self.submit_job(device_type='beaglebone', tags=[
            self.common_tag.name, self.unique_tag.name
        ])
        devices = [self.panda01, self.black01, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)
        try:
            job = self.submit_job(device_type='panda', tags=[
                self.common_tag.name, self.unique_tag.name
            ])
        except DevicesUnavailableException:
            pass
        else:
            self.fail("Offered a panda when no pandas support the requested tags")

        devices = [self.black01, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)

        devices = [self.arndale02, self.panda02, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)

    def test_find_device_with_single_job_tag(self):
        """
        tests handling of jobs with less tags than supported but still
        choosing one tag which only applies to one device in the set.
        """
        job = self.submit_job(device_type='beaglebone', tags=[
            self.unique_tag.name
        ])
        devices = [self.panda02, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)

        job = self.submit_job(device_type='beaglebone', tags=[
            self.exclusion_tag.name
        ])
        devices = [self.panda02, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black03, chosen_device)

    def _test_basic_vm_groups_scheduling(self):
        self.factory.ensure_device_type(name='kvm-arm')
        self.factory.ensure_device_type(name='dynamic-vm')
        self.submit_job(vm_group={
            "host": {
                "device_type": "arndale",
                "role": "host"
            },
            "vms": [
                {
                    "device_type": "kvm-arm",
                    "role": "server"
                },
                {
                    "device_type": "kvm-arm",
                    "role": "client"
                }
            ]
        })
        jobs = self.scheduler_tick()
        self.assertEqual(3, len(jobs))
Ejemplo n.º 13
0
class DatabaseJobSourceTest(TestCaseWithFactory):
    def setUp(self):
        super(DatabaseJobSourceTest, self).setUp()

        DeviceType.objects.all().delete()

        self.panda = self.factory.ensure_device_type(name='panda')
        self.beaglebone = self.factory.ensure_device_type(name='beaglebone')
        self.arndale = self.factory.ensure_device_type(name='arndale')

        # make sure the DB is in a clean state wrt devices and jobs
        Device.objects.all().delete()
        TestJob.objects.all().delete()
        Tag.objects.all().delete()

        panda = self.panda
        self.panda01 = self.factory.make_device(device_type=panda,
                                                hostname='panda01')
        self.panda02 = self.factory.make_device(device_type=panda,
                                                hostname='panda02')

        arndale = self.arndale
        self.arndale01 = self.factory.make_device(device_type=arndale,
                                                  hostname='arndale01')
        self.arndale02 = self.factory.make_device(device_type=arndale,
                                                  hostname='arndale02')

        self.common_tag = self.factory.ensure_tag('common')
        self.unique_tag = self.factory.ensure_tag('unique')
        self.exclusion_tag = self.factory.ensure_tag('exclude')

        self.black01 = self.factory.make_device(device_type=self.beaglebone,
                                                hostname='black01',
                                                tags=[self.common_tag])
        self.black02 = self.factory.make_device(
            device_type=self.beaglebone,
            hostname='black02',
            tags=[self.common_tag, self.unique_tag])
        self.black03 = self.factory.make_device(device_type=self.beaglebone,
                                                hostname='black03',
                                                tags=[self.exclusion_tag])

        self.user = self.factory.make_user()

        self.master = DatabaseJobSource(lambda: ['panda01', 'panda02'])

    def submit_job(self, **kw):
        job_definition = self.factory.make_job_json(**kw)
        return TestJob.from_json_and_user(job_definition, self.user)

    @contextmanager
    def log_scheduler_state(self, event):
        if 'DEBUG' in os.environ:
            print("##############################################")
            print('# Before %s' % event)
            print('        Job queue: %r' % self.master._get_job_queue())
            print('Available devices: %r' %
                  self.master._get_available_devices())
        yield
        if 'DEBUG' in os.environ:
            print('# After %s' % event)
            print('        Job queue: %r' % self.master._get_job_queue())
            print('Available devices: %r' %
                  self.master._get_available_devices())

    def scheduler_tick(self, worker=None):
        if worker is None:
            worker = self.master
        with self.log_scheduler_state("scheduler ticks"):
            jobs = worker.getJobList_impl()
        if 'DEBUG' in os.environ:
            print('Jobs ready to run: %r' % jobs)
            print('   Submitted jobs: %r' %
                  TestJob.objects.filter(status=TestJob.SUBMITTED))
            print(' State of devices: %r' % Device.objects.all())
        for job in jobs:
            # simulates the actual daemon, which will start jobs just after it
            # gets them from the scheduler
            self.job_started(job, worker)
        return jobs

    def job_started(self, job, worker=None):
        if worker is None:
            worker = self.master
        worker.jobStarted_impl(job)

    def job_finished(self, job, worker=None):
        if worker is None:
            worker = self.master
        with self.log_scheduler_state("job %d completes" % job.id):
            worker.jobCompleted_impl(job.actual_device.hostname, 0, None)

    def device_status(self, hostname, status=None, health_status=None):
        device = Device.objects.get(pk=hostname)
        if status is not None:
            device.status = status
        if health_status is not None:
            device.health_status = health_status
        device.save()

    def test_simple_single_node_scheduling(self):
        submitted = self.submit_job(device_type='panda')
        scheduled = self.scheduler_tick()

        self.assertEqual([submitted], scheduled)
        job = scheduled[0]
        self.assertTrue(job.actual_device)

    def test_simple_multi_node_scheduler(self):
        submitted_jobs = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "panda",
                "count": 1,
                "role": "server"
            },
        ])

        scheduled_jobs = self.scheduler_tick()

        for job in submitted_jobs:
            self.assertTrue(job in scheduled_jobs)

    def test_single_node_and_multinode(self):
        singlenode_job1 = self.submit_job(device_type='panda')
        singlenode_job2 = self.submit_job(device_type='panda')

        self.scheduler_tick(
        )  # should schedule single node jobs and start running them

        singlenode_job1 = TestJob.objects.get(pk=singlenode_job1.id)  # reload
        singlenode_job2 = TestJob.objects.get(pk=singlenode_job2.id)  # reload

        # multinode jobs submitted
        multinode_job1, multinode_job2 = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "panda",
                "count": 1,
                "role": "server"
            },
        ])

        # job on first device finishes
        self.job_finished(singlenode_job1)
        singlenode_job1 = TestJob.objects.get(pk=singlenode_job1.id)  # reload
        self.assertEqual(singlenode_job1.status, TestJob.COMPLETE)

        self.scheduler_tick(
        )  # should reserve a device for one of jobs in the multinode group

        # one (and only one) of the multinode jobs gets a device assigned
        multinode_job1 = TestJob.objects.get(pk=multinode_job1.id)  # reload
        multinode_job2 = TestJob.objects.get(pk=multinode_job2.id)  # reload
        self.assertTrue(
            any([
                job.actual_device is not None
                for job in [multinode_job1, multinode_job2]
            ]))
        self.assertTrue(
            any([
                job.actual_device is None
                for job in [multinode_job1, multinode_job2]
            ]))

        # job on second board finishes
        self.job_finished(singlenode_job2)
        singlenode_job2 = TestJob.objects.get(pk=singlenode_job2.id)  # reload
        self.assertEqual(singlenode_job2.status, TestJob.COMPLETE)

        self.scheduler_tick(
        )  # should reserve a device for the other jon in the multinode job

        multinode_job1 = TestJob.objects.get(pk=multinode_job1.id)  # reload
        multinode_job2 = TestJob.objects.get(pk=multinode_job2.id)  # reload
        self.assertTrue(
            all([
                job.actual_device is not None
                for job in [multinode_job1, multinode_job2]
            ]))

    def test_health_check(self):

        self.panda.health_check_job = self.factory.make_job_json(
            health_check='true')
        self.panda.save()

        jobs = self.scheduler_tick()

        panda_jobs = [
            j for j in jobs if j.actual_device.device_type == self.panda
        ]

        self.assertTrue(len(panda_jobs) > 0)
        self.assertTrue(
            all([job.actual_device is not None for job in panda_jobs]))

    def test_one_worker_does_not_mess_with_jobs_from_the_others(self):
        # simulate a worker with no devices configured
        worker = DatabaseJobSource(lambda: [])

        self.submit_job(device_type='panda')

        scheduled_jobs = self.scheduler_tick(worker)

        self.assertEqual([], scheduled_jobs)
        self.assertTrue(
            all([
                job.status == TestJob.SUBMITTED
                for job in TestJob.objects.all()
            ]))

    def test_multinode_job_across_different_workers(self):
        master = self.master
        worker = DatabaseJobSource(lambda: ['arndale01'])
        arndale01 = self.arndale01
        self.panda02.state_transition_to(Device.OFFLINE)
        panda01 = self.panda01

        self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])

        master_jobs = self.scheduler_tick(master)
        worker_jobs = self.scheduler_tick(worker)

        self.assertEqual(1, len(master_jobs))
        self.assertEqual(master_jobs[0].actual_device, panda01)

        self.assertEqual(1, len(worker_jobs))
        self.assertEqual(worker_jobs[0].actual_device, arndale01)

    def test_two_multinode_jobs_plus_two_singlenode_jobs(self):

        single1 = self.submit_job(device_type='panda')
        single2 = self.submit_job(device_type='panda')

        multi1a, multi1b = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "panda",
                "count": 1,
                "role": "server"
            },
        ])

        multi2a, multi2b = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "panda",
                "count": 1,
                "role": "server"
            },
        ])

        # make it confusing by making both multinode jobs have the exact same
        # submit time
        # also set the target_group string to make the outcome predictable
        now = datetime.datetime.now()
        for job in [multi1a, multi1b]:
            job.submit_time = now
            job.target_group = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
            job.save()
        for job in [multi2a, multi2b]:
            job.submit_time = now
            job.target_group = 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
            job.save()

        scheduled = sorted(self.scheduler_tick(), key=lambda job: job.id)
        self.assertEqual([single1, single2], scheduled)
        single1, single2 = scheduled  # reload locals

        self.job_finished(single1)

        self.assertEqual([], self.scheduler_tick())

        self.job_finished(single2)

        scheduled = sorted(self.scheduler_tick(), key=lambda job: job.id)
        self.assertEqual([multi1a, multi1b], scheduled)

    def test_two_multinode_and_multiworker_jobs_waiting_in_the_queue(self):
        master = self.master
        worker = DatabaseJobSource(lambda: ['arndale01', 'arndale02'])

        self.submit_job(device_type='panda')
        self.submit_job(device_type='panda')
        self.submit_job(device_type='arndale')
        self.submit_job(device_type='arndale')

        p1, p2 = self.scheduler_tick(master)
        a1, a2 = self.scheduler_tick(worker)

        m1p, m1a = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])
        m1p.target_group = m1a.target_group = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
        m1p.save()
        m1a.save()
        m2p, m2a = self.submit_job(device_group=[
            {
                "device_type": "panda",
                "count": 1,
                "role": "client"
            },
            {
                "device_type": "arndale",
                "count": 1,
                "role": "server"
            },
        ])
        m2p.target_group = m2a.target_group = 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
        m2p.save()
        m2a.save()

        self.assertEqual([], self.scheduler_tick(master))
        self.assertEqual([], self.scheduler_tick(worker))

        self.job_finished(p1, master)
        self.job_finished(a1, worker)

        self.assertEqual([m1p], self.scheduler_tick(master))
        self.assertEqual([m1a], self.scheduler_tick(worker))

    def test_looping_mode(self):

        self.panda.health_check_job = self.factory.make_job_json(
            health_check='true')
        self.panda.save()
        self.device_status('panda01', health_status=Device.HEALTH_LOOPING)
        self.device_status('panda02', status=Device.OFFLINE)

        jobs = self.scheduler_tick()
        self.assertEqual(1, len(jobs))
        health_check = jobs[0]
        self.assertTrue(health_check.health_check)
        self.assertEqual(health_check.actual_device.hostname, 'panda01')

        # no new health check while the original one is running
        self.assertEqual(0, len(self.scheduler_tick()))

        self.job_finished(health_check)
        jobs = self.scheduler_tick()
        self.assertEqual(1, len(jobs))
        new_health_check = jobs[0]
        self.assertTrue(new_health_check.health_check)
        self.assertEqual(new_health_check.actual_device.hostname, 'panda01')

        # again just to be sure
        self.job_finished(new_health_check)
        jobs = self.scheduler_tick()
        self.assertEqual(1, len(jobs))
        third_health_check = jobs[0]
        self.assertTrue(third_health_check.health_check)
        self.assertEqual(third_health_check.actual_device.hostname, 'panda01')

    def test_find_device_for_job(self):
        """
        tests that find_device_for_job gives preference to matching by requested
        _device_ over matching by requested device _type_.
        """
        job = self.submit_job(target='panda01', device_type='panda')
        devices = [self.panda02, self.panda01]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.panda01, chosen_device)

    def test_offline_health_check(self):
        """
        tests whether we are able to submit health check jobs for devices that
        are OFFLINE.
        """
        self.panda.health_check_job = self.factory.make_job_json(
            health_check='true')
        self.panda.save()

        self.panda01.state_transition_to(Device.OFFLINE)
        self.panda02.state_transition_to(Device.OFFLINE)

        Device.initiate_health_check_job(self.panda01)
        Device.initiate_health_check_job(self.panda02)

        jobs = self.scheduler_tick()

        self.assertEqual(2, len(jobs))
        self.assertTrue(all([job.actual_device is not None for job in jobs]))
        self.assertEqual(self.panda01.status, Device.OFFLINE)
        self.assertEqual(self.panda02.status, Device.OFFLINE)

    def test_find_device_for_job_with_tag(self):
        """
        test that tags are used to set which device is selected
        panda should be excluded by device_type
        black03 should be excluded as it does not have the common tag
        black02 would also match but is not included in the device check
        """
        job = self.submit_job(device_type='beaglebone',
                              tags=[self.common_tag.name])
        devices = [self.panda01, self.arndale02, self.black01, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black01, chosen_device)

    def test_find_device_for_devices_without_tags(self):
        """
        ensure that tags do not interfere with finding devices of
        unrelated types
        """
        job = self.submit_job(device_type='arndale', tags=[])
        devices = [self.panda01, self.arndale02, self.black01, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.arndale02, chosen_device)
        try:
            job = self.submit_job(device_type='arndale',
                                  tags=[self.common_tag.name])
        except DevicesUnavailableException:
            pass
        else:
            self.fail(
                "Offered an arndale when no arndale support the requested tags"
            )

    def test_find_device_for_job_with_multiple_tags(self):
        """
        test that tags are used to set which device is selected
        choose black02 and never black01 due to the presence
        of both the common tag and the unique tag only with black02.
        """

        job = self.submit_job(
            device_type='beaglebone',
            tags=[self.common_tag.name, self.unique_tag.name])
        devices = [self.panda01, self.black01, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)
        try:
            job = self.submit_job(
                device_type='panda',
                tags=[self.common_tag.name, self.unique_tag.name])
        except DevicesUnavailableException:
            pass
        else:
            self.fail(
                "Offered a panda when no pandas support the requested tags")

        devices = [self.black01, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)

        devices = [self.arndale02, self.panda02, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)

    def test_find_device_with_single_job_tag(self):
        """
        tests handling of jobs with less tags than supported but still
        choosing one tag which only applies to one device in the set.
        """
        job = self.submit_job(device_type='beaglebone',
                              tags=[self.unique_tag.name])
        devices = [self.panda02, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black02, chosen_device)

        job = self.submit_job(device_type='beaglebone',
                              tags=[self.exclusion_tag.name])
        devices = [self.panda02, self.black02, self.black03]
        chosen_device = find_device_for_job(job, devices)
        self.assertEqual(self.black03, chosen_device)

    def _test_basic_vm_groups_scheduling(self):
        self.factory.ensure_device_type(name='kvm-arm')
        self.factory.ensure_device_type(name='dynamic-vm')
        self.submit_job(
            vm_group={
                "host": {
                    "device_type": "arndale",
                    "role": "host"
                },
                "vms": [{
                    "device_type": "kvm-arm",
                    "role": "server"
                }, {
                    "device_type": "kvm-arm",
                    "role": "client"
                }]
            })
        jobs = self.scheduler_tick()
        self.assertEqual(3, len(jobs))