Esempio n. 1
0
 def test_rate_limit_while_creating_job(self):
     # Make sure the rate limit cuts a user off.
     settings = util.settings(self.cook_url)
     if settings['rate-limit']['job-submission'] is None:
         pytest.skip(
             "Can't test job submission rate limit without submission rate limit set."
         )
     if not settings['rate-limit']['job-submission']['enforce?']:
         pytest.skip("Enforcing must be on for test to run")
     user = self.user_factory.new_user()
     bucket_size = settings['rate-limit']['job-submission']['bucket-size']
     extra_size = replenishment_rate = settings['rate-limit'][
         'job-submission']['tokens-replenished-per-minute']
     if extra_size < 100:
         extra_size = 100
     if bucket_size > 3000 or extra_size > 1000:
         pytest.skip(
             "Job submission rate limit test would require making too many or too few jobs to run the test."
         )
     with user:
         jobs_to_kill = []
         try:
             # First, empty most but not all of the tocken bucket.
             jobs1, resp1 = util.submit_jobs(self.cook_url, {},
                                             bucket_size - 60)
             jobs_to_kill.extend(jobs1)
             self.assertEqual(resp1.status_code, 201)
             # Then another 1060 to get us very negative.
             jobs2, resp2 = util.submit_jobs(self.cook_url, {},
                                             extra_size + 60)
             jobs_to_kill.extend(jobs2)
             self.assertEqual(resp2.status_code, 201)
             # And finally a request that gets cut off.
             jobs3, resp3 = util.submit_jobs(self.cook_url, {}, 10)
             self.assertEqual(resp3.status_code, 400)
             # The timestamp can change so we should only match on the prefix.
             expectedPrefix = f'User {user.name} is inserting too quickly. Not allowed to insert for'
             self.assertEqual(resp3.json()['error'][:len(expectedPrefix)],
                              expectedPrefix)
             # Earn back 70 seconds of tokens.
             time.sleep(70.0 * extra_size / replenishment_rate)
             jobs4, resp4 = util.submit_jobs(self.cook_url, {}, 10)
             jobs_to_kill.extend(jobs4)
             self.assertEqual(resp4.status_code, 201)
         finally:
             util.kill_jobs(self.cook_url, jobs_to_kill)
Esempio n. 2
0
    def test_container_submit_no_image(self):
        """Test submitting a job with a port specification but no image."""
        settings_dict = util.settings(self.cook_url)
        if 'pools' not in settings_dict or 'default-containers' not in settings_dict['pools']:
            self.skipTest("Test requires default containers")
        JOB_PORT = 30030
        progress_file_env = util.retrieve_progress_file_env(type(self).cook_url)
        hostname_progress_cmd = util.progress_line(type(self).cook_url,
                                                   50,  # Don't really care, we just need a val
                                                   '$(hostname -I)',
                                                   write_to_file=True)

        container = DockerContainer(port_mapping=[
            DockerPortMapping(host_port=0, container_port=JOB_PORT,
                              protocol='tcp')
        ])
        uuid = self.client.submit(command=f'{hostname_progress_cmd} && nc -l -p {JOB_PORT} $(hostname -I)',
                                  container=container,
                                  env={progress_file_env: 'progress.txt'},
                                  max_retries=5,
                                  pool=util.default_submit_pool())

        addr = None
        try:
            util.wait_for_instance_with_progress(type(self).cook_url, str(uuid), 50)
            job = self.client.query(uuid)
            addr = job.instances[0].progress_message

            self.assertIsNotNone(addr)

            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
                sock.connect((addr, JOB_PORT))
                message = b"hello world!"

                self.assertEqual(sock.send(message), len(message))
        except Exception as e:
            if addr is not None:
                raise Exception(f"Could not connect to {addr}: {e}") from e
            else:
                raise e
        finally:
            self.client.kill(uuid)
Esempio n. 3
0
    def trigger_preemption(self, pool):
        """
        Triggers preemption on the provided pool (which can be None) by doing the following:

        1. Choose a user, X
        2. Lower X's cpu share to 0.1 and cpu quota to 1.0
        3. Submit a job, J1, from X with 1.0 cpu and priority 99 (fills the cpu quota)
        4. Wait for J1 to start running
        5. Submit a job, J2, from X with 0.1 cpu and priority 100
        6. Wait until J1 is preempted (to make room for J2)
        """
        admin = self.user_factory.admin()
        user = self.user_factory.new_user()
        all_job_uuids = []
        try:
            small_cpus = 0.1
            large_cpus = small_cpus * 10
            with admin:
                # Lower the user's cpu share and quota
                util.set_limit(self.cook_url,
                               'share',
                               user.name,
                               cpus=small_cpus,
                               pool=pool)
                util.set_limit(self.cook_url,
                               'quota',
                               user.name,
                               cpus=large_cpus,
                               pool=pool)

            with user:
                # Submit a large job that fills up the user's quota
                base_priority = 99
                command = 'sleep 600'
                uuid_large, _ = util.submit_job(self.cook_url,
                                                priority=base_priority,
                                                cpus=large_cpus,
                                                command=command,
                                                pool=pool)
                all_job_uuids.append(uuid_large)
                util.wait_for_running_instance(self.cook_url, uuid_large)

                # Submit a higher-priority job that should trigger preemption
                uuid_high_priority, _ = util.submit_job(
                    self.cook_url,
                    priority=base_priority + 1,
                    cpus=small_cpus,
                    command=command,
                    name='higher_priority_job',
                    pool=pool)
                all_job_uuids.append(uuid_high_priority)

                # Assert that the lower-priority job was preempted
                def low_priority_job():
                    job = util.load_job(self.cook_url, uuid_large)
                    one_hour_in_millis = 60 * 60 * 1000
                    start = util.current_milli_time() - one_hour_in_millis
                    end = util.current_milli_time()
                    running = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='running',
                                        start=start,
                                        end=end).json()
                    waiting = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='waiting',
                                        start=start,
                                        end=end).json()
                    self.logger.info(
                        f'Currently running jobs: {json.dumps(running, indent=2)}'
                    )
                    self.logger.info(
                        f'Currently waiting jobs: {json.dumps(waiting, indent=2)}'
                    )
                    return job

                def job_was_preempted(job):
                    for instance in job['instances']:
                        self.logger.debug(
                            f'Checking if instance was preempted: {instance}')
                        if instance.get(
                                'reason_string') == 'Preempted by rebalancer':
                            return True
                    self.logger.info(f'Job has not been preempted: {job}')
                    return False

                max_wait_ms = util.settings(
                    self.cook_url
                )['rebalancer']['interval-seconds'] * 1000 * 1.5
                self.logger.info(
                    f'Waiting up to {max_wait_ms} milliseconds for preemption to happen'
                )
                util.wait_until(low_priority_job,
                                job_was_preempted,
                                max_wait_ms=max_wait_ms,
                                wait_interval_ms=5000)
        finally:
            with admin:
                util.kill_jobs(self.cook_url,
                               all_job_uuids,
                               assert_response=False)
                util.reset_limit(self.cook_url,
                                 'share',
                                 user.name,
                                 reason=self.current_name(),
                                 pool=pool)
                util.reset_limit(self.cook_url,
                                 'quota',
                                 user.name,
                                 reason=self.current_name(),
                                 pool=pool)
Esempio n. 4
0
    def test_rate_limit_launching_jobs(self):
        settings = util.settings(self.cook_url)
        if settings['rate-limit']['job-launch'] is None:
            pytest.skip(
                "Can't test job launch rate limit without launch rate limit set."
            )

        # Allow an environmental variable override.
        name = os.getenv('COOK_LAUNCH_RATE_LIMIT_NAME')
        if name is not None:
            user = self.user_factory.user_class(name)
        else:
            user = self.user_factory.new_user()

        if not settings['rate-limit']['job-launch']['enforce?']:
            pytest.skip("Enforcing must be on for test to run")
        bucket_size = settings['rate-limit']['job-launch']['bucket-size']
        token_rate = settings['rate-limit']['job-launch'][
            'tokens-replenished-per-minute']
        # In some environments, e.g., minimesos, we can only launch so many concurrent jobs.
        if token_rate < 5 or token_rate > 20:
            pytest.skip(
                "Job launch rate limit test is only validated to reliably work correctly with certain token rates."
            )
        if bucket_size < 10 or bucket_size > 20:
            pytest.skip(
                "Job launch rate limit test is only validated to reliably work correctly with certain token bucket sizes."
            )
        with user:
            job_uuids = []
            try:
                jobspec = {"command": "sleep 240", 'cpus': 0.03, 'mem': 32}

                self.logger.info(
                    f'Submitting initial batch of {bucket_size-1} jobs')
                initial_uuids, initial_response = util.submit_jobs(
                    self.cook_url, jobspec, bucket_size - 1)
                job_uuids.extend(initial_uuids)
                self.assertEqual(201,
                                 initial_response.status_code,
                                 msg=initial_response.content)

                def submit_jobs():
                    self.logger.info(
                        f'Submitting subsequent batch of {bucket_size-1} jobs')
                    subsequent_uuids, subsequent_response = util.submit_jobs(
                        self.cook_url, jobspec, bucket_size - 1)
                    job_uuids.extend(subsequent_uuids)
                    self.assertEqual(201,
                                     subsequent_response.status_code,
                                     msg=subsequent_response.content)

                def is_rate_limit_triggered(_):
                    jobs1 = util.query_jobs(self.cook_url,
                                            True,
                                            uuid=job_uuids).json()
                    waiting_jobs = [
                        j for j in jobs1 if j['status'] == 'waiting'
                    ]
                    running_jobs = [
                        j for j in jobs1 if j['status'] == 'running'
                    ]
                    self.logger.debug(
                        f'There are {len(waiting_jobs)} waiting jobs')
                    # We submitted just under two buckets. We should only see a bucket + some extra running. No more.
                    return len(running_jobs) >= bucket_size and len(
                        running_jobs) < (bucket_size + token_rate /
                                         2) and len(waiting_jobs) > 0

                util.wait_until(submit_jobs, is_rate_limit_triggered)
                jobs2 = util.query_jobs(self.cook_url, True,
                                        uuid=job_uuids).json()
                running_jobs = [j for j in jobs2 if j['status'] == 'running']
                self.assertEqual(len(running_jobs), bucket_size)
            finally:
                util.kill_jobs(self.cook_url, job_uuids)
Esempio n. 5
0
    def test_preemption(self):
        admin = self.user_factory.admin()
        user = self.user_factory.new_user()
        all_job_uuids = []
        try:
            small_cpus = 0.1
            large_cpus = small_cpus * 10
            with admin:
                # Lower the user's cpu share and quota
                util.set_limit(self.cook_url,
                               'share',
                               user.name,
                               cpus=small_cpus)
                util.set_limit(self.cook_url,
                               'quota',
                               user.name,
                               cpus=large_cpus)

            with user:
                # Submit a large job that fills up the user's quota
                base_priority = 99
                command = 'sleep 600'
                uuid_large, _ = util.submit_job(self.cook_url,
                                                priority=base_priority,
                                                cpus=large_cpus,
                                                command=command)
                all_job_uuids.append(uuid_large)
                util.wait_for_running_instance(self.cook_url, uuid_large)

                # Submit a higher-priority job that should trigger preemption
                uuid_high_priority, _ = util.submit_job(
                    self.cook_url,
                    priority=base_priority + 1,
                    cpus=small_cpus,
                    command=command,
                    name='higher_priority_job')
                all_job_uuids.append(uuid_high_priority)

                # Assert that the lower-priority job was preempted
                def low_priority_job():
                    job = util.load_job(self.cook_url, uuid_large)
                    one_hour_in_millis = 60 * 60 * 1000
                    start = util.current_milli_time() - one_hour_in_millis
                    end = util.current_milli_time()
                    running = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='running',
                                        start=start,
                                        end=end).json()
                    waiting = util.jobs(self.cook_url,
                                        user=user.name,
                                        state='waiting',
                                        start=start,
                                        end=end).json()
                    self.logger.info(
                        f'Currently running jobs: {json.dumps(running, indent=2)}'
                    )
                    self.logger.info(
                        f'Currently waiting jobs: {json.dumps(waiting, indent=2)}'
                    )
                    return job

                def job_was_preempted(job):
                    for instance in job['instances']:
                        self.logger.debug(
                            f'Checking if instance was preempted: {instance}')
                        if instance.get(
                                'reason_string') == 'Preempted by rebalancer':
                            return True
                    self.logger.info(f'Job has not been preempted: {job}')
                    return False

                max_wait_ms = util.settings(
                    self.cook_url
                )['rebalancer']['interval-seconds'] * 1000 * 1.5
                self.logger.info(
                    f'Waiting up to {max_wait_ms} milliseconds for preemption to happen'
                )
                util.wait_until(low_priority_job,
                                job_was_preempted,
                                max_wait_ms=max_wait_ms,
                                wait_interval_ms=5000)
        finally:
            with admin:
                util.kill_jobs(self.cook_url,
                               all_job_uuids,
                               assert_response=False)
                util.reset_limit(self.cook_url,
                                 'share',
                                 user.name,
                                 reason=self.current_name())
                util.reset_limit(self.cook_url,
                                 'quota',
                                 user.name,
                                 reason=self.current_name())