def test_cancel_job(self): job_uuid, _ = util.submit_job(self.cook_url, command='sleep 300') util.wait_for_job(self.cook_url, job_uuid, 'running') resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)) self.assertEqual(204, resp.status_code) job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0] self.assertEqual('failed', job['state'])
def gpu_submit_helper(self, pool_name, gpu_count, gpu_model): query_model_name = gpu_model.lstrip('nvidia-').replace('-', ' ').title() command = ( '/usr/bin/nvidia-smi && /usr/bin/nvidia-smi -q > nvidia-smi-output && ' f'cat nvidia-smi-output; expected_model="{query_model_name}"; ' 'num_gpus=$(grep "Attached GPUs" nvidia-smi-output | cut -d \':\' -f 2 | tr -d \'[:space:]\'); echo "num_gpus=$num_gpus"; ' 'num_expected_model=$(grep "$expected_model" nvidia-smi-output | wc -l); echo "num_expected_model=$num_expected_model"; ' f'if [[ $num_gpus -eq {gpu_count} && $num_expected_model -eq {gpu_count} ]]; then exit 0; else exit 1; fi' ) uuid = self.client.submit(command=command, cpus=0.5, mem=256.0, pool=pool_name, gpus=gpu_count, env={'COOK_GPU_MODEL': gpu_model}, max_retries=5) try: util.wait_for_job(type(self).cook_url, uuid, 'completed') job = self.client.query(uuid) self.assertEqual(JobState.SUCCESS, job.state) except Exception as e: raise Exception( f"Submitting job with GPU {gpu_model} to pool {pool_name} failed" ) from e finally: self.client.kill(uuid)
def test_cancel_instance(self): job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10', max_retries=2) job = util.wait_for_job(self.cook_url, job_uuid, 'running') task_id = job['instances'][0]['task_id'] resp = util.session.delete('%s/rawscheduler?instance=%s' % (self.cook_url, task_id)) self.assertEqual(204, resp.status_code) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['state'])
def test_basic_submit(self): job_uuid_1, resp = util.submit_job(self.cook_url_1) self.assertEqual(resp.status_code, 201) job_uuid_2, resp = util.submit_job(self.cook_url_2) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url_1, job_uuid_1, 'completed') self.assertEqual('success', job['instances'][0]['status']) job = util.wait_for_job(self.cook_url_2, job_uuid_2, 'completed') self.assertEqual('success', job['instances'][0]['status'])
def test_change_retries(self): job_uuid, _ = util.submit_job(self.cook_url, command='sleep 10') util.wait_for_job(self.cook_url, job_uuid, 'running') resp = util.session.delete('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)) self.assertEqual(204, resp.status_code) job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0] self.assertEqual('failed', job['state']) resp = util.session.put('%s/retry' % self.cook_url, json={'retries': 2, 'jobs': [job_uuid]}) self.assertEqual(201, resp.status_code, resp.text) job = util.session.get('%s/rawscheduler?job=%s' % (self.cook_url, job_uuid)).json()[0] self.assertEqual('waiting', job['status']) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['state'])
def test_allow_partial(self): def absent_uuids(response): return [part for part in response.json()['error'].split() if util.is_valid_uuid(part)] job_uuid_1, resp = util.submit_job(self.cook_url) self.assertEqual(201, resp.status_code) job_uuid_2, resp = util.submit_job(self.cook_url) self.assertEqual(201, resp.status_code) # Only valid job uuids resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2]) self.assertEqual(200, resp.status_code) # Mixed valid, invalid job uuids bogus_uuid = str(uuid.uuid4()) resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid]) self.assertEqual(404, resp.status_code) self.assertEqual([bogus_uuid], absent_uuids(resp)) resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='false') self.assertEqual(404, resp.status_code, resp.json()) self.assertEqual([bogus_uuid], absent_uuids(resp)) # Partial results with mixed valid, invalid job uuids resp = util.query_jobs(self.cook_url, job=[job_uuid_1, job_uuid_2, bogus_uuid], partial='true') self.assertEqual(200, resp.status_code, resp.json()) self.assertEqual(2, len(resp.json())) self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort()) # Only valid instance uuids job = util.wait_for_job(self.cook_url, job_uuid_1, 'completed') instance_uuid_1 = job['instances'][0]['task_id'] job = util.wait_for_job(self.cook_url, job_uuid_2, 'completed') instance_uuid_2 = job['instances'][0]['task_id'] resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2]) self.assertEqual(200, resp.status_code) # Mixed valid, invalid instance uuids resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid]) self.assertEqual(404, resp.status_code) self.assertEqual([bogus_uuid], absent_uuids(resp)) resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='false') self.assertEqual(404, resp.status_code) self.assertEqual([bogus_uuid], absent_uuids(resp)) # Partial results with mixed valid, invalid instance uuids resp = util.query_jobs(self.cook_url, instance=[instance_uuid_1, instance_uuid_2, bogus_uuid], partial='true') self.assertEqual(200, resp.status_code) self.assertEqual(2, len(resp.json())) self.assertEqual([job_uuid_1, job_uuid_2].sort(), [job['uuid'] for job in resp.json()].sort())
def test_explicit_group(self): group_spec = self.minimal_group() job_a = util.minimal_job(group=group_spec["uuid"]) job_b = util.minimal_job(group=group_spec["uuid"]) data = {'jobs': [job_a, job_b], 'groups': [group_spec]} resp = util.session.post('%s/rawscheduler' % self.cook_url, json=data) self.assertEqual(resp.status_code, 201) jobs = util.session.get('%s/rawscheduler?job=%s&job=%s' % (self.cook_url, job_a['uuid'], job_b['uuid'])) self.assertEqual(200, jobs.status_code) jobs = jobs.json() self.assertEqual(group_spec['uuid'], jobs[0]['groups'][0]) self.assertEqual(group_spec['uuid'], jobs[1]['groups'][0]) util.wait_for_job(self.cook_url, job_a['uuid'], 'completed') util.wait_for_job(self.cook_url, job_b['uuid'], 'completed')
def test_max_runtime_exceeded(self): settings_timeout_interval_minutes = util.get_in(self.settings(), 'task-constraints', 'timeout-interval-minutes') # the value needs to be a little more than 2 times settings_timeout_interval_minutes to allow # at least two runs of the lingering task killer job_timeout_interval_seconds = (2 * settings_timeout_interval_minutes * 60) + 15 job_uuid, resp = util.submit_job(self.cook_url, command='sleep %s' % job_timeout_interval_seconds, max_runtime=5000) self.assertEqual(201, resp.status_code) job = util.wait_for_job(self.cook_url, job_uuid, 'completed', job_timeout_interval_seconds * 1000) self.assertEqual(1, len(job['instances'])) self.assertEqual('failed', job['instances'][0]['status']) self.assertEqual(2003, job['instances'][0]['reason_code'])
def test_expected_runtime_field(self): # Should support expected_runtime expected_runtime = 1 job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['instances'][0]['status']) self.assertEqual(expected_runtime, job['expected_runtime']) # Should disallow expected_runtime > max_runtime expected_runtime = 2 max_runtime = expected_runtime - 1 job_uuid, resp = util.submit_job(self.cook_url, expected_runtime=expected_runtime, max_runtime=max_runtime) self.assertEqual(resp.status_code, 400)
def test_straggler_handling(self): straggler_handling = { 'type': 'quantile-deviation', 'parameters': { 'quantile': 0.5, 'multiplier': 2.0 } } group_spec = self.minimal_group(straggler_handling=straggler_handling) job_fast = util.minimal_job(group=group_spec["uuid"]) job_slow = util.minimal_job(group=group_spec["uuid"], command='sleep 120') data = {'jobs': [job_fast, job_slow], 'groups': [group_spec]} resp = util.session.post('%s/rawscheduler' % self.cook_url, json=data) self.assertEqual(resp.status_code, 201) util.wait_for_job(self.cook_url, job_fast['uuid'], 'completed') util.wait_for_job(self.cook_url, job_slow['uuid'], 'completed') jobs = util.session.get('%s/rawscheduler?job=%s&job=%s' % (self.cook_url, job_fast['uuid'], job_slow['uuid'])) self.assertEqual(200, jobs.status_code) jobs = jobs.json() self.logger.debug('Loaded jobs %s', jobs) self.assertEqual('success', jobs[0]['state']) self.assertEqual('failed', jobs[1]['state']) self.assertEqual(2004, jobs[1]['instances'][0]['reason_code'])
def test_constraints(self): state = util.get_mesos_state(self.mesos_url) hosts = [agent['hostname'] for agent in state['slaves']] bad_job_uuid, resp = util.submit_job(self.cook_url, constraints=[["HOSTNAME", "EQUALS", "lol won't get scheduled"]]) self.assertEqual(resp.status_code, 201, resp.text) host_to_job_uuid = {} for hostname in hosts: constraints = [["HOSTNAME", "EQUALS", hostname]] job_uuid, resp = util.submit_job(self.cook_url, constraints=constraints) self.assertEqual(resp.status_code, 201, resp.text) host_to_job_uuid[hostname] = job_uuid for hostname, job_uuid in host_to_job_uuid.items(): job = util.wait_for_job(self.cook_url, job_uuid, 'completed') hostname_constrained = job['instances'][0]['hostname'] self.assertEqual(hostname, hostname_constrained) self.assertEqual([["HOSTNAME", "EQUALS", hostname]], job['constraints']) # This job should have been scheduled since the job submitted after it has completed # however, its constraint means it won't get scheduled job = util.wait_for_job(self.cook_url, bad_job_uuid, 'waiting', max_delay=3000)
def test_application_field(self): # Should support application application = {'name': 'foo-app', 'version': '0.1.0'} job_uuid, resp = util.submit_job(self.cook_url, application=application) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['instances'][0]['status']) self.assertEqual(application, job['application']) # Should require application name _, resp = util.submit_job(self.cook_url, application={'version': '0.1.0'}) self.assertEqual(resp.status_code, 400) # Should require application version _, resp = util.submit_job(self.cook_url, application={'name': 'foo-app'}) self.assertEqual(resp.status_code, 400)
def test_job_delete_permission(self): user1, user2 = self.user_factory.new_users(2) with user1: job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30') try: self.assertEqual(resp.status_code, 201, resp.text) with user2: util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) with user1: util.kill_jobs(self.cook_url, [job_uuid]) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('failed', job['state']) finally: with user1: util.kill_jobs(self.cook_url, [job_uuid])
def test_job_delete_permission(self): user1, user2 = self.user_factory.new_users(2) with user1: job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30') try: self.assertEqual(resp.status_code, 201, resp.text) with user2: resp = util.kill_jobs(self.cook_url, [job_uuid], expected_status_code=403) self.assertEqual( f'You are not authorized to kill the following jobs: {job_uuid}', resp.json()['error']) with user1: util.kill_jobs(self.cook_url, [job_uuid]) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('failed', job['state']) finally: with user1: util.kill_jobs(self.cook_url, [job_uuid], assert_response=False)
def test_group_delete_permission(self): user1, user2 = self.user_factory.new_users(2) with user1: group_spec = util.minimal_group() group_uuid = group_spec['uuid'] job_uuid, resp = util.submit_job(self.cook_url, command='sleep 30', group=group_uuid) try: self.assertEqual(resp.status_code, 201, resp.text) with user2: util.kill_groups(self.cook_url, [group_uuid], expected_status_code=403) with user1: util.kill_groups(self.cook_url, [group_uuid]) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('failed', job['state']) finally: with user1: util.kill_jobs(self.cook_url, [job_uuid], assert_response=False)
def test_get_job(self): # schedule a job job_spec = util.minimal_job() resp = util.session.post('%s/rawscheduler' % self.cook_url, json={'jobs': [job_spec]}) self.assertEqual(201, resp.status_code) # query for the same job & ensure the response has what it's supposed to have job = util.wait_for_job(self.cook_url, job_spec['uuid'], 'completed') self.assertEquals(job_spec['mem'], job['mem']) self.assertEquals(job_spec['max_retries'], job['max_retries']) self.assertEquals(job_spec['name'], job['name']) self.assertEquals(job_spec['priority'], job['priority']) self.assertEquals(job_spec['uuid'], job['uuid']) self.assertEquals(job_spec['cpus'], job['cpus']) self.assertTrue('labels' in job) self.assertEquals(9223372036854775807, job['max_runtime']) # 9223372036854775807 is MAX_LONG(ish), the default value for max_runtime self.assertEquals('success', job['state']) self.assertTrue('env' in job) self.assertTrue('framework_id' in job) self.assertTrue('ports' in job) self.assertTrue('instances' in job) self.assertEquals('completed', job['status']) self.assertTrue(isinstance(job['submit_time'], int)) self.assertTrue('uris' in job) self.assertTrue('retries_remaining' in job) instance = job['instances'][0] self.assertTrue(isinstance(instance['start_time'], int)) self.assertTrue('executor_id' in instance) self.assertTrue('hostname' in instance) self.assertTrue('slave_id' in instance) self.assertTrue(isinstance(instance['preempted'], bool)) self.assertTrue(isinstance(instance['end_time'], int)) self.assertTrue(isinstance(instance['backfilled'], bool)) self.assertTrue('ports' in instance) self.assertEquals('completed', job['status']) self.assertTrue('task_id' in instance)
def test_show_running_job(self): cp, uuids = cli.submit('sleep 60', self.cook_url) self.assertEqual(0, cp.returncode, cp.stderr) util.wait_for_job(self.cook_url, uuids[0], 'running') cp = cli.show(uuids, self.cook_url) self.assertEqual(0, cp.returncode, cp.stderr)
def test_failing_submit(self): job_uuid, resp = util.submit_job(self.cook_url, command='exit 1') self.assertEqual(201, resp.status_code) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual(1, len(job['instances'])) self.assertEqual('failed', job['instances'][0]['status'])
def test_list_by_state(self): name = str(uuid.uuid4()) # waiting raw_job = { 'command': 'ls', 'name': name, 'constraints': [['HOSTNAME', 'EQUALS', 'will not get scheduled']] } cp, uuids = cli.submit(stdin=cli.encode(json.dumps(raw_job)), cook_url=self.cook_url, submit_flags='--raw') user = util.get_user(self.cook_url, uuids[0]) self.assertEqual(0, cp.returncode, cp.stderr) util.wait_for_job(self.cook_url, uuids[0], 'waiting') cp, jobs = self.list_jobs(name, user, 'waiting') self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(1, len(jobs)) self.assertEqual(uuids[0], jobs[0]['uuid']) waiting_uuid = uuids[0] # running cp, uuids = cli.submit('sleep 60', self.cook_url, submit_flags='--name %s' % name) self.assertEqual(0, cp.returncode, cp.stderr) util.wait_for_job(self.cook_url, uuids[0], 'running') cp, jobs = self.list_jobs(name, user, 'running') self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(1, len(jobs)) self.assertEqual(uuids[0], jobs[0]['uuid']) running_uuid = uuids[0] # completed cp, uuids = cli.submit('ls', self.cook_url, submit_flags='--name %s' % name) self.assertEqual(0, cp.returncode, cp.stderr) util.wait_for_job(self.cook_url, uuids[0], 'completed') cp, jobs = self.list_jobs(name, user, 'completed') self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(1, len(jobs)) self.assertEqual(uuids[0], jobs[0]['uuid']) # success cp, jobs = self.list_jobs(name, user, 'success') self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(1, len(jobs)) self.assertEqual(uuids[0], jobs[0]['uuid']) success_uuid = uuids[0] # failed cp, uuids = cli.submit('exit 1', self.cook_url, submit_flags='--name %s' % name) self.assertEqual(0, cp.returncode, cp.stderr) util.wait_for_job(self.cook_url, uuids[0], 'completed') cp, jobs = self.list_jobs(name, user, 'failed') self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(1, len(jobs)) self.assertEqual(uuids[0], jobs[0]['uuid']) failed_uuid = uuids[0] # all cp, jobs = self.list_jobs(name, user, 'all') uuids = [j['uuid'] for j in jobs] self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(4, len(jobs)) self.assertIn(waiting_uuid, uuids) self.assertIn(running_uuid, uuids) self.assertIn(success_uuid, uuids) self.assertIn(failed_uuid, uuids) # waiting+running cp, jobs = self.list_jobs(name, user, 'waiting', 'running') uuids = [j['uuid'] for j in jobs] self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(2, len(jobs)) self.assertIn(waiting_uuid, uuids) self.assertIn(running_uuid, uuids) # completed+waiting cp, jobs = self.list_jobs(name, user, 'completed', 'waiting') uuids = [j['uuid'] for j in jobs] self.assertEqual(0, cp.returncode, cp.stderr) self.assertEqual(3, len(jobs)) self.assertIn(waiting_uuid, uuids) self.assertIn(success_uuid, uuids) self.assertIn(failed_uuid, uuids)
def test_basic_submit(self): job_uuid, resp = util.submit_job(self.cook_url) self.assertEqual(resp.status_code, 201) job = util.wait_for_job(self.cook_url, job_uuid, 'completed') self.assertEqual('success', job['instances'][0]['status']) self.assertEqual(False, job['disable_mea_culpa_retries'])