def test_container_port_submit(self): """Test submitting a job with a port specification.""" JOB_PORT = 30030 progress_file_env = util.retrieve_progress_file_env( type(self).cook_url) hostname_progress_cmd = util.progress_line( type(self).cook_url, 50, # Don't really care, we just need a val '$(hostname -I)', write_to_file=True) container = DockerContainer(util.docker_image(), port_mapping=[ DockerPortMapping( host_port=0, container_port=JOB_PORT, protocol='tcp') ]) uuid = self.client.submit( command= f'{hostname_progress_cmd} && nc -l -p {JOB_PORT} $(hostname -I)', container=container, env={progress_file_env: 'progress.txt'}, max_retries=5, pool=util.default_submit_pool()) addr = None try: util.wait_for_instance_with_progress( type(self).cook_url, str(uuid), 50) job = self.client.query(uuid) addr = job.instances[0].progress_message self.assertIsNotNone(addr) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.connect((addr, JOB_PORT)) message = b"hello world!" self.assertEqual(sock.send(message), len(message)) except Exception as e: if addr is not None: raise Exception(f"Could not connect to {addr}: {e}") from e else: raise e finally: self.client.kill(uuid)
def test_bulk_submit_explicit_none(self): jobspecs = [ { 'command': 'echo "Hello World!"', 'mem': 256.0, 'container': None } ] uuids = self.client.submit_all(jobspecs, pool=util.default_submit_pool()) try: jobs = self.client.query_all(uuids) self.assertEqual(jobs[0].uuid, uuids[0]) self.assertEqual(jobs[0].command, jobspecs[0]['command']) finally: self.client.kill_all(uuids)
def test_instance_query(self): """Test that parsing an instance yielded from Cook works.""" uuid = self.client.submit(command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}', cpus=0.5, mem=1.0, max_retries=5, pool=util.default_submit_pool()) try: util.wait_for_instance(type(self).cook_url, uuid) job = self.client.query(uuid) self.assertNotEqual(job.instances, []) self.assertIsNotNone(job.instances[0]) finally: self.client.kill(uuid)
def test_kill(self): uuid = self.client.submit(command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}', cpus=0.5, mem=1.0, max_retries=5, pool=util.default_submit_pool()) killed = False try: job = self.client.query(uuid) # Ensure the job is either waiting or running self.assertNotEqual(job.status, JobStatus.COMPLETED) self.client.kill(uuid) killed = True job = self.client.query(uuid) self.assertEqual(job.status, JobStatus.COMPLETED) finally: if not killed: self.client.kill(uuid)
def test_bulk_ops(self): jobspecs = [{ 'command': 'ls' }, { 'command': 'echo "Hello World!"', 'mem': 256.0 }] uuids = self.client.submit_all(jobspecs, pool=util.default_submit_pool()) try: jobs = self.client.query_all(uuids) self.assertEqual(jobs[0].uuid, uuids[0]) self.assertEqual(jobs[0].command, jobspecs[0]['command']) self.assertEqual(jobs[1].uuid, uuids[1]) self.assertEqual(jobs[1].command, jobspecs[1]['command']) self.assertEqual(jobs[1].mem, jobspecs[1]['mem']) finally: self.client.kill_all(uuids)
def submit(command=None, cook_url=None, flags=None, submit_flags=None, stdin=None): """Submits one job via the CLI""" default_pool = util.default_submit_pool() if default_pool: message = f'Submitting explicitly to the {default_pool} pool (set as default)' if not submit_flags: submit_flags = f'--pool {default_pool}' logger.info(message) elif '--pool' not in submit_flags: submit_flags += f' --pool {default_pool}' logger.info(message) args = 'submit %s%s' % (submit_flags + ' ' if submit_flags else '', command if command else '') cp = cli(args, cook_url, flags, stdin) uuids = [ s for s in stdout(cp).split() if len(s) == 36 and util.is_valid_uuid(s) ] return cp, uuids
def test_checkpoint_locality(self): """ Test that restored instances run in the same location as their checkpointed instances. """ # Get the set of clusters that correspond to the pool under test and are running pool = util.default_submit_pool() clusters = util.compute_clusters(self.cook_url) running_clusters = [ c for c in clusters['in-mem-configs'] if pool in c['cluster-definition']['config']['synthetic-pods'] ['pools'] and c['state'] == 'running' ] self.logger.info( f'Running clusters for pool {pool}: {running_clusters}') if len(running_clusters) == 0: self.skipTest( f'Requires at least 1 running compute cluster for pool {pool}') # Submit an initial canary job job_uuid, resp = util.submit_job(self.cook_url, pool=pool, command='true') self.assertEqual(201, resp.status_code, resp.content) util.wait_for_instance(self.cook_url, job_uuid, status='success', indent=None) # Submit a long-running job with checkpointing checkpoint_job_uuid, resp = util.submit_job( self.cook_url, pool=pool, command=f'sleep {util.DEFAULT_TEST_TIMEOUT_SECS}', max_retries=5, checkpoint={'mode': 'auto'}) self.assertEqual(201, resp.status_code, resp.content) try: # Wait for the job to be running checkpoint_instance = util.wait_for_instance(self.cook_url, checkpoint_job_uuid, status='running', indent=None) checkpoint_instance_uuid = checkpoint_instance['task_id'] checkpoint_location = next( c['location'] for c in running_clusters if c['name'] == checkpoint_instance['compute-cluster']['name']) admin = self.user_factory.admin() try: # Force all clusters in the instance's location to have state = draining with admin: for cluster in running_clusters: if cluster['location'] == checkpoint_location: cluster_update = dict(cluster) # Set state = draining cluster_update['state'] = 'draining' cluster_update['state-locked?'] = True # The location, cluster-definition, and features fields cannot be sent in the update cluster_update.pop('location', None) cluster_update.pop('cluster-definition', None) cluster_update.pop('features', None) self.logger.info( f'Trying to update cluster to draining: {cluster_update}' ) util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster_update)[1], lambda response: response.status_code == 201 and len(response.json()) > 0) else: self.logger.info( f'Not updating cluster - not in location {checkpoint_location}: {cluster}' ) # Kill the running checkpoint job instance util.kill_instance(self.cook_url, checkpoint_instance_uuid) # Submit another canary job job_uuid, resp = util.submit_job(self.cook_url, pool=pool, command='true') self.assertEqual(201, resp.status_code, resp.content) cluster_locations = set(c['location'] for c in running_clusters) if len(cluster_locations) > 1: # The canary job should run in the non-draining location self.logger.info( f'There are > 1 cluster locations under test: {cluster_locations}' ) util.wait_for_instance(self.cook_url, job_uuid, status='success', indent=None) else: self.logger.info( f'There is only 1 cluster location under test: {cluster_locations}' ) # The checkpoint job should be waiting util.wait_for_instance(self.cook_url, checkpoint_job_uuid, status='failed', indent=None) util.wait_for_job_in_statuses(self.cook_url, checkpoint_job_uuid, ['waiting']) finally: # Revert all clusters in the instance's location to state = running with admin: for cluster in running_clusters: if cluster['location'] == checkpoint_location: cluster_update = dict(cluster) # Set state = running cluster_update['state'] = 'running' cluster_update['state-locked?'] = False # The location, cluster-definition, and features fields cannot be sent in the update cluster_update.pop('location', None) cluster_update.pop('cluster-definition', None) cluster_update.pop('features', None) self.logger.info( f'Trying to update cluster to running: {cluster_update}' ) util.wait_until( lambda: util.update_compute_cluster( self.cook_url, cluster_update)[1], lambda response: response.status_code == 201 and len(response.json()) > 0) else: self.logger.info( f'Not updating cluster - not in location {checkpoint_location}: {cluster}' ) # Wait for the checkpoint job to be running again, in the same location as before checkpoint_instance = util.wait_for_instance( self.cook_url, checkpoint_job_uuid, status='running', indent=None) self.assertEqual( checkpoint_location, next(c['location'] for c in running_clusters if c['name'] == checkpoint_instance['compute-cluster']['name'])) finally: # Kill the checkpoint job to not leave it running util.kill_jobs(self.cook_url, [checkpoint_job_uuid])