def test_unsupported(self): with self.assertRaises(Exception) as cm: get_queue_adapter({ 'config': { 'scheduler': { 'type': 'foo' } } }, None) self.assertIsNotNone(cm.exception)
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def setUp(self): self._cluster_connection = mock.MagicMock() self._adapter = get_queue_adapter({ 'config': { 'scheduler': { 'type': QueueType.SLURM } }, 'type': 'trad' }, self._cluster_connection)
def setUp(self): self._cluster_connection = mock.MagicMock() self._adapter = get_queue_adapter( { 'config': { 'scheduler': { 'type': QueueType.PBS } }, 'type': 'trad' }, self._cluster_connection)
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None, monitor_interval=5): headers = {'Girder-Token': girder_token} cluster_url = '%s/clusters/%s' % ( cumulus.config.girder.baseUrl, cluster['_id']) try: with get_connection(girder_token, cluster) as conn: try: job_queue_states \ = get_queue_adapter(cluster, conn).job_statuses(jobs) new_states = set() for (job, state) in job_queue_states: job_id = job['_id'] # First get the current status status_url = '%s/jobs/%s/status' % ( cumulus.config.girder.baseUrl, job_id) r = requests.get(status_url, headers=headers) check_status(r) current_status = r.json()['status'] if current_status == JobState.TERMINATED: continue job_status = from_string(current_status, task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = job_status.next(state) job['status'] = str(job_status) job_status.run() json = { 'status': str(job_status), 'timings': job.get('timings', {}), 'output': job['output'] } job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) r = requests.patch(job_url, headers=headers, json=json) check_status(r) new_states.add(job['status']) # Now see if we still have jobs to monitor running_states = set( [JobState.CREATED, JobState.QUEUED, JobState.RUNNING, JobState.TERMINATING] ) # Do we have any job still in a running state? if new_states & running_states: task.retry(countdown=monitor_interval) except EOFError: # Try again task.retry(countdown=5) return except paramiko.ssh_exception.NoValidConnectionsError as ex: # Try again task.retry(countdown=5) return # Ensure that the Retry exception will get through except Retry: raise except paramiko.ssh_exception.NoValidConnectionsError as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(ex.message) except Exception as ex: traceback.print_exc() r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(ex.message) raise
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int(get_queue_adapter(cluster, conn) .number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.mkdir(job_dir, ignore_failure=True) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def _monitor_jobs(task, cluster, jobs, log_write_url=None, girder_token=None, monitor_interval=5): headers = {'Girder-Token': girder_token} cluster_url = '%s/clusters/%s' % (cumulus.config.girder.baseUrl, cluster['_id']) try: with get_connection(girder_token, cluster) as conn: try: job_queue_states \ = get_queue_adapter(cluster, conn).job_statuses(jobs) new_states = set() for (job, state) in job_queue_states: job_id = job['_id'] # First get the current status status_url = '%s/jobs/%s/status' % ( cumulus.config.girder.baseUrl, job_id) r = requests.get(status_url, headers=headers) check_status(r) current_status = r.json()['status'] if current_status == JobState.TERMINATED: continue job_status = from_string(current_status, task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = job_status.next(state) job['status'] = str(job_status) job_status.run() json = { 'status': str(job_status), 'timings': job.get('timings', {}), 'output': job['output'] } job_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) r = requests.patch(job_url, headers=headers, json=json) check_status(r) new_states.add(job['status']) # Now see if we still have jobs to monitor running_states = set([ JobState.CREATED, JobState.QUEUED, JobState.RUNNING, JobState.TERMINATING ]) # Do we have any job still in a running state? if new_states & running_states: task.retry(countdown=monitor_interval) except EOFError: # Try again task.retry(countdown=5) return except paramiko.ssh_exception.NoValidConnectionsError: # Try again task.retry(countdown=5) return # Ensure that the Retry exception will get through except Retry: raise except paramiko.ssh_exception.NoValidConnectionsError as ex: r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(str(ex)) except Exception as ex: traceback.print_exc() r = requests.patch(cluster_url, headers=headers, json={'status': 'error'}) check_status(r) get_cluster_logger(cluster, girder_token).exception(str(ex)) raise
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int( get_queue_adapter(cluster, conn).number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.makedirs(job_dir) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise
def test_unsupported(self): with self.assertRaises(Exception) as cm: get_queue_adapter({'config': {'scheduler': {'type': 'foo'}}}, None) self.assertIsNotNone(cm.exception)