def _tail_output(self): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, self.job['_id']) log = get_post_logger(self.job['_id'], self.girder_token, job_url) # Do we need to tail any output files for output in self.job.get('output', []): if 'tail' in output and output['tail']: path = output['path'] offset = 0 if 'content' in output: offset = len(output['content']) else: output['content'] = [] tail_path = os.path.join(self.job['dir'], path) command = 'tail -n +%d %s' % (offset, tail_path) try: # Only tail if file exists if self.conn.isfile(tail_path): stdout = self.conn.execute(command) output['content'] = output['content'] + stdout else: log.info('Skipping tail of %s as file doesn\'t ' 'currently exist' % tail_path) except Exception as ex: get_job_logger(self.job, self.girder_token).exception(ex.message)
def _tail_output(self): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, self.job['_id']) log = get_post_logger(self.job['_id'], self.girder_token, job_url) # Do we need to tail any output files for output in self.job.get('output', []): if 'tail' in output and output['tail']: path = output['path'] offset = 0 if 'content' in output: offset = len(output['content']) else: output['content'] = [] tail_path = os.path.join(self.job['dir'], path) command = 'tail -n +%d %s' % (offset, tail_path) try: # Only tail if file exists if self.conn.isfile(tail_path): stdout = self.conn.execute(command) output['content'] = output['content'] + stdout else: log.info('Skipping tail of %s as file doesn\'t ' 'currently exist' % tail_path) except Exception as ex: get_job_logger(self.job, self.girder_token).exception(str(ex))
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def terminate_job(cluster, job, log_write_url=None, girder_token=None): script_filepath = None headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: if AbstractQueueAdapter.QUEUE_JOB_ID in job: queue_adapter = get_queue_adapter(cluster, conn) output = queue_adapter.terminate_job(job) else: r = requests.patch(status_url, headers=headers, json={'status': JobState.TERMINATED}) check_status(r) if 'onTerminate' in job: commands = '\n'.join(job['onTerminate']['commands']) + '\n' commands = Template(commands) \ .render(cluster=cluster, job=job, base_url=cumulus.config.girder.baseUrl) on_terminate = _put_script(conn, commands + '\n') terminate_output = '%s.terminate.out' % job_id terminate_cmd = 'nohup %s &> %s &\n' % (on_terminate, terminate_output) terminate_cmd = _put_script(conn, terminate_cmd) output = conn.execute(terminate_cmd) conn.remove(on_terminate) conn.remove(terminate_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) output_message = 'onTerminate error: %s' monitor_process.delay(cluster, job, pid, terminate_output, log_write_url=log_write_url, output_message=output_message, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise finally: if script_filepath and os.path.exists(script_filepath): os.remove(script_filepath)
def download_job_input_items(cluster, job, log_write_url=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.basename(path)) r = requests.patch(status_url, json={'status': 'downloading'}, headers=headers) check_status(r) download_cmd = 'python girderclient.py --token %s --url "%s" ' \ 'download --dir %s --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job_directory(cluster, job), job_id) download_output = '%s.download.out' % job_id download_cmd = 'nohup %s &> %s &\n' % (download_cmd, download_output) download_cmd = _put_script(conn, download_cmd) output = conn.execute(download_cmd) # Remove download script conn.remove(download_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) # When the download is complete submit the job on_complete = submit_job.s(cluster, job, log_write_url=log_write_url, girder_token=girder_token) monitor_process.delay(cluster, job, pid, download_output, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) get_job_logger(job, girder_token).exception(ex.message)
def download_job_input_items(cluster, job, log_write_url=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.basename(path)) r = requests.patch(status_url, json={'status': 'downloading'}, headers=headers) check_status(r) download_cmd = 'python girderclient.py --token %s --url "%s" ' \ 'download --dir %s --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job_directory(cluster, job), job_id) download_output = '%s.download.out' % job_id download_cmd = 'nohup %s &> %s &\n' % (download_cmd, download_output) download_cmd = _put_script(conn, download_cmd) output = conn.execute(download_cmd) # Remove download script conn.remove(download_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) # When the download is complete submit the job on_complete = submit_job.s(cluster, job, log_write_url=log_write_url, girder_token=girder_token) monitor_process.delay(cluster, job, pid, download_output, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': 'error'}) check_status(r) get_job_logger(job, girder_token).exception(str(ex))
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put(fp, os.path.normpath(os.path.join(job_dir, '..', os.path.basename(path)))) cmds = ['cd %s' % job_dir] upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \ 'upload --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job['_id']) upload_output = '%s.upload.out' % job_id upload_output_path = os.path.normpath(os.path.join(job_dir, '..', upload_output)) cmds.append('nohup %s &> ../%s &\n' % (upload_cmd, upload_output)) upload_cmd = _put_script(conn, '\n'.join(cmds)) output = conn.execute(upload_cmd) # Remove upload script conn.remove(upload_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) on_complete = None if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) on_complete = signature( 'cumulus.tasks.cluster.terminate_cluster', args=(cluster,), kwargs={'log_write_url': cluster_log_url, 'girder_token': girder_token}) monitor_process.delay(cluster, job, pid, upload_output_path, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message)
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int(get_queue_adapter(cluster, conn) .number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.mkdir(job_dir, ignore_failure=True) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise
def upload_job_output_to_item(cluster, job, log_write_url=None, job_dir=None, girder_token=None): headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # First put girder client on master path = inspect.getsourcefile(cumulus.girderclient) with open(path, 'r') as fp: conn.put( fp, os.path.normpath( os.path.join(job_dir, '..', os.path.basename(path)))) cmds = ['cd %s' % job_dir] upload_cmd = 'python ../girderclient.py --token %s --url "%s" ' \ 'upload --job %s' \ % (girder_token, cumulus.config.girder.baseUrl, job['_id']) upload_output = '%s.upload.out' % job_id upload_output_path = os.path.normpath( os.path.join(job_dir, '..', upload_output)) cmds.append('nohup %s &> ../%s &\n' % (upload_cmd, upload_output)) upload_cmd = _put_script(conn, '\n'.join(cmds)) output = conn.execute(upload_cmd) # Remove upload script conn.remove(upload_cmd) if len(output) != 1: raise Exception('PID not returned by execute command') try: pid = int(output[0]) except ValueError: raise Exception('Unable to extract PID from: %s' % output) on_complete = None if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) on_complete = signature('cumulus.tasks.cluster.terminate_cluster', args=(cluster, ), kwargs={ 'log_write_url': cluster_log_url, 'girder_token': girder_token }) monitor_process.delay(cluster, job, pid, upload_output_path, log_write_url=log_write_url, on_complete=on_complete, girder_token=girder_token) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex))
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int( get_queue_adapter(cluster, conn).number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.makedirs(job_dir) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise