def _tail_output(self): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, self.job['_id']) log = get_post_logger(self.job['_id'], self.girder_token, job_url) # Do we need to tail any output files for output in self.job.get('output', []): if 'tail' in output and output['tail']: path = output['path'] offset = 0 if 'content' in output: offset = len(output['content']) else: output['content'] = [] tail_path = os.path.join(self.job['dir'], path) command = 'tail -n +%d %s' % (offset, tail_path) try: # Only tail if file exists if self.conn.isfile(tail_path): stdout = self.conn.execute(command) output['content'] = output['content'] + stdout else: log.info('Skipping tail of %s as file doesn\'t ' 'currently exist' % tail_path) except Exception as ex: get_job_logger(self.job, self.girder_token).exception(str(ex))
def _tail_output(self): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, self.job['_id']) log = get_post_logger(self.job['_id'], self.girder_token, job_url) # Do we need to tail any output files for output in self.job.get('output', []): if 'tail' in output and output['tail']: path = output['path'] offset = 0 if 'content' in output: offset = len(output['content']) else: output['content'] = [] tail_path = os.path.join(self.job['dir'], path) command = 'tail -n +%d %s' % (offset, tail_path) try: # Only tail if file exists if self.conn.isfile(tail_path): stdout = self.conn.execute(command) output['content'] = output['content'] + stdout else: log.info('Skipping tail of %s as file doesn\'t ' 'currently exist' % tail_path) except Exception as ex: get_job_logger(self.job, self.girder_token).exception(ex.message)
def upload_job_output(cluster, job, log_write_url=None, job_dir=None, girder_token=None): job_name = job['name'] job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) log.info('Uploading output for "%s"' % job_name) if parse('output.itemId').find(job): upload_job_output_to_item(cluster, job, log_write_url=log_write_url, job_dir=job_dir, girder_token=girder_token) else: upload_job_output_to_folder(cluster, job, log_write_url=log_write_url, job_dir=job_dir, girder_token=girder_token)
def download_job_input(cluster, job, log_write_url=None, girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) # Create job directory with get_connection(girder_token, cluster) as conn: conn.mkdir(job_directory(cluster, job)) log.info('Downloading input for "%s"' % job['name']) if parse('input.itemId').find(job): download_job_input_items(cluster, job, log_write_url=log_write_url, girder_token=girder_token) else: download_job_input_folders(cluster, job, log_write_url=log_write_url, girder_token=girder_token)
def download_job_input(cluster, job, log_write_url=None, girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) # Create job directory with get_connection(girder_token, cluster) as conn: conn.makedirs(job_directory(cluster, job)) log.info('Downloading input for "%s"' % job['name']) if parse('input.itemId').find(job): download_job_input_items(cluster, job, log_write_url=log_write_url, girder_token=girder_token) else: download_job_input_folders(cluster, job, log_write_url=log_write_url, girder_token=girder_token)
def upload_job_output_to_folder(cluster, job, log_write_url=None, job_dir=None, girder_token=None): status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job['_id']) headers = {'Girder-Token': girder_token} assetstore_base_url = get_assetstore_url_base(cluster) assetstore_id = get_assetstore_id(girder_token, cluster) if not job_dir: job_dir = job['dir'] try: with get_connection(girder_token, cluster) as conn: for output in job['output']: if 'folderId' in output and 'path' in output: folder_id = output['folderId'] path = os.path.join(job_dir, output['path']) download_path(conn, girder_token, folder_id, path, assetstore_base_url, assetstore_id) except HttpError as e: job['status'] = JobState.ERROR url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) logger = get_post_logger('job', girder_token, url) logger.exception(e.responseText) r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) if _get_on_complete(job) == 'terminate': cluster_log_url = '%s/clusters/%s/log' % \ (cumulus.config.girder.baseUrl, cluster['_id']) command.send_task( 'cumulus.tasks.cluster.terminate_cluster', args=(cluster,), kwargs={'log_write_url': cluster_log_url, 'girder_token': girder_token}) # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=None, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r)
def next(self, job_queue_status): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, self.job['_id']) log = get_post_logger(self.job['_id'], self.girder_token, job_url) job_name = self.job['name'] if 'runningTime' in self.job: running_time = time.time() - self.job['runningTime'] self.job.get('timings', {})['running'] \ = int(round(running_time * 1000)) del self.job['runningTime'] # Fire off task to upload the output log.info('Job "%s" complete' % job_name) if 'output' in self.job and len(self.job['output']) == 0: return Complete(self) return self
def __init__(self): super(CallbackModule, self).__init__() self.current_task = None self.current_play = None self.logger = get_post_logger('cumulus_log', self.girder_token, self.log_write_url)
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int(get_queue_adapter(cluster, conn) .number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.mkdir(job_dir, ignore_failure=True) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(ex.message) raise
def monitor_process(task, cluster, job, pid, nohup_out_path, log_write_url=None, on_complete=None, output_message='Job download/upload error: %s', girder_token=None): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return with get_connection(girder_token, cluster) as conn: # See if the process is still running output = conn.execute('ps %s | grep %s' % (pid, pid), ignore_exit_status=True, source_profile=False) if len(output) > 0: # Process is still running so schedule self again in about 5 # secs # N.B. throw=False to prevent Retry exception being raised task.retry(throw=False, countdown=5) else: try: nohup_out_file_name = os.path.basename(nohup_out_path) # Log the output with conn.get(nohup_out_path) as fp: output = fp.read() if output.strip(): log.error(output_message % output) # If we have output then set the error state on the # job and return r = requests.patch(status_url, headers=headers, json={'status': JobState.ERROR}) check_status(r) return finally: if nohup_out_file_name and \ os.path.exists(nohup_out_file_name): os.remove(nohup_out_file_name) # Fire off the on_compete task if we have one if on_complete: signature(on_complete).delay() # If we where uploading move job to the complete state if job['status'] == JobState.UPLOADING: job_status = from_string(job['status'], task=task, cluster=cluster, job=job, log_write_url=log_write_url, girder_token=girder_token, conn=conn) job_status = Complete(job_status) job_status = job_status.next(JobQueueState.COMPLETE) job_status.run() r = requests.patch(status_url, headers=headers, json={'status': str(job_status)}) check_status(r) except EOFError: # Try again task.retry(throw=False, countdown=5) except Exception as ex: r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise
def submit_job(cluster, job, log_write_url=None, girder_token=None, monitor=True): job_url = '%s/jobs/%s/log' % (cumulus.config.girder.baseUrl, job['_id']) log = get_post_logger(job['_id'], girder_token, job_url) headers = {'Girder-Token': girder_token} job_id = job['_id'] status_url = '%s/jobs/%s' % (cumulus.config.girder.baseUrl, job_id) try: # if terminating break out if _is_terminating(job, girder_token): return script_name = job['name'] with get_connection(girder_token, cluster) as conn: job_params = {} if 'params' in job: job_params = job['params'] output = conn.execute('pwd') if len(output) != 1: raise Exception('Unable to fetch users home directory.') user_home = output[0].strip() job_dir = job_directory(cluster, job, user_home=user_home) job['dir'] = job_dir slots = -1 # Try job parameters first slots = int(job_params.get('numberOfSlots', slots)) if slots == -1: # Try the cluster slots = int(cluster['config'].get('numberOfSlots', slots)) parallel_env = _get_parallel_env(cluster, job) if parallel_env: job_params['parallelEnvironment'] = parallel_env # If the number of slots has not been provided we will get # the number of slots from the parallel environment if slots == -1: slots = int( get_queue_adapter(cluster, conn).number_of_slots(parallel_env)) if slots > 0: job_params['numberOfSlots'] = slots script = _generate_submission_script(job, cluster, job_params) conn.makedirs(job_dir) # put the script to master conn.put(StringIO(script), os.path.join(job_dir, script_name)) if slots > -1: log.info('We have %s slots available' % slots) # Now submit the job queue_job_id \ = get_queue_adapter(cluster, conn).submit_job(job, script_name) # Update the state and queue job id job[AbstractQueueAdapter.QUEUE_JOB_ID] = queue_job_id patch_data = { 'status': JobState.QUEUED, AbstractQueueAdapter.QUEUE_JOB_ID: queue_job_id, 'dir': job_dir } r = requests.patch(status_url, headers=headers, json=patch_data) check_status(r) job = r.json() job['queuedTime'] = time.time() # Now monitor the jobs progress if monitor: monitor_job.s( cluster, job, log_write_url=log_write_url, girder_token=girder_token).apply_async(countdown=5) # Now update the status of the job headers = {'Girder-Token': girder_token} r = requests.patch(status_url, headers=headers, json={'status': JobState.QUEUED}) check_status(r) except Exception as ex: traceback.print_exc() r = requests.patch(status_url, headers=headers, json={'status': JobState.UNEXPECTEDERROR}) check_status(r) get_job_logger(job, girder_token).exception(str(ex)) raise