def sync(self, job): """! @brief Poll Grid Engine for job completion. """ # Create SSH connection try: self.ensure_ssh_connection(job) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Poll for job completion check_command = self.create_qacct_command(job.pid) try: job.logger.debug('Running: %s', check_command) exit_code, stdout, stderr = self.execute_ssh_command(check_command) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) if exit_code != EXIT_OK: job.logger.debug('Job %d has not completed yet.', job.pid) job.set_next_poll_time(QACCT_CHECK_INTERVAL_MSECS) return False job.exit_code = get_exit_code_from_qacct_output(stdout) # Submit job metrics stats = parse_qacct_metrics(stdout.splitlines()) for metric, value in stats['metrics'].items(): self.statsd.timing(metric, value, stats['tags']) # Retrieve stdout and stderr try: with self.sftp_client.open(job.stdout_path, 'r') as f: job.stdout = eva.executor.strip_stdout_newlines(f.readlines()) with self.sftp_client.open(job.stderr_path, 'r') as f: job.stderr = eva.executor.strip_stdout_newlines(f.readlines()) except SSH_RETRY_EXCEPTIONS + (IOError,) as e: raise eva.exceptions.RetryException( 'Unable to retrieve stdout and stderr from finished Grid Engine job.' ) # Set job exit status if job.exit_code == EXIT_OK: job.set_status(eva.job.COMPLETE) else: job.set_status(eva.job.FAILED) # Print stdout and stderr eva.executor.log_stdout_stderr(job, job.stdout, job.stderr) # Remove job script, stdout, and stderr caches try: self.sftp_client.unlink(job.submit_script_path) self.sftp_client.unlink(job.stdout_path) self.sftp_client.unlink(job.stderr_path) except SSH_RETRY_EXCEPTIONS + (IOError,) as e: job.logger.warning('Could not remove script file, stdout and stderr')
def execute_async(self, job): """! @brief Execute a job on Grid Engine. """ skip_submit = False # Create SSH connection try: self.ensure_ssh_connection(job) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Check whether a GridEngine task is already running for this job. If # it is, we skip submitting the job and jump right to the qacct polling. job.logger.info('Querying if job is already running.') job_id = create_job_unique_id(self.group_id, job.id) command = 'qstat -j %s' % job_id try: exit_code, stdout, stderr = self.execute_ssh_command(command) if exit_code == 0: job.pid = get_job_id_from_qstat_output(stdout) job.logger.warning('Job is already running with JOB_ID %d, will not submit a new job.', job.pid) job.set_status(eva.job.STARTED) skip_submit = True else: job.logger.info('Job is not running, continuing with submission.') except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Generate paths job.stdout_path = self.create_job_filename(job_id, 'stdout') job.stderr_path = self.create_job_filename(job_id, 'stderr') job.submit_script_path = self.create_job_filename(job_id, 'sh') # Skip submitting the job if it already exists if not skip_submit: # Create a submit script try: with self.sftp_client.open(job.submit_script_path, 'w') as submit_script: script_content = job.command submit_script.write(script_content) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e) # Print the job script to the log eva.executor.log_job_script(job) # Submit the job using qsub command = ['qsub', '-N', job_id, '-b', 'n', '-sync', 'n', '-o', job.stdout_path, '-e', job.stderr_path, ] # Run jobs in a specified queue if self.env['EVA_GRIDENGINE_QUEUE']: command += ['-q', self.env['EVA_GRIDENGINE_QUEUE']] command += [job.submit_script_path] command = ' '.join(command) job.logger.info('Submitting job to GridEngine: %s', command) # Execute command asynchronously try: exit_code, stdout, stderr = self.execute_ssh_command(command) if exit_code != EXIT_OK: raise eva.exceptions.RetryException( 'Failed to submit the job to GridEngine, exit code %d' % exit_code ) job.pid = get_job_id_from_qsub_output(eva.executor.get_std_lines(stdout)[0]) job.logger.info('Job has been submitted, JOB_ID = %d', job.pid) job.set_status(eva.job.STARTED) job.set_next_poll_time(QACCT_CHECK_INTERVAL_MSECS) except SSH_RETRY_EXCEPTIONS as e: raise eva.exceptions.RetryException(e)