Example #1
0
    def sync(self, job):
        """!
        @brief Poll Grid Engine for job completion.
        """

        # Create SSH connection
        try:
            self.ensure_ssh_connection(job)
        except SSH_RETRY_EXCEPTIONS as e:
            raise eva.exceptions.RetryException(e)

        # Poll for job completion
        check_command = self.create_qacct_command(job.pid)
        try:
            job.logger.debug('Running: %s', check_command)
            exit_code, stdout, stderr = self.execute_ssh_command(check_command)
        except SSH_RETRY_EXCEPTIONS as e:
            raise eva.exceptions.RetryException(e)
        if exit_code != EXIT_OK:
            job.logger.debug('Job %d has not completed yet.', job.pid)
            job.set_next_poll_time(QACCT_CHECK_INTERVAL_MSECS)
            return False
        job.exit_code = get_exit_code_from_qacct_output(stdout)

        # Submit job metrics
        stats = parse_qacct_metrics(stdout.splitlines())
        for metric, value in stats['metrics'].items():
            self.statsd.timing(metric, value, stats['tags'])

        # Retrieve stdout and stderr
        try:
            with self.sftp_client.open(job.stdout_path, 'r') as f:
                job.stdout = eva.executor.strip_stdout_newlines(f.readlines())
            with self.sftp_client.open(job.stderr_path, 'r') as f:
                job.stderr = eva.executor.strip_stdout_newlines(f.readlines())
        except SSH_RETRY_EXCEPTIONS + (IOError,) as e:
            raise eva.exceptions.RetryException(
                'Unable to retrieve stdout and stderr from finished Grid Engine job.'
            )

        # Set job exit status
        if job.exit_code == EXIT_OK:
            job.set_status(eva.job.COMPLETE)
        else:
            job.set_status(eva.job.FAILED)

        # Print stdout and stderr
        eva.executor.log_stdout_stderr(job, job.stdout, job.stderr)

        # Remove job script, stdout, and stderr caches
        try:
            self.sftp_client.unlink(job.submit_script_path)
            self.sftp_client.unlink(job.stdout_path)
            self.sftp_client.unlink(job.stderr_path)
        except SSH_RETRY_EXCEPTIONS + (IOError,) as e:
            job.logger.warning('Could not remove script file, stdout and stderr')
Example #2
0
    def execute_async(self, job):
        """!
        @brief Execute a job on Grid Engine.
        """

        skip_submit = False

        # Create SSH connection
        try:
            self.ensure_ssh_connection(job)
        except SSH_RETRY_EXCEPTIONS as e:
            raise eva.exceptions.RetryException(e)

        # Check whether a GridEngine task is already running for this job. If
        # it is, we skip submitting the job and jump right to the qacct polling.
        job.logger.info('Querying if job is already running.')
        job_id = create_job_unique_id(self.group_id, job.id)
        command = 'qstat -j %s' % job_id
        try:
            exit_code, stdout, stderr = self.execute_ssh_command(command)
            if exit_code == 0:
                job.pid = get_job_id_from_qstat_output(stdout)
                job.logger.warning('Job is already running with JOB_ID %d, will not submit a new job.', job.pid)
                job.set_status(eva.job.STARTED)
                skip_submit = True
            else:
                job.logger.info('Job is not running, continuing with submission.')
        except SSH_RETRY_EXCEPTIONS as e:
            raise eva.exceptions.RetryException(e)

        # Generate paths
        job.stdout_path = self.create_job_filename(job_id, 'stdout')
        job.stderr_path = self.create_job_filename(job_id, 'stderr')
        job.submit_script_path = self.create_job_filename(job_id, 'sh')

        # Skip submitting the job if it already exists
        if not skip_submit:

            # Create a submit script
            try:
                with self.sftp_client.open(job.submit_script_path, 'w') as submit_script:
                    script_content = job.command
                    submit_script.write(script_content)
            except SSH_RETRY_EXCEPTIONS as e:
                raise eva.exceptions.RetryException(e)

            # Print the job script to the log
            eva.executor.log_job_script(job)

            # Submit the job using qsub
            command = ['qsub',
                       '-N', job_id,
                       '-b', 'n',
                       '-sync', 'n',
                       '-o', job.stdout_path,
                       '-e', job.stderr_path,
                       ]

            # Run jobs in a specified queue
            if self.env['EVA_GRIDENGINE_QUEUE']:
                command += ['-q', self.env['EVA_GRIDENGINE_QUEUE']]

            command += [job.submit_script_path]

            command = ' '.join(command)
            job.logger.info('Submitting job to GridEngine: %s', command)

            # Execute command asynchronously
            try:
                exit_code, stdout, stderr = self.execute_ssh_command(command)
                if exit_code != EXIT_OK:
                    raise eva.exceptions.RetryException(
                        'Failed to submit the job to GridEngine, exit code %d' %
                        exit_code
                    )
                job.pid = get_job_id_from_qsub_output(eva.executor.get_std_lines(stdout)[0])
                job.logger.info('Job has been submitted, JOB_ID = %d', job.pid)
                job.set_status(eva.job.STARTED)
                job.set_next_poll_time(QACCT_CHECK_INTERVAL_MSECS)
            except SSH_RETRY_EXCEPTIONS as e:
                raise eva.exceptions.RetryException(e)